Ejemplo n.º 1
0
class Searcher():
    def __init__(self):
        self.conf = SparkConf().setMaster("local").setAppName("Searcher")
        self.sc = SparkContext(conf=self.conf)

    def load_data(self, data_file):
        raw_data = self.sc.textFile(data_file)
        fields = raw_data.map(lambda x: x.split("\t"))
        self.documents = fields.map(lambda x: x[3].split(" "))
        self.document_names = fields.map(lambda x: x[1])

    def hashing(self, size):
        self.hashing_TF = HashingTF(
            size)  #100K hash buckets just to save some memory
        tf = self.hashing_TF.transform(self.documents)

        tf.cache()
        idf = IDF(minDocFreq=2).fit(tf)
        self.tfidf = idf.transform(tf)

    def search(self, search_text):
        search_text_TF = self.hashing_TF.transform([search_text])
        search_text_hash_value = int(search_text_TF.indices[0])
        search_text_relevance = self.tfidf.map(
            lambda x: x[search_text_hash_value])

        return search_text_relevance.zip(self.document_names)
def main():
    """
	Driver program for a spam filter using Spark and MLLib
	"""

    # Create the Spark Context for parallel processing
    sc = SparkContext(appName="Spam Filter")

    # Load the spam and ham data files into RDDs
    spam = sc.textFile(
        "E:\\Personal\\Imp Docs\\Spark Projects\\Spam-Ham\\20050311_spam_2.tar\\20050311_spam_2\\spam.txt"
    )
    ham = sc.textFile(
        "E:\\Personal\\Imp Docs\\Spark Projects\\Spam-Ham\\20030228_easy_ham.tar\\20030228_easy_ham\\ham.txt"
    )

    # Create a HashingTF instance to map email text to vectors of 10,000 features.
    tf = HashingTF(numFeatures=10000)

    # Each email is split into words, and each word is mapped to one feature.
    spamFeatures = spam.map(lambda email: tf.transform(email.split(" ")))
    hamFeatures = ham.map(lambda email: tf.transform(email.split(" ")))

    # Create LabeledPoint datasets for positive (spam) and negative (ham) data points.
    positiveExamples = spamFeatures.map(
        lambda features: LabeledPoint(1, features))
    negativeExamples = hamFeatures.map(
        lambda features: LabeledPoint(0, features))

    # Combine positive and negative datasets into one
    data = positiveExamples.union(negativeExamples)

    # Split the data into 70% for training and 30% test data sets
    (trainingData, testData) = data.randomSplit([0.7, 0.3])

    # Cache the training data to optmize the Logistic Regression
    trainingData.cache()

    # Train the model with Logistic Regression using the SGD algorithm.
    model = LogisticRegressionWithSGD.train(trainingData)

    # Create tuples of actual and predicted values
    labels_and_predictions = testData.map(
        lambda email: (email.label, model.predict(email.features)))

    # Calculate the error rate as number wrong / total number
    error_rate = labels_and_predictions.filter(
        lambda (val, pred): val != pred).count() / float(testData.count())

    # End the Spark Context
    sc.stop()

    #  Print out the error rate
    print("*********** SPAM FILTER RESULTS **********")
    print("\n")
    print("Error Rate: " + str(error_rate))
    print("\n")

    # Serialize the model for presistance
    pickle.dump(model, open("spamFilter.pkl", "wb"))
Ejemplo n.º 3
0
def init_tranining_set(sc):
    """
    合并积极/消极的词性
    param: sc spark对象的context
    """

    words = sc.textFile('traning_words.csv')
    # 训练词频矩阵
    hashingTF = HashingTF()
    tf = hashingTF.transform(words)

    # 计算TF-IDF矩阵
    idfModel = IDF().fit(tf)
    tfidf = idfModel.transform(tf)
    tf.cache()

    with open('NBmodel.pkl', 'r') as f:
        NBmodel = pickle.load(f)

    session = get_session(settings.DB_URL)
    for r in session.execute('select * from traning_collection').fetchall():
        yourDocument = r[3]
        print r[3]
        yourwords="/".join(jieba.cut_for_search(yourDocument)).split("/")
        yourtf = hashingTF.transform(yourwords)
        yourtfidf=idfModel.transform(yourtf)
        print('NaiveBayes Model Predict:', NBmodel.predict(yourtfidf))
Ejemplo n.º 4
0
    def analyse_data(self, data):
        """
        针对入口数据进行合适的分析
        param data: file, unicode, str
        """
        words = self.sc.textFile(self.training_words_dir)
        # 训练词频矩阵
        hashingTF = HashingTF()
        tf = hashingTF.transform(words)

        # 计算TF-IDF矩阵
        idfModel = IDF().fit(tf)
        tfidf = idfModel.transform(tf)
        tf.cache()

        with open(self.NBmodel, 'r') as f:
            NBmodel = pickle.load(f)

        # 先分词后分析
        yourwords = set("/".join(jieba.cut_for_search(data)).split("/"))
        print '分词结果:{}'.format(yourwords)
        yourtf = hashingTF.transform(yourwords)
        yourtfidf = idfModel.transform(yourtf)

        return NBmodel.predict(yourtfidf), data
Ejemplo n.º 5
0
def column_search(words,row_filter):
    
    if row_filter == 'n' or row_filter == 'N':
        min_row = 0
    else:
        min_row = row_filter

    rawData = table_cols.join(master_index, master_index["Table_Name"]==table_cols["Name"]).rdd
    data = rawData.map(lambda x: (x['Doc_ID'], x['Columns'])).map(parse)

    titles = data.map(lambda x: x[0])
    documents = data.map(lambda x: x[1])
    hashingTF = HashingTF()
    tf = hashingTF.transform(documents)
    tf.cache()
    idf = IDF().fit(tf)
    normalizer = Normalizer()
    tfidf = normalizer.transform(idf.transform(tf))
    tfidfData = titles.zip(tfidf).toDF(["label", "features"])
    
    query = parse((0, words))[1]
    queryTF = hashingTF.transform(query)
    queryTFIDF = normalizer.transform(idf.transform(queryTF))
    queryRelevance = tfidfData.rdd.map(lambda x: (x[0], float(x[1].dot(queryTFIDF)))).sortBy(lambda x: -x[1]).filter(lambda x: x[1] > 0)
    queryRelevance = queryRelevance.toDF(["Doc_ID", "scores"])
    queryRelevance = queryRelevance.join(table_desc,queryRelevance.Doc_ID == table_desc.Doc_ID).select(table_desc.Doc_ID, queryRelevance.scores, table_desc.Columns)
    queryRelevance = queryRelevance.join(master_index, master_index.Doc_ID==queryRelevance.Doc_ID).select(master_index.Table_Name,master_index.Table_Length, queryRelevance.Columns, queryRelevance.scores)
    queryRelevance = queryRelevance.rdd.filter(lambda x: int(x['Table_Length']) >= int(min_row))
    if (queryRelevance.isEmpty()):
        print("Sorry, nothing matched in column search, please try a different keyword")
    else:
        print("Here is your column search result")
        queryRelevance.toDF().show()
    '''
Ejemplo n.º 6
0
def task2():
    #Print title with Machine Learning Classification
    print("-------------------------------------------")
    startTitle = time.time()
    regex1 = re.compile(".*(title:).*")
    find1 = [m.group(0) for l in data for m in [regex1.search(l)] if m]
    title = [i.split('title: ', 1)[1] for i in find1]

    Programming = sc.textFile(fileProgramming)
    Other = sc.textFile(fileOther)

    # Create a HashingTF instance to map title text to vectors of 100,000 features.
    tf = HashingTF(numFeatures=100000)

    # Each title is split into words, and each word is mapped to one feature.
    programmingFeatures = Programming.map(
        lambda title: tf.transform(title.split(" ")))
    otherFeatures = Other.map(lambda title: tf.transform(title.split(" ")))

    # Create LabeledPoint datasets for positive (programming) and negative (other) examples.
    positiveExamples = programmingFeatures.map(
        lambda features: LabeledPoint(1, features))
    negativeExamples = otherFeatures.map(
        lambda features: LabeledPoint(0, features))
    trainingData = positiveExamples.union(negativeExamples)
    trainingData.cache()

    # Run Logistic Regression using the SGD algorithm.
    model = LogisticRegressionWithSGD.train(trainingData)

    listResult = []

    for row in title:
        test = tf.transform(row.split(" "))
        result = "null"
        if model.predict(test) == 1:
            result = "Programmings"
        else:
            result = "Non-Programming"
        joinResult = row + " = " + result
        listResult.append(joinResult)

    for i in listResult:
        if 'Non-Programming' in i:
            print(i)

    for i in listResult:
        if 'Programmings' in i:
            print(i)

    endTitle = time.time()
    elapsedTitle = endTitle - startTitle
    print(elapsedTitle)
    print("-------------------------------------------")
Ejemplo n.º 7
0
def main():
	"""
	Driver program for a spam filter using Spark and MLLib
	"""

	# Consolidate the individual email files into a single spam file
	# and a single ham file
	makeDataFileFromEmails( "data/spam_2/", "data/spam.txt")
	makeDataFileFromEmails( "data/easy_ham_2/", "data/ham.txt" )

	# Create the Spark Context for parallel processing
	sc = SparkContext( appName="Spam Filter")

	# Load the spam and ham data files into RDDs
	spam = sc.textFile( "data/spam.txt" )
	ham = sc.textFile( "data/ham.txt" )

	# Create a HashingTF instance to map email text to vectors of 10,000 features.
	tf = HashingTF(numFeatures = 10000)

	# Each email is split into words, and each word is mapped to one feature.
	spamFeatures = spam.map(lambda email: tf.transform(email.split(" ")))
	hamFeatures = ham.map(lambda email: tf.transform(email.split(" ")))

	# Create LabeledPoint datasets for positive (spam) and negative (ham) data points.
	positiveExamples = spamFeatures.map(lambda features: LabeledPoint(1, features))
	negativeExamples = hamFeatures.map(lambda features: LabeledPoint(0, features))

	# Combine positive and negative datasets into one
	data = positiveExamples.union(negativeExamples)

	# Split the data into 70% for training and 30% test data sets 
	( trainingData, testData ) = data.randomSplit( [0.7, 0.3] )

	# Cache the training data to optmize the Logistic Regression
	trainingData.cache() 

	# Train the model with Logistic Regression using the SGD algorithm.
	model = LogisticRegressionWithSGD.train(trainingData)

	# Create tuples of actual and predicted values
	labels_and_predictions = testData.map( lambda email: (email.label, model.predict( email.features) ) )

	# Calculate the error rate as number wrong / total number
	error_rate = labels_and_predictions.filter( lambda (val, pred): val != pred ).count() / float(testData.count() )
	print( "*********** SPAM FILTER RESULTS **********" )
	print( "\n" )
	print( "Error Rate: " + str( error_rate ) )
	print( "\n" )

	# Serialize the model for presistance
	pickle.dump( model, open( "spamFilter.pkl", "wb" ) )

	sc.stop()
def main():
    sc = SparkContext(appName="BayesClassifer")
    htf = HashingTF(50000)
    data = sc.textFile('/home/varshav/work/PycharmProjects/Sentiment/cleaned_bayes_labels.csv')
    data_cleaned = data.map(lambda line : line.split(","))
    # Create an RDD of LabeledPoints using category labels as labels and tokenized, hashed text as feature vectors
    data_hashed = data_cleaned.map(lambda (label, text): LabeledPoint(label, htf.transform(text)))
    data_hashed.persist()
    # data = sc.textFile('/home/admin/work/spark-1.4.1-bin-hadoop2.4/data/mllib/sample_naive_bayes_data.txt').map(parseLine)
    #print data
    # Split data aproximately into training (60%) and test (40%)
    training, test = data_hashed.randomSplit([0.70, 0.30], seed=0)

    sameModel = NaiveBayesModel.load(sc, "/home/varshav/work/PycharmProjects/StockAnalysis/myModel")

    print "----------"
    print sameModel.predict(htf.transform("posts jump in net profit"))

    predictionAndLabel = test.map(lambda p: (sameModel.predict(p.features), p.label))
    predictionAndLabel1 = training.map(lambda p: (sameModel.predict(p.features), p.label))
    prediction = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count()
    prediction1 = 1.0 * predictionAndLabel1.filter(lambda (x, v): x == v).count() / training.count()
    buy_buy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == 1 and v ==1).count()


    # Instantiate metrics object
    # Instantiate metrics object
    metrics = MulticlassMetrics(predictionAndLabel)

    # Overall statistics
    precision = metrics.precision()
    precision = normalize(precision)
    recall = metrics.recall()
    recall = normalize(recall)
    f1Score = metrics.fMeasure()
    f1Score = normalize(f1Score)
    print("Summary Stats")
    print("Precision = %s" % precision)
    print("Recall = %s" % recall)
    print("F1 Score = %s" % f1Score)

    '''
    # Statistics by class
    labels = data_hashed.map(lambda lp: lp.label).distinct().collect()

    for label in sorted(labels):
        print("Class %s precision = %s" % (label, metrics.precision(label)))
        print("Class %s recall = %s" % (label, metrics.recall(label)))
        print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0)))
    '''
    '''
Ejemplo n.º 9
0
def entrenar_spam(sc,
                  sql_context,
                  dir_spam,
                  dir_no_spam,
                  num_trees=20,
                  max_depth=8):
    input_spam = sc.textFile(dir_spam)
    input_no_spam = sc.textFile(dir_no_spam)

    spam = sql_context.read.json(input_spam).select("text").withColumn(
        "label", F.lit(1.0))
    no_spam = sql_context.read.json(input_no_spam).select("text").withColumn(
        "label", F.lit(0.0))

    training_data = spam.unionAll(no_spam)

    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    wordsData = tokenizer.transform(training_data)

    hashingTF = HashingTF(inputCol="words",
                          outputCol="rawFeatures",
                          numFeatures=140)
    featurizedData = hashingTF.transform(wordsData)
    """idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)"""

    seed = 1800009193L
    (split_20_df, split_80_df) = featurizedData.randomSplit([20.0, 80.0], seed)

    test_set_df = split_20_df.cache()
    training_set_df = split_80_df.cache()

    rf = RandomForestClassifier().setLabelCol("label") \
        .setPredictionCol("predicted_label") \
        .setFeaturesCol("rawFeatures") \
        .setSeed(100088121L) \
        .setMaxDepth(max_depth) \
        .setNumTrees(num_trees)

    rf_pipeline = Pipeline()
    rf_pipeline.setStages([rf])

    reg_eval = MulticlassClassificationEvaluator(
        predictionCol="predicted_label",
        labelCol="label",
        metricName="accuracy")

    crossval = CrossValidator(estimator=rf_pipeline,
                              evaluator=reg_eval,
                              numFolds=5)
    param_grid = ParamGridBuilder().addGrid(rf.maxBins, [50, 100]).build()
    crossval.setEstimatorParamMaps(param_grid)
    modelo = crossval.fit(training_set_df).bestModel

    predictions_and_labels_df = modelo.transform(test_set_df)

    accuracy = reg_eval.evaluate(predictions_and_labels_df)

    return modelo, accuracy
Ejemplo n.º 10
0
def main():
    # 初始化 SparkContext
    sc = spark_context(spark_master)

    # 读取文件
    data = sc.textFile(hdfs_path)

    # 分词
    documents = data.map(tokenize)
    documents.cache()

    # TF
    hashingTF = HashingTF()
    tf = hashingTF.transform(documents)

    # IDF
    idf = IDF(minDocFreq=2).fit(tf)
    
    # TFIDF
    tfidf = idf.transform(tf)

    # 链接到 MongoDB
    from pymongo import MongoClient
    mongo_client = MongoClient(mongo_host)
    mongo_client.admin.authenticate(mongo_user, mongo_pass, mechanism='SCRAM-SHA-1')
    clear_mongodb(mongo_client)

    # zip
    term_tfidf = documents.zip(tfidf).map(doc_tfidf)
    articles = term_tfidf.flatMap(lambda i: i).reduceByKey(lambda x, y: x + y)
    for article in articles.collect():
        item = {}
        item['text'] = article[0].encode('utf-8')
        item['size'] = int(article[1] * 10)
        send_mongodb(mongo_client, item)
Ejemplo n.º 11
0
 def tfidf(self):
     self._create_rdd()
     hashingTF = HashingTF()
     tf = hashingTF.transform(self.token_rdd)
     idf = IDF(minDocFreq=2).fit(tf)
     tfidf = idf.transform(tf)
     return tfidf
Ejemplo n.º 12
0
def createHashData(rdd):
    original = rdd.map(lambda line: line.split(", "))
    # load up the json string
    data = rdd.map(lambda line: line.split(", ")).collect()

    def fn(line):
        label = 0.0

        if line[9] == 'title':
            label = 1.0

        return (label, data[0:9])

    # create paired data
    data_pared = original.map(fn)

    print data_pared

    htf = HashingTF(100)

    # hash data
    data_hashed = data_pared.map(
        lambda (label, f): LabeledPoint(label, htf.transform(f)))

    return data_hashed
Ejemplo n.º 13
0
Archivo: ml.py Proyecto: aditcoding/zfs
def main(sc):

    stopset = set(stopwords.words('english'))

    tweets = sc.textFile('hdfs:/adi/sample.txt')
    words = tweets.map(lambda word: word.split(" "))
    wordArr = []
    for wArr in words.collect():
        tempArr = []
        for w in wArr:
                if not w in stopset:
                        tempArr.append(w)
        wordArr.append(tempArr)
    # Open a file
   # print wordArr
    #tokens = sc.textFile("hdfs:/adi/tokens1.txt")

    # Load documents (one per line).
    documents = sc.textFile("hdfs:/adi/tokens1.txt").map(lambda line: line.split(" "))
    numDims = 100000
    hashingTF = HashingTF(numDims)
    tf = hashingTF.transform(documents)
    tf.cache()
    idf = IDF().fit(tf)
    tfidf = idf.transform(tf)
    tfidf.count()
    model = KMeans.train(tfidf, 5)
    model.save(sc,"tweetModel1")
    print("Final centers: " + str(model.clusterCenters))
#    print("Total Cost: " + str(model.computeCost(data)))
    sc.stop()
Ejemplo n.º 14
0
def process_data(data):

    print("Processing data ...")

    if (not data.isEmpty()):
        nbModel = bc_model.value
        hashingTF = HashingTF(100000)
        tf = hashingTF.transform(
            data.map(lambda x: x[0].encode('utf-8', 'ignore')))
        tf.cache()
        idf = IDF(minDocFreq=2).fit(tf)
        tfidf = idf.transform(tf)
        tfidf.cache()
        prediction = nbModel.predict(tfidf)

        temp = []
        i = 0
        for p, q, r in data.collect():
            temp.append([])
            temp[i].append(p.encode('utf-8', 'ignore'))
            temp[i].append(q)
            temp[i].append(r)
            i += 1
        i = 0
        for p in prediction.collect():
            temp[i].append(p)
            i += 1

        print(temp)
        for i in temp:

            insert_tweet(str(i[0]), str(i[1]), "0", int(i[3]), int(i[2]))
    else:
        print("Empty RDD !!!")
        pass
Ejemplo n.º 15
0
def TFIDF(source, destination):

    if destination[-1] != '/':
        destination = destination + '/'
    ## typically define the source message
    rdd = sc.wholeTextFiles(source).map(lambda (name, text): text.split())
    tf = HashingTF()
    tfVectors = tf.transform(rdd).cache()
    a = tfVectors.collect()
    # Storing the TF values above in individual files, one per link
    ind = 0
    for vector in a:
        dest_path = destination + "TF_%d" % ind + ".txt"
        ind = ind + 1
        file = open(dest_path, 'w')
        file.write(str(vector))
        file.close()
    # Calculating IDF Values for each case.
    idf = IDF()
    idfModel = idf.fit(tfVectors)
    tfIdfVectors = idfModel.transform(tfVectors)
    # Writing TF-IDF values to a single file.
    file = open(destination + "TF-IDF.txt", 'w')
    file.write(str(tfIdfVectors.collect()))
    try:
        for i in range(0, 100):
            print ""  #Testing Printing"
    except KeyboardInterrupt:
        pass
Ejemplo n.º 16
0
def run_tf_idf_spark_mllib(df, numFeatures=1 << 20):
    tokenizer = Tokenizer(inputCol="body", outputCol="words")
    wordsData = tokenizer.transform(df)

    words = wordsData.select("words").rdd.map(lambda x: x.words)

    hashingTF = MllibHashingTF(numFeatures)
    tf = hashingTF.transform(words)
    tf.cache()

    idf = MllibIDF().fit(tf)
    tfidf = idf.transform(tf)

    # @TODO make this nicer
    tmp = sqlContext.createDataFrame(wordsData.rdd.zip(tfidf),
                                     ["data", "features"])
    tmp.registerTempTable("tmp")
    old_columns = ', '.join(map(lambda x: 'data.%s' % x, wordsData.columns))
    with_features = sqlContext.sql("SELECT %s, features FROM tmp" %
                                   old_columns)
    tmp = sqlContext.createDataFrame(with_features.rdd.zip(tf),
                                     ["data", "rawFeatures"])
    tmp.registerTempTable("tmp")
    old_columns = ', '.join(map(lambda x: 'data.%s' % x,
                                with_features.columns))
    return sqlContext.sql("SELECT %s, rawFeatures FROM tmp" % old_columns)
def generatedHashedFeatures(tweet):
    #get label from tweet
    #get text from tweet

    htf = HashingTF(50000)
    lp = LabeledPoint("0", htf.transform(text))
    return lp
def main():
    #Reading the json file
    reviews_data = sqlContext.read.json(input)
    reviews=reviews_data.select('reviewText')
    reviews_rdd=reviews.rdd.cache()
    rdd_data=reviews_rdd.map(lambda line:str(line.reviewText))
    transformed_data=rdd_data.map(transform_data)

    #Finding Tf-IDF representation
    hashingTF = HashingTF()
    tf = hashingTF.transform(transformed_data)
    tf.cache()
    idf = IDF().fit(tf)
    tfidf = idf.transform(tf).collect()
    # Normalization
    # tfidf = idf.transform(tf)
    # normalizer1 = Normalizer()
    # normalized_vector=normalizer1.transform(tfidf).collect()

    score_rdd=reviews_data.rdd.map(lambda line:str(line.overall)).cache().collect()
    dates_rdd=reviews_data.rdd.map(lambda line:str(line.reviewTime)).map(lambda line:line.split(", ")).map(lambda (a,b):b).cache().collect()
    combinedList=zip(tfidf,score_rdd,dates_rdd)
    combinedRDD=sc.parallelize(combinedList).cache()
    TrainRDD=combinedRDD.filter(lambda (x,y,z):z!='2014').map(lambda (x,y,z):(x,y))
    TestRDD=combinedRDD.filter(lambda (x,y,z):z=='2014').map(lambda (x,y,z):(x,y))

    #Saving test and training data
    TrainRDD.saveAsPickleFile(output+'/Train_data_unnormalized.pickle')
    TestRDD.saveAsPickleFile(output+'/Test_data_unnormalized.pickle')
Ejemplo n.º 19
0
def get_feature_vectors(sc, input_file, feature_dimensions):
    """Get feature vector from the lines in input_file_obj using
    TF/IDF.

    Returns:
        vectors RDD

    """

    # Load documents (one per line).
    tweet_file = sc.textFile(input_file)
    input_text_rdd = tweet_file.map(lambda line: _tokenize(line))
    input_text_rdd.cache()

    # The default feature dimension is 2^20; for a corpus with million
    # tweets recommended dimensions are 50000 or 100000. Use higher
    # dimensions for larger corpus of tweets.
    hashing_tf = HashingTF(feature_dimensions)
    tf = hashing_tf.transform(input_text_rdd)
    tf.cache()
    idf = IDF(minDocFreq=2).fit(tf)
    tfidf = idf.transform(tf)
    tfidf.cache()

    return input_text_rdd, tfidf
Ejemplo n.º 20
0
def TFIDF(source, destination):
    if destination[-1] != '/':
        destination=destination+'/'
## typically define the source message
    rdd=sc.wholeTextFiles(source).map(lambda (name,text): text.split())
    tf=HashingTF()
    tfVectors=tf.transform(rdd).cache()
    a = tfVectors.collect()
    # Storing the TF values above in individual files, one per link
    ind = 0
    for vector in a:
        dest_path = destination + "TF_%d"%ind + ".txt"
        ind = ind + 1
        file = open(dest_path,'w')
        file.write(str(vector))
        file.close()
    # Calculating IDF Values for each case.
    idf=IDF()
    idfModel=idf.fit(tfVectors)
    tfIdfVectors=idfModel.transform(tfVectors)
    # Writing TF-IDF values to a single file.
    file = open(destination+"TF-IDF.txt", 'w')
    file.write(str(tfIdfVectors.collect()))
    try:
        for i in range(0,100):
            print ""#Testing Printing"
    except KeyboardInterrupt:
            pass
Ejemplo n.º 21
0
 def tfidf(self):
     self._create_rdd()
     hashingTF = HashingTF()
     tf = hashingTF.transform(self.token_rdd)
     idf = IDF(minDocFreq=2).fit(tf)
     tfidf = idf.transform(tf)
     return tfidf
def get_feature_vectors(sc, input_file, feature_dimensions):
    """Get feature vector from the lines in input_file_obj using
    TF/IDF.

    Returns:
        vectors RDD

    """

    # Load documents (one per line).
    tweet_file = sc.textFile(input_file)
    input_text_rdd = tweet_file.map(lambda line: _tokenize(line))
    input_text_rdd.cache()

    # The default feature dimension is 2^20; for a corpus with million
    # tweets recommended dimensions are 50000 or 100000. Use higher
    # dimensions for larger corpus of tweets.
    hashing_tf = HashingTF(feature_dimensions)
    tf = hashing_tf.transform(input_text_rdd)
    tf.cache()
    idf = IDF(minDocFreq=2).fit(tf)
    tfidf = idf.transform(tf)
    tfidf.cache()

    return input_text_rdd, tfidf
Ejemplo n.º 23
0
def main(sc):

    stopset = set(stopwords.words('english'))

    tweets = sc.textFile('hdfs:/adi/sample.txt')
    words = tweets.map(lambda word: word.split(" "))
    wordArr = []
    for wArr in words.collect():
        tempArr = []
        for w in wArr:
            if not w in stopset:
                tempArr.append(w)
        wordArr.append(tempArr)
    # Open a file

# print wordArr
#tokens = sc.textFile("hdfs:/adi/tokens1.txt")

# Load documents (one per line).
    documents = sc.textFile("hdfs:/adi/tokens1.txt").map(
        lambda line: line.split(" "))
    numDims = 100000
    hashingTF = HashingTF(numDims)
    tf = hashingTF.transform(documents)
    tf.cache()
    idf = IDF().fit(tf)
    tfidf = idf.transform(tf)
    tfidf.count()
    model = KMeans.train(tfidf, 5)
    model.save(sc, "tweetModel1")
    print("Final centers: " + str(model.clusterCenters))
    #    print("Total Cost: " + str(model.computeCost(data)))
    sc.stop()
Ejemplo n.º 24
0
def main():
    conf = SparkConf().setAppName("twitterclassifier")
    sc = SparkContext(conf=conf)
    ssc = StreamingContext(sc, 10)

    tweets = ssc.socketTextStream("localhost", PORT) \
                .map(lambda x: json.loads(x)) \
                .filter(lambda x: 'text' in x) \
                .map(lambda x: x['text'].encode('utf-8'))
    hasher = HashingTF(DIM)
    features = tweets.map(lambda x:
                          (x, hasher.transform(featurize(x)))).cache()

    # We create a model with random clusters and specify the number of clusters to find
    # decay = 1: total memory; decay = 0: no memory
    model = StreamingKMeans(k=N, decayFactor=0.1).setRandomCenters(DIM, 1.0, 0)
    model.trainOn(features.map(lambda x: x[1]))
    results = model.predictOnValues(features).cache()

    # Need a closure over i here.
    def print_group(i):
        results.filter(lambda x: x[1] == i).map(lambda x: '%i: %s' %
                                                (x[1], x[0])).pprint(3)

    for i in xrange(N):
        print_group(i)

    ssc.start()
    ssc.awaitTermination()
def generatedHashedFeatures(tweet):
    #get label from tweet
    #get text from tweet

    htf = HashingTF(50000)
    lp = LabeledPoint("0", htf.transform(text))
    return lp
def tfidf(rdd_doc):
    hasingTF = HashingTF()
    trainTf = hasingTF.transform(rdd_doc)
    trainTf.cache()
    idf = IDF().fit(trainTf)
    trainTfidf = idf.transform(trainTf)
    trainTfidf.cache()
    return trainTfidf, lambda x: hasingTF.indexOf(x)
Ejemplo n.º 27
0
def transform(idf, article):
    """
    transform article to a sparse vector
    """
    token = tokenizing(article)
    hashingTF = HashingTF()
    tf_test = hashingTF.transform(token)
    return idf.transform(tf_test)
def get_tfidf_features(txt):
    hashingTF = HashingTF()
    tf = hashingTF.transform(txt)
    tf.cache()
    idf = IDF().fit(tf)
    tfidf = idf.transform(tf)

    return tfidf
def generate_gender_tf(twProfilesRdd, numFe):
    """
    Generate Term Frequency tuple (gender,sparse vector) from rdd containing following tuples:
    (gender,(clean words tuple))
    """
    tf = HashingTF(numFeatures=numFe)
    return twProfilesRdd.map(lambda genderDescrTuple: (genderDict[
        genderDescrTuple[0]], tf.transform(genderDescrTuple[1])))
Ejemplo n.º 30
0
def get_tfidf_features(txt_rdd):
    hashingTF = HashingTF()
    tf = hashingTF.transform(txt_rdd)
    tf.cache()
    idf = IDF().fit(tf)
    tfidf = idf.transform(tf)

    return tfidf
Ejemplo n.º 31
0
    def create_bayes(self):
        """ 创建贝叶斯训练模型 """

        if self._check_traning_exists():
            return

        # 获取积极文本构造rdd
        positive_file = os.path.join(settings.DATA_DIR, '分类词库/positive.txt')
        positive_data = self.sc.textFile(positive_file)
        # 数据去重
        positive_data = positive_data.distinct()
        positive_data = positive_data.map(
            lambda line: line.split('###')).filter(lambda line: len(line) == 2)

        # 获取消极文本构造rdd
        negative_file = os.path.join(settings.DATA_DIR, '分类词库/negative.txt')
        negative_data = self.sc.textFile(negative_file)
        negative_data = negative_data.distinct()
        negative_data = negative_data.map(
            lambda line: line.split('###')).filter(lambda line: len(line) == 2)

        # 合并训练集
        all_data = negative_data.union(positive_data)
        all_data.repartition(1)
        # 评分已经提前进行处理只有-1与1
        rate = all_data.map(lambda s: s[0])
        document = all_data.map(lambda s: s[1])

        words = document.map(lambda w:"/".\
                join(jieba.cut_for_search(w))).\
                map(lambda line: line.split("/"))

        # 训练词频矩阵
        hashingTF = HashingTF()
        tf = hashingTF.transform(words)

        # 计算TF-IDF矩阵
        idfModel = IDF().fit(tf)
        tfidf = idfModel.transform(tf)
        tf.cache()

        # 生成训练集和测试集
        zipped = rate.zip(tfidf)
        data = zipped.map(lambda line: LabeledPoint(line[0], line[1]))
        training, test = data.randomSplit([0.6, 0.4], seed=0)

        # 训练贝叶斯分类模型
        NBmodel = NaiveBayes.train(training, 1.0)
        predictionAndLabel = test.map(lambda p:
                                      (NBmodel.predict(p.features), p.label))
        accuracy = 1.0 * predictionAndLabel.filter(lambda x: 1.0 \
                if x[0] == x[1] else 0.0).count() / test.count()

        # 存储rdd
        words.repartition(1).saveAsTextFile(self.training_words_dir)
        # 贝叶斯分类模型以pickle存储
        with open(self.NBmodel, 'w') as f:
            pickle.dump(NBmodel, f)
Ejemplo n.º 32
0
 def tfidf(self, tokenizer):
     """
     Get TFIDF matrix rdd with spark tfidf functions
     """
     self._create_rdd(tokenizer)
     hashingTF = HashingTF()
     tf = hashingTF.transform(self.token_rdd)
     idf = IDF(minDocFreq=2).fit(tf)
     tfidf = idf.transform(tf)
     return self.rdd, idf, tfidf
Ejemplo n.º 33
0
def tf_idf(sc,title_token):
    hashingTF = HashingTF(100)
    title_token = sc.parallelize(title_token)
    tf = hashingTF.transform(title_token)
    print tf, ' tf'
   
    idf = IDF().fit(tf)
    tfidf = idf.transform(tf)
   
    return tfidf
Ejemplo n.º 34
0
def vectorize(sc, rdd_words, size=0):
    '''
       使用TF将词语向量化
       向量的维度需要设定的,默认为2^20
    '''
    if not size:
    	size = rdd_words.flatMap(lambda x:x).distinct().count() + 10000
    hashingTF = HashingTF(size)
    tf = hashingTF.transform(rdd_words)
    return tf
Ejemplo n.º 35
0
def mySpark(minFreq, keyWord):

    # text cleaning function
    def removePunctuation(text):
        res=text.lower().strip()
        res=re.sub("[^0-9a-zA-Z ]", "", res)
        return res.split(" ")

    # Function for printing each element in RDD
    def println(x):
        for i in x:
            print i

    # Boilerplate Spark stuff:
    conf = SparkConf().setMaster("local").setAppName("SparkTFIDF")
    sc = SparkContext(conf = conf)

    # Load documents content (one per line) + cleaning.
    rawData = sc.textFile("list_berita-30.tsv")
    fields = rawData.map(lambda x: x.split("\t"))
    documents = fields.map(lambda x: removePunctuation(x[3]))

    # Get documents content without word mapping
    documentNames = fields.map(lambda x: x[3])

    # TF processing
    hashingTF = HashingTF(100000)  #100K hash buckets just to save some memory
    tf = hashingTF.transform(documents)

    # IDF & TF-IDF processing
    tf.cache()
    idf = IDF(minDocFreq=int(minFreq)).fit(tf)
    tfidf = idf.transform(tf)

    # Get keyword relevance with content and zip it
    keywordTF = hashingTF.transform(removePunctuation(keyWord))
    keywordHashValue = int(keywordTF.indices[0])
    keywordRelevance = tfidf.map(lambda x: x[keywordHashValue])
    zippedResults = keywordRelevance.zip(documentNames)

    # print result
    print "Best document for keywords is:"
    print zippedResults.max()
Ejemplo n.º 36
0
def mySpark(minFreq, keyWord):

    # text cleaning function
    def removePunctuation(text):
        res = text.lower().strip()
        res = re.sub("[^0-9a-zA-Z ]", "", res)
        return res.split(" ")

    # Function for printing each element in RDD
    def println(x):
        for i in x:
            print i

    # Boilerplate Spark stuff:
    conf = SparkConf().setMaster("local").setAppName("SparkTFIDF")
    sc = SparkContext(conf=conf)

    # Load documents content (one per line) + cleaning.
    rawData = sc.textFile("list_berita-30.tsv")
    fields = rawData.map(lambda x: x.split("\t"))
    documents = fields.map(lambda x: removePunctuation(x[3]))

    # Get documents content without word mapping
    documentNames = fields.map(lambda x: x[3])

    # TF processing
    hashingTF = HashingTF(100000)  #100K hash buckets just to save some memory
    tf = hashingTF.transform(documents)

    # IDF & TF-IDF processing
    tf.cache()
    idf = IDF(minDocFreq=int(minFreq)).fit(tf)
    tfidf = idf.transform(tf)

    # Get keyword relevance with content and zip it
    keywordTF = hashingTF.transform(removePunctuation(keyWord))
    keywordHashValue = int(keywordTF.indices[0])
    keywordRelevance = tfidf.map(lambda x: x[keywordHashValue])
    zippedResults = keywordRelevance.zip(documentNames)

    # print result
    print "Best document for keywords is:"
    print zippedResults.max()
    def extractKeywords_Train(self):
        documents = self.sc.textFile(self.trainingfile).map(lambda line: line.split(" ")[1:])

        hashingTF = HashingTF()
        tf = hashingTF.transform(documents)
        tf.cache()

        idfIgnore = IDF(minDocFreq=2).fit(tf)
        tfidfIgnore = idfIgnore.transform(tf)

        tfidfIgnore.saveAsTextFile("AAA")
Ejemplo n.º 38
0
def tf_idf_cal(words_rdd):
	hashingTF = HashingTF()
	tf = hashingTF.transform(words_rdd)

	idf = IDF().fit(tf)
	
	tfidf = idf.transform(tf).cache()

	tfidf_str = tfidf.map(lambda line: str(line)).cache()

	return tfidf_str
Ejemplo n.º 39
0
 def test_binary_term_freqs(self):
     hashingTF = HashingTF(100).setBinary(True)
     doc = "a a b c c c".split(" ")
     n = hashingTF.numFeatures
     output = hashingTF.transform(doc).toArray()
     expected = Vectors.sparse(n, {hashingTF.indexOf("a"): 1.0,
                                   hashingTF.indexOf("b"): 1.0,
                                   hashingTF.indexOf("c"): 1.0}).toArray()
     for i in range(0, n):
         self.assertAlmostEqual(output[i], expected[i], 14, "Error at " + str(i) +
                                ": expected " + str(expected[i]) + ", got " + str(output[i]))
Ejemplo n.º 40
0
 def test_binary_term_freqs(self):
     hashingTF = HashingTF(100).setBinary(True)
     doc = "a a b c c c".split(" ")
     n = hashingTF.numFeatures
     output = hashingTF.transform(doc).toArray()
     expected = Vectors.sparse(n, {hashingTF.indexOf("a"): 1.0,
                                   hashingTF.indexOf("b"): 1.0,
                                   hashingTF.indexOf("c"): 1.0}).toArray()
     for i in range(0, n):
         self.assertAlmostEqual(output[i], expected[i], 14, "Error at " + str(i) +
                                ": expected " + str(expected[i]) + ", got " + str(output[i]))
def predictSentiment(tweetText):
    nbModel = bc_model.value

    hashingTF = HashingTF()
    tf = hashingTF.transform(tweetText)
    tf.cache()
    idf = IDF(minDocFreq=2).fit(tf)
    tfidf = idf.transform(tf)
    tfidf.cache()
    prediction = nbModel.predict(tfidf)
    print "Predictions for this window :"
    for i in range(0, prediction.count()):
        print prediction.collect()[i], tweetText.collect()[i]
Ejemplo n.º 42
0
def calcTfidf(doc, source):
    """
    This method computes TF-IDF scores for the given document.
    While applying HashingTF only needs a single pass to the data, applying IDF needs two passes: first to compute the IDF vector and second to scale the term frequencies by IDF.
    """
    hashingTF = HashingTF(200000)
    tf = hashingTF.transform(doc)
    print "TF calculated for "+source.split('/')[-1]
    tf.cache()
    idf = IDF().fit(tf)  ##idf = IDF(minDocFreq=2).fit(tf)
    tfidf = idf.transform(tf)
    print "TF-IDF calculated for "+source.split('/')[-1]  
    return hashingTF, tfidf
Ejemplo n.º 43
0
def run():
    with pyspark.SparkContext('local', 'mapAndPartition') as sc:
        spam = sc.textFile('spam.txt')
        normal = sc.textFile('normal.txt')

        htf = HashingTF(numFeatures=10000)
        spamFeatures = spam.map(lambda email: htf.transform(email.split(' ')))
        normalFeatures = normal.map(lambda email: htf.transform(email.split(' ')))

        positiveExamples = spamFeatures.map(lambda features: LabeledPoint(1, features))
        negativeExamples = normalFeatures.map(lambda features: LabeledPoint(0, features))

        trainingData = positiveExamples.union(negativeExamples)
        trainingData.cache()

        model = LogisticRegressionWithSGD.train(trainingData)

        posTest = htf.transform('D M G GET cheap stuff by sending money to ...'.split(' '))
        negTest = htf.transform('Hi Dad, I started studying Spark the other ...'.split(' '))

        print('Prediction for positive test example: {}'.format(model.predict(posTest)))
        print('Prediction for negative test example: {}'.format(model.predict(negTest)))
def vectorize(training):
    hashingTF = HashingTF()
    tf_training = training.map(lambda tup: hashingTF.transform(tup[1]))
    idf_training = IDF().fit(tf_training)
    tfidf_training = idf_training.transform(tf_training)
    tfidf_idx = tfidf_training.zipWithIndex()
    training_idx = training.zipWithIndex()
    idx_training = training_idx.map(lambda line: (line[1], line[0]))
    idx_tfidf = tfidf_idx.map(lambda l: (l[1], l[0]))
    joined_tfidf_training = idx_training.join(idx_tfidf)
    training_labeled = joined_tfidf_training.map(lambda tup: tup[1])
    labeled_training_data = training_labeled.map(lambda k: LabeledPoint(k[0][0], k[1]))
    return labeled_training_data
Ejemplo n.º 45
0
def main():
    sc = SparkContext(appName="BayesClassifer")
    htf = HashingTF(50000)
    data = sc.textFile('/home/varshav/work/PycharmProjects/Sentiment/1.csv')
    data_cleaned = data.map(lambda line: line.split(","))
    # Create an RDD of LabeledPoints using category labels as labels and tokenized, hashed text as feature vectors
    data_hashed = data_cleaned.map(
        lambda (label, text): LabeledPoint(label, htf.transform(text)))
    data_hashed.persist()
    # data = sc.textFile('/home/admin/work/spark-1.4.1-bin-hadoop2.4/data/mllib/sample_naive_bayes_data.txt').map(parseLine)
    #print data
    # Split data aproximately into training (60%) and test (40%)
    training, test = data_hashed.randomSplit([0.70, 0.30], seed=0)

    # Train a naive Bayes model.
    model = NaiveBayes.train(training, 1.0)

    # Save and load model
    model.save(sc, "/home/varshav/Desktop/Bangalore")
    sameModel = NaiveBayesModel.load(sc, "/home/varshav/Desktop/Bangalore")

    print "----------"
    print model.predict(htf.transform("posts jump in net profit"))
    # Make prediction and test accuracy.
    predictionAndLabel = test.map(lambda p:
                                  (sameModel.predict(p.features), p.label))
    predictionAndLabel1 = training.map(
        lambda p: (sameModel.predict(p.features), p.label))
    prediction = 1.0 * predictionAndLabel.filter(
        lambda (x, v): x == v).count() / test.count()
    #buy_buy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == 1 and v == 1 ).count()
    #    print buy_buy
    prediction1 = 1.0 * predictionAndLabel1.filter(
        lambda (x, v): x == v).count() / training.count()

    print prediction
    print prediction1
    sc.stop()
Ejemplo n.º 46
0
def main():
    # 初始化 SparkContext
    sc = spark_context(spark_master)

    # 加载数据
    data = sc.textFile(hdfs_path)

    # 计算词频
    documents = data.map(tokenize)
    hashingTF = HashingTF(2 << 10)
    tf = hashingTF.transform(documents)

    # 对文档词频进行索引
    corpus = tf.zipWithIndex().map(lambda x: [x[1], x[0]]).cache()

    # 索引和词的映射
    mapping = hashing_term_mapping(documents)
    mapping.cache()

    # 训练 LDA 模型
    ldaModel = LDA.train(corpus, k=3)

    # 链接到 MongoDB
    from pymongo import MongoClient

    mongo_client = MongoClient(mongo_host)
    mongo_client.admin.authenticate(mongo_user, mongo_pass, mechanism="SCRAM-SHA-1")
    clear_mongodb(mongo_client)

    # 保存结果到 MongoDB
    topics = ldaModel.describeTopics(maxTermsPerTopic=10)
    for topic in range(3):
        doc = {}
        doc["name"] = "topic " + str(topic)
        doc["terms"] = []
        for i in range(10):
            term_index = topics[topic][0][i]
            for term in mapping.lookup(term_index):
                doc["terms"].append([term.encode("utf8"), topics[topic][1][i]])
        send_mongodb(mongo_client, doc)
Ejemplo n.º 47
0
def run_tf_idf_spark_mllib(df, numFeatures=1 << 20):
    tokenizer = Tokenizer(inputCol="body", outputCol="words")
    wordsData = tokenizer.transform(df)

    words = wordsData.select("words").rdd.map(lambda x: x.words)

    hashingTF = MllibHashingTF(numFeatures)
    tf = hashingTF.transform(words)
    tf.cache()

    idf = MllibIDF().fit(tf)
    tfidf = idf.transform(tf)

    # @TODO make this nicer
    tmp = sqlContext.createDataFrame(wordsData.rdd.zip(tfidf), ["data", "features"])
    tmp.registerTempTable("tmp")
    old_columns = ', '.join(map(lambda x: 'data.%s' % x, wordsData.columns))
    with_features = sqlContext.sql("SELECT %s, features FROM tmp" % old_columns)
    tmp = sqlContext.createDataFrame(with_features.rdd.zip(tf), ["data", "rawFeatures"])
    tmp.registerTempTable("tmp")
    old_columns = ', '.join(map(lambda x: 'data.%s' % x, with_features.columns))
    return sqlContext.sql("SELECT %s, rawFeatures FROM tmp" % old_columns)
def featurize(tweet_tuple):
    """
    generate features for this tweet text
    returns: csv line with a the last field
    containing the feature vector for the tweet

    """
    ID_FIELD_IDX = 0
    CREATED_AT_IDX = 1
    TIMESTAMP_MS = 2
    LANG_FIELD_IDX = 3
    LON_FIELD_IDX = 4
    LAT_FIELD_IDX = 5
    TEXT_IDX = 6

    TWEET_IDX = 1

    #split the tweet into components id, lang, text, lon, lat etc
    tweet_attrib_list = tweet_tuple[TWEET_IDX].split(",")
    #get the text
    text = tweet_attrib_list[TEXT_IDX]
    #tokenize the text
    word_list = tokenize(text)
    #remove stop words
    word_list = removeStopWords(word_list)
    #remove punctuations
    word_list = removePunctuation(word_list)
    #stemmed the tokens
    word_list = stemmed_tokens(word_list)
    st = " ".join(word_list)
    #hash the words
    htf = HashingTF(50000)
    hashedfeatures = htf.transform(text)
    tweet = tweet_tuple[TWEET_IDX]
    results = {'tweet':tweet, 'features':hashedfeatures}
    return  results
Ejemplo n.º 49
0
def createHashData(rdd):
    original = rdd.map(lambda line : line.split(", "))
    # load up the json string
    data = rdd.map(lambda line : line.split(", ")).collect();

    def fn(line):
        label = 0.0

        if line[9] == 'title':
            label = 1.0

        return (label, data[0:9])

    # create paired data
    data_pared = original.map(fn)

    print data_pared

    htf = HashingTF(100)

    # hash data
    data_hashed = data_pared.map(lambda (label, f) : LabeledPoint(label, htf.transform(f)))

    return data_hashed
Ejemplo n.º 50
0
# Initialize a SparkContext
sc = SparkContext()
# Import full dataset of newsgroup posts as text file
#data_raw = sc.textFile('hdfs://ec2-54-213-237-76.us-west-2.compute.amazonaws.com:9000/trainingdata/trainingdata/bbcjsontxt')
data_raw = sc.textFile('bbcdataset.json')

# Parse JSON entries in dataset
data = data_raw.map(lambda line: json.loads(line))
# Extract relevant fields in dataset -- category label and text content
data_pared = data.map(lambda line: (line['label'], line['text']))
# Temporary print statement for testing partial script
print data_pared.first()

# Prepare text for analysis using our tokenize function to clean it up
data_cleaned = data_pared.map(lambda (label, text): (label, tokenize(text)))

# Hashing term frequency vectorizer with 50k features
htf = HashingTF(50000)

# Create an RDD of LabeledPoints using category labels as labels and tokenized, hashed text as feature vectors
data_hashed = data_cleaned.map(lambda (label, text): LabeledPoint(hash(label), htf.transform(text)))

# Ask Spark to persist the RDD so it won't have to be re-created later
data_hashed.persist()
# Train a Naive Bayes model on the training data
model = NaiveBayes.train(data_hashed)

#model.save(sc, "hdfs://ec2-54-213-237-76.us-west-2.compute.amazonaws.com:9000/trainingdata/trainingdata/bbcmodela")
model.save(sc, "bbcmodel")
Ejemplo n.º 51
0
def getTFVector(review):
	htf = HashingTF(1000)
	doc = review.split()
	return htf.transform(doc).toArray()
Ejemplo n.º 52
0
    print x

# Boilerplate Spark stuff:
conf = SparkConf().setMaster("local").setAppName("SparkTFIDF")
sc = SparkContext(conf = conf)

# Load documents (one per line).
rawData = sc.textFile("subset-small.tsv")
fields = rawData.map(lambda x: x.split("\t"))
documents = fields.map(lambda x: x[3].split(" "))

documentNames = fields.map(lambda x: x[1])


hashingTF = HashingTF(100000)  #100K hash buckets just to save some memory
tf = hashingTF.transform(documents)

tf.cache()
idf = IDF(minDocFreq=2).fit(tf)
tfidf = idf.transform(tf)

keywordTF = hashingTF.transform(["Apollo"])
keywordHashValue = int(keywordTF.indices[0])

keywordRelevance = tfidf.map(lambda x: x[keywordHashValue])

zippedResults = keywordRelevance.zip(documentNames)

print "Best document for keywords is:"
print zippedResults.max()
Ejemplo n.º 53
0
from pyspark import SparkContext
from pyspark.mllib.clustering import KMeans, KMeansModel
from pyspark.mllib.feature import HashingTF, IDF

sc = SparkContext()

rdd = sc.wholeTextFiles("/usr/local/Cellar/BigDataAdvanced/Assignment1/TwitterStuff/TweetData").map(lambda (name,text):text.split())
tf = HashingTF()
tfVectors = tf.transform(rdd).cache()
a = tfVectors.collect()
count = 0 
for vec in a:
        print vec
        count = count + 1
        with open("TF_Tweet"+str(count)+".txt","w") as f:
                f.write(str(vec))
        f.close()

idf = IDF()
idfModel = idf.fit(tfVectors)
tfIdfVectors = idfModel.transform(tfVectors)
file = open("TF-IDF_tweet.txt", 'w')
file.write(str(tfIdfVectors.collect()))

#count = 0
#output=tfIdfVectors.collect()
#for vec in output:
#	print vec
#	count = count + 1
#	with open("TF_Wiki"+str(count)+".txt","w") as f:
#		f.write(str(vec))
Ejemplo n.º 54
0
    # collectVocab = vocab.collect()
    # remove top 3 lines from document
    doc_wo_counters = documents.mapPartitionsWithIndex(lambda i, iter: islice(iter, 3, None) if i == 0 else iter)

    final_doc = doc_wo_counters.map(lambda x: (int(x[0]), doc_to_words(int(x[1]), int(x[2])).encode("utf8"))).reduceByKey(lambda x, y: x + " " + y)

    vect_rep = final_doc.map(lambda x: x[1])

    raw_document = sc.textFile("test.txt")
    vect_rep = raw_document.map(lambda line: line.encode("utf8").split(" "))

    
    # TfIDF
    hashingTF = HashingTF()
    tf = hashingTF.transform(vect_rep)
    tf.cache()
    idf = IDF().fit(tf)
    tfidf_vectors = idf.transform(tf)
    
    #Build the model (cluster the data)
    clusters = KMeans.train(tfidf_vectors, 10, maxIterations=100)
    
    # Evaluate clustering by computing Within Set Sum of Squared Errors
    def error(point):
        center = clusters.centers[clusters.predict(point)]
        return sqrt(sum([x**2 for x in (point.toArray() - center)]))

    WSSSE = tfidf_vectors.map(lambda point: error(point)).reduce(lambda x, y: x + y)
    print("Within Set Sum of Squared Error = " + str(WSSSE))

#Exports data to csv
train.to_csv("product_train.csv",",",index=False)
#all.to_csv("product_all.csv",",",index=False)


# Read the training data file created above into an RDD
train = sc.textFile( "product_train.csv" ).map(lambda line: (line.split(',')))
header = train.first() #extract header
train2 = train.filter(lambda x:x !=header)

train_title = sc.textFile( "product_train.csv" ).map(lambda line: (line.split(',')[1]))

hashingTF = HashingTF(50000)
tf = train_title.map(lambda title: hashingTF.transform(title.split(" ")))
tf.cache()
idf = IDF().fit(tf)
tfidf = idf.transform(tf)
print(tfidf.first())

data_pared = train2.map(lambda line: (line[0], line[1]))
data_pared2 = train2.map(lambda line: (line[0]))
train_cleaned = data_pared.map(lambda (label, text): (label, tokenize(text)))
#parsedData = train_cleaned.map(lambda (label,text): LabeledPoint(label, idf.transform(text)))
parsedData = train_cleaned.map(lambda (label, text): LabeledPoint(label, hashingTF.transform(text)))


# Split the data into two RDDs. 70% for training and 30% test data sets
( trainingData, testData ) = parsedData.randomSplit( [0.7, 0.3] )
Ejemplo n.º 56
0
from pyspark.mllib.linalg import SparseVector 
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.feature import IDF
import math

dim= math.pow(2,16)
hashingTF= HashingTF(dim)
tokens = manytokens_final.map(lambda l:[k for (k,v) in l])
tf = hashingTF.transform(tokens)
tf.cache()


idf = IDF().fit(tf)
tfidf = idf.transform(tf)

#print(tfidf.count()) #=11314
Ejemplo n.º 57
0
from pyspark import SparkContext
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.feature import IDF


# Load documents (one per line).
sc = SparkContext()
documents = sc.textFile("training/bigdata_documents_cat.txt").map(lambda line: line.split(" "))

hashingTF = HashingTF()
tf = hashingTF.transform(documents)

# ... continue from the previous example
tf.cache()
idf = IDF().fit(tf)
tfidf = idf.transform(tf)

# save the matrix
#tfidf.saveAsSequenceFile("training/matrix.txt")
tfidfmatrix = tfidf.collect()

count = str(tfidf.count())

givenIndex = 0
givenDocumentMatrix = tfidfmatrix[givenIndex]

def similarity(x):
	return givenDocumentMatrix.dot(x)

sim = tfidf.map(similarity)
indexedsim = sim.zipWithIndex().map(lambda keyval: (keyval[1],keyval[0]))
Ejemplo n.º 58
0
from pyspark import SparkConf, SparkContext
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.classification import LogisticRegressionWithSGD

conf = SparkConf().setMaster("local").setAppName("My App")
sc = SparkContext(conf = conf)

spam = sc.textFile("/home/sakib/spark-1.3.1/spark_workspace/data/spam.txt")
normal = sc.textFile("/home/sakib/spark-1.3.1/spark_workspace/data/ham.txt")
# Create a HashingTF instance to map email text to vectors of 10,000 features.
tf = HashingTF(numFeatures = 10000)
# Each email is split into words, and each word is mapped to one feature.
spamFeatures = spam.map(lambda email: tf.transform(email.split(" ")))
normalFeatures = normal.map(lambda email: tf.transform(email.split(" ")))
# Create LabeledPoint datasets for positive (spam) and negative (normal) examples.


positiveExamples = spamFeatures.map(lambda features: LabeledPoint(1, features))
negativeExamples = normalFeatures.map(lambda features: LabeledPoint(0, features))
trainingData = positiveExamples.union(negativeExamples)
trainingData.cache() # Cache since Logistic Regression is an iterative algorithm.
# Run Logistic Regression using the SGD algorithm.
model = LogisticRegressionWithSGD.train(trainingData)
# Test on a positive example (spam) and a negative one (normal). We first apply
# the same HashingTF feature transformation to get vectors, then apply the model.
posTest = tf.transform("O M G GET cheap stuff by sending money to ...".split(" "))
negTest = tf.transform("Hi Dad, I started studying Spark the other ...".split(" "))
print "Prediction for positive test example: %g" % model.predict(posTest)
print "Prediction for negative test example: %g" % model.predict(negTest)
    # Regular expressions to find all the links and text from XML files and storing them in two lists
    TEXT_RE = re.compile(r'<text.+>([\s\S]*)<\/text>')
    
    liste = TEXT_RE.findall(text_content)
    str1 = re.split('[^a-zA-Z.]', liste[0].lower())
    str2 = filter (None, str1)
    return str2

splitRDD = dataRDD.values().map(text_parsing)


#Building tf-idf

hashingTF = HashingTF()
tf = hashingTF.transform(splitRDD)
from pyspark.mllib.feature import IDF

# ...from tf create IDF
tf.cache()
idf = IDF().fit(tf)
tfidf = idf.transform(tf)


zipped = splitRDD.zip(tfidf)
fRDD = splitRDD.flatMap(lambda x: x).distinct()
#print fRDD.count()


wordRDD = fRDD.map(lambda x: (x, hashingTF.indexOf(x)))
listW = wordRDD.collect()