Ejemplo n.º 1
0
def column_search(words,row_filter):
    
    if row_filter == 'n' or row_filter == 'N':
        min_row = 0
    else:
        min_row = row_filter

    rawData = table_cols.join(master_index, master_index["Table_Name"]==table_cols["Name"]).rdd
    data = rawData.map(lambda x: (x['Doc_ID'], x['Columns'])).map(parse)

    titles = data.map(lambda x: x[0])
    documents = data.map(lambda x: x[1])
    hashingTF = HashingTF()
    tf = hashingTF.transform(documents)
    tf.cache()
    idf = IDF().fit(tf)
    normalizer = Normalizer()
    tfidf = normalizer.transform(idf.transform(tf))
    tfidfData = titles.zip(tfidf).toDF(["label", "features"])
    
    query = parse((0, words))[1]
    queryTF = hashingTF.transform(query)
    queryTFIDF = normalizer.transform(idf.transform(queryTF))
    queryRelevance = tfidfData.rdd.map(lambda x: (x[0], float(x[1].dot(queryTFIDF)))).sortBy(lambda x: -x[1]).filter(lambda x: x[1] > 0)
    queryRelevance = queryRelevance.toDF(["Doc_ID", "scores"])
    queryRelevance = queryRelevance.join(table_desc,queryRelevance.Doc_ID == table_desc.Doc_ID).select(table_desc.Doc_ID, queryRelevance.scores, table_desc.Columns)
    queryRelevance = queryRelevance.join(master_index, master_index.Doc_ID==queryRelevance.Doc_ID).select(master_index.Table_Name,master_index.Table_Length, queryRelevance.Columns, queryRelevance.scores)
    queryRelevance = queryRelevance.rdd.filter(lambda x: int(x['Table_Length']) >= int(min_row))
    if (queryRelevance.isEmpty()):
        print("Sorry, nothing matched in column search, please try a different keyword")
    else:
        print("Here is your column search result")
        queryRelevance.toDF().show()
    '''
Ejemplo n.º 2
0
def run_tf_idf_spark_mllib(df, numFeatures=1 << 20):
    tokenizer = Tokenizer(inputCol="body", outputCol="words")
    wordsData = tokenizer.transform(df)

    words = wordsData.select("words").rdd.map(lambda x: x.words)

    hashingTF = MllibHashingTF(numFeatures)
    tf = hashingTF.transform(words)
    tf.cache()

    idf = MllibIDF().fit(tf)
    tfidf = idf.transform(tf)

    # @TODO make this nicer
    tmp = sqlContext.createDataFrame(wordsData.rdd.zip(tfidf),
                                     ["data", "features"])
    tmp.registerTempTable("tmp")
    old_columns = ', '.join(map(lambda x: 'data.%s' % x, wordsData.columns))
    with_features = sqlContext.sql("SELECT %s, features FROM tmp" %
                                   old_columns)
    tmp = sqlContext.createDataFrame(with_features.rdd.zip(tf),
                                     ["data", "rawFeatures"])
    tmp.registerTempTable("tmp")
    old_columns = ', '.join(map(lambda x: 'data.%s' % x,
                                with_features.columns))
    return sqlContext.sql("SELECT %s, rawFeatures FROM tmp" % old_columns)
Ejemplo n.º 3
0
def init_tranining_set(sc):
    """
    合并积极/消极的词性
    param: sc spark对象的context
    """

    words = sc.textFile('traning_words.csv')
    # 训练词频矩阵
    hashingTF = HashingTF()
    tf = hashingTF.transform(words)

    # 计算TF-IDF矩阵
    idfModel = IDF().fit(tf)
    tfidf = idfModel.transform(tf)
    tf.cache()

    with open('NBmodel.pkl', 'r') as f:
        NBmodel = pickle.load(f)

    session = get_session(settings.DB_URL)
    for r in session.execute('select * from traning_collection').fetchall():
        yourDocument = r[3]
        print r[3]
        yourwords="/".join(jieba.cut_for_search(yourDocument)).split("/")
        yourtf = hashingTF.transform(yourwords)
        yourtfidf=idfModel.transform(yourtf)
        print('NaiveBayes Model Predict:', NBmodel.predict(yourtfidf))
Ejemplo n.º 4
0
Archivo: ml.py Proyecto: aditcoding/zfs
def main(sc):

    stopset = set(stopwords.words('english'))

    tweets = sc.textFile('hdfs:/adi/sample.txt')
    words = tweets.map(lambda word: word.split(" "))
    wordArr = []
    for wArr in words.collect():
        tempArr = []
        for w in wArr:
                if not w in stopset:
                        tempArr.append(w)
        wordArr.append(tempArr)
    # Open a file
   # print wordArr
    #tokens = sc.textFile("hdfs:/adi/tokens1.txt")

    # Load documents (one per line).
    documents = sc.textFile("hdfs:/adi/tokens1.txt").map(lambda line: line.split(" "))
    numDims = 100000
    hashingTF = HashingTF(numDims)
    tf = hashingTF.transform(documents)
    tf.cache()
    idf = IDF().fit(tf)
    tfidf = idf.transform(tf)
    tfidf.count()
    model = KMeans.train(tfidf, 5)
    model.save(sc,"tweetModel1")
    print("Final centers: " + str(model.clusterCenters))
#    print("Total Cost: " + str(model.computeCost(data)))
    sc.stop()
Ejemplo n.º 5
0
    def analyse_data(self, data):
        """
        针对入口数据进行合适的分析
        param data: file, unicode, str
        """
        words = self.sc.textFile(self.training_words_dir)
        # 训练词频矩阵
        hashingTF = HashingTF()
        tf = hashingTF.transform(words)

        # 计算TF-IDF矩阵
        idfModel = IDF().fit(tf)
        tfidf = idfModel.transform(tf)
        tf.cache()

        with open(self.NBmodel, 'r') as f:
            NBmodel = pickle.load(f)

        # 先分词后分析
        yourwords = set("/".join(jieba.cut_for_search(data)).split("/"))
        print '分词结果:{}'.format(yourwords)
        yourtf = hashingTF.transform(yourwords)
        yourtfidf = idfModel.transform(yourtf)

        return NBmodel.predict(yourtfidf), data
Ejemplo n.º 6
0
def get_feature_vectors(sc, input_file, feature_dimensions):
    """Get feature vector from the lines in input_file_obj using
    TF/IDF.

    Returns:
        vectors RDD

    """

    # Load documents (one per line).
    tweet_file = sc.textFile(input_file)
    input_text_rdd = tweet_file.map(lambda line: _tokenize(line))
    input_text_rdd.cache()

    # The default feature dimension is 2^20; for a corpus with million
    # tweets recommended dimensions are 50000 or 100000. Use higher
    # dimensions for larger corpus of tweets.
    hashing_tf = HashingTF(feature_dimensions)
    tf = hashing_tf.transform(input_text_rdd)
    tf.cache()
    idf = IDF(minDocFreq=2).fit(tf)
    tfidf = idf.transform(tf)
    tfidf.cache()

    return input_text_rdd, tfidf
Ejemplo n.º 7
0
def main():
    # 初始化 SparkContext
    sc = spark_context(spark_master)

    # 读取文件
    data = sc.textFile(hdfs_path)

    # 分词
    documents = data.map(tokenize)
    documents.cache()

    # TF
    hashingTF = HashingTF()
    tf = hashingTF.transform(documents)

    # IDF
    idf = IDF(minDocFreq=2).fit(tf)
    
    # TFIDF
    tfidf = idf.transform(tf)

    # 链接到 MongoDB
    from pymongo import MongoClient
    mongo_client = MongoClient(mongo_host)
    mongo_client.admin.authenticate(mongo_user, mongo_pass, mechanism='SCRAM-SHA-1')
    clear_mongodb(mongo_client)

    # zip
    term_tfidf = documents.zip(tfidf).map(doc_tfidf)
    articles = term_tfidf.flatMap(lambda i: i).reduceByKey(lambda x, y: x + y)
    for article in articles.collect():
        item = {}
        item['text'] = article[0].encode('utf-8')
        item['size'] = int(article[1] * 10)
        send_mongodb(mongo_client, item)
Ejemplo n.º 8
0
def main(sc):

    stopset = set(stopwords.words('english'))

    tweets = sc.textFile('hdfs:/adi/sample.txt')
    words = tweets.map(lambda word: word.split(" "))
    wordArr = []
    for wArr in words.collect():
        tempArr = []
        for w in wArr:
            if not w in stopset:
                tempArr.append(w)
        wordArr.append(tempArr)
    # Open a file

# print wordArr
#tokens = sc.textFile("hdfs:/adi/tokens1.txt")

# Load documents (one per line).
    documents = sc.textFile("hdfs:/adi/tokens1.txt").map(
        lambda line: line.split(" "))
    numDims = 100000
    hashingTF = HashingTF(numDims)
    tf = hashingTF.transform(documents)
    tf.cache()
    idf = IDF().fit(tf)
    tfidf = idf.transform(tf)
    tfidf.count()
    model = KMeans.train(tfidf, 5)
    model.save(sc, "tweetModel1")
    print("Final centers: " + str(model.clusterCenters))
    #    print("Total Cost: " + str(model.computeCost(data)))
    sc.stop()
def main():
    #Reading the json file
    reviews_data = sqlContext.read.json(input)
    reviews=reviews_data.select('reviewText')
    reviews_rdd=reviews.rdd.cache()
    rdd_data=reviews_rdd.map(lambda line:str(line.reviewText))
    transformed_data=rdd_data.map(transform_data)

    #Finding Tf-IDF representation
    hashingTF = HashingTF()
    tf = hashingTF.transform(transformed_data)
    tf.cache()
    idf = IDF().fit(tf)
    tfidf = idf.transform(tf).collect()
    # Normalization
    # tfidf = idf.transform(tf)
    # normalizer1 = Normalizer()
    # normalized_vector=normalizer1.transform(tfidf).collect()

    score_rdd=reviews_data.rdd.map(lambda line:str(line.overall)).cache().collect()
    dates_rdd=reviews_data.rdd.map(lambda line:str(line.reviewTime)).map(lambda line:line.split(", ")).map(lambda (a,b):b).cache().collect()
    combinedList=zip(tfidf,score_rdd,dates_rdd)
    combinedRDD=sc.parallelize(combinedList).cache()
    TrainRDD=combinedRDD.filter(lambda (x,y,z):z!='2014').map(lambda (x,y,z):(x,y))
    TestRDD=combinedRDD.filter(lambda (x,y,z):z=='2014').map(lambda (x,y,z):(x,y))

    #Saving test and training data
    TrainRDD.saveAsPickleFile(output+'/Train_data_unnormalized.pickle')
    TestRDD.saveAsPickleFile(output+'/Test_data_unnormalized.pickle')
Ejemplo n.º 10
0
 def tfidf(self):
     self._create_rdd()
     hashingTF = HashingTF()
     tf = hashingTF.transform(self.token_rdd)
     idf = IDF(minDocFreq=2).fit(tf)
     tfidf = idf.transform(tf)
     return tfidf
Ejemplo n.º 11
0
def process_data(data):

    print("Processing data ...")

    if (not data.isEmpty()):
        nbModel = bc_model.value
        hashingTF = HashingTF(100000)
        tf = hashingTF.transform(
            data.map(lambda x: x[0].encode('utf-8', 'ignore')))
        tf.cache()
        idf = IDF(minDocFreq=2).fit(tf)
        tfidf = idf.transform(tf)
        tfidf.cache()
        prediction = nbModel.predict(tfidf)

        temp = []
        i = 0
        for p, q, r in data.collect():
            temp.append([])
            temp[i].append(p.encode('utf-8', 'ignore'))
            temp[i].append(q)
            temp[i].append(r)
            i += 1
        i = 0
        for p in prediction.collect():
            temp[i].append(p)
            i += 1

        print(temp)
        for i in temp:

            insert_tweet(str(i[0]), str(i[1]), "0", int(i[3]), int(i[2]))
    else:
        print("Empty RDD !!!")
        pass
def get_feature_vectors(sc, input_file, feature_dimensions):
    """Get feature vector from the lines in input_file_obj using
    TF/IDF.

    Returns:
        vectors RDD

    """

    # Load documents (one per line).
    tweet_file = sc.textFile(input_file)
    input_text_rdd = tweet_file.map(lambda line: _tokenize(line))
    input_text_rdd.cache()

    # The default feature dimension is 2^20; for a corpus with million
    # tweets recommended dimensions are 50000 or 100000. Use higher
    # dimensions for larger corpus of tweets.
    hashing_tf = HashingTF(feature_dimensions)
    tf = hashing_tf.transform(input_text_rdd)
    tf.cache()
    idf = IDF(minDocFreq=2).fit(tf)
    tfidf = idf.transform(tf)
    tfidf.cache()

    return input_text_rdd, tfidf
Ejemplo n.º 13
0
def TFIDF(source, destination):

    if destination[-1] != '/':
        destination = destination + '/'
    ## typically define the source message
    rdd = sc.wholeTextFiles(source).map(lambda (name, text): text.split())
    tf = HashingTF()
    tfVectors = tf.transform(rdd).cache()
    a = tfVectors.collect()
    # Storing the TF values above in individual files, one per link
    ind = 0
    for vector in a:
        dest_path = destination + "TF_%d" % ind + ".txt"
        ind = ind + 1
        file = open(dest_path, 'w')
        file.write(str(vector))
        file.close()
    # Calculating IDF Values for each case.
    idf = IDF()
    idfModel = idf.fit(tfVectors)
    tfIdfVectors = idfModel.transform(tfVectors)
    # Writing TF-IDF values to a single file.
    file = open(destination + "TF-IDF.txt", 'w')
    file.write(str(tfIdfVectors.collect()))
    try:
        for i in range(0, 100):
            print ""  #Testing Printing"
    except KeyboardInterrupt:
        pass
Ejemplo n.º 14
0
 def _compute_tfid(texts: RDD) -> IDFModel:
     tf = HashingTF().transform(texts.map(lambda t: t.words))
     tf.cache()
     idf = IDF().fit(tf)
     tfidfs = idf.transform(tf)
     text_tfs = texts.zip(tfidfs)
     return text_tfs.map(lambda t: t[0].set_tfidf(t[1]))
Ejemplo n.º 15
0
def TFIDF(source, destination):
    if destination[-1] != '/':
        destination=destination+'/'
## typically define the source message
    rdd=sc.wholeTextFiles(source).map(lambda (name,text): text.split())
    tf=HashingTF()
    tfVectors=tf.transform(rdd).cache()
    a = tfVectors.collect()
    # Storing the TF values above in individual files, one per link
    ind = 0
    for vector in a:
        dest_path = destination + "TF_%d"%ind + ".txt"
        ind = ind + 1
        file = open(dest_path,'w')
        file.write(str(vector))
        file.close()
    # Calculating IDF Values for each case.
    idf=IDF()
    idfModel=idf.fit(tfVectors)
    tfIdfVectors=idfModel.transform(tfVectors)
    # Writing TF-IDF values to a single file.
    file = open(destination+"TF-IDF.txt", 'w')
    file.write(str(tfIdfVectors.collect()))
    try:
        for i in range(0,100):
            print ""#Testing Printing"
    except KeyboardInterrupt:
            pass
Ejemplo n.º 16
0
def training_set(sc,
                 numFeatures,
                 pos_file = "data/training_positif_clean.csv",
                 neg_file = "data/training_negatif_clean.csv"
                 ):
    """
        Input : number of retained features in the tweet-term structure
        Output : 
            normalized tweet-term format training set
            IDF model (that will be used in the test phase)
    """
    
 
    text_negative = sc.textFile(neg_file)
    text_positive = sc.textFile(pos_file)
    
    train_text = text_negative.union(text_positive)
    train_labels = text_negative.map(lambda x: 0.0).union(text_positive.map(lambda x: 1.0))
    
    tf = HashingTF(numFeatures=numFeatures).transform(train_text.map(lambda x : x))
    idf = IDF().fit(tf)
    train_tfidf = idf.transform(tf)
    
    training = train_labels.zip(train_tfidf).map(lambda x: LabeledPoint(x[0], x[1]))
    return (training, idf)
Ejemplo n.º 17
0
 def tfidf(self):
     self._create_rdd()
     hashingTF = HashingTF()
     tf = hashingTF.transform(self.token_rdd)
     idf = IDF(minDocFreq=2).fit(tf)
     tfidf = idf.transform(tf)
     return tfidf
Ejemplo n.º 18
0
    def hashing(self, size):
        self.hashing_TF = HashingTF(
            size)  #100K hash buckets just to save some memory
        tf = self.hashing_TF.transform(self.documents)

        tf.cache()
        idf = IDF(minDocFreq=2).fit(tf)
        self.tfidf = idf.transform(tf)
Ejemplo n.º 19
0
def get_tfidf_features(txt_rdd):
    hashingTF = HashingTF()
    tf = hashingTF.transform(txt_rdd)
    tf.cache()
    idf = IDF().fit(tf)
    tfidf = idf.transform(tf)

    return tfidf
def tfidf(rdd_doc):
    hasingTF = HashingTF()
    trainTf = hasingTF.transform(rdd_doc)
    trainTf.cache()
    idf = IDF().fit(trainTf)
    trainTfidf = idf.transform(trainTf)
    trainTfidf.cache()
    return trainTfidf, lambda x: hasingTF.indexOf(x)
def get_tfidf_features(txt):
    hashingTF = HashingTF()
    tf = hashingTF.transform(txt)
    tf.cache()
    idf = IDF().fit(tf)
    tfidf = idf.transform(tf)

    return tfidf
Ejemplo n.º 22
0
 def tfidf(self):
     tf = HashingTF().transform(self._sents)
     self._tf = tf
     tf.cache()
     idf = IDF().fit(tf)
     self.idf = idf
     tfidf = idf.transform(tf)
     self._tfidf = dict(enumerate(tfidf.collect()))
Ejemplo n.º 23
0
    def create_bayes(self):
        """ 创建贝叶斯训练模型 """

        if self._check_traning_exists():
            return

        # 获取积极文本构造rdd
        positive_file = os.path.join(settings.DATA_DIR, '分类词库/positive.txt')
        positive_data = self.sc.textFile(positive_file)
        # 数据去重
        positive_data = positive_data.distinct()
        positive_data = positive_data.map(
            lambda line: line.split('###')).filter(lambda line: len(line) == 2)

        # 获取消极文本构造rdd
        negative_file = os.path.join(settings.DATA_DIR, '分类词库/negative.txt')
        negative_data = self.sc.textFile(negative_file)
        negative_data = negative_data.distinct()
        negative_data = negative_data.map(
            lambda line: line.split('###')).filter(lambda line: len(line) == 2)

        # 合并训练集
        all_data = negative_data.union(positive_data)
        all_data.repartition(1)
        # 评分已经提前进行处理只有-1与1
        rate = all_data.map(lambda s: s[0])
        document = all_data.map(lambda s: s[1])

        words = document.map(lambda w:"/".\
                join(jieba.cut_for_search(w))).\
                map(lambda line: line.split("/"))

        # 训练词频矩阵
        hashingTF = HashingTF()
        tf = hashingTF.transform(words)

        # 计算TF-IDF矩阵
        idfModel = IDF().fit(tf)
        tfidf = idfModel.transform(tf)
        tf.cache()

        # 生成训练集和测试集
        zipped = rate.zip(tfidf)
        data = zipped.map(lambda line: LabeledPoint(line[0], line[1]))
        training, test = data.randomSplit([0.6, 0.4], seed=0)

        # 训练贝叶斯分类模型
        NBmodel = NaiveBayes.train(training, 1.0)
        predictionAndLabel = test.map(lambda p:
                                      (NBmodel.predict(p.features), p.label))
        accuracy = 1.0 * predictionAndLabel.filter(lambda x: 1.0 \
                if x[0] == x[1] else 0.0).count() / test.count()

        # 存储rdd
        words.repartition(1).saveAsTextFile(self.training_words_dir)
        # 贝叶斯分类模型以pickle存储
        with open(self.NBmodel, 'w') as f:
            pickle.dump(NBmodel, f)
    def process(reviews):
        if (reviews.isEmpty()):
            pass
        else:
            start = time.time()

            #get reviews with overall rating > 3 and overall rating < 3
            pos_reviews = reviews.filter(lambda x: x[0] > 3.0)
            neg_reviews = reviews.filter(lambda x: x[0] < 3.0)

            #set label for each class. 0.0 is positive - 1.0 is negative
            review_labels = reviews.map(lambda x: 0.0 if x[0] > 3.0 else 1.0)

            Words = Row('label', 'words')
            words = reviews.map(lambda r: Words(*r))
            words_df = spark.createDataFrame(words)

            #reviews tokenization
            token = RegexTokenizer(minTokenLength=2,
                                   pattern="[^A-Za-z]+",
                                   inputCol="words",
                                   outputCol="token",
                                   toLowercase=True)
            token_filtered = token.transform(words_df)

            #stopwords elimination
            remover = StopWordsRemover(inputCol="token",
                                       outputCol="stopwords",
                                       caseSensitive=False)
            stopwords_filtered = remover.transform(token_filtered)

            prep_filtered = (
                stopwords_filtered.select('stopwords').rdd).map(lambda x: x[0])

            #tf-idf calculation
            tf = HashingTF(numFeatures=numFeatures).transform(
                prep_filtered.map(porter_stem, preservesPartitioning=True))
            idf = IDF().fit(tf)
            train_tfidf = idf.transform(tf)

            #set training dataset with label
            training = review_labels.zip(train_tfidf).map(
                lambda x: LabeledPoint(x[0], x[1]))

            #train the model classifier
            model = SVMWithSGD.train(training, iterations=100)
            model_name = "svm" + str(counter_model)
            #save model classifier to HDFS
            output_dir = "hdfs://VM10-1-0-14:9000/classifier/" + model_name
            model.save(sc, output_dir)

            counter_model.add(1)

            end = time.time()
            print("Model Name : ", model_name, ", Total Reviews : ",
                  reviews.count(), "Processing Time : ", (end - start))
Ejemplo n.º 25
0
 def tfidf(self, tokenizer):
     """
     Get TFIDF matrix rdd with spark tfidf functions
     """
     self._create_rdd(tokenizer)
     hashingTF = HashingTF()
     tf = hashingTF.transform(self.token_rdd)
     idf = IDF(minDocFreq=2).fit(tf)
     tfidf = idf.transform(tf)
     return self.rdd, idf, tfidf
Ejemplo n.º 26
0
def tf_idf(sc,title_token):
    hashingTF = HashingTF(100)
    title_token = sc.parallelize(title_token)
    tf = hashingTF.transform(title_token)
    print tf, ' tf'
   
    idf = IDF().fit(tf)
    tfidf = idf.transform(tf)
   
    return tfidf
Ejemplo n.º 27
0
 def test_idf_model(self):
     data = [
         Vectors.dense([1, 2, 6, 0, 2, 3, 1, 1, 0, 0, 3]),
         Vectors.dense([1, 3, 0, 1, 3, 0, 0, 2, 0, 0, 1]),
         Vectors.dense([1, 4, 1, 0, 0, 4, 9, 0, 1, 2, 0]),
         Vectors.dense([2, 1, 0, 3, 0, 0, 5, 0, 2, 3, 9])
     ]
     model = IDF().fit(self.sc.parallelize(data, 2))
     idf = model.idf()
     self.assertEqual(len(idf), 11)
Ejemplo n.º 28
0
 def test_idf_model(self):
     data = [
         Vectors.dense([1, 2, 6, 0, 2, 3, 1, 1, 0, 0, 3]),
         Vectors.dense([1, 3, 0, 1, 3, 0, 0, 2, 0, 0, 1]),
         Vectors.dense([1, 4, 1, 0, 0, 4, 9, 0, 1, 2, 0]),
         Vectors.dense([2, 1, 0, 3, 0, 0, 5, 0, 2, 3, 9])
     ]
     model = IDF().fit(self.sc.parallelize(data, 2))
     idf = model.idf()
     self.assertEqual(len(idf), 11)
Ejemplo n.º 29
0
	def process(reviews):
		if(reviews.isEmpty()):
			pass
		else:
			model_name = "dt"
			updated_model = "dt0"
			model_path, data_path, metadata_path = '','',''
			
			#performing looping process to check the availability of new model classifier
			for i in range(25,-1,-1):
				model_path = "hdfs://VM10-1-0-14:9000/classifier/"+model_name+str(i)
				updated_model = model_name+str(i)
				data_path = model_path+"/data/part-r*"
				metadata_path = model_path+"/metadata/part-00000"
				if(patherror(data_path) == False and patherror(metadata_path) == False):
					break
			
			#load model classifier
			model = DecisionTreeModel.load(sc, model_path)

			start = time.time()
			reviews_label = reviews.map(lambda x: 0.0 if x[0] > 3.0 else 1.0)
			
			Words = Row('label', 'words')
			words = reviews.map(lambda r: Words(*r))
			words_df = spark.createDataFrame(words)
			
			#review tokenization
			token = RegexTokenizer(minTokenLength=2, pattern="[^A-Za-z]+", inputCol="words", outputCol="token", toLowercase=True)
			token_filtered = token.transform(words_df)
			
			#stopwords elimination
			remover = StopWordsRemover(inputCol="token", outputCol="stopwords", caseSensitive=False)
			stopwords_filtered = remover.transform(token_filtered)

			prep_filtered = (stopwords_filtered.select('stopwords').rdd).map(lambda x: x[0])
			
			#tf-idf calculation
			tf = HashingTF(numFeatures=numFeatures).transform(prep_filtered.map(porter_stem, preservesPartitioning=True))
			idf = IDF().fit(tf)
			tfidf = idf.transform(tf)
			
			prediction = model.predict(tfidf)
			
			labeled_prediction = reviews_label.zip(prediction).map(lambda x: (float(x[1]), x[0]))
			
			metrics = MulticlassMetrics(labeled_prediction)	
			
			output = reviews.zip(prediction)
				
			filename = "hdfs://VM10-1-0-14:9000/output/" + re.sub('[^0-9]','',str(datetime.now())) + ".out"
			output.saveAsTextFile(filename)
			
			end = time.time()	
			print(updated_model,';',reviews.count(),';',metrics.accuracy,';',metrics.precision(0.0),';',metrics.precision(1.0),';',metrics.recall(0.0),';',metrics.recall(1.0),';',metrics.fMeasure(0.0),';',metrics.fMeasure(1.0),';',(end-start))
def generate_tf_idf(twProfilesRdd, numFe):
    """
    Generate TF IDF tuple (gender,sparse vector) from rdd containing following tuples:
    (gender,(clean words tuple))
    """
    gtlp = generate_gender_tf(twProfilesRdd, numFe)
    idf = IDF()
    tfVectorsRDD = gtlp.map(lambda tp: tp[1])
    idfModel = idf.fit(tfVectorsRDD)
    idfRdd = idfModel.transform(tfVectorsRDD)
    return (idfRdd.zip(gtlp).map(lambda tp: (tp[1][0], tp[0])), idfModel)
    def extractKeywords_Train(self):
        documents = self.sc.textFile(self.trainingfile).map(lambda line: line.split(" ")[1:])

        hashingTF = HashingTF()
        tf = hashingTF.transform(documents)
        tf.cache()

        idfIgnore = IDF(minDocFreq=2).fit(tf)
        tfidfIgnore = idfIgnore.transform(tf)

        tfidfIgnore.saveAsTextFile("AAA")
Ejemplo n.º 32
0
def tf_idf_cal(words_rdd):
	hashingTF = HashingTF()
	tf = hashingTF.transform(words_rdd)

	idf = IDF().fit(tf)
	
	tfidf = idf.transform(tf).cache()

	tfidf_str = tfidf.map(lambda line: str(line)).cache()

	return tfidf_str
def generate_tf_idf(twProfilesRdd,numFe):
    """
    Generate TF IDF tuple (gender,sparse vector) from rdd containing following tuples:
    (gender,(clean words tuple))
    """
    gtlp=generate_gender_tf(twProfilesRdd, numFe)
    idf=IDF()
    tfVectorsRDD=gtlp.map(lambda tp: tp[1])
    idfModel=idf.fit(tfVectorsRDD)
    idfRdd=idfModel.transform(tfVectorsRDD)
    return (idfRdd.zip(gtlp).map(lambda tp:(tp[1][0],tp[0])),idfModel)
def vectorize(training):
    hashingTF = HashingTF()
    tf_training = training.map(lambda tup: hashingTF.transform(tup[1]))
    idf_training = IDF().fit(tf_training)
    tfidf_training = idf_training.transform(tf_training)
    tfidf_idx = tfidf_training.zipWithIndex()
    training_idx = training.zipWithIndex()
    idx_training = training_idx.map(lambda line: (line[1], line[0]))
    idx_tfidf = tfidf_idx.map(lambda l: (l[1], l[0]))
    joined_tfidf_training = idx_training.join(idx_tfidf)
    training_labeled = joined_tfidf_training.map(lambda tup: tup[1])
    labeled_training_data = training_labeled.map(lambda k: LabeledPoint(k[0][0], k[1]))
    return labeled_training_data
def predictSentiment(tweetText):
    nbModel = bc_model.value

    hashingTF = HashingTF()
    tf = hashingTF.transform(tweetText)
    tf.cache()
    idf = IDF(minDocFreq=2).fit(tf)
    tfidf = idf.transform(tf)
    tfidf.cache()
    prediction = nbModel.predict(tfidf)
    print "Predictions for this window :"
    for i in range(0, prediction.count()):
        print prediction.collect()[i], tweetText.collect()[i]
Ejemplo n.º 36
0
def calcTfidf(doc, source):
    """
    This method computes TF-IDF scores for the given document.
    While applying HashingTF only needs a single pass to the data, applying IDF needs two passes: first to compute the IDF vector and second to scale the term frequencies by IDF.
    """
    hashingTF = HashingTF(200000)
    tf = hashingTF.transform(doc)
    print "TF calculated for "+source.split('/')[-1]
    tf.cache()
    idf = IDF().fit(tf)  ##idf = IDF(minDocFreq=2).fit(tf)
    tfidf = idf.transform(tf)
    print "TF-IDF calculated for "+source.split('/')[-1]  
    return hashingTF, tfidf
Ejemplo n.º 37
0
def tf_idf(data):

	# This hashes each of the word / feature using MurmurHash 3 and generates
	# an index for each, then calculates TF for each index
	# TODO : Need to check best numFeatures
	tf = HashingTF(numFeatures=10000).transform(data.map(lambda x : x.split()))

	# TF-IDF is calculated to understand how important a word is to a particular
	# document

	idf = IDF().fit(tf)
	tfidf = idf.transform(tf)

	return tfidf
Ejemplo n.º 38
0
def training_set(pos_file, neg_file):
    text_negative = sc.textFile(neg_file)
    text_positive = sc.textFile(pos_file)

    train_text = text_negative.union(text_positive)
    train_labels = text_negative.map(lambda x: 0.0).union(
        text_positive.map(lambda x: 1.0))

    tf = HashingTF(numFeatures=10000).transform(train_text.map(lambda x: x))
    idf = IDF().fit(tf)
    train_tfidf = idf.transform(tf)

    training = train_labels.zip(train_tfidf).map(
        lambda x: LabeledPoint(x[0], x[1]))
    return (training, idf)
Ejemplo n.º 39
0
def create_tfidf(sc):
    # start = time.time()
    docs = sc.textFile(FILE0, 4).map(split_docs).cache()
    tags = docs.map(lambda doc: doc[1].split()).cache()
    tag = tags.map(lambda tags: tags[0])
    words = docs.map(lambda doc: doc[0].split())
    words = words.map(preProcess).cache()

    # id_tag = tag.zipWithIndex().map(swapOder)

    hashingTF = HashingTF()
    tf = hashingTF.transform(words)
    tf.cache()
    idf = IDF(minDocFreq=2).fit(tf)
    tfidf = idf.transform(tf).cache()
    #tfidf = tfidf.collect()
    return tfidf, tags
def normTFIDF(fn_tokens_RDD, vecDim, caching=True):
    keysRDD = fn_tokens_RDD.keys()
    tokensRDD = fn_tokens_RDD.values()
    tfVecRDD = tokensRDD.map(lambda tokens: hashing_vectorize(
        tokens, vecDim))  # passing the vecDim value. TIP: you need a lambda.
    if caching:
        tfVecRDD.persist(
            StorageLevel.MEMORY_ONLY
        )  # since we will read more than once, caching in Memory will make things quicker.
    idf = IDF()  # create IDF object
    idfModel = idf.fit(tfVecRDD)  # calculate IDF values
    tfIdfRDD = idfModel.transform(
        tfVecRDD)  # 2nd pass needed (see lecture slides), transforms RDD
    norm = Normalizer(
    )  # create a Normalizer object like in the example linked above
    normTfIdfRDD = norm.transform(tfIdfRDD)  # and apply it to the tfIdfRDD
    zippedRDD = keysRDD.zip(normTfIdfRDD)  # zip the keys and values together
    return zippedRDD
Ejemplo n.º 41
0
    def parseTextRDDToIndex(self, data, label=True):

        if label:
            labels = data.map(lambda line: float(line.split(" ", 1)[0]))
            documents = data.map(lambda line: line.split(" ", 1)[1].split(" "))
        else:
            documents = data.map(lambda line: line.split(" "))

        tf = HashingTF().transform(documents)
        tf.cache()

        idfIgnore = IDF(minDocFreq=2).fit(tf)
        index = idfIgnore.transform(tf)

        if label:
            return labels.zip(index).map(
                lambda line: LabeledPoint(line[0], line[1]))
        else:
            return index
Ejemplo n.º 42
0
def mySpark(minFreq, keyWord):

    # text cleaning function
    def removePunctuation(text):
        res = text.lower().strip()
        res = re.sub("[^0-9a-zA-Z ]", "", res)
        return res.split(" ")

    # Function for printing each element in RDD
    def println(x):
        for i in x:
            print i

    # Boilerplate Spark stuff:
    conf = SparkConf().setMaster("local").setAppName("SparkTFIDF")
    sc = SparkContext(conf=conf)

    # Load documents content (one per line) + cleaning.
    rawData = sc.textFile("list_berita-30.tsv")
    fields = rawData.map(lambda x: x.split("\t"))
    documents = fields.map(lambda x: removePunctuation(x[3]))

    # Get documents content without word mapping
    documentNames = fields.map(lambda x: x[3])

    # TF processing
    hashingTF = HashingTF(100000)  #100K hash buckets just to save some memory
    tf = hashingTF.transform(documents)

    # IDF & TF-IDF processing
    tf.cache()
    idf = IDF(minDocFreq=int(minFreq)).fit(tf)
    tfidf = idf.transform(tf)

    # Get keyword relevance with content and zip it
    keywordTF = hashingTF.transform(removePunctuation(keyWord))
    keywordHashValue = int(keywordTF.indices[0])
    keywordRelevance = tfidf.map(lambda x: x[keywordHashValue])
    zippedResults = keywordRelevance.zip(documentNames)

    # print result
    print "Best document for keywords is:"
    print zippedResults.max()
Ejemplo n.º 43
0
def mySpark(minFreq, keyWord):

    # text cleaning function
    def removePunctuation(text):
        res=text.lower().strip()
        res=re.sub("[^0-9a-zA-Z ]", "", res)
        return res.split(" ")

    # Function for printing each element in RDD
    def println(x):
        for i in x:
            print i

    # Boilerplate Spark stuff:
    conf = SparkConf().setMaster("local").setAppName("SparkTFIDF")
    sc = SparkContext(conf = conf)

    # Load documents content (one per line) + cleaning.
    rawData = sc.textFile("list_berita-30.tsv")
    fields = rawData.map(lambda x: x.split("\t"))
    documents = fields.map(lambda x: removePunctuation(x[3]))

    # Get documents content without word mapping
    documentNames = fields.map(lambda x: x[3])

    # TF processing
    hashingTF = HashingTF(100000)  #100K hash buckets just to save some memory
    tf = hashingTF.transform(documents)

    # IDF & TF-IDF processing
    tf.cache()
    idf = IDF(minDocFreq=int(minFreq)).fit(tf)
    tfidf = idf.transform(tf)

    # Get keyword relevance with content and zip it
    keywordTF = hashingTF.transform(removePunctuation(keyWord))
    keywordHashValue = int(keywordTF.indices[0])
    keywordRelevance = tfidf.map(lambda x: x[keywordHashValue])
    zippedResults = keywordRelevance.zip(documentNames)

    # print result
    print "Best document for keywords is:"
    print zippedResults.max()
Ejemplo n.º 44
0
def run_tf_idf_spark_mllib(df, numFeatures=1 << 20):
    tokenizer = Tokenizer(inputCol="body", outputCol="words")
    wordsData = tokenizer.transform(df)

    words = wordsData.select("words").rdd.map(lambda x: x.words)

    hashingTF = MllibHashingTF(numFeatures)
    tf = hashingTF.transform(words)
    tf.cache()

    idf = MllibIDF().fit(tf)
    tfidf = idf.transform(tf)

    # @TODO make this nicer
    tmp = sqlContext.createDataFrame(wordsData.rdd.zip(tfidf), ["data", "features"])
    tmp.registerTempTable("tmp")
    old_columns = ', '.join(map(lambda x: 'data.%s' % x, wordsData.columns))
    with_features = sqlContext.sql("SELECT %s, features FROM tmp" % old_columns)
    tmp = sqlContext.createDataFrame(with_features.rdd.zip(tf), ["data", "rawFeatures"])
    tmp.registerTempTable("tmp")
    old_columns = ', '.join(map(lambda x: 'data.%s' % x, with_features.columns))
    return sqlContext.sql("SELECT %s, rawFeatures FROM tmp" % old_columns)
Ejemplo n.º 45
0
    def extract_features(self, feat='tfidf', **kwargs):
        """
        Converts each subtitle into its TF/TFIDF representation.
        Normalizes if necessary.

        Parameters
        --------
        Feat: 'tf' or 'tfidf'.
        kwargs: num_features, minDocFreq, or other arguments to be passed
        to the MLLib objects.

        Returns
        --------
        RDD of features with key.
        """

        # transform BOW into TF vectors
        num_features = kwargs.get('num_features', 10000)
        htf = HashingTF(num_features)
        feat_rdd = self.RDD.mapValues(htf.transform).cache()

        # transform TF vectors into IDF vectors
        if feat == 'tfidf':
            keys, tf_vecs = feat_rdd.keys(), feat_rdd.values()
            minDocFreq = kwargs.get('minDocFreq', 2)
            idf = IDF(minDocFreq=minDocFreq)
            idf_model = idf.fit(tf_vecs)
            idf_rdd = idf_model.transform(tf_vecs.map(lambda vec: vec.toArray()))
            feat_rdd = keys.zip(idf_rdd)

        if self.model_type == 'log_reg':
            normalizer = StandardScaler(withMean=True, withStd=True)
            keys, vecs = feat_rdd.keys(), feat_rdd.values()
            norm_model = normalizer.fit(vecs)
            norm_rdd = norm_model.transform(vecs.map(lambda vec: vec.toArray()))
            feat_rdd = keys.zip(norm_rdd)

        return feat_rdd
Ejemplo n.º 46
0
def use_naive_nayes():
    """
    Running the Naive Bayes from Spark's Mlib library
    """
    from pyspark.mllib.classification import NaiveBayes
    from pyspark.mllib.feature import HashingTF, IDF
    from pyspark.mllib.linalg import SparseVector, Vectors
    from pyspark.mllib.regression import LabeledPoint
    #loading the files
    path = "/Users/abhisheksingh29895/Desktop/courses/CURRENT/Advance_Machine_Learning/HW2/aclImdb/"
    train_pos = sc.textFile(path + "train/pos/*txt").map(lambda line: line.encode('utf8')).map(lambda line: line.split())
    train_neg = sc.textFile(path + "train/neg/*txt").map(lambda line: line.encode('utf8')).map(lambda line: line.split())
    test_pos = sc.textFile(path + "test/pos/*txt").map(lambda line: line.encode('utf8')).map(lambda line: line.split())
    test_neg = sc.textFile(path + "test/neg/*txt").map(lambda line: line.encode('utf8'))
    #TF-IDF
    tr_pos = HashingTF().transform(train_pos)  ;  tr_pos_idf = IDF().fit(tr_pos)
    tr_neg = HashingTF().transform(train_neg)  ;  tr_neg_idf = IDF().fit(tr_neg)
    te_pos = HashingTF().transform(test_pos)  ;  te_pos_idf = IDF().fit(te_pos)
    te_neg = HashingTF().transform(test_neg)  ;  te_neg_idf = IDF().fit(te_neg)
    #IDF step
    tr_pos_tfidf = tr_pos_idf.transform(tr_pos)  ;  tr_neg_tfidf = tr_neg_idf.transform(tr_neg)
    te_pos_tfidf = te_pos_idf.transform(te_pos)  ;  te_neg_tfidf = te_neg_idf.transform(te_neg)
    #Creating labels
    pos_label = [1] * 12500  ;  pos_label = sc.parallelize(pos_label)
    neg_label = [1] * 12500  ;  neg_label = sc.parallelize(neg_label)
    # Combine using zip
    train_pos_file = pos_label.zip(tr_pos_tfidf).map(lambda x: LabeledPoint(x[0], x[1]))
    train_neg_file = neg_label.zip(tr_neg_tfidf).map(lambda x: LabeledPoint(x[0], x[1]))
    test_pos_file = pos_label.zip(te_pos_tfidf).map(lambda x: LabeledPoint(x[0], x[1]))
    test_neg_file = neg_label.zip(te_neg_tfidf).map(lambda x: LabeledPoint(x[0], x[1]))
    #Joining 2 RDDS to form the final training set
    train_file = train_pos_file.union(train_neg_file)
    test_file = test_pos_file.union(test_neg_file)
    # Fitting a Naive bayes model
    model = NaiveBayes.train(train_file)
    # Make prediction and test accuracy
    predictionAndLabel = test_file.map(lambda p: (model.predict(p[1]), p[0]))
    accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count()
    print ""
    print "Test accuracy is {}".format(round(accuracy,4))
Ejemplo n.º 47
0
sc = SparkContext()

rdd = sc.wholeTextFiles("/usr/local/Cellar/BigDataAdvanced/Assignment1/TwitterStuff/TweetData").map(lambda (name,text):text.split())
tf = HashingTF()
tfVectors = tf.transform(rdd).cache()
a = tfVectors.collect()
count = 0 
for vec in a:
        print vec
        count = count + 1
        with open("TF_Tweet"+str(count)+".txt","w") as f:
                f.write(str(vec))
        f.close()

idf = IDF()
idfModel = idf.fit(tfVectors)
tfIdfVectors = idfModel.transform(tfVectors)
file = open("TF-IDF_tweet.txt", 'w')
file.write(str(tfIdfVectors.collect()))

#count = 0
#output=tfIdfVectors.collect()
#for vec in output:
#	print vec
#	count = count + 1
#	with open("TF_Wiki"+str(count)+".txt","w") as f:
#		f.write(str(vec))
#	f.close()
	
	
Ejemplo n.º 48
0
    # remove top 3 lines from document
    doc_wo_counters = documents.mapPartitionsWithIndex(lambda i, iter: islice(iter, 3, None) if i == 0 else iter)

    final_doc = doc_wo_counters.map(lambda x: (int(x[0]), doc_to_words(int(x[1]), int(x[2])).encode("utf8"))).reduceByKey(lambda x, y: x + " " + y)

    vect_rep = final_doc.map(lambda x: x[1])

    raw_document = sc.textFile("test.txt")
    vect_rep = raw_document.map(lambda line: line.encode("utf8").split(" "))

    
    # TfIDF
    hashingTF = HashingTF()
    tf = hashingTF.transform(vect_rep)
    tf.cache()
    idf = IDF().fit(tf)
    tfidf_vectors = idf.transform(tf)
    
    #Build the model (cluster the data)
    clusters = KMeans.train(tfidf_vectors, 10, maxIterations=100)
    
    # Evaluate clustering by computing Within Set Sum of Squared Errors
    def error(point):
        center = clusters.centers[clusters.predict(point)]
        return sqrt(sum([x**2 for x in (point.toArray() - center)]))

    WSSSE = tfidf_vectors.map(lambda point: error(point)).reduce(lambda x, y: x + y)
    print("Within Set Sum of Squared Error = " + str(WSSSE))

    # Save and load model
    clusters.save(sc, "myModelPath")
Ejemplo n.º 49
0
def distributed_ops( corpus, sanit=False, recall=False, corpred=False, \
                     streams=False, segred=False, tfidf=False, lda=False, \
                     word2vec=False, fin=None, segclust=None):

    # Return item for end results
    return_list = []

    ##########################################

    # Default actions:
    if (segred):
        zipped_corpus = zip(segclust,corpus)
        #print zipped_corpus
    corpus = sc.parallelize(corpus).cache()

    if (sanit or recall):
        corpus = corpus.map(lambda doc: preprocess(doc))
        # Here we "recover all" text, after having removed multi-ws & ws-pad punctuation
        # & replace \n by NL etc... (see function "preprocess" above)
        # We use the same regex sub/filtration rules as in the implementation found
        # @ https://github.com/alexalemi/segmentation (from which we got files in
        # directory: representation.py, tools.py and splitters.py, and which
        # segmentSETxRes.py is based on)
        if (recall):
            return_list.append(recover_encoding(corpus.collect()))

    # Here we return only potentially "meaningful words" - see function "return_words" above
    # Keeps alpha-numeric (removes numeric and non-alphabetical/alphanumeric)
    corpus_distrib = corpus.map(lambda doc: return_words(doc))
    print 'Original number of docs in corpus {filtering *docs* for alpha(+alphanumeric)-only words}: %i'%corpus_distrib.count()
    
    # merge corpus docs into one continuous split text
    corpus_merge = []
    corpus_collect = corpus_distrib.collect() # rdd2list
    for list_of_words in corpus_collect:
        corpus_merge.extend(list_of_words) # list-of-wordslist2{single-wordslist}
    
    # use numpy functions to sort dict words based on term-frequency
    corpus_merge_array = np.array(corpus_merge)
    corpus_merge_sorted = np.sort(corpus_merge_array)
    corpus_merge_unique, counts = np.unique(corpus_merge_sorted,return_counts=True)
    sort_ixs = np.argsort(counts)[::-1]
    counts = counts[sort_ixs]
    corpus_merge_unique = corpus_merge_unique[sort_ixs]
    return_list.append(corpus_merge_unique)
    return_list.append(counts)
    print
    for i,w in enumerate(corpus_merge_unique):
        print ('Counted word "%s" _%i_ many times.'%(w,counts[i]))
    print

    #########################################################################################
    # Next we split the text based on "verbosity/density/sparsity" as would
    # befit an articulate document (i.e. articles/papers/journal entries)
    # or more conversational/blog-entry-like/Q&A style/headings-only-
    # -retrieved website results.
    def error(point):
        center = clusters.centers[clusters.predict(point)]
        return sqrt(sum([x**2 for x in (point-center)]))

    # The following will further sanitize text.
    if (corpred):
        # Use pretrained term frequencies:
        # Experimentally, the following clustering has helped us get rid of
        # irrelevant search engine text results.
        corpus2vec = corpus.map(lambda doc: genre_score(doc,type2=False))
        corpus2vec = corpus2vec.map(lambda doc: process_doc2vec_word_counts(doc)).cache()
        # print 'Corpus vectorized'
        # collected = corpus2vec.collect()
        tempor = corpus.collect()
        print
        print
        for i,vec in enumerate(corpus2vec.collect()):
            print 'Got vecs:'
            print vec
            print 'Of text:'
            print tempor[i].split()
            print
        print

        # choose 5 clusters
        clusters = KMeans.train(corpus2vec, 5, maxIterations=90, runs=10, initializationMode="k-means||")
        WSSE = corpus2vec.map(lambda point: error(point)).reduce(lambda x,y: x+y) # cumsum
        print
        print 'Within Set Sum of Squared Error = ' + str(WSSE)
        print 'The cluster centers:'
        print clusters.centers
        print
        print
        return_list.append(corpus2vec.map(lambda pt: clusters.predict(pt)).collect())

    # The following will cluster for article length + content
    if (streams):
        corpus2vec = corpus.map(lambda doc: genre_score(doc,type2=True))
        temple = corpus.collect()
        print
        print
        for i,vec in enumerate(corpus2vec.collect()):
            print 'Got vecs:'
            print vec
            print 'Of text:'
            print temple[i].split()
            print
        print
        sumall = corpus2vec.reduce(lambda vecx,vecy: np.array([vecx[0]+vecy[0]]))
        corpus2vec = corpus2vec.map(lambda doc: process_doc2vec_word_counts(doc,normalizer=sumall)).cache()
        #
        clusters = KMeans.train(corpus2vec, 5, maxIterations=90, runs=10, initializationMode="k-means||")
        WSSE = corpus2vec.map(lambda point: error(point)).reduce(lambda x,y: x+y) # cumsum
        print
        print 'Within Set Sum of Squared Error = ' + str(WSSE)
        print 'The cluster centers:'
        print clusters.centers
        print
        print
        return_list.append(corpus2vec.map(lambda pt: clusters.predict(pt)).collect())

    #########################################################################################

    # Here we want to remove documents from the corpus which do not contain
    # 'english' dictionary words at all, or words that can be word2vec transformed
    # and "synonimized".
    if (segred):
        corpus_english_prose = sc.parallelize(zipped_corpus).filter(lambda doc: check(doc))
        zipped_corpus = zip(*corpus_english_prose.collect())
        red_clusts = list(zipped_corpus[0])
        red_text = recover_encoding(list(zipped_corpus[1]))
        return_list.append(red_clusts)
        return_list.append(red_text)
        print 'Number of docs in corpus {filtering *corpus* for alpha(+alphanumeric)-only words}: %i'%corpus_english_prose.count()

        f1 = open(''.join([filename,'-document_clusters.txt']),'w')
        f1.write('\n'.join(map(str,red_clusts)))
        f1.close()
        f2 = open(''.join([filename,'-documents_sanitized.txt']),'w')
        f2.write('\n'.join(red_text))
        f2.close()
        f3 = open(''.join([filename,'-documents_dict.txt']),'w')
        f3.write('\n'.join(corpus_merge_unique))
        f3.close()

    #########################################################################################

    if (tfidf):
        # generate document term frequences
        htf = HashingTF()
        tf = htf.transform(corpus_distrib)
        # generate idf = log{ frac{#docs}{#docs w. term} }
        idf = IDF().fit(tf)
        # scale tf * idf
        tfidf = idf.transform(tf)
        # collect tfidf for future use
        doc_tfidf = tfidf.collect()
        # generate unique word : HashingTF hash dict
        corpus_dict_tfidf_t = {}
        # uniquifie merged corpus into terms
        #corpus_merge_unique = sorted(set(corpus_merge))
        # fill in unique word : HashingTF hash dict
        for word in corpus_merge_unique:
            idx = htf.indexOf(word)
            corpus_dict_tfidf_t[word] = idx
            # index not necessarily found in doc_tfidf.

        # no return item

    #########################################################################################

    if (lda):
        corpus_dict = {}
        for c,word in enumerate(corpus_merge_unique):
            corpus_dict[word]=counts[c]
        def return_freq_words(doc,corpus_dict):
            return [word for word in doc if word in corpus_dict if corpus_dict[word]>2]
        corpus_distrib_red = corpus_distrib.map(lambda doc: return_freq_words(doc,corpus_dict)).cache()
        gensim_corpora_id2word = corpora.Dictionary(corpus_distrib_red.collect())
        gensim_doc2bow_doctf = corpus_distrib_red.map(lambda doc: gensim_corpora_id2word.doc2bow(doc)).collect()
        f1 = open(''.join([filename,'-gensim_corpora_id2word.pkl']),'w')
        pickle.dump(gensim_corpora_id2word,f1)
        f1.close()
        f2 = open(''.join([filename,'-gensim_doc2bow_doctf.pkl']),'w')
        pickle.dump(gensim_doc2bow_doctf,f2)
        f2.close()
        f3 = open(''.join([filename,'-corpus.pkl']),'w')
        pickle.dump(corpus_distrib.collect(),f3)
        f3.close()

    if (word2vec):
        #
        def increase_tf(doc): # only words with freq >= 5 are vectorized
            ret_doc = []
            for i in xrange(5):  # <<<
                ret_doc.extend(doc)  # <<<
            return ret_doc
        #
        corpus_distrib_ext = corpus_distrib.map(lambda doc: increase_tf(doc))
        word_mbd = Word2Vec().setVectorSize(50).setSeed(42L).fit(corpus_distrib_ext)
        word2vec_dict = {}
        for i,w in enumerate(corpus_merge_unique):
            #print ('Counted word "%s" _%i_ many times.'%(w,counts[i]))
            word2vec_dict[w] = word_mbd.transform(w)
            try:
                print ('Top 5 embedding cosine similarity synonyms of word "%s":'%w)
                proximal_synonyms = word_mbd.findSynonyms(w,5)
                for s,cs in proximal_synonyms:
                    print ('  "%s" with score _%f_'%(s,cs))
            except:
                print 'No synonyms found (word not in dict).'
        print
        print 'Processing + Spark MLLib has given us %i word2vec vectors.'%len(word2vec_dict)
        return_list.append(word2vec_dict)
        f4 = open(''.join([filename,'-word2vec_dict.pkl']),'w')
        pickle.dump(word2vec_dict,f4)
        f4.close()

    if len(return_list)==1:
        return_list = return_list[0]
    return return_list
Ejemplo n.º 50
0
documents_neg = documents_neg_RDD.map(lambda x: x.replace(',',' ').replace('.',' ').replace('-',' ').lower())\
    .map(lambda line: line.split(" "))\
    .map(lambda x: filter_word(x))\
    .map(lambda x: (0.0, x))


documents_train = documents.union(documents_neg)

labels = documents_train.map(lambda x: x[0])
train_set = documents_train.map(lambda x: x[1])

hashingTF = HashingTF()
tf = hashingTF.transform(train_set)

tf.cache()
idf = IDF(minDocFreq=2).fit(tf)
tfidf = idf.transform(tf)

# Create a labeled point with a positive label and a dense feature vector
training = labels.zip(tfidf).map(lambda x: LabeledPoint(x[0], x[1]))

model = NaiveBayes.train(training)

######### Calculate TFIDF with test data ########

### test_pos data ###
documents_t_RDD = sc.textFile("/Users/tracy/msan-ml/hw2/aclImdb/test_pos.txt")
# This command is for running on EMR connecting to S3
# documents_RDD = sc.textFile("s3n://aml-aml/test_pos.txt")

documents_t = documents_t_RDD.map(lambda x: x.replace(',',' ').replace('.',' ').replace('-',' ').lower())\
Ejemplo n.º 51
0
	def tfIdf_cluster(self,content,title,date,tfidf):
                tfidf_list=content
		inputRDD = sc.parallelize(tfidf_list)
		hasingTF = HashingTF(2 ** 20)
		trainTf = hasingTF.transform(inputRDD)
                idf = IDF().fit(trainTf)
                trainTfidf = idf.transform(trainTf)
                km = KMeans.train(trainTfidf, 2, maxIterations=100, runs=10) #training new model
		 
		result = km.predict(trainTfidf)
		k_data = array(result.collect())
		
		grp1_news = []
		grp2_news = []
		
		#把抓到的新聞存成[{},{}] key & value 的樣子方便前端取用
		# i = 0 
		for idx, grp in enumerate(k_data):
		
			if grp == 0:
				news =  {
					'title':title[idx],
					'date':date[idx],
					'content':''.join(content[idx].split()),
					'tfidf':tfidf[idx],
				}
				grp1_news.append(news)

			
			if grp == 1:
				news =  {
					'title':title[idx],
					'date':date[idx],
					'content':''.join(content[idx].split()),
					'tfidf':tfidf[idx],
				}
				grp2_news.append(news)
				
		#存取新聞分群TFIDF詞數量開始------------------------------------
		tfidf_word_grp1=[]           #用來裝TFIDF詞跟數量
		all_tfidf_grp1=[]                #用來裝所有TFIDF詞
		for post in grp1_news:
			tfidf = post['tfidf']
			for i in tfidf:
				all_tfidf_grp1.append(i)
		tfidf_dic1 = {}
		for ele in all_tfidf_grp1: # n
			if not ele in tfidf_dic1:
				tfidf_dic1[ele] = 1
			else:
				tfidf_dic1[ele] = tfidf_dic1[ele] + 1
		for i in range(0,len(tfidf_dic1)):
			data = {
				"text":tfidf_dic1.keys()[i],
				"size":(tfidf_dic1.values()[i])*1.5,
			}
			tfidf_word_grp1.append(data)
		
		tfidf_word_grp1.sort(key=lambda d:d['size'],reverse=True)   #幫情緒字進行排序
		
		tfidf_word_grp1 = tfidf_word_grp1[0:50]
		tfidf_word_grp1 = json.dumps(tfidf_word_grp1)
		
		#---------------------------------------------------------------------------------------------
		tfidf_word_grp2=[]           #用來裝TFIDF詞跟數量
		all_tfidf_grp2=[]                #用來裝所有TFIDF詞
		for post in grp2_news:
			tfidf = post['tfidf']
			for i in tfidf:
				all_tfidf_grp2.append(i)
		tfidf_dic2 = {}
		for ele in all_tfidf_grp2: # n
			if not ele in tfidf_dic2:
				tfidf_dic2[ele] = 1
			else:
				tfidf_dic2[ele] = tfidf_dic2[ele] + 1
		for i in range(0,len(tfidf_dic2)):
			data = {
				"text":tfidf_dic2.keys()[i],
				"size":(tfidf_dic2.values()[i])*1.5,
			}
			tfidf_word_grp2.append(data)
		tfidf_word_grp2.sort(key=lambda d:d['size'],reverse=True)   #幫情緒字進行排序
		tfidf_word_grp2 = tfidf_word_grp2[0:50]
		tfidf_word_grp2 = json.dumps(tfidf_word_grp2)	
		
		#存取新聞分群TFIDF詞數量結束------------------------------------
			
		return grp1_news,grp2_news,tfidf_word_grp1,tfidf_word_grp2
Ejemplo n.º 52
0
    print x

# Boilerplate Spark stuff:
conf = SparkConf().setMaster("local").setAppName("SparkTFIDF")
sc = SparkContext(conf = conf)

# Load documents (one per line).
rawData = sc.textFile("subset-small.tsv")
fields = rawData.map(lambda x: x.split("\t"))
documents = fields.map(lambda x: x[3].split(" "))

documentNames = fields.map(lambda x: x[1])


hashingTF = HashingTF(100000)  #100K hash buckets just to save some memory
tf = hashingTF.transform(documents)

tf.cache()
idf = IDF(minDocFreq=2).fit(tf)
tfidf = idf.transform(tf)

keywordTF = hashingTF.transform(["Apollo"])
keywordHashValue = int(keywordTF.indices[0])

keywordRelevance = tfidf.map(lambda x: x[keywordHashValue])

zippedResults = keywordRelevance.zip(documentNames)

print "Best document for keywords is:"
print zippedResults.max()
Ejemplo n.º 53
0
def returnTFIDF(tokens, hashingTF):
	tf = hashingTF.transform(tokens)
	idf = IDF(minDocFreq=25).fit(tf)
	tfidf = idf.transform(tf)
	return tfidf
Ejemplo n.º 54
0
dim=pow(2,18)

hashingTF = HashingTF(dim)

tf=hashingTF.transform(tokens)

tf.cache()

v=tf.first()

print(v.size)
print(v.values)
print(v.indices)

idf = IDF().fit(tf)

tfidf=idf.transform(tf)

v2=tfidf.first()

print(v2.size)
print(v2.values)
print(v2.indices)

minMaxVals = tfidf.map(lambda v: (min(v.values),max(v.values)))
globalMin=minMaxVals.reduce(min)
globalMax=minMaxVals.reduce(max)
globalMinMax=(globalMin[0],globalMax[1])

###Using a TF-IDF model
Ejemplo n.º 55
0
from pyspark.mllib.util import MLUtils
#>>> examples = [LabeledPoint(1.1, Vectors.sparse(3, [(0, 1.23), (2, 4.56)])),                         LabeledPoint(0.0, Vectors.dense([1.01, 2.02, 3.03]))]
#>>> tempFile = NamedTemporaryFile(delete=True)
#>>> tempFile.close()
#>>> MLUtils.saveAsLibSVMFile(sc.parallelize(examples), tempFile.name)



from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.feature import HashingTF, IDF
from pyspark import SparkContext

sc=SparkContext("local","dd")
train = sc.parallelize(open("/home/madhura/ML_Spring16/MLProject/data/OriginalTraining.txt").read().splitlines()).map(lambda x: x.split(","))
trainlabels = train.map(lambda(a,b): int(b))
traintf = HashingTF().transform(train.map(lambda(a,b): a.split()))
trainidf = IDF().fit(traintf)
traintfidf = trainidf.transform(traintf)
#densetrain = traintfidf.map(lambda x: pyspark.mllib.linalg.DenseVector(x.toArray()))
#zippeddata = trainlabels.zip(densetrain)
#new = zippeddata.map(lambda (a,vec) : (a,vec.toArray()))
training = trainlabels.zip(traintfidf).map(lambda x : LabeledPoint(x[0], x[1]))
MLUtils.saveAsLibSVMFile(training.coalesce(1),"/home/madhura/ML_Spring16/MLProject/data/libsvmfile")
data = MLUtils.loadLibSVMFile(sc, "/home/madhura/ML_Spring16/MLProject/data/libsvmfile/part-00000")
(trainingData, testData) = data.randomSplit([0.7, 0.3])
model = RandomForest.trainClassifier(data, numClasses=2, categoricalFeaturesInfo={},
                                     numTrees=3, featureSubsetStrategy="auto",
                                     impurity='gini', maxDepth=4, maxBins=32)
model.save(sc, "/home/madhura/ML_Spring16/MLProject/SentimentAnalysis_NLTK_NB/src/myRandomForestClassificationModel")
Ejemplo n.º 56
0
def calculate_tfidf(documents):
    hashingTF = HashingTF()
    tf = hashingTF.transform(documents.map(lambda x: x[1]))
    tf.cache()
    idf = IDF().fit(tf)
    return idf.transform(tf)
Ejemplo n.º 57
0
def main():
	"""
	Driver program for a spam filter using Spark and MLLib
	"""

	# Consolidate the individual email files into a single spam file
	# and a single ham file
	makeDataFileFromEmails( "data/spam_2/", "data/spam.txt")
	makeDataFileFromEmails( "data/easy_ham_2/", "data/ham.txt" )

	# Create the Spark Context for parallel processing
	sc = SparkContext( appName="Spam Filter")

	# Load the spam and ham data files into RDDs
	spam = sc.textFile( "data/spam.txt" )
	ham = sc.textFile( "data/ham.txt" )

	# Create a HashingTF instance to map email text to vectors of 10,000 features.
	tf = HashingTF(numFeatures = 10000)

	# Each email is split into words, and each word is mapped to one feature.
	spamtf = spam.map(lambda email: tf.transform(email.split(" ")))
	hamtf = ham.map(lambda email: tf.transform(email.split(" ")))

	spamtf.cache()
	hamtf.cache()

	spamidf = IDF().fit(spamtf)
	hamidf = IDF().fit(hamtf)
	
	spamFeatures = spamidf.transform(spamtf)
	hamFeatures = hamidf.transform(hamtf)
	
	# Create LabeledPoint datasets for positive (spam) and negative (ham) data points.
	positiveExamples = spamFeatures.map(lambda features: LabeledPoint(1, features))
	negativeExamples = hamFeatures.map(lambda features: LabeledPoint(0, features))

	# Combine positive and negative datasets into one
	data = positiveExamples.union(negativeExamples)

	# Split the data into 70% for training and 30% test data sets 
	( trainingData, testData ) = data.randomSplit( [0.7, 0.3] )

	# Cache the training data to optmize the Logistic Regression
	trainingData.cache() 

	# Train the model with Logistic Regression using the SGD algorithm.
	model = SVMWithSGD.train(trainingData, iterations=100)

	# Create tuples of actual and predicted values
	labels_and_predictions = testData.map( lambda email: (email.label, model.predict( email.features) ) )

	# Calculate the error rate as number wrong / total number
	error_rate = labels_and_predictions.filter( lambda (val, pred): val != pred ).count() / float(testData.count() )
	print( "*********** SPAM FILTER RESULTS **********" )
	print( "\n" )
	print( "Error Rate: " + str( error_rate ) )
	print( "\n" )

	# Serialize the model for presistance
	pickle.dump( model, open( "SpamSvm.pkl", "wb" ) )

	sc.stop()
Ejemplo n.º 58
0
# $example off$

if __name__ == "__main__":
    sc = SparkContext(appName="TFIDFExample")  # SparkContext

    # $example on$
    # Load documents (one per line).
    documents = sc.textFile("data/mllib/kmeans_data.txt").map(lambda line: line.split(" "))

    hashingTF = HashingTF()
    tf = hashingTF.transform(documents)

    # While applying HashingTF only needs a single pass to the data, applying IDF needs two passes:
    # First to compute the IDF vector and second to scale the term frequencies by IDF.
    tf.cache()
    idf = IDF().fit(tf)
    tfidf = idf.transform(tf)

    # spark.mllib's IDF implementation provides an option for ignoring terms
    # which occur in less than a minimum number of documents.
    # In such cases, the IDF for these terms is set to 0.
    # This feature can be used by passing the minDocFreq value to the IDF constructor.
    idfIgnore = IDF(minDocFreq=2).fit(tf)
    tfidfIgnore = idfIgnore.transform(tf)
    # $example off$

    print("tfidf:")
    for each in tfidf.collect():
        print(each)

    print("tfidfIgnore:")
Ejemplo n.º 59
0
        for word ,flag in psegCut:
            if(flag=="n"):
                words.append(word)
        data.append(list(words))

    data.remove("")
    documents = sc.parallelize(data)
    def hashing(x):
        return hashingTF.transform([x]).indices[0]
    hashed = documents.flatMap(lambda line: line).map(lambda word:(hashing(word), word)).distinct()
    hashed_word = pd.DataFrame(hashed.collect(), columns=['hash','word']).set_index('hash')
    # hashingTF = HashingTF()
    # Tf-Idfの生成
    tf = hashingTF.transform(documents)
    tf.cache()
    idf = IDF().fit(tf)
    tf_idf_data = idf.transform(tf)
    print dt.now().strftime('%Y/%m/%d %H:%M:%S')
    K = 5


	# Index documents with unique IDs
    corpus_data = tf_idf_data.zipWithIndex().map(lambda x: [x[1], x[0]]).cache()
    print corpus_data
	# Cluster the documents into three topics using LDA
    ldaModel = LDA.train(corpus_data, k=K)

	# Output topics. Each is a distribution over words (matching word count vectors)
    print "Learned topics (as distributions over vocab of " + str(ldaModel.vocabSize()) + " words):"
    topics = ldaModel.topicsMatrix()
    print dt.now().strftime('%Y/%m/%d %H:%M:%S')
def filterStopWords(x):
	filtered_x = []
	for word in x:
		if word not in stopwordsList and len(word)>1:
			filtered_x.append(word)
	return filtered_x

documents = documents.map(lambda x: filterStopWords(x)).filter(lambda x: len(x)>0)


## Step 3: Extract TF-IDF features
hashingTF = HashingTF(nFeature)   # default is 2^20
tf = hashingTF.transform(documents)
tf.cache()
idf = IDF(minDocFreq=5).fit(tf)
tfidf = idf.transform(tf).repartition(nPartition)
tf.unpersist()
del idf
tfidf.cache()

## Step 4: Clustering with k-mean algorithm

pool = [10, 100, 1000]
for nCluster in pool:
	# Build the model (cluster the data)
	kmeans_model = KMeans.train(tfidf, nCluster, maxIterations=10, runs=1, initializationMode="random")

	# Evaluate clustering by computing Within Set Sum of Squared Errors
	'''
	def error(point):