Esempio n. 1
0
 def test_binary_term_freqs(self):
     hashingTF = HashingTF(100).setBinary(True)
     doc = "a a b c c c".split(" ")
     n = hashingTF.numFeatures
     output = hashingTF.transform(doc).toArray()
     expected = Vectors.sparse(n, {hashingTF.indexOf("a"): 1.0,
                                   hashingTF.indexOf("b"): 1.0,
                                   hashingTF.indexOf("c"): 1.0}).toArray()
     for i in range(0, n):
         self.assertAlmostEqual(output[i], expected[i], 14, "Error at " + str(i) +
                                ": expected " + str(expected[i]) + ", got " + str(output[i]))
Esempio n. 2
0
 def test_binary_term_freqs(self):
     hashingTF = HashingTF(100).setBinary(True)
     doc = "a a b c c c".split(" ")
     n = hashingTF.numFeatures
     output = hashingTF.transform(doc).toArray()
     expected = Vectors.sparse(n, {hashingTF.indexOf("a"): 1.0,
                                   hashingTF.indexOf("b"): 1.0,
                                   hashingTF.indexOf("c"): 1.0}).toArray()
     for i in range(0, n):
         self.assertAlmostEqual(output[i], expected[i], 14, "Error at " + str(i) +
                                ": expected " + str(expected[i]) + ", got " + str(output[i]))
def tfidf(rdd_doc):
    hasingTF = HashingTF()
    trainTf = hasingTF.transform(rdd_doc)
    trainTf.cache()
    idf = IDF().fit(trainTf)
    trainTfidf = idf.transform(trainTf)
    trainTfidf.cache()
    return trainTfidf, lambda x: hasingTF.indexOf(x)
Esempio n. 4
0
def makeDict(x):
	global data
	hash = HashingTF(100000)
	for i in range(len(x)):
		word = x[i]
		ind = hash.indexOf(word)
		if ind not in data:
			data.update({ind:word})
	print len(data)
Esempio n. 5
0
def idf_scores_for_terms(tfidf, group_key):

    group_keys = tfidf.select(group_key).collect()
    words = tfidf.select("words").collect()
    features = tfidf.select("features").collect()
    rawFeatures = tfidf.select("rawFeatures").collect()
    
    numFeatures = features[0].features.size
    mllibHashingTf = MllibHashingTF(numFeatures=numFeatures)
    
    idfscores = list()
    for line_idx, feature_row in enumerate(features):
        terms = dict()

        for term_array in words[line_idx]:
            for term in term_array:
                term_index = mllibHashingTf.indexOf(term)
                terms[term] = (feature_row.features[term_index], rawFeatures[line_idx].rawFeatures[term_index])
        for term, score in terms.iteritems():
            idfscores.append((group_keys[line_idx][group_key], term, float(score[0]), float(score[1])))

    return idfscores
Esempio n. 6
0
def idf_scores_for_terms(tfidf, group_key):

    group_keys = tfidf.select(group_key).collect()
    words = tfidf.select("words").collect()
    features = tfidf.select("features").collect()
    rawFeatures = tfidf.select("rawFeatures").collect()

    numFeatures = features[0].features.size
    mllibHashingTf = MllibHashingTF(numFeatures=numFeatures)

    idfscores = list()
    for line_idx, feature_row in enumerate(features):
        terms = dict()

        for term_array in words[line_idx]:
            for term in term_array:
                term_index = mllibHashingTf.indexOf(term)
                terms[term] = (feature_row.features[term_index],
                               rawFeatures[line_idx].rawFeatures[term_index])
        for term, score in terms.iteritems():
            idfscores.append((group_keys[line_idx][group_key], term,
                              float(score[0]), float(score[1])))

    return idfscores
Esempio n. 7
0
################################################################################

conf = SparkConf().setMaster("local").setAppName("My App")

#Create SparkContext
sc = SparkContext(conf = conf)

#Load all documents in a certain directory
documents = sc.textFile(dir_path + "/*.txt").map(lambda line: line.split(" "))

hashing = HashingTF()

# Condition to know if the user wants to get the tf.idf of the whole document
# set or only one word
if mot != "" :
	index = hashing.indexOf(mot)
	mot_precis = True
else:
	mot_precis = False



#Term Frequency
#transform() : Transforms the input document (list of terms) to term frequency vectors, or transform the RDD of document to RDD of term frequency vectors.
tf = hashing.transform(documents)



#RDD Persistence
#Caching a dataset in memory across operations. Each nodes stores any partitions of it that it computes in memory and reuses them in other actions on that database. Used to make operation go up to 10x faster
tf.cache()
hashingTF = HashingTF()
tf = hashingTF.transform(splitRDD)
from pyspark.mllib.feature import IDF

# ...from tf create IDF
tf.cache()
idf = IDF().fit(tf)
tfidf = idf.transform(tf)


zipped = splitRDD.zip(tfidf)
fRDD = splitRDD.flatMap(lambda x: x).distinct()
#print fRDD.count()


wordRDD = fRDD.map(lambda x: (x, hashingTF.indexOf(x)))
listW = wordRDD.collect()


# Writing the tf-idf values to dictionary

import json
dictW = dict(listW)

with open("wordHashD.json", 'w') as f:
    json.dump(dictW, f)




finalZip = dataRDD.map(lambda x: x[0]).zip(zipped)
def distributed_ops( corpus, sanit=False, recall=False, corpred=False, \
                     streams=False, segred=False, tfidf=False, lda=False, \
                     word2vec=False, fin=None, segclust=None):

    # Return item for end results
    return_list = []

    ##########################################

    # Default actions:
    if (segred):
        zipped_corpus = zip(segclust,corpus)
        #print zipped_corpus
    corpus = sc.parallelize(corpus).cache()

    if (sanit or recall):
        corpus = corpus.map(lambda doc: preprocess(doc))
        # Here we "recover all" text, after having removed multi-ws & ws-pad punctuation
        # & replace \n by NL etc... (see function "preprocess" above)
        # We use the same regex sub/filtration rules as in the implementation found
        # @ https://github.com/alexalemi/segmentation (from which we got files in
        # directory: representation.py, tools.py and splitters.py, and which
        # segmentSETxRes.py is based on)
        if (recall):
            return_list.append(recover_encoding(corpus.collect()))

    # Here we return only potentially "meaningful words" - see function "return_words" above
    # Keeps alpha-numeric (removes numeric and non-alphabetical/alphanumeric)
    corpus_distrib = corpus.map(lambda doc: return_words(doc))
    print 'Original number of docs in corpus {filtering *docs* for alpha(+alphanumeric)-only words}: %i'%corpus_distrib.count()
    
    # merge corpus docs into one continuous split text
    corpus_merge = []
    corpus_collect = corpus_distrib.collect() # rdd2list
    for list_of_words in corpus_collect:
        corpus_merge.extend(list_of_words) # list-of-wordslist2{single-wordslist}
    
    # use numpy functions to sort dict words based on term-frequency
    corpus_merge_array = np.array(corpus_merge)
    corpus_merge_sorted = np.sort(corpus_merge_array)
    corpus_merge_unique, counts = np.unique(corpus_merge_sorted,return_counts=True)
    sort_ixs = np.argsort(counts)[::-1]
    counts = counts[sort_ixs]
    corpus_merge_unique = corpus_merge_unique[sort_ixs]
    return_list.append(corpus_merge_unique)
    return_list.append(counts)
    print
    for i,w in enumerate(corpus_merge_unique):
        print ('Counted word "%s" _%i_ many times.'%(w,counts[i]))
    print

    #########################################################################################
    # Next we split the text based on "verbosity/density/sparsity" as would
    # befit an articulate document (i.e. articles/papers/journal entries)
    # or more conversational/blog-entry-like/Q&A style/headings-only-
    # -retrieved website results.
    def error(point):
        center = clusters.centers[clusters.predict(point)]
        return sqrt(sum([x**2 for x in (point-center)]))

    # The following will further sanitize text.
    if (corpred):
        # Use pretrained term frequencies:
        # Experimentally, the following clustering has helped us get rid of
        # irrelevant search engine text results.
        corpus2vec = corpus.map(lambda doc: genre_score(doc,type2=False))
        corpus2vec = corpus2vec.map(lambda doc: process_doc2vec_word_counts(doc)).cache()
        # print 'Corpus vectorized'
        # collected = corpus2vec.collect()
        tempor = corpus.collect()
        print
        print
        for i,vec in enumerate(corpus2vec.collect()):
            print 'Got vecs:'
            print vec
            print 'Of text:'
            print tempor[i].split()
            print
        print

        # choose 5 clusters
        clusters = KMeans.train(corpus2vec, 5, maxIterations=90, runs=10, initializationMode="k-means||")
        WSSE = corpus2vec.map(lambda point: error(point)).reduce(lambda x,y: x+y) # cumsum
        print
        print 'Within Set Sum of Squared Error = ' + str(WSSE)
        print 'The cluster centers:'
        print clusters.centers
        print
        print
        return_list.append(corpus2vec.map(lambda pt: clusters.predict(pt)).collect())

    # The following will cluster for article length + content
    if (streams):
        corpus2vec = corpus.map(lambda doc: genre_score(doc,type2=True))
        temple = corpus.collect()
        print
        print
        for i,vec in enumerate(corpus2vec.collect()):
            print 'Got vecs:'
            print vec
            print 'Of text:'
            print temple[i].split()
            print
        print
        sumall = corpus2vec.reduce(lambda vecx,vecy: np.array([vecx[0]+vecy[0]]))
        corpus2vec = corpus2vec.map(lambda doc: process_doc2vec_word_counts(doc,normalizer=sumall)).cache()
        #
        clusters = KMeans.train(corpus2vec, 5, maxIterations=90, runs=10, initializationMode="k-means||")
        WSSE = corpus2vec.map(lambda point: error(point)).reduce(lambda x,y: x+y) # cumsum
        print
        print 'Within Set Sum of Squared Error = ' + str(WSSE)
        print 'The cluster centers:'
        print clusters.centers
        print
        print
        return_list.append(corpus2vec.map(lambda pt: clusters.predict(pt)).collect())

    #########################################################################################

    # Here we want to remove documents from the corpus which do not contain
    # 'english' dictionary words at all, or words that can be word2vec transformed
    # and "synonimized".
    if (segred):
        corpus_english_prose = sc.parallelize(zipped_corpus).filter(lambda doc: check(doc))
        zipped_corpus = zip(*corpus_english_prose.collect())
        red_clusts = list(zipped_corpus[0])
        red_text = recover_encoding(list(zipped_corpus[1]))
        return_list.append(red_clusts)
        return_list.append(red_text)
        print 'Number of docs in corpus {filtering *corpus* for alpha(+alphanumeric)-only words}: %i'%corpus_english_prose.count()

        f1 = open(''.join([filename,'-document_clusters.txt']),'w')
        f1.write('\n'.join(map(str,red_clusts)))
        f1.close()
        f2 = open(''.join([filename,'-documents_sanitized.txt']),'w')
        f2.write('\n'.join(red_text))
        f2.close()
        f3 = open(''.join([filename,'-documents_dict.txt']),'w')
        f3.write('\n'.join(corpus_merge_unique))
        f3.close()

    #########################################################################################

    if (tfidf):
        # generate document term frequences
        htf = HashingTF()
        tf = htf.transform(corpus_distrib)
        # generate idf = log{ frac{#docs}{#docs w. term} }
        idf = IDF().fit(tf)
        # scale tf * idf
        tfidf = idf.transform(tf)
        # collect tfidf for future use
        doc_tfidf = tfidf.collect()
        # generate unique word : HashingTF hash dict
        corpus_dict_tfidf_t = {}
        # uniquifie merged corpus into terms
        #corpus_merge_unique = sorted(set(corpus_merge))
        # fill in unique word : HashingTF hash dict
        for word in corpus_merge_unique:
            idx = htf.indexOf(word)
            corpus_dict_tfidf_t[word] = idx
            # index not necessarily found in doc_tfidf.

        # no return item

    #########################################################################################

    if (lda):
        corpus_dict = {}
        for c,word in enumerate(corpus_merge_unique):
            corpus_dict[word]=counts[c]
        def return_freq_words(doc,corpus_dict):
            return [word for word in doc if word in corpus_dict if corpus_dict[word]>2]
        corpus_distrib_red = corpus_distrib.map(lambda doc: return_freq_words(doc,corpus_dict)).cache()
        gensim_corpora_id2word = corpora.Dictionary(corpus_distrib_red.collect())
        gensim_doc2bow_doctf = corpus_distrib_red.map(lambda doc: gensim_corpora_id2word.doc2bow(doc)).collect()
        f1 = open(''.join([filename,'-gensim_corpora_id2word.pkl']),'w')
        pickle.dump(gensim_corpora_id2word,f1)
        f1.close()
        f2 = open(''.join([filename,'-gensim_doc2bow_doctf.pkl']),'w')
        pickle.dump(gensim_doc2bow_doctf,f2)
        f2.close()
        f3 = open(''.join([filename,'-corpus.pkl']),'w')
        pickle.dump(corpus_distrib.collect(),f3)
        f3.close()

    if (word2vec):
        #
        def increase_tf(doc): # only words with freq >= 5 are vectorized
            ret_doc = []
            for i in xrange(5):  # <<<
                ret_doc.extend(doc)  # <<<
            return ret_doc
        #
        corpus_distrib_ext = corpus_distrib.map(lambda doc: increase_tf(doc))
        word_mbd = Word2Vec().setVectorSize(50).setSeed(42L).fit(corpus_distrib_ext)
        word2vec_dict = {}
        for i,w in enumerate(corpus_merge_unique):
            #print ('Counted word "%s" _%i_ many times.'%(w,counts[i]))
            word2vec_dict[w] = word_mbd.transform(w)
            try:
                print ('Top 5 embedding cosine similarity synonyms of word "%s":'%w)
                proximal_synonyms = word_mbd.findSynonyms(w,5)
                for s,cs in proximal_synonyms:
                    print ('  "%s" with score _%f_'%(s,cs))
            except:
                print 'No synonyms found (word not in dict).'
        print
        print 'Processing + Spark MLLib has given us %i word2vec vectors.'%len(word2vec_dict)
        return_list.append(word2vec_dict)
        f4 = open(''.join([filename,'-word2vec_dict.pkl']),'w')
        pickle.dump(word2vec_dict,f4)
        f4.close()

    if len(return_list)==1:
        return_list = return_list[0]
    return return_list
Esempio n. 10
0
# Load documents (one per line).
documentsG = sc.textFile("/Users/daniellenash/Downloads/cleangoodreviews.txt").filter(lambda x : len(x) > 15)
documentsB = sc.textFile("/Users/daniellenash/Downloads/cleanbadreviews.txt").filter(lambda x : len(x) > 15)

docTokensG = documentsG.map(lambda x: x.split(" ")).map(cleanWord)
docTokensB = documentsB.map(lambda x: x.split(" ")).map(cleanWord)

hashingTF = HashingTF(100000)

wordList1 = docTokensB.reduce(lambda a,b : a+b) 
wordList2 = docTokensG.reduce(lambda a,b : a+b)


for word in set(wordList2):
	ind = hashingTF.indexOf(word)
	if ind not in data:
		data[ind] =  word


tfidfB = returnTFIDF(docTokensB, hashingTF)
tfidfG = returnTFIDF(docTokensG, hashingTF)

# 0 is bad, 1 is good
dataB = tfidfB.map(lambda x: LabeledPoint(0, x))
dataG = tfidfG.map(lambda x: LabeledPoint(1, x))

fullData = dataB.union(dataG)

model = LogisticRegressionWithLBFGS.train(fullData, iterations=100)
Esempio n. 11
0
    input_path = os.path.join(input)
    file_name = os.path.join(base_dir, input_path)
    # load data
    rdd = sc.textFile(file_name, num_part)
    rdd_j = rdd.map(json.loads)
    rdd_j.cache()
    return rdd_j


revs = get_rdd('data', 'reviews_musical.json', 4)
prods = get_rdd('data', 'meta_musical.json', 4)

# Load documents (one per line).
rev_texts = ['cheese is good for your health', 'he like cheese', 'cheese is an important income resource here']
rev_texts = sc.parallelize(rev_texts).map(lambda line: line.split(" "))
#rev_texts = revs.map(lambda x: x['reviewText'].split(' '))

# term frequency
hashingTF = HashingTF()
tf = hashingTF.transform(rev_texts)

# ... continue from the previous example
tf.cache()
idf = IDF().fit(tf)
tfidf = idf.transform(tf)
# from sparsevector, get indices and values
# get indices -> term PairRDD
keys = set(rev_texts.reduce(lambda x, y: x + y))
terms_dict = sc.parallelize(keys)
terms_pair = terms_dict.map(lambda x: (hashingTF.indexOf(x), x))
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.feature import IDF
doc = sc.textFile(“target url”).map(lambda line: line.split(' '))
hashingTF = HashingTF()
hashingTF.indexOf("COMPANY NAME")
tf = hashingTF.transform(doc)
idf = IDF().fit(tf)
tfidf = idf.transform(tf).collect()
print(tfidf)
Esempio n. 13
0
    for line in doc_test.collect():
        clean_dt.append(line.split('\t'))

    para = sc.parallelize(clean_dt)

    doc = para.map(lambda s: s[1])  ##get text
    splt = doc.map(lambda s: s.split())

    cn_dist = splt.flatMap(lambda s: s).distinct().count()  #distinct count

    htf1 = HashingTF()
    tf1 = htf1.transform(splt)
    idf1 = IDF().fit(tf1)
    tfidf1 = idf1.transform(tf1)

    ind_w = splt.map(lambda s: [htf1.indexOf(x) for x in s])  ##get index for each word in each doc

    tfidf_v_test = []
    for i in range(doc_test.count()):
        tfidf_v_test.append(tfidf1.collect()[i].values.tolist())
    tfidf_ind_test = []
    for i in range(doc_test.count()):
        tfidf_ind_test.append(tfidf1.collect()[i].indices.tolist())

    tfidf_v = []
    for i in range(doc_test.count()):
        tfidf_v.append(tfidf1.collect()[i].values.tolist())  ##tfidf value

    tfidf_ind = []
    for i in range(doc_test.count()):
        tfidf_ind.append(tfidf1.collect()[i].indices.tolist())
Esempio n. 14
0
from pyspark import SparkContext
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.feature import IDF

sc = SparkContext()

# Load documents (one per line).
documents = "abc bcd cde abc cde def cde".split(" ")
doc = documents
# print doc
#print documents

hashingTF = HashingTF()
tf = hashingTF.transform(documents)
tF = tf
# tf.cache()
# idf = IDF().fit(tf)
# tfidf = idf.transform(tf).collect()
print tF
print len(tF)
print doc
print hashingTF.indexOf('bcd')
Esempio n. 15
0
    rdd = sc.textFile(file_name, num_part)
    rdd_j = rdd.map(json.loads)
    rdd_j.cache()
    return rdd_j


revs = get_rdd('data', 'reviews_musical.json', 4)
prods = get_rdd('data', 'meta_musical.json', 4)

# Load documents (one per line).
rev_texts = [
    'cheese is good for your health', 'he like cheese',
    'cheese is an important income resource here'
]
rev_texts = sc.parallelize(rev_texts).map(lambda line: line.split(" "))
#rev_texts = revs.map(lambda x: x['reviewText'].split(' '))

# term frequency
hashingTF = HashingTF()
tf = hashingTF.transform(rev_texts)

# ... continue from the previous example
tf.cache()
idf = IDF().fit(tf)
tfidf = idf.transform(tf)
# from sparsevector, get indices and values
# get indices -> term PairRDD
keys = set(rev_texts.reduce(lambda x, y: x + y))
terms_dict = sc.parallelize(keys)
terms_pair = terms_dict.map(lambda x: (hashingTF.indexOf(x), x))