Ejemplo n.º 1
0
 def test_idf_model(self):
     data = [
         Vectors.dense([1, 2, 6, 0, 2, 3, 1, 1, 0, 0, 3]),
         Vectors.dense([1, 3, 0, 1, 3, 0, 0, 2, 0, 0, 1]),
         Vectors.dense([1, 4, 1, 0, 0, 4, 9, 0, 1, 2, 0]),
         Vectors.dense([2, 1, 0, 3, 0, 0, 5, 0, 2, 3, 9])
     ]
     model = IDF().fit(self.sc.parallelize(data, 2))
     idf = model.idf()
     self.assertEqual(len(idf), 11)
Ejemplo n.º 2
0
 def test_idf_model(self):
     data = [
         Vectors.dense([1, 2, 6, 0, 2, 3, 1, 1, 0, 0, 3]),
         Vectors.dense([1, 3, 0, 1, 3, 0, 0, 2, 0, 0, 1]),
         Vectors.dense([1, 4, 1, 0, 0, 4, 9, 0, 1, 2, 0]),
         Vectors.dense([2, 1, 0, 3, 0, 0, 5, 0, 2, 3, 9])
     ]
     model = IDF().fit(self.sc.parallelize(data, 2))
     idf = model.idf()
     self.assertEqual(len(idf), 11)
Ejemplo n.º 3
0
    #get url list
    url_dict = []
    url_file = open("../../data/corpus.txt", "r").readlines()
    for val in url_file:
        d = re.split("###", val.strip())
        url = d[1]
        url_dict.append(url)
    doc_num = len(url_dict)
    #calculate TF-IDF feature
    word_dict_size = 504927
    hashingTF = HashingTF(word_dict_size)
    tf_words = hashingTF.transform(word_file.map(processWords))
    tf_raw = hashingTF.transform(txt_file.map(processCorpus))
    idf = IDF().fit(tf_raw)
    max_idf = float(idf.idf().max())
    tf = tf_raw.map(processTF)
    tfidf = idf.transform(tf).map(processTFIDF(max_idf))
    tfidf.persist(StorageLevel.MEMORY_AND_DISK)
    tf.persist(StorageLevel.MEMORY_AND_DISK)
    tf_raw.persist(StorageLevel.MEMORY_AND_DISK)

    #get word dictionary
    words_all = open("../../data/all_words.txt", "r").readlines()
    words_all = [val.strip() for val in words_all]
    word_dict = {val: 1 for val in words_all}

    #get inverted index
    temp1 = txt_file.map(filterWords(word_dict))
    temp2 = temp1.flatMap(lambda line: line)
    inverted_index = temp2.groupByKey()