コード例 #1
0
ファイル: test_tfidf.py プロジェクト: inpho/uroc
    def testKnownTFIDF(self):
        """
        Testing to see whether the tfidf values for arbitrarily selected words 
        in the articles correspond with manually calculated values.
        """
        articleList = []
        theList = []

        for string in self.strings:
            articleList.append(tfidf.tf(string))

        for string in self.theTwentyFive:
            theList.append(tfidf.tf(string))

        idfArtDict = tfidf.idf(articleList)
        idfTheDict = tfidf.idf(theList)

        tfidfArtList = tfidf.tfidf(idfArtDict, articleList)
        tfidfTheList = tfidf.tfidf(idfTheDict, theList)

        self.assertEqual(tfidfArtList[1]["Meditation"], math.log10(6/1) * (1/19))
        self.assertEqual(tfidfArtList[2]["books"], math.log10(6/1) * (1/18))
        self.assertEqual(tfidfArtList[5]["the"], math.log10(6/3) * (5/5))

        self.assertEqual(tfidfTheList[3]["the"], math.log10(5/5) * (5/5))
コード例 #2
0
ファイル: test_tfidf.py プロジェクト: inpho/uroc
    def testKnownIDF(self):
        """
        Testing to see whether or not the inverse document frequencies match
        up with manually calculated idf values for arbitrarily selected words.
        """
        idfDict = tfidf.idf(self.articleList)

        self.assertEqual(idfDict["the"], math.log10(6/3))
        self.assertEqual(idfDict["books"], math.log10(6/1))
        self.assertEqual(idfDict["dog"], 0.0)

        idfDict = tfidf.idf(self.theList)
        self.assertEqual(idfDict[""], 0.0)
        self.assertEqual(idfDict["the"], math.log10(5/5))
コード例 #3
0
ファイル: lib.py プロジェクト: wagin/ppeople
def build_tfidf_model(job_posts, nlp_module='stanford'):
    tokens_list = []
    total_tokens = []
    idf_map = {}

    for j in job_posts:
        j['tokens'] = []
        for header, sentences in j['feature_sentence'].items():
            for sent in sentences:
                sent = clean_sentence(sent)
                tokens = []
                if not sent:
                    continue
                word_list = lemmatized_tokens(sent, nlp_module)
                unigram_tokens = get_unigrams(word_list)
                tokens.extend(unigram_tokens)
                bigram_tokens = get_bigrams(word_list)
                tokens.extend(bigram_tokens)
                j['tokens'].extend(tokens)
        tokens_list.append(j['tokens'])
        total_tokens.extend(j['tokens'])

    unique_tokens = list(set(total_tokens))
    for token in unique_tokens:
        idf_map[token] = idf(token, tokens_list)
    return idf_map
コード例 #4
0
def buildTfidfMatrix(queriedSentences, myLexicon,queryDictList):
    
    docTermMatrix = []
    for sentence1 in queriedSentences:
	tfVector = [tfidf.termfreq(word2, sentence1) for word2 in myLexicon]		
	docTermMatrix.append(tfVector)
 
    docTermNormalizedMatrix = []
    
    for vector in docTermMatrix:
	docTermNormalizedMatrix.append(tfidf.normalizer(vector))


    myIdfVector = [tfidf.idf(word3, queryDictList) for word3 in myLexicon]
    print "This is the idf vector ---->", myIdfVector
    tfidfMatrix = tfidf.build_tfidf_matrix(myIdfVector, docTermNormalizedMatrix)


    for vector in tfidfMatrix:
	print vector,"\n"

    return tfidfMatrix