def test_similarity(): # TFIDF.similarity Test tfidf1_sample = [['Apple', 1], ['Orange', 2], ['Banana', 1], ['Kiwi', 0]] tfidf2_sample = [['Apple', 1], ['Orange', 0], ['Banana', 2], ['Kiwi', 1]] # Tf-Idf cosは0.5であること r = TFIDF.similarity(tfidf1_sample, tfidf2_sample) assert round(r - 0.5, 7) == 0 # 同じTf-IdfはTf-Idf cosが1.0であること r = TFIDF.similarity(tfidf1_sample, tfidf1_sample) assert round(r - 1, 7) == 0
def test_tfidf(): # 中日 url = "https://ja.wikipedia.org/wiki/%E6%9D%B1%E4%BA%AC%E3%83%A4%E3%82%AF%E3%83%AB%E3%83%88%E3%82%B9%E3%83%AF%E3%83%AD%E3%83%BC%E3%82%BA" tfidf1 = TFIDF.gen_web(url) # 山本昌 url = "https://ja.wikipedia.org/wiki/%E5%B1%B1%E6%9C%AC%E6%98%8C" tfidf2 = TFIDF.gen_web(url) # print tfidf1 for k, v in tfidf1: print k, v print "~~~~~~~~~~~~~~~" # print tfidf2 for k, v in tfidf2: print k, v print TFIDF.similarity(tfidf1, tfidf2)