def test_build_lsh_forest(self): # Create a CompanyCorpus instance, and initialize it with some data test_corpus = [['company_1', 'Provider of software'], ['company_2', 'Provider of hardware']] test_input = pd.DataFrame(test_corpus, columns=['domain', 'description']) cc = CompanyCorpus(test_input) number_to_remove = 1 cc.build_idf(description_column_name='description') cc.filter_desc_by_idf(description_column_name='description', number_words_to_cut=number_to_remove) # Create a CompanyGraph instance and test building the LSH forest cg = CompanyGraph(cc) cg.build_lsh_forest(company_name_column_name='domain') self.assertTrue(cg.lsh_forest)
def test_save_graph(self): # Create a CompanyCorpus instance, and initialize it with some data test_corpus = [['company_1', 'Provider of software'], ['company_2', 'Provider of hardware technology'], ['company_3', 'Provider of Software technology'], ['company_4', 'Provider of software service']] test_input = pd.DataFrame(test_corpus, columns=['domain', 'description']) cc = CompanyCorpus(test_input) number_to_remove = 1 cc.build_idf(description_column_name='description') cc.filter_desc_by_idf(description_column_name='description', number_words_to_cut=number_to_remove) # Create a CompanyGraph instance cg = CompanyGraph(cc) cg.build_lsh_forest(company_name_column_name='domain') cg.build_graph(sensitivity=3) save_graph(cg, filename='graph.pickle')
def test_get_jaccard_similarity(self): # Create a CompanyCorpus instance, and initialize it with some data test_corpus = [['company_1', 'Provider of software'], ['company_2', 'Provider of hardware technology'], ['company_3', 'Provider of Software technology'], ['company_4', 'Provider of software service']] test_input = pd.DataFrame(test_corpus, columns=['domain', 'description']) cc = CompanyCorpus(test_input) number_to_remove = 1 cc.build_idf(description_column_name='description') cc.filter_desc_by_idf(description_column_name='description', number_words_to_cut=number_to_remove) # Create a CompanyGraph instance cg = CompanyGraph(cc) cg.build_lsh_forest(company_name_column_name='domain') cg.build_graph(sensitivity=3) # print(cg.graph.todense()) # Run test of function jaccard_similarity = cg.get_jaccard_similarity('company_1', 'company_3') self.assertNotEqual(0., jaccard_similarity)
def test_get_dot_product_score(self): # Create a CompanyCorpus instance, and initialize it with some data test_corpus = [['company_1', 'business software application'], ['company_2', 'hardware technology'], ['company_3', 'consumer software service'], ['company_4', 'consumer saas application']] test_input = pd.DataFrame(test_corpus, columns=['domain', 'description']) cc = CompanyCorpus(test_input) number_to_remove = 0 cc.build_idf(description_column_name='description') cc.filter_desc_by_idf(description_column_name='description', number_words_to_cut=number_to_remove) # Create a CompanyGraph instance cg = CompanyGraph(cc) cg.build_lsh_forest(company_name_column_name='domain') cg.build_graph(sensitivity=3) # Run test of function dot_product_score = cg.get_dot_product_score('company_1', 'company_3') self.assertNotEqual(0., dot_product_score)