コード例 #1
0
    def test_build_lsh_forest(self):
        # Create a CompanyCorpus instance, and initialize it with some data
        test_corpus = [['company_1', 'Provider of software'], ['company_2', 'Provider of hardware']]
        test_input = pd.DataFrame(test_corpus, columns=['domain', 'description'])

        cc = CompanyCorpus(test_input)

        number_to_remove = 1
        cc.build_idf(description_column_name='description')
        cc.filter_desc_by_idf(description_column_name='description',
                              number_words_to_cut=number_to_remove)

        # Create a CompanyGraph instance and test building the LSH forest
        cg = CompanyGraph(cc)

        cg.build_lsh_forest(company_name_column_name='domain')
        self.assertTrue(cg.lsh_forest)
コード例 #2
0
 def test_save_graph(self):
     # Create a CompanyCorpus instance, and initialize it with some data
     test_corpus = [['company_1', 'Provider of software'], ['company_2', 'Provider of hardware technology'],
                    ['company_3', 'Provider of Software technology'], ['company_4', 'Provider of software service']]
     test_input = pd.DataFrame(test_corpus, columns=['domain', 'description'])
     cc = CompanyCorpus(test_input)
     number_to_remove = 1
     cc.build_idf(description_column_name='description')
     cc.filter_desc_by_idf(description_column_name='description',
                           number_words_to_cut=number_to_remove)
     # Create a CompanyGraph instance
     cg = CompanyGraph(cc)
     cg.build_lsh_forest(company_name_column_name='domain')
     cg.build_graph(sensitivity=3)
     save_graph(cg, filename='graph.pickle')
コード例 #3
0
    def test_get_jaccard_similarity(self):
        # Create a CompanyCorpus instance, and initialize it with some data
        test_corpus = [['company_1', 'Provider of software'], ['company_2', 'Provider of hardware technology'],
                       ['company_3', 'Provider of Software technology'], ['company_4', 'Provider of software service']]
        test_input = pd.DataFrame(test_corpus, columns=['domain', 'description'])
        cc = CompanyCorpus(test_input)
        number_to_remove = 1
        cc.build_idf(description_column_name='description')
        cc.filter_desc_by_idf(description_column_name='description',
                              number_words_to_cut=number_to_remove)
        # Create a CompanyGraph instance
        cg = CompanyGraph(cc)
        cg.build_lsh_forest(company_name_column_name='domain')
        cg.build_graph(sensitivity=3)

        # print(cg.graph.todense())
        # Run test of function
        jaccard_similarity = cg.get_jaccard_similarity('company_1', 'company_3')
        self.assertNotEqual(0., jaccard_similarity)
コード例 #4
0
    def test_get_dot_product_score(self):
        # Create a CompanyCorpus instance, and initialize it with some data
        test_corpus = [['company_1', 'business software application'], ['company_2', 'hardware technology'],
                       ['company_3', 'consumer software service'], ['company_4', 'consumer saas application']]
        test_input = pd.DataFrame(test_corpus, columns=['domain', 'description'])
        cc = CompanyCorpus(test_input)
        number_to_remove = 0
        cc.build_idf(description_column_name='description')
        cc.filter_desc_by_idf(description_column_name='description',
                              number_words_to_cut=number_to_remove)
        # Create a CompanyGraph instance
        cg = CompanyGraph(cc)
        cg.build_lsh_forest(company_name_column_name='domain')
        cg.build_graph(sensitivity=3)

        # Run test of function

        dot_product_score = cg.get_dot_product_score('company_1', 'company_3')

        self.assertNotEqual(0., dot_product_score)