Esempio n. 1
0
    def test__same_text_correlation(self):
        """ Test that for same text correlation is 1"""
        
        _log.info('-'*80)
        
        # arrange 
        text1 = "love is rain as long story short"
        text2 = text1

        dump_file = getInputFile("swiki_knowledge_output.xml")
        parsed_file = getOutputFile("swiki_knowledge_output.parsed.xml")
        #wdb_file = getOutputFile("swiki_knowledge_output.wdb")

        articles = ['Rain', 'Love', 'Tree'] 
        
        # act
        wn.make_dump(dump_file, articles, compress=False)
        wn.parse_dump(dump_file, parsed_file)
        db_wrapper = wn.build_database_wrapper(parsed_file, StopWordsStemmer([]))
                             
        #self.addCleanup(os.remove, self.tmp_dump_file)
        
        comparer = SemanticComparer(db_wrapper)
        correlation = comparer.compare(text1, text2)
        _log.info(test_utils.get_texts_correlation_message(text1, text2, correlation))
        self.assertAlmostEqual(correlation, 1.0, msg="for same text correlation should be 1")
Esempio n. 2
0
    def test__same_text_correlation(self):
        """ Test that for same text correlation is 1"""

        _log.info('-' * 80)

        # arrange
        text1 = "love is rain as long story short"
        text2 = text1

        dump_file = getInputFile("swiki_knowledge_output.xml")
        parsed_file = getOutputFile("swiki_knowledge_output.parsed.xml")
        #wdb_file = getOutputFile("swiki_knowledge_output.wdb")

        articles = ['Rain', 'Love', 'Tree']

        # act
        wn.make_dump(dump_file, articles, compress=False)
        wn.parse_dump(dump_file, parsed_file)
        db_wrapper = wn.build_database_wrapper(parsed_file,
                                               StopWordsStemmer([]))

        #self.addCleanup(os.remove, self.tmp_dump_file)

        comparer = SemanticComparer(db_wrapper)
        correlation = comparer.compare(text1, text2)
        _log.info(
            test_utils.get_texts_correlation_message(text1, text2,
                                                     correlation))
        self.assertAlmostEqual(correlation,
                               1.0,
                               msg="for same text correlation should be 1")
Esempio n. 3
0
    def test__many_articles(self):
        wiki_dump_path = getInputFile("many_articles_dump.xml")
        parsed_xml_path = getOutputFile("many_articles_dump.parsed.xml")
        
        wiki_knowledge.parse_dump(wiki_dump_path, parsed_xml_path)

        db_wrapper = wiki_knowledge.build_database_wrapper(parsed_xml_path, PorterStemmer())
        
        c = db_wrapper.get_readable_centroid(ibm_licence_text)
        print c 
Esempio n. 4
0
    def test__many_articles(self):
        wdb_path = getInputFile("many_articles_dump.wdb")
        db_wrapper = wiki_knowledge.load_db_wrapper_from_wdb(wdb_path)

        d = db_wrapper.get_readable_centroid(ibm_licence_text_full)
        s = sorted(d.items(), key=lambda x: x[1], reverse=True)[:5]
        print s

        d = db_wrapper.get_readable_centroid(ibm_licence_text)
        s = sorted(d.items(), key=lambda x: x[1], reverse=True)[:5]
        print s
Esempio n. 5
0
    def test__many_articles(self):
        wiki_dump_path = getInputFile("many_articles_dump.xml")
        parsed_xml_path = getOutputFile("many_articles_dump.parsed.xml")

        wiki_knowledge.parse_dump(wiki_dump_path, parsed_xml_path)

        db_wrapper = wiki_knowledge.build_database_wrapper(
            parsed_xml_path, PorterStemmer())

        c = db_wrapper.get_readable_centroid(ibm_licence_text)
        print c
Esempio n. 6
0
    def test__many_articles(self):
        wdb_path = getInputFile("many_articles_dump.wdb")
        db_wrapper = wiki_knowledge.load_db_wrapper_from_wdb(wdb_path)
        
        
        d = db_wrapper.get_readable_centroid(ibm_licence_text_full)
        s = sorted(d.items(), key=lambda x: x[1], reverse=True)[:5]
        print s 

        d = db_wrapper.get_readable_centroid(ibm_licence_text)
        s = sorted(d.items(), key=lambda x: x[1], reverse=True)[:5]
        print s 
Esempio n. 7
0
    def test__many_articles(self):
        wdb_path = getInputFile("many_articles_dump.wdb")
        db_wrapper = wiki_knowledge.load_db_wrapper_from_wdb(wdb_path)

        d = db_wrapper.get_readable_centroid(ibm_licence_text_full)

        top = get_top(d, 5)
        self.assertIn("Computer", dict(top))

        d = db_wrapper.get_readable_centroid(ibm_licence_text)
        top = get_top(d, 5)
        self.assertIn("Computer", dict(top))
Esempio n. 8
0
    def test__many_articles(self):
        wdb_path = getInputFile("many_articles_dump.wdb")
        db_wrapper = wiki_knowledge.load_db_wrapper_from_wdb(wdb_path)
                
        d = db_wrapper.get_readable_centroid(ibm_licence_text_full)

        top = get_top(d,5)
        self.assertIn("Computer", dict(top))

        d = db_wrapper.get_readable_centroid(ibm_licence_text)
        top = get_top(d,5)
        self.assertIn("Computer", dict(top))
Esempio n. 9
0
    def test_extract_pages(self):
        '''regression check that extract_pages works well'''
        # template.format('id', ' title', 'length'); template = "{:<12}{:<30}{:>12}"
        expected =[
            (243478,      'Ross Ice Shelf',                       13734),
            (18798090,    'Southern Cross Expedition',            39110),
            (343246,      'Ice shelf',                             8262)
        ]
        
        test__parse_tools_xml =  getInputFile(FilesList.test__parse_tools)
        
        actual = [(wdoc.id, wdoc.title, len(wdoc.wiki_text)) 
            for wdoc
            in pt.iterate_wiki_pages(test__parse_tools_xml)] #  extract_pages(test__parse_tools_xml)]        

        self.assertSequenceEqual(actual, expected, "Assertion failure: \nActual={}\nExpected={}".format(actual, expected))
Esempio n. 10
0
    def test_extract_pages(self):
        '''regression check that extract_pages works well'''
        # template.format('id', ' title', 'length'); template = "{:<12}{:<30}{:>12}"
        expected = [(243478, 'Ross Ice Shelf', 13734),
                    (18798090, 'Southern Cross Expedition', 39110),
                    (343246, 'Ice shelf', 8262)]

        test__parse_tools_xml = getInputFile(FilesList.test__parse_tools)

        actual = [(wdoc.id, wdoc.title, len(wdoc.wiki_text))
                  for wdoc in pt.iterate_wiki_pages(test__parse_tools_xml)
                  ]  #  extract_pages(test__parse_tools_xml)]

        self.assertSequenceEqual(
            actual, expected,
            "Assertion failure: \nActual={}\nExpected={}".format(
                actual, expected))
Esempio n. 11
0
 def test_number_of_concepts(self):
     """ db builder reads parsed xml properly"""
     
     _log.info('-'*80)
     
     # arrange 
     dump_file = getInputFile("wikidump_Knowledge_Love_War.xml")
     parsed_file = getOutputFile("wikidump_Knowledge_Love_War.parsed.xml")
     
     # act
     wn.parse_dump(dump_file, parsed_file)
     db_wrapper = wn.build_database_wrapper(parsed_file, StopWordsStemmer([]))
     
     titles_count =len(db_wrapper.title_index)
     concepts_count =len(db_wrapper.concepts_index)
     
     # assert
     self.assertEqual(titles_count, 3, "number of tiltes should be 3, got {0}".format(titles_count))                     
     self.assertEqual(concepts_count, 3, "number of tiltes should be 3, got {0}".format(concepts_count)) 
Esempio n. 12
0
    def test_number_of_concepts(self):
        """ db builder reads parsed xml properly"""

        _log.info('-' * 80)

        # arrange
        dump_file = getInputFile("wikidump_Knowledge_Love_War.xml")
        parsed_file = getOutputFile("wikidump_Knowledge_Love_War.parsed.xml")

        # act
        wn.parse_dump(dump_file, parsed_file)
        db_wrapper = wn.build_database_wrapper(parsed_file,
                                               StopWordsStemmer([]))

        titles_count = len(db_wrapper.title_index)
        concepts_count = len(db_wrapper.concepts_index)

        # assert
        self.assertEqual(
            titles_count, 3,
            "number of tiltes should be 3, got {0}".format(titles_count))
        self.assertEqual(
            concepts_count, 3,
            "number of tiltes should be 3, got {0}".format(concepts_count))
Esempio n. 13
0
    def test__parse_dump(self):
        wiki_dump_path = io_tu.getInputFile(io_tu.FilesList.test__parse_tools)
        wiki_parsed_dump_path = io_tu.getOutputFile(
            io_tu.FilesList.test__parse_tools)

        wn.parse_dump(wiki_dump_path, wiki_parsed_dump_path)
Esempio n. 14
0
 def test__many_articles_files(self):
     wdb_path = getInputFile("many_articles_dump.wdb")
     text_path = getInputFile("ibm_licence.txt")
     d = wiki_knowledge.get_value_from_file(wdb_path, text_path)
     top = get_top(d, 5)
     self.assertIn("Computer", dict(top))
Esempio n. 15
0
 def test__parse_dump(self):
     wiki_dump_path =  io_tu.getInputFile(io_tu.FilesList.test__parse_tools)
     wiki_parsed_dump_path =  io_tu.getOutputFile(io_tu.FilesList.test__parse_tools)
   
     wn.parse_dump(wiki_dump_path, wiki_parsed_dump_path)
Esempio n. 16
0
 def test__many_articles_files(self):
     wdb_path = getInputFile("many_articles_dump.wdb")
     text_path = getInputFile("ibm_licence.txt")
     d = wiki_knowledge.get_value_from_file(wdb_path, text_path)
     top = get_top(d,5)
     self.assertIn("Computer", dict(top))