Esempio n. 1
0
def get_db_builder(dump_file, stemmer):
    db_builder = DbBuilder(stemmer)
    clean_docs = parse_tools.iterate_wikidocs_from_dump(
        dump_file, cleaner_WikiExtractor
    )  #    (dump_file, keep_sections=False, keep_links=False)
    for doc in clean_docs:
        db_builder.add_document(doc)
    return db_builder
Esempio n. 2
0
    def test__advanced_doc(self):
        """tests that new form of test equals to old one  from test__advanced_doc"""

        expected_wf = simple_wf()

        builder = DbBuilder(StopWordsStemmer([]))
        for doc in expected_wf.docs:
            builder.add_document(doc)

        actual_wf = WorkFlow()

        builder.build(wf=actual_wf, normalization=False)
        #workaround to handle dimensions mismatch
        expected_wf.df_vec = matrix(expected_wf.df_vec)
        assert_allclose(actual_wf.df_vec.todense(),
                        expected_wf.df_vec.todense())
        assert_allclose(actual_wf.wieghts_mat.todense(),
                        expected_wf.wieghts_mat.todense())
Esempio n. 3
0
def add_docs_from_parsed_xml_to_builder(parsed_dump, stemmer=None):
    """ builds WikiRep database.
        @param parsed_dump: Wikipedia parced xml (etc. wikiparsed.xml)
        @return: db_builder
    """
    _log.debug("-" * 80)
    _log.info("Building DB from parsed dump:{0}".format(parsed_dump))

    if stemmer is None: stemmer = stemmers.StopWordsStemmer()
    db_builder = DbBuilder(stemmer)
    if not os.path.isfile(parsed_dump):
        raise Exception(
            "Parsed dump doesnt exists: File {0}".format(parsed_dump))

    #TODO: add parsed page reader
    xml_pages = parse_tools.iterate_wiki_doc(parsed_dump)
    doc_count = 0
    for doc in xml_pages:
        doc_count += 1
        _log.debug("Adding document #{0}:{0}".format(doc_count, doc.title))
        db_builder.add_document(doc)
    _log.info("Added #{0} documents".format(doc_count))
    return db_builder
Esempio n. 4
0
 def setUp(self):
     stemmer = StopWordsStemmer([])
     self.db_builder = DbBuilder(stemmer)