def get_db_builder(dump_file, stemmer): db_builder = DbBuilder(stemmer) clean_docs = parse_tools.iterate_wikidocs_from_dump( dump_file, cleaner_WikiExtractor ) # (dump_file, keep_sections=False, keep_links=False) for doc in clean_docs: db_builder.add_document(doc) return db_builder
def test__advanced_doc(self): """tests that new form of test equals to old one from test__advanced_doc""" expected_wf = simple_wf() builder = DbBuilder(StopWordsStemmer([])) for doc in expected_wf.docs: builder.add_document(doc) actual_wf = WorkFlow() builder.build(wf=actual_wf, normalization=False) #workaround to handle dimensions mismatch expected_wf.df_vec = matrix(expected_wf.df_vec) assert_allclose(actual_wf.df_vec.todense(), expected_wf.df_vec.todense()) assert_allclose(actual_wf.wieghts_mat.todense(), expected_wf.wieghts_mat.todense())
def add_docs_from_parsed_xml_to_builder(parsed_dump, stemmer=None): """ builds WikiRep database. @param parsed_dump: Wikipedia parced xml (etc. wikiparsed.xml) @return: db_builder """ _log.debug("-" * 80) _log.info("Building DB from parsed dump:{0}".format(parsed_dump)) if stemmer is None: stemmer = stemmers.StopWordsStemmer() db_builder = DbBuilder(stemmer) if not os.path.isfile(parsed_dump): raise Exception( "Parsed dump doesnt exists: File {0}".format(parsed_dump)) #TODO: add parsed page reader xml_pages = parse_tools.iterate_wiki_doc(parsed_dump) doc_count = 0 for doc in xml_pages: doc_count += 1 _log.debug("Adding document #{0}:{0}".format(doc_count, doc.title)) db_builder.add_document(doc) _log.info("Added #{0} documents".format(doc_count)) return db_builder
def setUp(self): stemmer = StopWordsStemmer([]) self.db_builder = DbBuilder(stemmer)