def parse_wikipedia(path, corpus_path): '''path - the directory containing the extracted documents by WikiExtractor.py. corpus_path - the filename to store the parsed corpus. ''' corpus = PyCorpus(corpus_path) def from_path(path): sys.stderr.write('Processing path ' + path + '\n') files = os.listdir(path) for f in files: newpath = os.path.join(path, f) if os.path.isdir(newpath): from_path(newpath) else: sys.stderr.write('Processing file ' + newpath + '\n') get_documents(newpath) def get_documents(path): f = codecs.open(path, 'r', 'utf-8') contents = f.read() f.close() doctexts = contents.split('<doc id="') documents = [] for text in doctexts: text = text.strip() if len(text) < 1: continue # extract the document parts doc_id = int(text[:text.index('"')]) title = text[text.index('title="')+7 : text.index('"', text.index('title="')+7)] text = text[text.index('\n') : text.index('</doc>')].strip() text_stream = cStringIO.StringIO(text.encode('utf-8')) utf8_stream = codecs.getreader('utf-8')(text_stream) corpus[str(doc_id)] = parse_plain_doc_from_stream(utf8_stream) return documents from_path(path) corpus.sync() return corpus