Ejemplo n.º 1
0
def parse_wikipedia(path, corpus_path):
    '''path - the directory containing the extracted documents by
       WikiExtractor.py.
       corpus_path - the filename to store the parsed corpus.
    '''
    corpus = PyCorpus(corpus_path)

    def from_path(path):
        sys.stderr.write('Processing path ' + path + '\n')
        files = os.listdir(path)
        for f in files:
            newpath = os.path.join(path, f)
            if os.path.isdir(newpath):
                from_path(newpath)
            else:
                sys.stderr.write('Processing file ' + newpath + '\n')
                get_documents(newpath)

    def get_documents(path):
        f = codecs.open(path, 'r', 'utf-8')
        contents = f.read()
        f.close()
        doctexts  = contents.split('<doc id="')
        documents = []

        for text in doctexts:
            text = text.strip()
            if len(text) < 1:
                continue
            # extract the document parts
            doc_id = int(text[:text.index('"')])
            title  = text[text.index('title="')+7 : text.index('"', text.index('title="')+7)]
            text   = text[text.index('\n') : text.index('</doc>')].strip()

            text_stream = cStringIO.StringIO(text.encode('utf-8'))
            utf8_stream = codecs.getreader('utf-8')(text_stream)
            corpus[str(doc_id)] = parse_plain_doc_from_stream(utf8_stream)
        return documents

    from_path(path)

    corpus.sync()
    return corpus