Ejemplo n.º 1
0
def as_treetagger_corpus(orig_path, dest_path, encoding='latin-1', language='english'):
    assert (orig_path != eng_path)
    orig = PyCorpus(orig_path)
    dest = PyCorpus(eng_path)
    dest.autocommit(False)
    for doc_id in orig.keys():
        dest[doc_id] = as_treetagger_doc(orig[doc_id], encoding=encoding, language=language)
    dest.commit()
    orig.close()
    dest.close()
Ejemplo n.º 2
0
def as_eng_postagged_corpus(orig_path, eng_path):
    '''Uses nltk default tagger.'''
    assert (orig_path != eng_path)
    orig = PyCorpus(orig_path)
    dest = PyCorpus(eng_path)
    dest.autocommit(False)
    for doc_id in orig.keys():
        dest[doc_id] = as_eng_postagged_doc(orig[doc_id])
    dest.commit()
    orig.close()
    dest.close()
Ejemplo n.º 3
0
def as_t3corpus(orig_path, t3_path):
    '''Convert a corpus at orig_path to t3mesta corpus to t3_path.'''
    orig_corpus = PyCorpus(orig_path)
    dest_corpus = PyCorpus(t3_path)
    dest_corpus.autocommit(False)

    dest_keys = set(dest_corpus.keys())
    for key in orig_corpus.keys():
        if key not in dest_keys:
            dest_corpus[key] = as_t3doc(orig_corpus[key])

    dest_corpus.commit()

    orig_corpus.close()
    dest_corpus.close()
Ejemplo n.º 4
0
def parse_plain_corpus(plainpath, corpuspath):
    corpus = PyCorpus(corpuspath)
    data = codecs.open(plainpath, 'rb', 'utf-8').read()
    docs = re.split('s*?\r?\n\r?\n', data)
    data = None
    corpus.autocommit(False)
    for doc in docs:
        lines = re.split('\r?\n', doc.strip())
        title = lines[0].strip()
        contents = '\n'.join(lines[1:]).strip()
        text_stream = cStringIO.StringIO(contents.encode('utf-8'))
        utf8_stream = codecs.getreader('utf-8')(text_stream)
        corpus[title] = parse_plain_doc_from_stream(utf8_stream)
    corpus.commit()
    corpus.close()