def as_treetagger_corpus(orig_path, dest_path, encoding='latin-1', language='english'): assert (orig_path != eng_path) orig = PyCorpus(orig_path) dest = PyCorpus(eng_path) dest.autocommit(False) for doc_id in orig.keys(): dest[doc_id] = as_treetagger_doc(orig[doc_id], encoding=encoding, language=language) dest.commit() orig.close() dest.close()
def as_eng_postagged_corpus(orig_path, eng_path): '''Uses nltk default tagger.''' assert (orig_path != eng_path) orig = PyCorpus(orig_path) dest = PyCorpus(eng_path) dest.autocommit(False) for doc_id in orig.keys(): dest[doc_id] = as_eng_postagged_doc(orig[doc_id]) dest.commit() orig.close() dest.close()
def as_t3corpus(orig_path, t3_path): '''Convert a corpus at orig_path to t3mesta corpus to t3_path.''' orig_corpus = PyCorpus(orig_path) dest_corpus = PyCorpus(t3_path) dest_corpus.autocommit(False) dest_keys = set(dest_corpus.keys()) for key in orig_corpus.keys(): if key not in dest_keys: dest_corpus[key] = as_t3doc(orig_corpus[key]) dest_corpus.commit() orig_corpus.close() dest_corpus.close()
def parse_plain_corpus(plainpath, corpuspath): corpus = PyCorpus(corpuspath) data = codecs.open(plainpath, 'rb', 'utf-8').read() docs = re.split('s*?\r?\n\r?\n', data) data = None corpus.autocommit(False) for doc in docs: lines = re.split('\r?\n', doc.strip()) title = lines[0].strip() contents = '\n'.join(lines[1:]).strip() text_stream = cStringIO.StringIO(contents.encode('utf-8')) utf8_stream = codecs.getreader('utf-8')(text_stream) corpus[title] = parse_plain_doc_from_stream(utf8_stream) corpus.commit() corpus.close()