def analyze_tei_files(fns): for fn in fns: try: for text in parse_tei_corpus(fn, target=get_target(fn)): text['guess'] = True text['disambiguate'] = True show_df(text.get.word_texts.roots.postags.forms.as_dataframe) except AttributeError: print('Warning: parse error: skipped: {0}'.format(fn), file=sys.stderr)
def process(start_dir, out_dir, encoding=None): for dirpath, dirnames, filenames in os.walk(start_dir): if len(dirnames) > 0 or len(filenames) == 0 or 'bin' in dirpath: continue for fnm in filenames: full_fnm = os.path.join(dirpath, fnm) out_prefix = os.path.join(out_dir, fnm) target = get_target(full_fnm) if os.path.exists(out_prefix + '_0.txt'): logger.info('Skipping file {0}, because it seems to be already processed'.format(full_fnm)) continue logger.info('Processing file {0} with target {1}'.format(full_fnm, target)) docs = parse_tei_corpus(full_fnm, target=target, encoding=encoding) for doc_id, doc in enumerate(docs): out_fnm = '{0}_{1}.txt'.format(out_prefix, doc_id) logger.info('Writing document {0}'.format(out_fnm)) write_document(doc, out_fnm)
def process(start_dir, out_dir, encoding=None): for dirpath, dirnames, filenames in os.walk(start_dir): if len(dirnames) > 0 or len(filenames) == 0 or 'bin' in dirpath: continue for fnm in filenames: full_fnm = os.path.join(dirpath, fnm) out_prefix = os.path.join(out_dir, fnm) target = get_target(full_fnm) if os.path.exists(out_prefix + '_0.txt'): logger.info( 'Skipping file {0}, because it seems to be already processed' .format(full_fnm)) continue logger.info('Processing file {0} with target {1}'.format( full_fnm, target)) docs = parse_tei_corpus(full_fnm, target=target, encoding=encoding) for doc_id, doc in enumerate(docs): out_fnm = '{0}_{1}.txt'.format(out_prefix, doc_id) logger.info('Writing document {0}'.format(out_fnm)) write_document(doc, out_fnm)
def simplifyRandom(): counter = 0 while True: path = 'korp/' + random.choice(os.listdir('korp')) fail = teicorpus.parse_tei_corpus( path, target=["artikkel", "alaosa", "tervikteos"]) for _ in range(5): artikkel = random.choice(fail) laused = artikkel.sentence_texts for _ in range(10): lause = random.choice(laused) lihtsustatud, debug = syntaks.lihtsusta(lause) counter += 1 if '__LIHTSUSTATUD__' in debug: print("_________________________") print("Esialgne lause\n", lause) print("-------------------------") print("Lihtsustatud lause\n", lihtsustatud) print("_________________________") print(str(counter) + ". lause, mida vaadati.") return
''' Read in a A&A TEI corpus and perform some operations: ''' from estnltk.core import AA_PATH from estnltk.teicorpus import parse_tei_corpora, parse_tei_corpus from estnltk.corpus import * from pprint import pprint import os import json # read a single XML file corp_path = os.path.join(AA_PATH, 'tea_AA_03_1.tasak.xml') corpus = parse_tei_corpus(corp_path) # do something with the corpora from estnltk.corpus import Corpus from estnltk.morf import PyVabamorfAnalyzer from estnltk.ner import NerTagger # ner tag the corpus analyzer = PyVabamorfAnalyzer() tagger = NerTagger() corpus = tagger(analyzer(corpus)) from nltk import FreqDist entities = [ne.lemma for ne in corpus.named_entities] print (entities)
''' Read in a A&A TEI corpus and perform some operations: ''' from estnltk.core import AA_PATH from estnltk.teicorpus import parse_tei_corpora, parse_tei_corpus from estnltk.corpus import * from pprint import pprint import os import json # read a single XML file corp_path = os.path.join(AA_PATH, 'tea_AA_03_1.tasak.xml') corpus = parse_tei_corpus(corp_path) # do something with the corpora from estnltk.corpus import Corpus from estnltk.morf import PyVabamorfAnalyzer from estnltk.ner import NerTagger # ner tag the corpus analyzer = PyVabamorfAnalyzer() tagger = NerTagger() corpus = tagger(analyzer(corpus)) from nltk import FreqDist entities = [ne.lemma for ne in corpus.named_entities] print(entities)