Ejemplo n.º 1
0
def analyze_tei_files(fns):
    for fn in fns:
        try:
            for text in parse_tei_corpus(fn, target=get_target(fn)):
                text['guess'] = True
                text['disambiguate'] = True
                show_df(text.get.word_texts.roots.postags.forms.as_dataframe)
        except AttributeError:
            print('Warning: parse error: skipped: {0}'.format(fn), file=sys.stderr)
Ejemplo n.º 2
0
def process(start_dir, out_dir, encoding=None):
    for dirpath, dirnames, filenames in os.walk(start_dir):
        if len(dirnames) > 0 or len(filenames) == 0 or 'bin' in dirpath:
            continue
        for fnm in filenames:
            full_fnm = os.path.join(dirpath, fnm)
            out_prefix = os.path.join(out_dir, fnm)
            target = get_target(full_fnm)
            if os.path.exists(out_prefix + '_0.txt'):
                logger.info('Skipping file {0}, because it seems to be already processed'.format(full_fnm))
                continue
            logger.info('Processing file {0} with target {1}'.format(full_fnm, target))
            docs = parse_tei_corpus(full_fnm, target=target, encoding=encoding)
            for doc_id, doc in enumerate(docs):
                out_fnm = '{0}_{1}.txt'.format(out_prefix, doc_id)
                logger.info('Writing document {0}'.format(out_fnm))
                write_document(doc, out_fnm)
Ejemplo n.º 3
0
def process(start_dir, out_dir, encoding=None):
    for dirpath, dirnames, filenames in os.walk(start_dir):
        if len(dirnames) > 0 or len(filenames) == 0 or 'bin' in dirpath:
            continue
        for fnm in filenames:
            full_fnm = os.path.join(dirpath, fnm)
            out_prefix = os.path.join(out_dir, fnm)
            target = get_target(full_fnm)
            if os.path.exists(out_prefix + '_0.txt'):
                logger.info(
                    'Skipping file {0}, because it seems to be already processed'
                    .format(full_fnm))
                continue
            logger.info('Processing file {0} with target {1}'.format(
                full_fnm, target))
            docs = parse_tei_corpus(full_fnm, target=target, encoding=encoding)
            for doc_id, doc in enumerate(docs):
                out_fnm = '{0}_{1}.txt'.format(out_prefix, doc_id)
                logger.info('Writing document {0}'.format(out_fnm))
                write_document(doc, out_fnm)
Ejemplo n.º 4
0
def simplifyRandom():
    counter = 0
    while True:
        path = 'korp/' + random.choice(os.listdir('korp'))
        fail = teicorpus.parse_tei_corpus(
            path, target=["artikkel", "alaosa", "tervikteos"])
        for _ in range(5):
            artikkel = random.choice(fail)
            laused = artikkel.sentence_texts
            for _ in range(10):
                lause = random.choice(laused)
                lihtsustatud, debug = syntaks.lihtsusta(lause)
                counter += 1
                if '__LIHTSUSTATUD__' in debug:
                    print("_________________________")
                    print("Esialgne lause\n", lause)
                    print("-------------------------")
                    print("Lihtsustatud lause\n", lihtsustatud)
                    print("_________________________")
                    print(str(counter) + ". lause, mida vaadati.")
                    return
Ejemplo n.º 5
0
'''
Read in a A&A TEI corpus and perform some operations:
'''

from estnltk.core import AA_PATH
from estnltk.teicorpus import parse_tei_corpora, parse_tei_corpus
from estnltk.corpus import *
from pprint import pprint

import os
import json

# read a single XML file
corp_path = os.path.join(AA_PATH, 'tea_AA_03_1.tasak.xml')
corpus = parse_tei_corpus(corp_path)

# do something with the corpora
from estnltk.corpus import Corpus
from estnltk.morf import PyVabamorfAnalyzer
from estnltk.ner import NerTagger

# ner tag the corpus
analyzer = PyVabamorfAnalyzer()
tagger = NerTagger()
corpus = tagger(analyzer(corpus))

from nltk import FreqDist

entities = [ne.lemma for ne in corpus.named_entities]
print (entities)
Ejemplo n.º 6
0
'''
Read in a A&A TEI corpus and perform some operations:
'''

from estnltk.core import AA_PATH
from estnltk.teicorpus import parse_tei_corpora, parse_tei_corpus
from estnltk.corpus import *
from pprint import pprint

import os
import json

# read a single XML file
corp_path = os.path.join(AA_PATH, 'tea_AA_03_1.tasak.xml')
corpus = parse_tei_corpus(corp_path)

# do something with the corpora
from estnltk.corpus import Corpus
from estnltk.morf import PyVabamorfAnalyzer
from estnltk.ner import NerTagger

# ner tag the corpus
analyzer = PyVabamorfAnalyzer()
tagger = NerTagger()
corpus = tagger(analyzer(corpus))

from nltk import FreqDist

entities = [ne.lemma for ne in corpus.named_entities]
print(entities)