Ejemplo n.º 1
0
def lemmatize(file, output_file):
    morphodita_model = os.path.join(
        dir_cur, 'czech-morfflex-pdt-131112-raw_lemmas.tagger-best_accuracy')
    tagger = Tagger.load(morphodita_model)
    assert tagger
    forms = Forms()
    lemmas = TaggedLemmas()
    tokens = TokenRanges()
    tokenizer = tagger.newTokenizer()
    assert tokenizer
    with open_gz(output_file, 'w') as out, open_gz(file) as f:
        for line in f:
            tokenizer.setText(line)
            while tokenizer.nextSentence(forms, tokens):
                tagger.tag(forms, lemmas)
                # for i in range(len(tokens)):
                # lemma = lemmas[i]
                # token = tokens[i]
                #word = line[token.start:token.start + token.length]
                #out.write(str(lemma.lemma) + ' ')
                #out.write(" ".join(list(map(lambda x: str(x.lemma), lemmas))))
                out.write(" ".join(
                    list(
                        map(
                            lambda x: str(x.lemma).strip() + '___' + str(x.tag)
                            .strip(), lemmas))))
            out.write('\n')
Ejemplo n.º 2
0
def syn2_to_plain(filename,
                  filename_out,
                  keep_punctuation=True,
                  keep_tags=False,
                  raw=False):
    with open_gz(filename_out, 'w') as file, open_gz(filename,
                                                     'r',
                                                     encoding='utf-8') as f:
        root = ET.iterparse(f)
        for event, element in root:
            if element.tag == 'block':
                file.write('\n')
            if element.tag == 's':
                file.write(' '.join(element.text.split()[::3]))
                # for word in element.text.split('\n'):
                #     if word:
                #         word, lemma, tags = word.split('\t')
                #         if raw:
                #             file.write(word + ' ')
                #         elif keep_tags:
                #             file.write(word + ' ' + lemma + ' ' + tags + '\n')
                #         elif keep_punctuation or not tags.startswith('Z'):
                #             file.write(lemma + ' ')
                file.write('\n')
            element.clear()
Ejemplo n.º 3
0
def syn2_to_plain(filename, filename_out, keep_punctuation=True, keep_tags=False, raw=False):
    with open_gz(filename_out, 'w') as file, open_gz(filename, 'r', encoding='utf-8') as f:
        root = ET.iterparse(f)
        for event, element in root:
            if element.tag == 'block':
                file.write('\n')
            if element.tag == 's':
                file.write(' '.join(element.text.split()[::3]))
                # for word in element.text.split('\n'):
                #     if word:
                #         word, lemma, tags = word.split('\t')
                #         if raw:
                #             file.write(word + ' ')
                #         elif keep_tags:
                #             file.write(word + ' ' + lemma + ' ' + tags + '\n')
                #         elif keep_punctuation or not tags.startswith('Z'):
                #             file.write(lemma + ' ')
                file.write('\n')
            element.clear()
Ejemplo n.º 4
0
def lemmatize_and_replace_entities(file, output_file):
    nametag_model = os.path.join(dir_cur, 'czech-cnec2.0-140304.ner')
    morphodita_model = os.path.join(dir_cur,
                                    'czech-morfflex-131112.tagger-fast')
    tagger = Tagger.load(morphodita_model)
    assert tagger
    ner = Ner.load(nametag_model)
    assert ner
    forms = Forms()
    lemmas = TaggedLemmas()
    tokens = TokenRanges()
    entities = NamedEntities()
    tokenizer = ner.newTokenizer()
    assert tokenizer
    with open_gz(output_file, 'w') as out, open_gz(file) as f:
        for line in f:
            tokenizer.setText(line)
            while tokenizer.nextSentence(forms, tokens):
                tagger.tag(forms, lemmas)
                ner.recognize(forms, entities)
                sorted_entities = sort_entities(entities)
                open_entities = []
                open_entities_type = []
                e = 0
                for i in range(len(tokens)):
                    lemma = lemmas[i]
                    token = tokens[i]
                    word = line[token.start:token.start + token.length]
                    while e < len(
                            sorted_entities) and sorted_entities[e].start == i:
                        open_entities.append(sorted_entities[e].start +
                                             sorted_entities[e].length - 1)
                        open_entities_type.append(sorted_entities[e].type)
                        e += 1
                    if len(open_entities) == 0:
                        out.write(str(lemma.lemma) + ' ')
                    else:
                        out.write("@!ENT!%s " % ('!'.join(open_entities_type)))
                    while open_entities and open_entities[-1] == i:
                        open_entities.pop()
                        open_entities_type.pop()
            out.write('\n')
Ejemplo n.º 5
0
def lemmatize_and_replace_entities(file, output_file):
    nametag_model = os.path.join(dir_cur, 'czech-cnec2.0-140304.ner')
    morphodita_model = os.path.join(dir_cur, 'czech-morfflex-131112.tagger-fast')
    tagger = Tagger.load(morphodita_model)
    assert tagger
    ner = Ner.load(nametag_model)
    assert ner
    forms = Forms()
    lemmas = TaggedLemmas()
    tokens = TokenRanges()
    entities = NamedEntities()
    tokenizer = ner.newTokenizer()
    assert tokenizer
    with open_gz(output_file, 'w') as out, open_gz(file) as f:
        for line in f:
            tokenizer.setText(line)
            while tokenizer.nextSentence(forms, tokens):
                tagger.tag(forms, lemmas)
                ner.recognize(forms, entities)
                sorted_entities = sort_entities(entities)
                open_entities = []
                open_entities_type = []
                e = 0
                for i in range(len(tokens)):
                    lemma = lemmas[i]
                    token = tokens[i]
                    word = line[token.start:token.start + token.length]
                    while e < len(sorted_entities) and sorted_entities[e].start == i:
                        open_entities.append(sorted_entities[e].start + sorted_entities[e].length - 1)
                        open_entities_type.append(sorted_entities[e].type)
                        e += 1
                    if len(open_entities) == 0:
                        out.write(str(lemma.lemma) + ' ')
                    else:
                        out.write("@!ENT!%s " % ('!'.join(open_entities_type)))
                    while open_entities and open_entities[-1] == i:
                        open_entities.pop()
                        open_entities_type.pop()
            out.write('\n')
Ejemplo n.º 6
0
def lemmatize(file, output_file):
    morphodita_model = os.path.join(dir_cur, 'czech-morfflex-pdt-131112-raw_lemmas.tagger-best_accuracy')
    tagger = Tagger.load(morphodita_model)
    assert tagger
    forms = Forms()
    lemmas = TaggedLemmas()
    tokens = TokenRanges()
    tokenizer = tagger.newTokenizer()
    assert tokenizer
    with open_gz(output_file, 'w') as out, open_gz(file) as f:
        for line in f:
            tokenizer.setText(line)
            while tokenizer.nextSentence(forms, tokens):
                tagger.tag(forms, lemmas)
                # for i in range(len(tokens)):
                # lemma = lemmas[i]
                # token = tokens[i]
                #word = line[token.start:token.start + token.length]
                #out.write(str(lemma.lemma) + ' ')
                #out.write(" ".join(list(map(lambda x: str(x.lemma), lemmas))))
                out.write(" ".join(list(map(lambda x: str(x.lemma).strip() + '___' + str(x.tag).strip(), lemmas))))
            out.write('\n')
Ejemplo n.º 7
0
 def create(file):
     tags = defaultdict(lambda: defaultdict(int))
     with open_gz(file) as f:
         for line in f:
             if line != '\n' and len(line) >= 3:
                 word = line.strip()
                 if word[-2] == '_':
                     p = word[-1]
                     w = word[:-2]
                     tags[w][p] += 1
     pos = POS()
     for word, t in tags.items():
         max_num = 0
         for tag, num in t.items():
             if num > max_num:
                 pos.word_pos[word] = t
                 max_num = num
     return pos
Ejemplo n.º 8
0
__author__ = 'veselt12'
import argparse
from synonyms.in_out.utils import check_input_file_exists


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Computes counts and ppmi matrix for given corpus and dictionary")
    parser.add_argument('corpus', type=str, help='Corpus')
    parser.add_argument('word_count', type=int, help='Word count')
    parser.add_argument('postfix_length', type=int)
    parser.add_argument('output_file', type=str, help='Name of the output file')
    args = parser.parse_args()
    check_input_file_exists(args.corpus)

    max_count = args.word_count
    with open_gz(args.corpus) as input:
        word_count = 0
        file_count = 0
        line_1 = None
        end_of_file = False
        output = None
        for line in input:
            line = line.strip()
            if not output:
                output = open_gz(args.output_file + ('.%0' + str(args.postfix_length) + 'd') % file_count + '.gz', 'w')
            if line == '\n':
                output.write('\n')
                continue
            words = len(line.split())
            word_count += words
            if line_1:
Ejemplo n.º 9
0
#!/usr/bin/env python3
from synonyms.dictionary import Dictionary
from synonyms.in_out.readers import open_gz

__author__ = 'veselt12'
import argparse
from synonyms.in_out.utils import check_input_file_exists


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="TODO")
    parser.add_argument('input_file', type=str, help='Input file with corpus in plain text')
    parser.add_argument('dictionary', type=str, help='Input file with dictionary')
    parser.add_argument('output_file', type=str, help='Output file where filtered version of corpus will be stored')
    args = parser.parse_args()
    check_input_file_exists(args.input_file)
    check_input_file_exists(args.dictionary)
    dictionary = Dictionary(filename=args.dictionary)
    with open_gz(args.output_file, 'w+', encoding='utf-8') as w, open_gz(args.input_file, encoding='utf-8') as r:
        for line in r:
            w.write(' '.join([word for word in line.lower().split() if word in dictionary])+'\n')
Ejemplo n.º 10
0
 def load(filename):
     pos = POS()
     with open_gz(filename) as f:
         pos.word_pos = json.load(f)
     return pos
Ejemplo n.º 11
0
 def save(self, filename):
     with open_gz(filename, 'w') as f:
         json.dump(self.word_pos, f)