Ejemplo n.º 1
0
def find_route(text,
               objects_dict,
               duplicates_filtering_window=0,
               far_objects_filtering_dist=0.0,
               splitting_min_dist=None):
    pos = 0
    morf = morfeusz2.Morfeusz()
    sets = prepare_text(text, morf)
    route = []
    while pos < len(sets):
        matches = find_next_object(sets, pos, objects_dict)
        if len(matches) == 0:
            pos += 1
            continue
        objects, lengths, positions = zip(*matches)
        max_length = max(lengths)
        pos += max_length
        route.append([(m[0], m[2]) for m in matches if m[1] == max_length])
    route = swap_elements(route)
    route = filter_variants(route)
    remove_needless_fields(route)
    if far_objects_filtering_dist > 0:
        route = filter_far_objects(route, far_objects_filtering_dist)
    if duplicates_filtering_window != 0:
        route = filter_duplicates(route, duplicates_filtering_window)
    route = filter_duplicates(route, 1)
    if splitting_min_dist is not None:
        route = split_route(route, splitting_min_dist)
    return route
Ejemplo n.º 2
0
 def __init__(self,
              punfile="../data/punctation.txt",
              stopfile="../data/stopwords.txt"):
     # Initialize Morfeusz
     if platform.system() == 'Windows':
         self.morf = morfeusz
     else:
         self.morf = morfeusz2.Morfeusz()
     # Initialize files
     self.__punfile = punfile
     self.__stopfile = stopfile
     self.tokens_re = re.compile(r'(' + '|'.join(regex_str) + ')',
                                 re.UNICODE | re.VERBOSE | re.IGNORECASE)
     self.emoticon_re = re.compile(r'^' + emoticons_str + '$',
                                   re.UNICODE | re.VERBOSE | re.IGNORECASE)
     self.undef_re = re.compile(r'^' + regex_str[-1] + '$',
                                re.UNICODE | re.VERBOSE | re.IGNORECASE)
     self.men_re = re.compile(r'^' + regex_str[2] + '$',
                              re.UNICODE | re.VERBOSE | re.IGNORECASE)
     self.url_re = re.compile(
         r'(' + '|'.join([regex_str[1], regex_str[4]]) + ')',
         re.UNICODE | re.VERBOSE | re.IGNORECASE)
     # Load list of punctation characters from file
     with open(self.__punfile) as punf:
         self.punctation = ast.literal_eval(punf.read())
     # Load list of stop words characters from file
     with open(self.__stopfile) as stopf:
         self.stop = ast.literal_eval(stopf.read())
Ejemplo n.º 3
0
    def __init__(self, nlp):
        if imported_Morfeusz:
            self.morf = morfeusz2.Morfeusz(
                generate=False,
                whitespace=morfeusz2.KEEP_WHITESPACES,
                expand_tags=True)
        self.imported_Morfeusz = imported_Morfeusz
        self.nlp = nlp
        self.toygger = Toygger()
        self.vocab = self.nlp.vocab

        Token.set_extension("feats", default="")
        # we reserve a custom attribute for storing morphological features

        #tagmap into UD POS
        self.tag_map = {
            'adj': 'ADJ',
            'adja': 'ADJ',
            'adjc': 'ADJ',
            'adjp': 'ADJ',
            'adv': 'ADV',
            'aglt': 'AUX',
            'bedzie': 'VERB',
            'brev': 'X',
            'burk': 'ADV',
            'comp': 'SCONJ',
            'conj': 'CCONJ',
            'depr': 'NOUN',
            'fin': 'VERB',
            'ger': 'NOUN',
            'imps': 'VERB',
            'impt': 'VERB',
            'inf': 'VERB',
            'interj': 'INTJ',
            'interp': 'PUNCT',
            'num': 'NUM',
            'numcol': 'NUM',
            'pact': 'VERB',
            'pant': 'VERB',
            'pcon': 'VERB',
            'ppas': '******',
            'ppron12': 'PRON',
            'ppron3': 'PRON',
            'praet': 'VERB',
            'pred': 'VERB',
            'prep': 'ADP',
            'qub': 'PART',
            'siebie': 'PRON',
            'subst': 'NOUN',
            'winien': 'VERB',
            'xxx': 'X',
            #additional tags besides nkjp
            'part': 'PART',  # particle
            'ign': 'X',
            'dig': 'NUM',
            'romandig': 'NUM',
            'frag': 'X',
            'pacta': 'VERB',
            'numcomp': 'NUM'
        }
Ejemplo n.º 4
0
def Get_Case(keyword):
    ''' Returns case '''
    morf = morfeusz2.Morfeusz()
    list_of_morphosyntactic_forms = morf.analyse(keyword)
    element = list_of_morphosyntactic_forms[0][2][2]
    case = element.split(':')[2].split('.')[0]
    return case
Ejemplo n.º 5
0
def count_nouns(titles: list):
    """Return dictionary of occurences of nouns(nominative) in article titles"""
    morf = morfeusz2.Morfeusz()
    unique_list: list = []

    for title in titles:
        sentence_analysis: list = morf.analyse(title)

        for id in range(len(sentence_analysis)):
            list_matching_words = [item for item in sentence_analysis if item[0] == id]
            unique_list.append(list_matching_words)

    words_to_check: list = []
    for unique_word in unique_list:

        for item in unique_word:
            compatible_types: bool = any(elem in item[-1][3]
                                         for elem in ['nazwisko', 'imiona', 'imię', 'nazwa_geograficzna'])

            if (('subst:sg:nom:f' in item[-1][2] or 'subst:sg:nom:m' in item[-1][2]
                 or 'subst:pl:nom:f' in item[-1][2] or 'subst:pl:nom:m' in item[-1][2])
                    and not (compatible_types or ':' in item[-1][1])):

                words_to_check.append(item[-1][1])
            else:
                pass

    results: dict = {}
    for word in words_to_check:
        results.setdefault(word, 0)
        results[word] += 1

    return results
    def to_lemmas(words: List[str]) -> List[str]:

        w_str = " ".join(words)
        morf = morfeusz2.Morfeusz()  # (praet='composite')
        analysis = []
        analysis = morf.analyse(words)

        # for word in words:
        #     analysis.append(morf.analyse(word))
        # # analysis = morf.analyse(words)

        prev = None
        result = []

        for i, j, (orth, base, tag, posp, kwal) in analysis:
            if i == prev:
                continue
            prev = i
            if i > 0:
                if i + 1 == i:
                    continue
            if ':' in base:
                result.append(re.findall('(.*):', base)[0])
            else:
                result.append(base)
        return result
def stem2(text):
    if type(text) is str:
        morf = morfeusz2.Morfeusz()
        result_words = []
        for word in text:
            analysis = morf.analyse(word.decode('utf-8'))
            for intepretation in analysis:
                result_words.append(intepretation[2][1].encode('utf-8'))
    return ' '.join(result_words)
Ejemplo n.º 8
0
    def infinitive_of_word(self, word):
        morf = morfeusz2.Morfeusz()
        analysis = morf.analyse(word)

        if len(analysis) == 1:
            return analysis[0][2][1]
        elif len(analysis) == 2:
            return analysis[1][2][1]
        else:
            return analysis[len(analysis)-1][2][1]
Ejemplo n.º 9
0
 def __init__(self, nlp):
     self.nlp = nlp
     try:
         self.nlp.tokenizer.morf.generate("")
     except RuntimeError:
         # morfeusz does not have the generate dictionary loaded
         self.nlp.tokenizer.morf = morfeusz2.Morfeusz(
             expand_tags=True,
             whitespace=morfeusz2.KEEP_WHITESPACES,
             generate=True)
     self.morf = self.nlp.tokenizer.morf
def print_interpretation(df):
    morf = morfeusz2.Morfeusz()

    for line in df['text']:
        if type(line) is str:
            for word in line.split(' '):
                print("-----TEXT: ", word)
                analysis = morf.analyse(word.decode('utf-8'))
                for intepretation in analysis:
                    print('-----INTERPRETATION: ',
                          intepretation[2][1].encode('utf-8'))
Ejemplo n.º 11
0
def get_morfeusz():
    import morfeusz2
    morf = morfeusz2.Morfeusz(
        analyse=True,  # load analyze dictionary
        generate=False,  # dont load generator dictionary
        expand_tags=True,  # expand tags (return tags without dots)
        aggl=
        'isolated',  # 'isolated' - token 'm' has aglt interpretation, token 'np' has brev interpretation
        praet='composite',  # aglt and 'by' are not divided
        #    whitespace=morfeusz2.KEEP_WHITESPACES
    )
    return morf
Ejemplo n.º 12
0
    def __init__(self):
        self._morf = morfeusz2.Morfeusz(
            dict_path=f'{base_dir}/third parties/morfeusz2-dictionary-polimorf',
            dict_name="polimorf")

        self._base_form_extension = None
        self.reset_base_form_extension()

        self._base_form_removals = None
        self.reset_base_form_removals()

        self._reinterpret_mapping = None
        self.reset_reinterpret_mapping()
Ejemplo n.º 13
0
class MorfeuszAnalyser():
    morf = morfeusz2.Morfeusz(generate=False)
    tokenizer = nltk.data.load('tokenizers/punkt/polish.pickle')

    def __init__(self,
                 ngram_range=(1, 3),
                 split_to_sentences=True,
                 use_multiprocessing=True):
        self.ngram_range = ngram_range
        self.split_to_sentences = split_to_sentences
        self.use_multiprocessing = use_multiprocessing

        with open('polish_stopwords.txt') as f:
            self.stop_words = [x.strip() for x in f]

        self.ignore_tags = ['interp', 'interj', 'part', 'conj', 'comp', 'pred']

    def _analyse(self, text):
        analysis = [
            x for x in self.morf.analyse(text)
            if x[2][1].split(':')[0] not in self.stop_words
            and x[2][2] not in self.ignore_tags
        ]
        org_tokens = []
        lem_tokens = []
        curr_index = -1

        for word_index, _, tup in analysis:
            if curr_index == word_index:
                continue

            curr_index = word_index
            org_tokens.append(tup[0])
            lem_tokens.append(tup[1].split(':')[0])

        return lem_tokens + ngrams(org_tokens, self.ngram_range)

    def __call__(self, text):
        if self.split_to_sentences:
            sentences = self.tokenizer.tokenize(text)
            if self.use_multiprocessing:
                with Pool(cpu_count() - 1) as p:
                    sentence_tokens = p.map(self._analyse, sentences)
            else:
                sentence_tokens = [
                    self._analyse(sentence) for sentence in sentences
                ]

            return [token for tokens in sentence_tokens for token in tokens]
        else:
            return self._analyse(text)
def stem(text):
    if type(text) is str and ' ' in text:
        morf = morfeusz2.Morfeusz()
        result_words = []
        for word in text.split(' '):
            try:
                analysis = morf.analyse(word.decode('utf-8'))
                if len(analysis) > 0:
                    result_words.append(analysis[0][2][1].encode('utf-8'))
            except:
                result_words.append(word)
        result = ' '.join(result_words)
        return result
    else:
        return text
Ejemplo n.º 15
0
def preprocess_sents(sents, stop_words):
    morf = morfeusz2.Morfeusz(generate=False)
    res = []

    for sent in sents:
        analysis = morf.analyse(sent)
        brief_list = [
            next(t) for _, t in itertools.groupby(analysis, lambda x: x[0])
        ]
        words = list(
            filter(lambda x: x.isalpha() and x not in stop_words,
                   map(lambda x: x[2][1].lower(), brief_list)))
        if len(words) > 0:
            res.append(' '.join(words))

    return res
Ejemplo n.º 16
0
def main(file: str):
    morf = morfeusz2.Morfeusz()

    with open(file, 'r') as f:
        text = f.read()

    result = morf.analyse(text)
    line = []
    id = 0
    for word in result:
        if word[0] != id:
            print(", ".join(line))
            line = []
            id = word[0]

        line.append(f"{word[2][1]}:{word[2][2]}")
Ejemplo n.º 17
0
 def lemmatize(self, text):
     """Szuka lemmatów słów w danym tekście. Zwraca słownik zbudowany wg
     schemtu `lemmat: formy występujące w tekście`.
     """
     morf = morfeusz2.Morfeusz(whitespace=morfeusz2.SKIP_WHITESPACES,
                               generate=False)
     analysis = morf.analyse(text)
     pairs = [
         (
             lemm[2][0],  # forma występująca w tekście
             lemm[2][1].split(":")[0],  # lemmat
         ) for lemm in analysis
     ]
     lemmas = collections.defaultdict(set)
     for key, val in pairs:
         lemmas[key].add(val)
     return lemmas
 def __init__(self):
     self.column_names = [
         "Wykładnik formy", "Lemat", "Znacznik morfosyntaktyczny",
         "Klasyfikacja nazw własnych", "Kwalifikatory"
     ]
     self.VERBS_SYMBOLS = ["fin", "praet"]
     self.NOUNS_SYMBOLS = ["subst", "depr"]
     self.ADJECTIVES_SYMBOLS = ["adj", "adja", "adjp"]
     self.GRADES_OF_ADJECTIVES = {
         "equal": "pos",
         "higher": "com",
         "top": "sup"
     }
     self.MORPHOSYNTACTIC_MARKER = "Znacznik morfosyntaktyczny"
     self.morfeusz_object = morfeusz2.Morfeusz(praet='composite')
     self.nouns = []
     self.adjectives = []
     self.verbs = []
Ejemplo n.º 19
0
def tokenize_and_lemmatize(text):
    return_word_list = []
    next_word = 0
    try:
        for list_of_tuples in morfeusz2.Morfeusz().analyse(str(text)):

            morf_actual_word = list_of_tuples[0]

            if next_word > morf_actual_word:
                continue

            next_word = list_of_tuples[1]
            analyse_tuple = list_of_tuples[2]
            return_word_list.append((str(analyse_tuple[1])).lower())
    except:
        print("Error:", text)

    return return_word_list
Ejemplo n.º 20
0
def parseString(ciag, pretty):
    ciagU = ciag.decode('utf8')
    objMorf = morfeusz2.Morfeusz()
    if pretty:
        print "Ale ładne"
    else:
        out = objMorf.analyse(ciagU)
        #	[x.encode('utf8') for x in out]
        #	print out.encode('utf8')
        #	print "ciag: %s" % ciag
        #	print "ciagU: %s" % ciagU
        print out
        for wyraz in out:
            print "------------------------------"
            print "Wyraz:	%s" % wyraz[2][0]
            print "Leksem:	%s" % wyraz[2][1]
            print "Uwagi:	%s" % wyraz[2][3]
            print "Uwagi2:	%s" % wyraz[2][4]
            print "Morfo:	%s" % wyraz[2][2]
            for el in wyraz[2][2].split(":"):
                print el
Ejemplo n.º 21
0
def prepare_objects(terms):
    morf = morfeusz2.Morfeusz()
    print(morf.dict_id())
    prepared_objects = []
    for term in terms:
        words = term['name'].split(' ')
        words_results = [morf.analyse(w) for w in words]
        prepared_words = []
        for word_result in words_results:
            info = process_word(word_result)
            prepared_result = [(w[1], w[2].split(':')[0], w[3]) for w in info]
            forms = set([r[0].split(':')[0].lower() for r in prepared_result])
            prepared_words.append(
                (forms, any([is_sufficient(t) for t in prepared_result])))
        prepared_objects.append({
            'name': term['name'],
            'keywords': prepared_words,
            'type': term['type'],
            'coords': (term['latitude'], term['longitude'])
        })
    return prepared_objects
Ejemplo n.º 22
0
def Decline_Noun(keyword, case):
    ''' Decline nouns '''
    result = ""
    morf = morfeusz2.Morfeusz()
    list_of_morphosyntactic_forms = morf.generate(keyword)
    if case == 'nom':
        for tuple in list_of_morphosyntactic_forms:
            for element in tuple:
                if "subst:sg:nom" in element:
                    result = tuple[0]
    elif case == 'gen':
        for tuple in list_of_morphosyntactic_forms:
            for element in tuple:
                if "subst:sg:gen" in element:
                    result = tuple[0]
    elif case == 'dat':
        for tuple in list_of_morphosyntactic_forms:
            for element in tuple:
                if "subst:sg:dat" in element:
                    result = tuple[0]
    elif case == 'acc':
        for tuple in list_of_morphosyntactic_forms:
            for element in tuple:
                if "subst:sg:nom.acc" in element:
                    result = tuple[0]
                elif "subst:sg:acc" in element:
                    result = tuple[0]
    elif case == 'inst':
        for tuple in list_of_morphosyntactic_forms:
            for element in tuple:
                if "subst:sg:inst" in element:
                    result = tuple[0]
    elif case == 'loc':
        for tuple in list_of_morphosyntactic_forms:
            for element in tuple:
                if "subst:sg:loc" in element:
                    result = tuple[0]
    return result
Ejemplo n.º 23
0
# -*- coding: utf-8 -*-
import pandas as pd
import morfeusz2
import pickle

morf = morfeusz2.Morfeusz()
data = pd.read_excel("wykaz_miejscowosci.xlsx")

values = data[u'Nazwa miejscowości '].values.tolist()
values_lexical = []

for value in values:
    try:
        value = value.lower()
        lexical = morf.analyse(value)
        lst_word = -1
        name = []

        for i in lexical:
            if i[0] == lst_word:
                continue

            lst_word = i[0]
            v = i[2][1]
            v = v.split(":")[0]
            v = v.lower()
            name.append(v)

        name = " ".join(name)
        if u"góra" in name:
            print(name)
Ejemplo n.º 24
0
# encoding=utf8
import sys
from nltk.tokenize import word_tokenize
from tqdm import *
import morfeusz2
import os
import codecs
import json
import sys

reload(sys)
sys.setdefaultencoding('utf8')
if sys.platform == 'linux2':
    morfeusz = morfeusz2.Morfeusz()


class XmlParser:
    @staticmethod
    def get_stopwords(self):
        with open(self.stopwords_path) as f:
            words_list = f.readlines()[0]
            return words_list.split(", ")

    def __init__(self):
        self.stopwords_path = os.path.join(os.path.abspath('..'), "data",
                                           "stopwords.txt")
        self.stopwords = self.get_stopwords(self)
        self.special_char = "\'~*+§/\[](){}<>@=°„‚’\”&^|%_#-:;.!?,"
        self.xml_article_path = os.path.join(os.path.abspath('..'), "data",
                                             "wiki.xml")
        # self.articles_json_path = os.path.abspath('..') + '\\data\\articles.json'
Ejemplo n.º 25
0
    * Paweł Płatek
"""

import argparse
import logging
import signal
import sys
from collections import defaultdict
from sys import exit
from typing import Callable, List, Optional, Set, Tuple

# http://morfeusz.sgjp.pl/download/
import morfeusz2  # type: ignore

# init morfeusz2 globally, because it is slow and leaks memory
morfeusz_analyser = morfeusz2.Morfeusz(whitespace=morfeusz2.KEEP_WHITESPACES)

# (start_segment, end_segment, (text_form, lemma, morphology marker, ordinariness, stylistic qualifiers))
Interpretation = Tuple[int, int, Tuple[str, str, str, List[str], List[str]]]
IsDiminutiveFunc = Callable[[str, List[Interpretation]], bool]

logging.basicConfig(format='%(message)s')
L = logging.getLogger(__name__)

def interrupt_handler(sig, frame):
    print('Exit')
    exit(0)
signal.signal(signal.SIGINT, interrupt_handler)

# http://www.ipipan.waw.pl/~wolinski/publ/znakowanie.pdf
GRAM_FLEX = defaultdict(lambda: 'nieznane', {
Ejemplo n.º 26
0
 def __init__(self):
     self._analyzer = morfeusz2.Morfeusz()
Ejemplo n.º 27
0
 def setUp(self):
     self.morfeusz = morfeusz2.Morfeusz()
Ejemplo n.º 28
0
import collections
import functools

import morfeusz2
import matplotlib.pyplot as plt

KSIĄŻKA = 'jadro_ciemnosci.txt'
# TU(3): wpisać nazwę pliku z tekstem książki.

# Chociaż biblioteki morfeusz2 używa się zwykle do analizowania
# dłuższych tekstów, my używamy jej tylko do analizowania
# pojedynczych wyrazów.
# Dzięki parametrowi praet='composite' formy czasu przeszłego
# i trybu przypuszczającego są analizowane jako jeden segment,
# a nie jako np. 'robił' + 'by' + 'm'.
MORFEUSZ = morfeusz2.Morfeusz(praet='composite')


def podaj_wyrazy(nazwa_pliku):
    # TU(4): uzupełnić zgodnie z instrukcją.
    with open(nazwa_pliku, 'rt', encoding='utf-8') as plik:
        for czesc in plik.read().split():
            wyraz = czesc.strip(',.—;?!…:„”()*&-–/')
            if wyraz != '':
                yield wyraz


def wypisz_skrajne_znaki_wyrazów(nazwa_pliku):
    znaki = collections.Counter()
    for wyraz in podaj_wyrazy(nazwa_pliku):
        znaki[wyraz[0]] += 1
Ejemplo n.º 29
0
def diminutive_probability(word: str, interpretation: Interpretation, allows_rerun: bool = True) -> float:
    """Returns probability of the word being diminutive, given its morphological interpretation.
    TODO: weights for sets of suffixes
    TODO: handle suffix combinations
    Args:
        word: word to check
        interpretation: one item from morfeusz2.analyse function
        allows_rerun: allows recursive calls to this function
    """
    _, _, word_morphology = interpretation
    text_form, lemma, morphology_marker, _, _ = word_morphology

    # remove "rozpodabniacze", because words can have completely different meanings
    # f.e. kot:s1 == animal, kot:s2 == young soldier
    lemma = lemma.split(':')[0]

    L.debug('Probability for `%s` (%s, %s, %s)',
            word, text_form, lemma, morphology_marker)

    # find word's part of speech
    is_noun = False
    is_adjective = False
    is_unknown = False

    # TODO, czy część mowy zawsze jest jako pierwsza?
    marker = morphology_marker.split(':')[0]
    if GRAM_FLEX[marker] == 'rzeczownik':
        is_noun = True
    if GRAM_FLEX[marker] == 'przymiotnik':
        is_adjective = True
    if GRAM_FLEX[marker] == 'nieznane':
        is_unknown = True

    # sanity check
    if is_noun and is_adjective:
        L.warning('Strange, word `%s` is both noun and adjective', word)

    # results
    number_of_matches = 0
    number_of_checks = 0

    # general suffixes
    if is_noun or is_adjective or is_unknown:
        # Paweł Miczko
        number_of_checks += 1
        if has_diminutive_suffix(lemma, suf_miczko_general, 'Paweł Miczko'):
            number_of_matches += 1

    # noun only suffixes
    if is_noun:
        L.debug('    -> rzeczownik')
        # Długosz suffixes
        # find gender and grammatical number
        gender = None
        grammar_number = None
        subgender = None
        for marker_with_dots in morphology_marker.split(':'):
            for marker in marker_with_dots.split('.'):
                flex = GRAM_CATEGORY[marker]
                if flex == 'rodzaj':
                    gender = marker
                elif flex == 'liczba':
                    grammar_number = marker
                elif flex == 'przyrodzaj':
                    subgender = marker

        # rodzaj/liczba dowolne
        suffixes_to_check = set()
        suffixes_to_check.update(suf_dlugosz_noun_other)

        # liczba pojedyncza
        if grammar_number == 'sg':
            L.debug('        -> liczba pojedyncza')

            if gender:
                # męski
                if gender.startswith('m'):
                    L.debug('        -> rodzaj męski')
                    suffixes_to_check.update(suf_dlugosz_noun_masculine)
                # żeński
                elif gender.startswith('f'):
                    L.debug('        -> rodzaj żeński')
                    suffixes_to_check.update(suf_dlugosz_noun_feminine)
                # nijaki
                elif gender.startswith('n'):
                    L.debug('        -> rodzaj nijaki')
                    suffixes_to_check.update(suf_dlugosz_noun_neuter)
                # przymnogi TODO, czyli jakby mnogi? Sprawdzac word czy lemma?
                elif gender.startswith('p'):
                    L.debug('        -> rodzaj przymnogi')
                    suffixes_to_check.update(
                        suf_dlugosz_noun_plural_and_plurale_tantum)

            # check lemma, as it always is plural
            number_of_checks += 1
            if has_diminutive_suffix(lemma, suffixes_to_check, 'Długosz'):
                number_of_matches += 1

        else:
            # liczba mnoga
            if grammar_number:
                L.debug('        -> liczba mnoga')
                suffixes_to_check.update(
                    suf_dlugosz_noun_plural_and_plurale_tantum)

            # plurale tantum
            elif subgender == 'pt':
                L.debug('        -> plurale tantum')
                suffixes_to_check.update(
                    suf_dlugosz_noun_plural_and_plurale_tantum)

            # check original word, not lemma, because lemma is singular
            number_of_checks += 1
            if has_diminutive_suffix(word, suffixes_to_check, 'Długosz'):
                number_of_matches += 1

            # run checks for pluralized lemma
            if allows_rerun and lemma.lower() != word.lower():
                L.debug('    -> re-running checks for lemma!')
                L.debug('~*' * 5)
                number_of_checks += 1
                morf = morfeusz2.Morfeusz(
                    whitespace=morfeusz2.SKIP_WHITESPACES)
                lemma_segments = morf.analyse(lemma)
                if is_diminutive(lemma, lemma_segments, allows_rerun=False):
                    number_of_matches += 1
                L.debug('~*' * 5)

        # Grzegorczykowa and Puzynina, Dobrzyński, Kaczorowska
        number_of_checks += 1
        if has_diminutive_suffix(lemma, suf_gpdk_noun, 'GPDK'):
            number_of_matches += 1

    # adjective only suffixes
    elif is_adjective:
        L.debug('    -> przymiotnik')
        # Grzegorczykowa
        number_of_checks += 1
        if has_diminutive_suffix(lemma, suf_grzeg_adjectives, 'Grzegorczykowa'):
            number_of_matches += 1

    # we care only about nouns and adjectives
    else:
        pass

    probability = 0.0
    if number_of_checks != 0:
        probability = float(number_of_matches) / number_of_checks
    L.debug('    -> probability: %f', probability)
    return probability
Ejemplo n.º 30
0
        if ktext.tokens and ktext.tokens[-1].start_position == start_position and ktext.tokens[
            -1].end_position == end_position:
            ktext.tokens[-1].add_interpretation(kinterpretation)
        else:
            ktoken = KToken(form, space_before=None, start_offset=None, end_offset=None)
            ktoken.start_position = start_position
            ktoken.end_position = end_position
            ktoken.add_interpretation(kinterpretation)
            ktext.add_token(ktoken)
    return ktext


parser = ArgumentParser(description='Train')
parser.add_argument('jsonl_path', help='path to JSONL for getting text')
parser.add_argument('--dict_dir', default=None, help='path to directory with dict')
parser.add_argument('--dict_name', default=None, help='dict name')
parser.add_argument('output_path', help='path to merged JSONL')
args = parser.parse_args()

morfeusz = morfeusz2.Morfeusz(generate=False, expand_tags=True, dict_name=args.dict_name, dict_path=args.dict_dir)  # dict_name=None, dict_path=None
#--dict-dir /home/kwrobel/repos/poleval2020-task2/data/ --dict morfeusz-f19

with jsonlines.open(args.jsonl_path) as reader, jsonlines.open(args.output_path, mode='w') as writer:
    for data in reader:
        original_ktext = KText.load(data)
        text = original_ktext.text

        ktext = morfeusz_tokenize(text, original_ktext)
        ktext.fix_offsets2()
        writer.write(ktext.save())