def prepare_text(text: str, morf: morfeusz2.Morfeusz) -> List[Set[str]]:
    analysed_text = morf.analyse(text)
    pos = 0
    sets = []
    current_set = set()
    for morf_tuple in analysed_text:
        part_of_speech = morf_tuple[2][2].split(':')[0]
        if part_of_speech in [
                'interj', 'conj', 'part', 'siebie', 'fin', 'bedzie', 'aglt',
                'impt', 'imps', 'inf', 'winien', 'pred', 'comp', 'interj',
                'interp'
        ]:
            continue
        if morf_tuple[0] != pos:
            if len(current_set) != 0:
                sets.append(current_set)
                current_set = set()
            pos = morf_tuple[0]
        lemma = morf_tuple[2][1].split(':')[0]
        if part_of_speech == 'ign':
            lemma = stem(lemma)
        lemma = ''.join(c for c in lemma if c.isalnum())
        if len(lemma) > 0:
            current_set.add(lemma.lower())
    return sets
def answer_sentiment_score(text: str,
                           sentiment_dict: Dict[str, int],
                           morf: morfeusz2.Morfeusz,
                           verbose=False):
    analysis = morf.analyse(text)
    lemmas = lemmas_list(analysis)
    return sentiment_score(lemmas, sentiment_dict, verbose)
Exemple #3
0
class MorfeuszLemmatizer(object):
    """Morfeusz-based lemmatizer"""
    def __init__(self):
        """Constructor"""
        self.morf = Morfeusz()

    def lemmatize(self, form):
        analysed = self.morf.analyse(form)
        for (
                _begin,
                _end,
            (_wordform, baseform, _tags, _commonness, _qualifiers),
        ) in analysed:
            return baseform
Exemple #4
0
def process_request(params):
    option_parser = MorfeuszOptionParser(params)
    option_parser.parse_bool('expandDag', 'expand_dag')
    option_parser.parse_bool('expandTags', 'expand_tags')
    option_parser.parse_bool('expandDot', 'expand_dot')
    option_parser.parse_bool('expandUnderscore', 'expand_underscore')
    option_parser.parse_string('agglutinationRules', 'aggl',
                               AGGLUTINATION_RULES)
    option_parser.parse_string('pastTenseSegmentation', 'praet',
                               PAST_TENSE_SEGMENTATION)
    option_parser.parse_enum('tokenNumbering', 'separate_numbering',
                             TokenNumbering, TokenNumbering.separate)
    option_parser.parse_enum('caseHandling', 'case_handling', CaseHandling)
    option_parser.parse_enum('whitespaceHandling', 'whitespace',
                             WhitespaceHandling)
    option_parser.parse_actions('action')

    results = []
    response = {'results': results}

    if option_parser.validate(response):
        option_parser.set_dictionary_path('MORFEUSZ_DICT_PATH')
        morfeusz = Morfeusz(**option_parser.get_opts())

        if option_parser.action == 'analyze':
            for interp_list in morfeusz.analyse(option_parser.text):
                if isinstance(interp_list, list):
                    subitem = []
                    results.append(subitem)

                    for item in interp_list:
                        subitem.append(tag_items(item))
                else:
                    results.append(tag_items(interp_list))
        elif option_parser.action == 'generate':
            for title in option_parser.titles:
                subitem = []
                results.append(subitem)

                for interp_list in morfeusz.generate(title):
                    subitem.append(tag_items(interp_list))

        response['version'] = morfeusz2.__version__
        response['dictionaryId'] = morfeusz.dict_id()
        response['copyright'] = morfeusz.dict_copyright()

    return response
Exemple #5
0
#! /usr/bin/python
# *-* coding: utf-8 *-*

from morfeusz2 import Morfeusz
from concraft_pl2 import Concraft, Server

try:
  morfeusz = Morfeusz(expand_tags=True)
  server = Server(model_path="/home/kuba/work/ipipan/concraft/pre-trained/Sep-18/model-04-09-2018.gz", port=3001)
  concraft = Concraft(port=3001)
  
  dag = morfeusz.analyse(u'W Szczebrzeszynie chrząszcz brzmi w trzcinie.')
  res = concraft.disamb(dag)
  print(res)
  
  dag = morfeusz.analyse(u'W Szczebrzeszynie chrząszcz brzmi w trzcinie.')
  dag_str = concraft.dag_to_str(dag)
  dag_disamb_str = concraft.disamb_str(dag_str)
  print(dag_disamb_str)
finally:
  server.terminate()
Exemple #6
0
    r'<([a-z]*-[a-z]*)(\s+[a-z]*="\w*((\s+\w*)+)?")?>(.+?)<\/[a-z]*-[a-z]*>',
    letter_contents)
# print('one word tags')
# for tag in tags_word:
#     print(tag)
#     print('\n')
# print('two word tags')
# for tag in tags_words:
#     print(tag)
#     print('\n')


def remove_dashes(text):
    tmp_str = ''
    for letter in text:
        if letter != '-':
            tmp_str = tmp_str + letter
    return tmp_str


letter1_no_tags = remove_tags(letters[0].contents)
letter1_nt_str = ' '.join(letter1_no_tags)
# letter_ntnd_str = remove_dashes(letter1_nt_str.decode('utf-8'))
# print(letter_ntnd_str)

morf = Morfeusz()
# print(morf)
print(letter1_nt_str.decode('utf8'))
letter1_analysed = morf.analyse(letter1_nt_str)
print(letter1_analysed)
Exemple #7
0
if 'generated_with_lsa.txt' in listdir('.'):
    remove('generated_with_lsa.txt')

f = codecs.open(corpus_filename, encoding='utf-8')

# main cycle
for raw_line in f:
    sentence = re.split('\W+', raw_line.lower(), flags=re.UNICODE)
    ngrams2file = []
    for ngram_tuple in ngrams(
            sentence, min(len(sentence[:-1]), max_n)
    ):  # [:-1] because sentence has empty word as the last element
        ngram = list(ngram_tuple)
        ngrams2file.append(ngram)  # initial form of ngram
        for c in xrange(0, len(ngram)):  # step by word in ngram
            w_desc = morph.analyse(ngram[c])
            if len(w_desc) > 0:
                init_form = w_desc[0][2][1].split(':')[0]
                #print 'init_form: ', type(init_form)
                try:
                    index = dictionary.index(init_form)  # + u'\n'
                    #print 'index: ', index
                    syns = find_syns(init_form, index,
                                     w_desc[0][2][2].split(':'))
                    #print 'syns: '
                    for syn in syns:
                        ngram2file = ngram[:]
                        #print(syn)
                        ngram2file[c] = syn
                        ngrams2file.append(ngram2file)  #u' '.join(ngram2file)
                #print 'n ', ngrams2file[-1]