def answer_sentiment_score(text: str,
                           sentiment_dict: Dict[str, int],
                           morf: morfeusz2.Morfeusz,
                           verbose=False):
    analysis = morf.analyse(text)
    lemmas = lemmas_list(analysis)
    return sentiment_score(lemmas, sentiment_dict, verbose)
def prepare_text(text: str, morf: morfeusz2.Morfeusz) -> List[Set[str]]:
    analysed_text = morf.analyse(text)
    pos = 0
    sets = []
    current_set = set()
    for morf_tuple in analysed_text:
        part_of_speech = morf_tuple[2][2].split(':')[0]
        if part_of_speech in [
                'interj', 'conj', 'part', 'siebie', 'fin', 'bedzie', 'aglt',
                'impt', 'imps', 'inf', 'winien', 'pred', 'comp', 'interj',
                'interp'
        ]:
            continue
        if morf_tuple[0] != pos:
            if len(current_set) != 0:
                sets.append(current_set)
                current_set = set()
            pos = morf_tuple[0]
        lemma = morf_tuple[2][1].split(':')[0]
        if part_of_speech == 'ign':
            lemma = stem(lemma)
        lemma = ''.join(c for c in lemma if c.isalnum())
        if len(lemma) > 0:
            current_set.add(lemma.lower())
    return sets
Exemple #3
0
    def __init__(self):
        super(PolishLemmatizer, self).__init__()
        try:
            from morfeusz2 import Morfeusz
        except ImportError:
            raise ImportError(
                'The Polish lemmatizer requires the morfeusz2-python library')

        if PolishLemmatizer._morph is None:
            PolishLemmatizer._morph = Morfeusz(dict_name='polimorf')
Exemple #4
0
class MorfeuszLemmatizer(object):
    """Morfeusz-based lemmatizer"""
    def __init__(self):
        """Constructor"""
        self.morf = Morfeusz()

    def lemmatize(self, form):
        analysed = self.morf.analyse(form)
        for (
                _begin,
                _end,
            (_wordform, baseform, _tags, _commonness, _qualifiers),
        ) in analysed:
            return baseform
Exemple #5
0
def process_request(params):
    option_parser = MorfeuszOptionParser(params)
    option_parser.parse_bool('expandDag', 'expand_dag')
    option_parser.parse_bool('expandTags', 'expand_tags')
    option_parser.parse_bool('expandDot', 'expand_dot')
    option_parser.parse_bool('expandUnderscore', 'expand_underscore')
    option_parser.parse_string('agglutinationRules', 'aggl',
                               AGGLUTINATION_RULES)
    option_parser.parse_string('pastTenseSegmentation', 'praet',
                               PAST_TENSE_SEGMENTATION)
    option_parser.parse_enum('tokenNumbering', 'separate_numbering',
                             TokenNumbering, TokenNumbering.separate)
    option_parser.parse_enum('caseHandling', 'case_handling', CaseHandling)
    option_parser.parse_enum('whitespaceHandling', 'whitespace',
                             WhitespaceHandling)
    option_parser.parse_actions('action')

    results = []
    response = {'results': results}

    if option_parser.validate(response):
        option_parser.set_dictionary_path('MORFEUSZ_DICT_PATH')
        morfeusz = Morfeusz(**option_parser.get_opts())

        if option_parser.action == 'analyze':
            for interp_list in morfeusz.analyse(option_parser.text):
                if isinstance(interp_list, list):
                    subitem = []
                    results.append(subitem)

                    for item in interp_list:
                        subitem.append(tag_items(item))
                else:
                    results.append(tag_items(interp_list))
        elif option_parser.action == 'generate':
            for title in option_parser.titles:
                subitem = []
                results.append(subitem)

                for interp_list in morfeusz.generate(title):
                    subitem.append(tag_items(interp_list))

        response['version'] = morfeusz2.__version__
        response['dictionaryId'] = morfeusz.dict_id()
        response['copyright'] = morfeusz.dict_copyright()

    return response
Exemple #6
0
from sys import argv, exit
#other imports

corpus_filename = 'pl.txt'
try:
    filename = argv[1]
    if filename in listdir('.'):
        corpus_filename = filename
    else:
        print('File %s not found in the current directory' % filename)
        exit(-1)
except IndexError:
    pass

exclude = string.digits  #unicode(string.digits) #
morph = Morfeusz()


def lemm(line):
    sentence = re.split(
        '\d+|\W+|_', line.lower(), flags=re.UNICODE
    )  #re.split('\W+', line.lower(), flags=re.UNICODE) #line.split()
    norm_sentence = []
    for i in xrange(0, len(sentence)):
        if sentence[i] != u'':
            #print 'sen: ', sentence[i], 'len: ', len(sentence[i])
            w_desc = morph.analyse(sentence[i])
            if len(w_desc) > 0:
                norm_sentence.append(w_desc[0][2][1].split(':')[0])
    return norm_sentence
Exemple #7
0
#! /usr/bin/python
# *-* coding: utf-8 *-*

from morfeusz2 import Morfeusz
from concraft_pl2 import Concraft, Server

try:
  morfeusz = Morfeusz(expand_tags=True)
  server = Server(model_path="/home/kuba/work/ipipan/concraft/pre-trained/Sep-18/model-04-09-2018.gz", port=3001)
  concraft = Concraft(port=3001)
  
  dag = morfeusz.analyse(u'W Szczebrzeszynie chrząszcz brzmi w trzcinie.')
  res = concraft.disamb(dag)
  print(res)
  
  dag = morfeusz.analyse(u'W Szczebrzeszynie chrząszcz brzmi w trzcinie.')
  dag_str = concraft.dag_to_str(dag)
  dag_disamb_str = concraft.disamb_str(dag_str)
  print(dag_disamb_str)
finally:
  server.terminate()
Exemple #8
0
import pathlib, random, re, sys
from typing import Callable, Optional

from morfeusz2 import Morfeusz
from wordnet import query

morfeusz = Morfeusz(analyse=False)

DATASETS = ["new"]
DICT_LINES = {}
DICT_FUNCTIONS = {}

THESAURUS = {}

# Words from the thesaurus containing these tags will be ignored:
BLACKLISTED_TAGS = [
	"(bardzo potocznie)",
	"(potocznie)",
	"(częściej, ale wg niektórych niepoprawnie)",
	"(eufemistycznie)",  # :(
	#"(nieco potocznie)",  # Eh, it's fine
	"(obraźliwe)",
	"(obraźliwie)",
	#"(pieszczotliwie)",
	"(pogardliwie)",
	"(potoczne)",
	"(potocznie)",
	"(przestarzale)",
	"(ptoocznie)",
	"(regionalnie)",  # Contains some inappropriate words
	"(rzadko, wg niektórych niepoprawnie)",
Exemple #9
0
 def __init__(self):
     """Constructor"""
     self.morf = Morfeusz()
Exemple #10
0
    r'<([a-z]*-[a-z]*)(\s+[a-z]*="\w*((\s+\w*)+)?")?>(.+?)<\/[a-z]*-[a-z]*>',
    letter_contents)
# print('one word tags')
# for tag in tags_word:
#     print(tag)
#     print('\n')
# print('two word tags')
# for tag in tags_words:
#     print(tag)
#     print('\n')


def remove_dashes(text):
    tmp_str = ''
    for letter in text:
        if letter != '-':
            tmp_str = tmp_str + letter
    return tmp_str


letter1_no_tags = remove_tags(letters[0].contents)
letter1_nt_str = ' '.join(letter1_no_tags)
# letter_ntnd_str = remove_dashes(letter1_nt_str.decode('utf-8'))
# print(letter_ntnd_str)

morf = Morfeusz()
# print(morf)
print(letter1_nt_str.decode('utf8'))
letter1_analysed = morf.analyse(letter1_nt_str)
print(letter1_analysed)
Exemple #11
0
from sys import argv, exit
#other imports

corpus_filename = 'pl.txt'
try:
    filename = argv[1]
    if filename in listdir('.'):
        corpus_filename = filename
    else:
        print('File %s not found in the current directory' % filename)
        exit(-1)
except IndexError:
    pass

max_n = 3
morph = Morfeusz()

#load data
dictionary = []
f = codecs.open('dict.txt', encoding='utf-8')
dictionary = f.readlines()
f.close()
for term_n in xrange(0, len(dictionary)):
    dictionary[term_n] = dictionary[term_n][:-1]

svdMat = fromfile('svdMat.svd')
svdMat = svdMat.reshape(len(dictionary), svdMat.shape[0] / len(dictionary))


def get_form_similarity(form, form_test):
    similarity = 0