def answer_sentiment_score(text: str, sentiment_dict: Dict[str, int], morf: morfeusz2.Morfeusz, verbose=False): analysis = morf.analyse(text) lemmas = lemmas_list(analysis) return sentiment_score(lemmas, sentiment_dict, verbose)
def prepare_text(text: str, morf: morfeusz2.Morfeusz) -> List[Set[str]]: analysed_text = morf.analyse(text) pos = 0 sets = [] current_set = set() for morf_tuple in analysed_text: part_of_speech = morf_tuple[2][2].split(':')[0] if part_of_speech in [ 'interj', 'conj', 'part', 'siebie', 'fin', 'bedzie', 'aglt', 'impt', 'imps', 'inf', 'winien', 'pred', 'comp', 'interj', 'interp' ]: continue if morf_tuple[0] != pos: if len(current_set) != 0: sets.append(current_set) current_set = set() pos = morf_tuple[0] lemma = morf_tuple[2][1].split(':')[0] if part_of_speech == 'ign': lemma = stem(lemma) lemma = ''.join(c for c in lemma if c.isalnum()) if len(lemma) > 0: current_set.add(lemma.lower()) return sets
def __init__(self): super(PolishLemmatizer, self).__init__() try: from morfeusz2 import Morfeusz except ImportError: raise ImportError( 'The Polish lemmatizer requires the morfeusz2-python library') if PolishLemmatizer._morph is None: PolishLemmatizer._morph = Morfeusz(dict_name='polimorf')
class MorfeuszLemmatizer(object): """Morfeusz-based lemmatizer""" def __init__(self): """Constructor""" self.morf = Morfeusz() def lemmatize(self, form): analysed = self.morf.analyse(form) for ( _begin, _end, (_wordform, baseform, _tags, _commonness, _qualifiers), ) in analysed: return baseform
def process_request(params): option_parser = MorfeuszOptionParser(params) option_parser.parse_bool('expandDag', 'expand_dag') option_parser.parse_bool('expandTags', 'expand_tags') option_parser.parse_bool('expandDot', 'expand_dot') option_parser.parse_bool('expandUnderscore', 'expand_underscore') option_parser.parse_string('agglutinationRules', 'aggl', AGGLUTINATION_RULES) option_parser.parse_string('pastTenseSegmentation', 'praet', PAST_TENSE_SEGMENTATION) option_parser.parse_enum('tokenNumbering', 'separate_numbering', TokenNumbering, TokenNumbering.separate) option_parser.parse_enum('caseHandling', 'case_handling', CaseHandling) option_parser.parse_enum('whitespaceHandling', 'whitespace', WhitespaceHandling) option_parser.parse_actions('action') results = [] response = {'results': results} if option_parser.validate(response): option_parser.set_dictionary_path('MORFEUSZ_DICT_PATH') morfeusz = Morfeusz(**option_parser.get_opts()) if option_parser.action == 'analyze': for interp_list in morfeusz.analyse(option_parser.text): if isinstance(interp_list, list): subitem = [] results.append(subitem) for item in interp_list: subitem.append(tag_items(item)) else: results.append(tag_items(interp_list)) elif option_parser.action == 'generate': for title in option_parser.titles: subitem = [] results.append(subitem) for interp_list in morfeusz.generate(title): subitem.append(tag_items(interp_list)) response['version'] = morfeusz2.__version__ response['dictionaryId'] = morfeusz.dict_id() response['copyright'] = morfeusz.dict_copyright() return response
from sys import argv, exit #other imports corpus_filename = 'pl.txt' try: filename = argv[1] if filename in listdir('.'): corpus_filename = filename else: print('File %s not found in the current directory' % filename) exit(-1) except IndexError: pass exclude = string.digits #unicode(string.digits) # morph = Morfeusz() def lemm(line): sentence = re.split( '\d+|\W+|_', line.lower(), flags=re.UNICODE ) #re.split('\W+', line.lower(), flags=re.UNICODE) #line.split() norm_sentence = [] for i in xrange(0, len(sentence)): if sentence[i] != u'': #print 'sen: ', sentence[i], 'len: ', len(sentence[i]) w_desc = morph.analyse(sentence[i]) if len(w_desc) > 0: norm_sentence.append(w_desc[0][2][1].split(':')[0]) return norm_sentence
#! /usr/bin/python # *-* coding: utf-8 *-* from morfeusz2 import Morfeusz from concraft_pl2 import Concraft, Server try: morfeusz = Morfeusz(expand_tags=True) server = Server(model_path="/home/kuba/work/ipipan/concraft/pre-trained/Sep-18/model-04-09-2018.gz", port=3001) concraft = Concraft(port=3001) dag = morfeusz.analyse(u'W Szczebrzeszynie chrząszcz brzmi w trzcinie.') res = concraft.disamb(dag) print(res) dag = morfeusz.analyse(u'W Szczebrzeszynie chrząszcz brzmi w trzcinie.') dag_str = concraft.dag_to_str(dag) dag_disamb_str = concraft.disamb_str(dag_str) print(dag_disamb_str) finally: server.terminate()
import pathlib, random, re, sys from typing import Callable, Optional from morfeusz2 import Morfeusz from wordnet import query morfeusz = Morfeusz(analyse=False) DATASETS = ["new"] DICT_LINES = {} DICT_FUNCTIONS = {} THESAURUS = {} # Words from the thesaurus containing these tags will be ignored: BLACKLISTED_TAGS = [ "(bardzo potocznie)", "(potocznie)", "(częściej, ale wg niektórych niepoprawnie)", "(eufemistycznie)", # :( #"(nieco potocznie)", # Eh, it's fine "(obraźliwe)", "(obraźliwie)", #"(pieszczotliwie)", "(pogardliwie)", "(potoczne)", "(potocznie)", "(przestarzale)", "(ptoocznie)", "(regionalnie)", # Contains some inappropriate words "(rzadko, wg niektórych niepoprawnie)",
def __init__(self): """Constructor""" self.morf = Morfeusz()
r'<([a-z]*-[a-z]*)(\s+[a-z]*="\w*((\s+\w*)+)?")?>(.+?)<\/[a-z]*-[a-z]*>', letter_contents) # print('one word tags') # for tag in tags_word: # print(tag) # print('\n') # print('two word tags') # for tag in tags_words: # print(tag) # print('\n') def remove_dashes(text): tmp_str = '' for letter in text: if letter != '-': tmp_str = tmp_str + letter return tmp_str letter1_no_tags = remove_tags(letters[0].contents) letter1_nt_str = ' '.join(letter1_no_tags) # letter_ntnd_str = remove_dashes(letter1_nt_str.decode('utf-8')) # print(letter_ntnd_str) morf = Morfeusz() # print(morf) print(letter1_nt_str.decode('utf8')) letter1_analysed = morf.analyse(letter1_nt_str) print(letter1_analysed)
from sys import argv, exit #other imports corpus_filename = 'pl.txt' try: filename = argv[1] if filename in listdir('.'): corpus_filename = filename else: print('File %s not found in the current directory' % filename) exit(-1) except IndexError: pass max_n = 3 morph = Morfeusz() #load data dictionary = [] f = codecs.open('dict.txt', encoding='utf-8') dictionary = f.readlines() f.close() for term_n in xrange(0, len(dictionary)): dictionary[term_n] = dictionary[term_n][:-1] svdMat = fromfile('svdMat.svd') svdMat = svdMat.reshape(len(dictionary), svdMat.shape[0] / len(dictionary)) def get_form_similarity(form, form_test): similarity = 0