def text_rank(text, language): sentences = [] a = [] if (language == 'ukrainian'): morph = MorphAnalyzer(lang='uk') sentences = sent_tokenizer_ua(text) if len(sentences) < 2: s = sentences[0] return [(1, 0, s)] a = tfidf(text, language, sent_tokenizer_ua, stop_words_ua) else: morph = MorphAnalyzer() sentences = sent_tokenizer_ru(text) if len(sentences) < 2: s = sentences[0] return [(1, 0, s)] a = tfidf(text, language, sent_tokenizer_ru, stop_words_ru) pairs = combinations(range(len(sentences)), 2) scores = [(i, j, similarity(a[i, :], a[j, :])) for i, j in pairs] scores = filter(lambda x: x[2], scores) pr = rank_graph(scores) return sorted(((i, pr[i], s) for i, s in enumerate(sentences) if i in pr), key=lambda x: pr[x[0]], reverse=True) # Сортировка по убыванию ранга тройки
def text_rank(text, language): sentences = [] words = [] if (language == 'ukrainian'): morph = MorphAnalyzer(lang='uk') sentences = sent_tokenizer_ua(text) if len(sentences) < 2: s = sentences[0] return [(1, 0, s)] words = [set(morph.parse(word)[0].normalized for word in word_tokenizer.tokenize(sentence.lower()) if word not in stop_words_ua) for sentence in sentences] else: morph = MorphAnalyzer() sentences = sent_tokenizer_ru(text) if len(sentences) < 2: s = sentences[0] return [(1, 0, s)] words = [set(morph.parse(word)[0].normalized for word in word_tokenizer.tokenize(sentence.lower()) if word not in stop_words_ru) for sentence in sentences] pairs = combinations(range(len(sentences)), 2) scores = [(i, j, similarity(words[i], words[j])) for i, j in pairs] scores = filter(lambda x: x[2], scores) pr = rank_graph(scores) return sorted(((i, pr[i], s) for i, s in enumerate(sentences) if i in pr), key=lambda x: pr[x[0]], reverse=True)
def connectUI(self): name_ui = 'ui_windows/main_window2.ui' uic.loadUi(name_ui, self.parent) self.setFixedSize(1010, 750) self.connect_buttons() self.update_info() self.create_labels(['', '', '', '']) self.set_group_companies() DATABASE['currency'].name = 'exchange_rates_month' money = DATABASE['currency'].draw_exchange_rates( self.parent.value.currentText(), my_type='Month') self.parent.chart.setPixmap( QPixmap(DATABASE['currency'].directory + DATABASE['currency'].name + '.png')) morph_country = MorphAnalyzer().parse('Российский')[0] morph_currency = MorphAnalyzer().parse('Рубль')[0] text = [ str(money), morph_country.make_agree_with_number(money).word.capitalize() + ' ' + morph_currency.make_agree_with_number(money).word.capitalize() ] self.parent.currensy.setText( "<html><head/><body><p align=\"center\"><span>" + text[0] + "</span></p></body></html>") self.parent.currensy_2.setText( "<html><head/><body><p align=\"center\"><span>" + text[1] + "</span></p></body></html>") self.parent.value.currentTextChanged.connect( self.change_type_exchange_rates) self.parent.current_company_2.currentTextChanged.connect( self.set_group_companies) self.parent.current_company.currentTextChanged.connect( self.set_company) try: company = self.parent.current_company.currentText().strip().lower() if not company: return img = open(DATABASE[company].directory + DATABASE[company].name + '.png') img.close() self.parent.chart_3.setPixmap( QPixmap(DATABASE[company].directory + DATABASE[company].name + '.png')) except FileNotFoundError: self.set_company()
def main(): morph = MorphAnalyzer() reg_new_name = re.compile('[0-9]{,9}-#') tokens_path = '..' + os.sep + '..' + os.sep + 'RuCor' + os.sep + '!new_tokens.txt' documents_path = tokens_path.replace('!new_tokens', 'Documents') nlc_folder = '..' + os.sep + '..' + os.sep + 'RuCor' + os.sep + '!all-in-one' all_tokens = read_info(tokens_path, header=True) files_and_ids = filenames_ids(documents_path) filenames = set(files_and_ids.keys()) counter = 1 total = len(os.listdir(nlc_folder)) for item in os.listdir(nlc_folder): if item.endswith('.csv'): original_name = reg_new_name.sub('', item).strip('.csv') if original_name in filenames: # NLC nlc_path = nlc_folder + os.sep + item nlc = NLC_to_dict(read_info(nlc_path, header=False)) # RuCor original_id = files_and_ids[original_name] rucor = tokens_to_dict(all_tokens, doc_id=original_id) # merging rucor = text_and_tokens(rucor, nlc) rucor = do_morphology(rucor, morph) # save save_dataset(rucor, original_id) # kinda logging print now = datetime.now() print('{0:2d}:{1:2d}:{2:2d}\t{3:3d}/{4:3d}\tfile: {5}\t\t\tdone: {6:.2f}%'.format(now.hour, now.minute, now.second, counter, total, nlc_path, counter / total * 100)) counter += 1
def pymorphying(filename): dictionary = dict() morph = MorphAnalyzer() words_and_grams = list() with open(filename, 'r', encoding='utf-8') as file: text = file.read() tokenized = word_tokenize(text) for one in tokenized: parsed = morph.parse(one) parsed = parsed[0] original_word = one gram_info = str(parsed.tag).split(',') first_gram = str(gram_info[0]).split()[0] if first_gram == 'PNCT' or first_gram == 'UNKN': continue if len(gram_info) == 1: continue loop = dictionary counter = 0 for gram in gram_info: counter += 1 check = gram checking = check in loop if checking == False: add_to_dict(check, loop, counter, len(gram_info)) if type(loop) != list: loop = loop[check] try: loop.append(original_word) loop.sort() except AttributeError: loop = list() loop.append(original_word) loop.sort() print(dictionary)
def __init__( self, vocab: Vocab, model: Optional[Model], name: str = "lemmatizer", *, mode: str = "pymorphy2", overwrite: bool = False, scorer: Optional[Callable] = lemmatizer_score, ) -> None: if mode == "pymorphy2": try: from pymorphy2 import MorphAnalyzer except ImportError: raise ImportError( "The Ukrainian lemmatizer mode 'pymorphy2' requires the " "pymorphy2 library and dictionaries. Install them with: " "pip install pymorphy2 pymorphy2-dicts-uk") from None if getattr(self, "_morph", None) is None: self._morph = MorphAnalyzer(lang="uk") super().__init__(vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer)
def addWord(): global vocabulary start_time = time.time() words = {} analyzer = MorphAnalyzer() vocabulary.append(inputText.get(1.0, END)) tokenize_sentence = word_tokenize(vocabulary[0]) for word in tokenize_sentence: parse_word = analyzer.parse(word)[0] word_word = parse_word.word word_lemma = parse_word.normal_form word_tags = parse_word.tag.cyr_repr word_ending = list(set(word_word) - set(word_lemma)) if word_word is not word_lemma: words.update({ word_word: { 'lemma': word_lemma, 'tag': word_tags, 'ending': word_ending } }) sorted_words = sorted(words) for key in sorted_words: lexeme = Lexeme((words[key]['lemma']), (words[key]['tag']), (words[key]['ending'])) outputText.insert(0, str(lexeme.lemma) + ' ' + str(lexeme.tags) + ' ' \ + str(lexeme.endings)) end_time = time.time() result_time = end_time - start_time print(str(result_time) + " seconds") vocabulary.clear()
def read_tab_corpus(inc): m = MorphAnalyzer() sent = [] for t in inc: # try: # t = t.rstrip().decode('utf-8') # except AttributeError: t = t.rstrip() if not t: continue if t == u'sent': sent = [] continue if t == u'/sent' or t == u'SENT': sent = [x[0] for x in sent] parses = [m.parse(token) for token in sent] if sent: yield [(p[0].word, p[0].tag) for p in parses] continue t = t.split('\t') try: token = (t[1], ' '.join(t[2].split(' ')[2:])) sent.append(token) except IndexError: continue
def search(query): relevance = defaultdict(float) m = MorphAnalyzer() inverted_index, articles, avdl = get_indices() N = len(articles) words = [ x.lower().strip(string.punctuation + '»«–…') for x in word_tokenize(query) ] lemmas = [ m.parse(x)[0].normal_form for x in words if x and x not in set(stopwords.words('russian')) ] for lemma in lemmas: if lemma in inverted_index: articles_w_lemma = inverted_index[lemma] n = len(articles_w_lemma) for a in articles_w_lemma: a_info = articles[a[0]] qf = a[1] dl = a_info[2] relevance[(a_info[0], a_info[1])] += score_BM25(n, qf, N, dl, avdl) res = sorted(relevance.items(), key=lambda x: x[1], reverse=True) res = [x[0] for x in res] return res
def load_russe_lemm_df(part='bts-rnc/train'): df = load_russe_df(part) from pymorphy2.tokenizers import simple_word_tokenize df['lctx'] = df.apply(lambda r: r.context[:r.positions[0]], axis=1) df['rctx'] = df.apply(lambda r: r.context[r.positions[1]:], axis=1) from pymorphy2 import MorphAnalyzer _ma = MorphAnalyzer() _ma_cache = {} def ma(s): s = s.strip( ) # get rid of spaces before and after token, pytmorphy2 doesn't work with them correctly if s not in _ma_cache: _ma_cache[s] = _ma.parse(s) return _ma_cache[s] def sent_ma(tokens): return [ma(t)[0] for t in tokens] for col in ('lctx', 'rctx'): df[col] = df[col].apply(simple_word_tokenize). \ apply(sent_ma). \ apply(lambda l: [s.normal_form for s in l if 'PNCT' not in s.tag]). \ str.join(' ') df.context = df.lctx + ' ' + df.word + ' ' + df.rctx df.positions = df.apply(lambda r: (len(r.lctx) + 1, len(r.lctx) + 1 + len(r.word)), axis=1) df['word_at'] = df.apply(lambda r: r.context[slice(*r.positions)], axis=1) return df
def make_tags(sentence): morph = MorphAnalyzer() tags = {'NOUN': 'N', 'NPRO': 'N', 'ADJF': 'A', 'ADJS': 'A', 'PRTF': 'A', 'PRTS': 'V', 'NUMR': 'A', 'VERB': 'V', 'INFN': 'V', 'GRND': 'V', 'ADVB': 'D', 'PREP': 'P', 'PRCL': 'P', 'CONJ': 'P'} tokens = [token for token in nltk.word_tokenize(sentence)] tokens_tags = [tags[morph.parse(token)[0].tag.POS] for token in nltk.word_tokenize(sentence)] return [tokens, tokens_tags]
def parse_text(self, string, flag): morph = MorphAnalyzer() FURTHER_DEVELOPMENT = morph.parse('дальнейшие'.lower())[0].normal_form FURTHER_IMPROVEMENTS = morph.parse('улучшения'.lower())[0].normal_form self.sentences = [] self.find_further_development = False self.filtered_docs = [] stop_words = set(stopwords.words("russian")) filtered_doc = [] self.get_sentences(string, flag) for sent in self.sentences: token_sent = [ w.lower() for w in word_tokenize(sent) if w.lower() not in stop_words ] for word in token_sent: w = morph.parse(word)[0].normal_form filtered_doc.append(w) if w in [FURTHER_DEVELOPMENT, FURTHER_IMPROVEMENTS ] and not flag: self.find_further_development = True self.further_dev_sentence = sent self.filtered_docs.append(filtered_doc) filtered_doc = []
def result(): morph = MorphAnalyzer() if request.args: sent = request.args['sentence'] m = Mystem() ana = m.analyze(sent) new_sent = open('sentence.txt', 'w', encoding='utf-8') for word in ana: if 'analysis' in word: forma_slova = word['analysis'][0]['gr'] sent2 = clear_words for w in sent2: ana2 = m.analyze(w) try: an_word = ana2[0] if 'analysis' in an_word: print(an_word) forma_slova2 = an_word['analysis'][0]['gr'] if forma_slova == forma_slova2: new_sent.write(w + ' ') break except IndexError: pass new_sent.close() with open('sentence.txt', 'r', encoding='utf-8') as f: read_sent = f.read() return render_template('result.html', sentence=read_sent) return render_template('result.html')
def __init__(self): self.device = None self.model = None self.tokenizer = None self.morph = MorphAnalyzer() self.module_path = os.path.dirname(os.path.abspath(__file__)) self.data = Struct()
def __init__(self, app_id, mail_manager, chat_id=1, number_of_seconds_for_the_rest=60, chat_id_for_mails=None, admins=None, **kwargs): """ Initializes Lama Bot. Expects login/password or access_token as named parameters :param mail_manager: A manager for retrieving mails :type mail_manager: AbstractMailManager :param chat_id: Chat identifier :type chat_id: int :param chat_id_for_mails: Chat for mails. Same as chat_id, if not presented :type chat_id_for_mails: int :raise ValueError: When neither login/password nor access_token was provided """ self.exit_event = Event() self.morph = MorphAnalyzer() self.version = '0.1.1' self.app_id = app_id self.access_token = None self.password = None self.login = None self.vkapi = ThreadSafeVkApi(app_id=app_id, **kwargs) self.commands = {} self._plugins = [] self.mail_manager = mail_manager self.number_of_seconds_for_the_rest = number_of_seconds_for_the_rest self.chat_id = chat_id self.chat_id_for_mails = chat_id_for_mails or self.chat_id self.admins = admins or [] self.initialize_commands()
def get_correct_form_of_points_number_name(number: int) -> str: """ Возвращает верное слово (Баллов/Балла/Балл) для правильного написания """ if not isinstance(number, int): # Ввелось не число return "Балл" analysis = MorphAnalyzer().parse("Балл")[0] return analysis.make_agree_with_number(number).word
def to_normal_form(file_text): morph = MorphAnalyzer() out = [] for word in word_tokenize(file_text.lower()): if word.isalnum(): out.append(morph.parse(word)[0].normal_form) return " ".join(out)
def __init__(self, token_pat="[а-я]+", mode="normal", counter=None, threshold=3, allowed_pos=None, stop_words=None, stop_cities=False): self.token = token_pat self.mode = mode if self.mode not in {"normal", "nospace"}: raise ValueError("Unknown mode") elif self.mode == "nospace": if not isinstance(counter, Counter): raise ValueError( "In 'nospace' mode the counter attribute should be passed") self.counter = counter self.nospace = NoSpaceSplitter(counter) self.threshold = threshold self.morph = MorphAnalyzer() self.allowed_pos = allowed_pos self.stop_words = stop_words or STOPWORDS if stop_cities: self.stop_words.union(CITIES)
def persistent_load(self, id): if id == "pymorphy2.MorphAnalyzer": return MorphAnalyzer() elif id == "russtress.Accent": return Accent() else: raise pickle.UnpicklingError("unsupported persistent object")
def feminine_checker(self, w): ''' Check if the word is feminine. Necessary for some variants of hieroglyphs Args: w: str, input Russian word Returns: sex: str, 'M' or 'F' - gender of a word ''' morph = MorphAnalyzer() w = self.input_word.split(' ')[0] ana = morph.parse(w)[0] gram = str(ana.tag).split(',') # print(gram) try: if 'femn' in gram[2]: sex = 'F' else: sex = 'M' except: if w[-1] == 'а' or w[-1] == 'я': sex = 'F' else: sex = 'M' self.sex = sex return self.sex
class MorphParser: _morph = MorphAnalyzer() @classmethod def parse(cls, tokens: List[str]) -> List[ParsedToken]: tokens_parsings = [] for token in tokens: pymorphy_parsings = cls._morph.parse(token) token_parsings = [ Parsing(pp.word, pp.normal_form, pp.tag) for pp in pymorphy_parsings ] tokens_parsings.append(ParsedToken(token, token_parsings)) return tokens_parsings @classmethod def lemmatize(cls, tokens: List[str]) -> List[str]: if isinstance(tokens, str): tokens = [tokens] lemmas = [str(parsed_token) for parsed_token in cls.parse(tokens)] return lemmas
def analyzeWord(self, word): morph = MorphAnalyzer() analysisResults = [] for p in morph.parse(word): curAnalysis = { 'исходное слово': word, 'нормальная форма': p.normal_form, 'часть речи': p.tag.POS, 'одушевленность': p.tag.animacy, 'вид': p.tag.aspect, 'падеж': p.tag.case, 'род': p.tag.gender, 'включенность': p.tag.involvement, 'наклонение': p.tag.mood, 'число': p.tag.number, 'лицо': p.tag.person, 'время': p.tag.tense, 'переходность': p.tag.transitivity, 'залог': p.tag.voice, 'лексема': [lexeme[0] for lexeme in p.lexeme] } analysisResults.append(curAnalysis) return analysisResults
def __init__(self, data_name, lemmatizing_method, max_examples=None, delete_word_parts=False, drop_duplicates=True, count_lemmas_weights=False, limit=None): self.data_name = data_name self.lemmatizing_method = lemmatizing_method self.max_examples = max_examples self.delete_word_parts = delete_word_parts self.drop_duplicates = drop_duplicates self.count_lemmas_weights = count_lemmas_weights self.translation = str.maketrans('', '', string.punctuation) self.dfs = dict() self.nf_cnts = dict() self.cache = dict() self.pattern = re.compile(r'\b\w+\b') if lemmatizing_method is not None and lemmatizing_method != 'none': if 'ru' in data_name: self.analyzer = MorphAnalyzer() elif 'german' in data_name: self.analyzer = spacy.load("de_core_news_sm", disable=['ner', 'parser']) elif 'english' in data_name: self.analyzer = spacy.load("en_core_web_sm", disable=['ner', 'parser']) else: assert "unknown data name %s" % data_name
def merge_files(): morph = MorphAnalyzer() reg_new_name = re.compile('[0-9]{,9}-#') tokens_path = '..' + os.sep + '..' + os.sep + 'RuCor' + os.sep + '!new_tokens.txt' documents_path = tokens_path.replace('!new_tokens', 'Documents') nlc_folder = '..' + os.sep + '..' + os.sep + 'RuCor' + os.sep + '!all-in-one' all_tokens = read_info(tokens_path, header=True) files_and_ids = filenames_ids(documents_path) filenames = set(files_and_ids.keys()) emb_dict = embeddings() par, bin_par = syntpar() for item in os.listdir(nlc_folder): if item.endswith('.csv'): original_name = reg_new_name.sub('', item).strip('.csv') if original_name in filenames: nlc_path = nlc_folder + os.sep + item nlc = NLC_to_dict(read_info(nlc_path, header=False)) original_id = files_and_ids[original_name] rucor = tokens_to_dict(all_tokens, doc_id=original_id) united_annotation = add_nlc_to_rucor(rucor, nlc) united_annotation = do_morphology(united_annotation, morph) data = [ dataset(united_annotation[token]) for token in united_annotation ] add_embeddings_save(emb_dict, bin_par, data, original_name) break
def __init__(self): self.grammeme_vectorizer_input = GrammemeVectorizer() self.grammeme_vectorizer_output = GrammemeVectorizer() self.word_dictionary = WordDictionary() self.char_set = set() self.morph = MorphAnalyzer() # pyMorphy2 self.converter = converters.converter('opencorpora-int', 'ud14')
def thanks(): morph = MorphAnalyzer() if request.args: #a = input('Введите предложение: ') a = request.args['sentence'] words = open('words.txt', 'r', encoding='utf-8') words = words.readlines() reg = re.compile('[^а-яА-Я ]') a = a.split() new_sent = open('sentence.txt', 'w', encoding='utf-8') for i in a: ana = morph.parse(i)[0] random.shuffle(words) for word in words: word = reg.sub('', word) word = morph.parse(word)[0] if word.tag == ana.tag: new_sent.write(word.word) new_sent.write(' ') break new_sent.close() new_sent1 = open('sentence.txt', 'r', encoding='utf-8') new_sent1 = new_sent1.read() return render_template('thanks.html', sentence_answer=new_sent1) #print(new_sent1) #new_sent1.close() return redirect(url_for(''))
def load_ae(paths): analyzer = MorphAnalyzer() for path in paths: for a, b, weight in load_ae_(path): a_pos = get_pos(a, analyzer) b_pos = get_pos(b, analyzer) yield Sim(a, b, a_pos, b_pos, weight)
def place(message): global places m = MorphAnalyzer() word = m.parse(message)[0] if 'гео' in word.tag.cyr_repr: if message not in places: places[message] = r.choice([ "Отличное место! Бывало, что я заползал туда иногда, раз в месяцок", "О да, знаю, там подают таки-и-ие блюда!", "Ну, знаешь, насчет этого места. Тут точно дело вкуса, обычному туристу лучше сюда не соваться...", "Место, откровенно говоря, так себе...", "Это одно из моих любимых мест на планете! Когда будет возможность, обязательно посети", "Это место меня отталкивает, даже не планируй туда поездку", "Да ладно, нашел место для отдыха!", "Погодка там так себе", "Ну, ничего, норм выбор", "Как тебе вообще в голову пришло туда захотеть поехать?!", "Для питона как раз!)", "Как-то одним морозным дням я замечательно отдохнул там, но общее впечатление оставляет желать лучшего", "Там бывает мокро, но для меня, питона, это естественная среда)", "Брррррррр, не нада", "Питон одобряет", "Не трать время на это", "Конечно, там прекрасно!", "Что ты там будешь делать?", "Хммм, ничего!" ]) return places[message] return False
def agree(w1, w2, t1, t2): morph = MorphAnalyzer() raw_cur_tags = morph.tag(w1)[0] raw_next_tags = morph.tag(w1)[0] cur_tags = re.findall(r"\w+", raw_cur_tags) next_tags = re.findall(r"\w+", raw_next_tags) if t1 == "person": if t2 == "verb_right": if next_tags[3] == "tran": cur_tags[-1] = "nomn" else: cur_tags[-1] = "datv" if t1 == "verb_right": if t2 == "property": pass if t1 == "adjective": if t2 == "property": pass if t1 == "property": if t2 == "person": pass if t2 == "adjective": pass #w1 = morph.parse(w1)[0].inflect({}).word return w1, w2
def generate_answer(input_sentence): morph = MorphAnalyzer() words = input_sentence.split() answer = '' for word in words: analyze = morph.parse(word)[0] pos_tag = analyze.tag.POS filename = str(pos_tag) + '.txt' with open(filename, 'r', encoding='utf-8') as file: lemmas = file.readlines() changed_word = None while changed_word == None: new_analyze, inf_tags = collect_inf_tags(analyze, pos_tag, lemmas, morph) changed_word = new_analyze.inflect(inf_tags) answer = answer + changed_word.word + ' ' print(answer)