def agree(w1, w2, t1, t2): if t1 == "comma" or t2 == "comma": return w1, w2 morph = MorphAnalyzer() raw_cur_tags = morph.tag(w1)[-1] raw_next_tags = morph.tag(w1)[-1] cur_tags = re.findall(r"\w+", str(raw_cur_tags)) next_tags = re.findall(r"\w+", str(raw_next_tags)) if t1[:-2] == "person": if t2[:-2] == "verb_right": if morph.normal_forms(w2)[0] in dative_verbs: w1 = morph.parse(w1)[0].inflect({"datv"}).word if t1[:-2] == "verb_right": if t2[:-2] == "property": pass if t2[:-2] == "person": if cur_tags[3] == "tran": w2 = morph.parse(w2)[0].inflect({"accs"}).word else: w2 = morph.parse(w2)[0].inflect({"nomn"}).word #gender with nomn only gender = next_tags[2] if gender == "inan": gender = next_tags[3] w1 = morph.parse(w1)[0].inflect({gender}).word if t1[:-2] == "adjective": if t2[:-2] == "property": #gender gender = next_tags[2] if gender == "inan": gender = next_tags[3] try: w1 = morph.parse(w1)[0].inflect({gender}).word except Exception: print("f**k") print(w1, w2) if t1[:-2] == "property": if t2[:-2] == "person": pass if t2[:-2] == "adjective": gender = cur_tags[2] if gender == "inan": gender = cur_tags[3] try: w2 = morph.parse(w2)[0].inflect({gender}).word except Exception: print("f**k") print(w1, w2) #w1 = morph.parse(w1)[0].inflect({}).word return w1, w2
def is_hint_need(self, text): hints = Hint.objects.all() morph = MorphAnalyzer() defined_words = [hint.defined_word for hint in hints] normal_defined_words = [morph.normal_forms(hint.defined_word)[0] for hint in hints] print("DEFINED WORDS:", normal_defined_words) recognized_words = [] if type(text) == list: text = ','.join(text) words_of_text = re.split('\.|,|-|\?|\*| ', text) normal_words = [morph.normal_forms(word) for word in words_of_text] print("NORMAL WORDS:", normal_words) for list_of_normals in range(len(normal_words)): for word in normal_words[list_of_normals]: if word in normal_defined_words: recognized_words.append(defined_words[normal_defined_words.index(word)]) print("RECOGNIZED WORDS:", recognized_words) return recognized_words
def make_base(x): all_word_str = " ".join(x) all_word_list = all_word_str.split() all_unique_word = pd.Series(all_word_list).unique() lemmatized_word_dict = {} lemmatizer = MorphAnalyzer() for word in all_unique_word: lemmatized_word_dict[word] = lemmatizer.normal_forms(word)[0] x_list = ' '.join([lemmatized_word_dict[word] for word in text]) return x_list
def __lemmatization_rus(txt_tokenized_filtered): morph_rus = MorphAnalyzer_rus() txt_lemmatizated = [] for elem in txt_tokenized_filtered: elem = elem.strip() txt_lemmatizated.append(morph_rus.normal_forms(elem)[0]) return txt_lemmatizated
def insert_resources(username='', l_resources=''): # добавление ресурсов в базу morph = MorphAnalyzer() for word in l_resources.split(','): word = str(word).strip() word = morph.normal_forms(word)[0] l_verbs = _d.get(word) _resource.insert_one({ 'username': username, 'word': word, 'verbs': l_verbs })
def lemmatize(text): """ Приведение слов в нормальню форму """ patterns = "[A-Za-z0-9!#$%&'()*+,./:;<=>?@[\]^_`{|}~—\"\-]+" stopwords_ru = set(stopwords.words("russian")) - set(['ты']) morph = MorphAnalyzer() tokens = [] for token in re.sub(patterns, ' ', text.lower()).split(): if token and token not in stopwords_ru: token = token.strip() token = morph.normal_forms(token)[0] tokens.append(token) return tokens
def generate_processor(keep_alpha_only=True, to_lower=True, stopwords_langs=[], add_stopwords=None, stemmer_langs=[], normalize_russian=False): """ Return word predicate filter and processor Arguments: keep_alpha_only (bool) - keep only alpha symbols to_lower (bool) - convert to lower case stopwords_langs (list str) - filter stopwords for languages, use [] for no filtering (default: []) add_stopwords (list str) - additional stopwords to filter stemmer_langs (bool) - stem words using stemmer for specified language, use None for no stemming (default: None) normalize_russian (bool) - normalize Russian with PyMorph2 (fefault: False) """ def idf(w): return w def truef(w): return True stops = [w.lower() for w in add_stopwords] if add_stopwords is not None else [] for stopwords_lang in stopwords_langs: stops += stopwords.words(stopwords_lang) def _stop_func(w): return w.lower() not in stops is_not_stop = _stop_func if stops else truef is_alpha = (lambda w: w.isalpha()) if keep_alpha_only else truef stemmers = [] for stemmer_lang in stemmer_langs: stemmer = SnowballStemmer(stemmer_lang) stemmers += [stemmer.stem] analyzer = MorphAnalyzer() normalize = (lambda w: analyzer.normal_forms(w)[0]) if normalize_russian else idf def stem(w): return reduce((lambda w, s: s(w)), stemmers, w) lower = (lambda w: w.lower()) if to_lower else idf return { "word_predicate": lambda w: is_not_stop(w) and is_alpha(w), "word_processor": lambda w: stem(normalize(lower(w))) }
def lemmotize(word): morph = MorphAnalyzer() return morph.normal_forms(word)[0]
def standartize_text(text: str, analyzer: pymorphy2.MorphAnalyzer) -> str: res = [ analyzer.normal_forms(w.lower())[0] for w in re.split("[\W\d]+", text) ] return ' '.join(res)
class PreTrainingFiles: def __init__(self): nltk.download('stopwords') self.patterns = "[A-Za-z0-9!#$%&'()*+,./:;<=>?@[\]^_`{|}~—\"\-]+" self.stopwords_ru = stopwords.words("russian") self.morph = MorphAnalyzer() self.ann_model = keras.Sequential() self.__preparing_data__() print("Data Prepared") def __preparing_data__(self): data_fr = pd.read_csv('dataframe.csv', delimiter=';') data_fr2 = pd.read_csv('dataframe2.csv', delimiter=';') data_fr3 = pd.read_csv('dataframe3.csv', delimiter=';') data_fr4 = pd.read_csv('dataframe4.csv', delimiter=';') data_fr = data_fr.merge(data_fr2, how='outer') data_fr = data_fr.merge(data_fr3, how='outer') data_fr = data_fr.merge(data_fr4, how='outer') self.data_frame = pd.Series(data_fr['Comment']) self.data_frame = self.data_frame.dropna().drop_duplicates() self.data_frame = self.data_frame.apply(self.__lemmatize__) self.data_frame = self.data_frame.dropna() def __lemmatize__(self, doc): doc = re.sub(self.patterns, ' ', doc) tokens = [] for token in doc.split(): if token and token not in self.stopwords_ru: token = token.strip() token = self.morph.normal_forms(token)[0] tokens.append(token) if len(tokens) > 2: return tokens return None def convert_to_vec(self): self.__create_w2v_model__() print("W2v model is comlited") (X, y) = self.split_sentence(self.data_frame) X_all = self.convert_x(X) y_all = self.convert_y(y) print("X and y is converted") self.create_ann() strain_info = self.ann_model.fit(X_all, y_all, epochs=150, verbose=1) self.ann_model.save("worked_ann_model_big.h5") def convert_x(self, data): arr = np.zeros(shape=(np.shape(data)[0], 3, 70)) print(arr.shape) for i in range(np.shape(data)[0]): try: arr[i] = np.array([self.w2v_model.wv[data[i]]]) except: arr[i] = np.array([np.zeros(shape=(3, 70))]) return arr def convert_y(self, data): arr = np.zeros(shape=(np.shape(data)[0], 70)) print(arr.shape) for i in range(np.shape(data)[0]): try: arr[i] = np.array([self.w2v_model.wv[data[i]]]) except: arr[i] = np.array([np.zeros(shape=(70))]) return arr def split_sentence(self, data): x = list() lst_x = list() lst_y = list() for words in data: i = 0 z = 3 lst_x = list() while i < len(words): if i == z: lst_y.append(words[i]) i = i - 2 z += 1 x.append(lst_x) lst_x = list() lst_x.append(words[i]) i += 1 return [x, lst_y] def create_ann(self): self.ann_model = keras.Sequential() self.ann_model.add(layers.Input(shape=(3, 70))) self.ann_model.add(keras.layers.BatchNormalization()) self.ann_model.add(layers.Dropout(0.2)) self.ann_model.add(layers.Dense(500, activation='sigmoid')) self.ann_model.add(layers.Dropout(0.2)) self.ann_model.add(keras.layers.BatchNormalization()) self.ann_model.add(layers.Dense(210, activation='sigmoid')) self.ann_model.add(keras.layers.BatchNormalization()) self.ann_model.add(layers.LSTM(64)) self.ann_model.add(layers.Dense(150, activation='sigmoid')) self.ann_model.add(layers.Dropout(0.2)) self.ann_model.add(keras.layers.BatchNormalization()) self.ann_model.add(layers.Dense(70, activation='tanh')) self.ann_model.compile( loss='mean_squared_error', optimizer='nadam', metrics=[tf.keras.metrics.RootMeanSquaredError()]) def __create_w2v_model__(self): self.w2v_model = Word2Vec(min_count=3, window=3, size=70, negative=10, alpha=0.03, min_alpha=0.0007, sample=6e-5, sg=1) self.w2v_model.build_vocab(self.data_frame) self.w2v_model.train(self.data_frame, total_examples=self.w2v_model.corpus_count, epochs=300, report_delay=1) def predict_next_word(self, sentence): data = self.preparing_data_for_predict(sentence) (x, y) = self.split_sentence(data) x = self.convert_x(x) y = self.convert_y(y) pred = self.ann_model.predict(x) for vec in pred: next_possible_words = self.w2v_model.wv.similar_by_vector(vec, topn=5) return next_possible_words def preparing_data_for_predict(self, sentence): data_frame = pd.Series(sentence) data_frame = data_frame.dropna().drop_duplicates() data_frame = data_frame.apply(self.__lemmatize__) data_frame = data_frame.dropna() return data_frame
class Parallel_Translate: def __init__(self, input_ru, input_en): self.morph_ru = MorphAnalyzer() self.sentences_ru = self.Pars_sentences( input_ru ) wordPattern_ru = re.compile( "((?:[а-яА-ЯёЁ]+[-']?)*[а-яА-яёЁ]+)" ) self.sentences_list_ru = self.Create_Word_List( wordPattern_ru, self.sentences_ru, self.Normalize_ru, self.Translate_ru ) self.word_list_ru = [] self.sentences_en = self.Pars_sentences( input_en ) self.dict_en_ru = Dictionary('Dict/ER-LingvoUniversal.ifo') wordPattern_en = re.compile("((?:[a-zA-Z]+[-']?)*[a-zA-Z]+)") self.sentences_list_en = self.Create_Word_List( wordPattern_en, self.sentences_en, self.Normalize_en, self.Translate_en ) self.word_list_en = [] self.Graph = self.Create_Graph() munkres_algorithm = munkres.Munkres() #self.word_matching = munkres_algorithm.compute( self.Graph ) # Input file? read text and split to sentences def Pars_sentences(self,file_name ) : sentences_list = [] with open(file_name, 'rU') as input_file: file_str = input_file.read() sentences_tokenize = nltk.tokenize.PunktSentenceTokenizer() for sentence in sentences_tokenize.sentences_from_text( file_str ): sentences_list.append( sentence ) return sentences_list def Create_Word_List(self, wordPattern, sentences, Normalize, Translate ): word_list = [] sentence_num = 0 sent_list = [] for sentence in sentences: sentence_word_list = [] for word in wordPattern.findall( sentence ): word = word.strip() word = word.lower() n_word = Normalize( word ) translate_list = Translate( n_word ) w_info = word_info( word, sentence_num, n_word, translate_list ) word_list.append( w_info ) sentence_word_list.append(w_info) sent_list.append( sentence_info( sentence, sentence_word_list ) ) sentence_num= sentence_num + 1 return sent_list def Translate_ru( self, n_word ): return [] def Translate_en( self, n_word ): self.re_for_entry = re.compile("<dtrn>(.*?)</dtrn>") valueWord = [] try: for normal_word in n_word: for entry in self.dict_en_ru[ normal_word ]: result_pars = self.ParsEntry( entry.data ) valueWord = valueWord + result_pars except KeyError: pass return valueWord def ParsEntry( self, entry_data ) : l = entry_data.split( "<abr><i><c><co>" ) result_first_step = [] for data in l: result_first_step = result_first_step + self.re_for_entry.findall(data) result_second_step = [] for data in result_first_step: temp = data.split("<") if temp[0] != "": result_second_step.append(temp[0]) result = [] for data in result_second_step: for data_prom in data.split(","): result = result + data_prom.split(";") for i in range( len( result ) ): result[i] = result[i].strip() return result def Normalize_ru( self, word ): n_word = self.morph_ru.normal_forms( word ) if n_word: return n_word[0] else: return [] def Normalize_en( self, word ): n_word = wordnet.morphy( word ) if n_word: return [ n_word ] else: return [] def Create_Graph(self): graph_matrix = [ [ 0 for i in range( len( self.sentences_list_ru ) ) ] for j in range( len( self.sentences_list_en ) ) ] koef = abs( len( self.sentences_list_en ) - len( self.sentences_list_ru ) ) sentence_num = 0 for sentence in self.sentences_list_en: sentence_left_num = sentence_num sentence_right_num = sentence_num +1 while (sentence_left_num >= 0) and (sentence_num - sentence_left_num <= koef): sum_eq_words = 0 for w_info in sentence.sentence_words: for translate_word in w_info.translate_list: for w_info_ru in self.sentences_list_ru[sentence_left_num]: for w_normal in w_info_ru.normal_form: if w_normal == translate_word: sum_eq_words = sum_eq_words + 1 graph_matrix[sentence_num][sentence_left_num] = -( sum_eq_words - sentence_num + sentence_left_num ) while (sentence_right_num < len( self.sentences_list_ru ) ) and ( sentence_right_num - sentence_num <= koef): sum_eq_words = 0 for w_info in sentence.sentence_words: for translate_word in w_info.translate_list: for w_info_ru in self.sentences_list_ru[sentence_right_num]: for w_normal in w_info_ru.normal_form: if w_normal == translate_word: sum_eq_words = sum_eq_words + 1 graph_matrix[sentence_num][sentence_right_num] = -( sum_eq_words - sentence_num + sentence_left_num ) return graph_matrix