コード例 #1
0
def agree(w1, w2, t1, t2):
    if t1 == "comma" or t2 == "comma":
        return w1, w2

    morph = MorphAnalyzer()
    raw_cur_tags = morph.tag(w1)[-1]
    raw_next_tags = morph.tag(w1)[-1]

    cur_tags = re.findall(r"\w+", str(raw_cur_tags))
    next_tags = re.findall(r"\w+", str(raw_next_tags))

    if t1[:-2] == "person":
        if t2[:-2] == "verb_right":
            if morph.normal_forms(w2)[0] in dative_verbs:
                w1 = morph.parse(w1)[0].inflect({"datv"}).word

    if t1[:-2] == "verb_right":
        if t2[:-2] == "property":
            pass
        if t2[:-2] == "person":
            if cur_tags[3] == "tran":
                w2 = morph.parse(w2)[0].inflect({"accs"}).word
            else:
                w2 = morph.parse(w2)[0].inflect({"nomn"}).word
                #gender with nomn only
                gender = next_tags[2]
                if gender == "inan":
                    gender = next_tags[3]
                w1 = morph.parse(w1)[0].inflect({gender}).word

    if t1[:-2] == "adjective":
        if t2[:-2] == "property":
            #gender
            gender = next_tags[2]
            if gender == "inan":
                gender = next_tags[3]
            try:
                w1 = morph.parse(w1)[0].inflect({gender}).word
            except Exception:
                print("f**k")
                print(w1, w2)

    if t1[:-2] == "property":
        if t2[:-2] == "person":
            pass
        if t2[:-2] == "adjective":
            gender = cur_tags[2]
            if gender == "inan":
                gender = cur_tags[3]
            try:
                w2 = morph.parse(w2)[0].inflect({gender}).word
            except Exception:
                print("f**k")
                print(w1, w2)


    #w1 = morph.parse(w1)[0].inflect({}).word
    return w1, w2
コード例 #2
0
ファイル: models.py プロジェクト: Belket/Profession
 def is_hint_need(self, text):
     hints = Hint.objects.all()
     morph = MorphAnalyzer()
     defined_words = [hint.defined_word for hint in hints]
     normal_defined_words = [morph.normal_forms(hint.defined_word)[0] for hint in hints]
     print("DEFINED WORDS:", normal_defined_words)
     recognized_words = []
     if type(text) == list:
         text = ','.join(text)
     words_of_text = re.split('\.|,|-|\?|\*| ', text)
     normal_words = [morph.normal_forms(word) for word in words_of_text]
     print("NORMAL WORDS:", normal_words)
     for list_of_normals in range(len(normal_words)):
         for word in normal_words[list_of_normals]:
             if word in normal_defined_words:
                 recognized_words.append(defined_words[normal_defined_words.index(word)])
     print("RECOGNIZED WORDS:", recognized_words)
     return recognized_words
コード例 #3
0
def make_base(x):
    all_word_str = " ".join(x)
    all_word_list = all_word_str.split()
    all_unique_word = pd.Series(all_word_list).unique()
    lemmatized_word_dict = {}
    lemmatizer = MorphAnalyzer()
    for word in all_unique_word:
        lemmatized_word_dict[word] = lemmatizer.normal_forms(word)[0]
    x_list = ' '.join([lemmatized_word_dict[word] for word in text])
    return x_list
コード例 #4
0
    def __lemmatization_rus(txt_tokenized_filtered):
        morph_rus = MorphAnalyzer_rus()

        txt_lemmatizated = []

        for elem in txt_tokenized_filtered:
            elem = elem.strip()
            txt_lemmatizated.append(morph_rus.normal_forms(elem)[0])

        return txt_lemmatizated
コード例 #5
0
def insert_resources(username='', l_resources=''):
    # добавление ресурсов в базу
    morph = MorphAnalyzer()
    for word in l_resources.split(','):
        word = str(word).strip()
        word = morph.normal_forms(word)[0]
        l_verbs = _d.get(word)
        _resource.insert_one({
            'username': username,
            'word': word,
            'verbs': l_verbs
        })
コード例 #6
0
 def lemmatize(text):
     """
     Приведение слов в нормальню форму
     """
     patterns = "[A-Za-z0-9!#$%&'()*+,./:;<=>?@[\]^_`{|}~—\"\-]+"
     stopwords_ru = set(stopwords.words("russian")) - set(['ты'])
     morph = MorphAnalyzer()
     tokens = []
     for token in re.sub(patterns, ' ', text.lower()).split():
         if token and token not in stopwords_ru:
             token = token.strip()
             token = morph.normal_forms(token)[0]
             tokens.append(token)
     return tokens
コード例 #7
0
ファイル: corpus.py プロジェクト: xbphat/ap-sample
def generate_processor(keep_alpha_only=True, to_lower=True,
                       stopwords_langs=[], add_stopwords=None,
                       stemmer_langs=[],
                       normalize_russian=False):
    """
    Return word predicate filter and processor

    Arguments:
    keep_alpha_only (bool) - keep only alpha symbols
    to_lower (bool) - convert to lower case
    stopwords_langs (list str) - filter stopwords for languages, use [] for no filtering (default: [])
    add_stopwords (list str) - additional stopwords to filter
    stemmer_langs (bool) - stem words using stemmer for specified language, use None for no stemming (default: None)
    normalize_russian (bool) - normalize Russian with PyMorph2 (fefault: False) 
    """

    def idf(w): return w

    def truef(w): return True

    stops = [w.lower() for w in add_stopwords] if add_stopwords is not None else []
    for stopwords_lang in stopwords_langs:
        stops += stopwords.words(stopwords_lang)

    def _stop_func(w):
        return w.lower() not in stops
    
    is_not_stop = _stop_func if stops else truef
    is_alpha = (lambda w: w.isalpha()) if keep_alpha_only else truef

    stemmers = []
    for stemmer_lang in stemmer_langs:
        stemmer = SnowballStemmer(stemmer_lang)
        stemmers += [stemmer.stem]

    analyzer = MorphAnalyzer()
    normalize = (lambda w: analyzer.normal_forms(w)[0]) if normalize_russian else idf

    def stem(w): return reduce((lambda w, s: s(w)), stemmers, w)

    lower = (lambda w: w.lower()) if to_lower else idf

    return {
        "word_predicate": lambda w: is_not_stop(w) and is_alpha(w),
        "word_processor": lambda w: stem(normalize(lower(w)))
    }
コード例 #8
0
def lemmotize(word):
    morph = MorphAnalyzer()
    return morph.normal_forms(word)[0]
コード例 #9
0
def standartize_text(text: str, analyzer: pymorphy2.MorphAnalyzer) -> str:
    res = [
        analyzer.normal_forms(w.lower())[0] for w in re.split("[\W\d]+", text)
    ]
    return ' '.join(res)
コード例 #10
0
class PreTrainingFiles:
    def __init__(self):
        nltk.download('stopwords')
        self.patterns = "[A-Za-z0-9!#$%&'()*+,./:;<=>?@[\]^_`{|}~—\"\-]+"
        self.stopwords_ru = stopwords.words("russian")
        self.morph = MorphAnalyzer()
        self.ann_model = keras.Sequential()
        self.__preparing_data__()
        print("Data Prepared")

    def __preparing_data__(self):
        data_fr = pd.read_csv('dataframe.csv', delimiter=';')
        data_fr2 = pd.read_csv('dataframe2.csv', delimiter=';')
        data_fr3 = pd.read_csv('dataframe3.csv', delimiter=';')
        data_fr4 = pd.read_csv('dataframe4.csv', delimiter=';')
        data_fr = data_fr.merge(data_fr2, how='outer')
        data_fr = data_fr.merge(data_fr3, how='outer')
        data_fr = data_fr.merge(data_fr4, how='outer')
        self.data_frame = pd.Series(data_fr['Comment'])
        self.data_frame = self.data_frame.dropna().drop_duplicates()
        self.data_frame = self.data_frame.apply(self.__lemmatize__)
        self.data_frame = self.data_frame.dropna()

    def __lemmatize__(self, doc):
        doc = re.sub(self.patterns, ' ', doc)
        tokens = []
        for token in doc.split():
            if token and token not in self.stopwords_ru:
                token = token.strip()
                token = self.morph.normal_forms(token)[0]
                tokens.append(token)
        if len(tokens) > 2:
            return tokens
        return None

    def convert_to_vec(self):
        self.__create_w2v_model__()
        print("W2v model is comlited")
        (X, y) = self.split_sentence(self.data_frame)
        X_all = self.convert_x(X)
        y_all = self.convert_y(y)
        print("X and y is converted")
        self.create_ann()
        strain_info = self.ann_model.fit(X_all, y_all, epochs=150, verbose=1)
        self.ann_model.save("worked_ann_model_big.h5")

    def convert_x(self, data):
        arr = np.zeros(shape=(np.shape(data)[0], 3, 70))
        print(arr.shape)
        for i in range(np.shape(data)[0]):
            try:
                arr[i] = np.array([self.w2v_model.wv[data[i]]])
            except:
                arr[i] = np.array([np.zeros(shape=(3, 70))])
        return arr

    def convert_y(self, data):
        arr = np.zeros(shape=(np.shape(data)[0], 70))
        print(arr.shape)
        for i in range(np.shape(data)[0]):
            try:
                arr[i] = np.array([self.w2v_model.wv[data[i]]])
            except:
                arr[i] = np.array([np.zeros(shape=(70))])
        return arr

    def split_sentence(self, data):
        x = list()
        lst_x = list()
        lst_y = list()
        for words in data:
            i = 0
            z = 3
            lst_x = list()
            while i < len(words):
                if i == z:
                    lst_y.append(words[i])
                    i = i - 2
                    z += 1
                    x.append(lst_x)
                    lst_x = list()
                lst_x.append(words[i])
                i += 1
        return [x, lst_y]

    def create_ann(self):
        self.ann_model = keras.Sequential()
        self.ann_model.add(layers.Input(shape=(3, 70)))
        self.ann_model.add(keras.layers.BatchNormalization())
        self.ann_model.add(layers.Dropout(0.2))
        self.ann_model.add(layers.Dense(500, activation='sigmoid'))
        self.ann_model.add(layers.Dropout(0.2))
        self.ann_model.add(keras.layers.BatchNormalization())
        self.ann_model.add(layers.Dense(210, activation='sigmoid'))
        self.ann_model.add(keras.layers.BatchNormalization())
        self.ann_model.add(layers.LSTM(64))
        self.ann_model.add(layers.Dense(150, activation='sigmoid'))
        self.ann_model.add(layers.Dropout(0.2))
        self.ann_model.add(keras.layers.BatchNormalization())
        self.ann_model.add(layers.Dense(70, activation='tanh'))
        self.ann_model.compile(
            loss='mean_squared_error',
            optimizer='nadam',
            metrics=[tf.keras.metrics.RootMeanSquaredError()])

    def __create_w2v_model__(self):
        self.w2v_model = Word2Vec(min_count=3,
                                  window=3,
                                  size=70,
                                  negative=10,
                                  alpha=0.03,
                                  min_alpha=0.0007,
                                  sample=6e-5,
                                  sg=1)
        self.w2v_model.build_vocab(self.data_frame)
        self.w2v_model.train(self.data_frame,
                             total_examples=self.w2v_model.corpus_count,
                             epochs=300,
                             report_delay=1)

    def predict_next_word(self, sentence):
        data = self.preparing_data_for_predict(sentence)
        (x, y) = self.split_sentence(data)
        x = self.convert_x(x)
        y = self.convert_y(y)
        pred = self.ann_model.predict(x)
        for vec in pred:
            next_possible_words = self.w2v_model.wv.similar_by_vector(vec,
                                                                      topn=5)
        return next_possible_words

    def preparing_data_for_predict(self, sentence):
        data_frame = pd.Series(sentence)
        data_frame = data_frame.dropna().drop_duplicates()
        data_frame = data_frame.apply(self.__lemmatize__)
        data_frame = data_frame.dropna()
        return data_frame
コード例 #11
0
ファイル: main.py プロジェクト: antonfait/SerchEngine
class Parallel_Translate:
    def __init__(self, input_ru, input_en):

        self.morph_ru = MorphAnalyzer()

        self.sentences_ru = self.Pars_sentences( input_ru )
        wordPattern_ru = re.compile( "((?:[а-яА-ЯёЁ]+[-']?)*[а-яА-яёЁ]+)" )
        self.sentences_list_ru = self.Create_Word_List( wordPattern_ru, self.sentences_ru,
                                                   self.Normalize_ru, self.Translate_ru )
        self.word_list_ru = []

        self.sentences_en = self.Pars_sentences( input_en )
        self.dict_en_ru = Dictionary('Dict/ER-LingvoUniversal.ifo')
        wordPattern_en = re.compile("((?:[a-zA-Z]+[-']?)*[a-zA-Z]+)")
        self.sentences_list_en = self.Create_Word_List( wordPattern_en, self.sentences_en,
                                                   self.Normalize_en, self.Translate_en )
        self.word_list_en = []
        self.Graph = self.Create_Graph()

        munkres_algorithm = munkres.Munkres()
        #self.word_matching = munkres_algorithm.compute( self.Graph )




# Input file? read text and split to sentences
    def Pars_sentences(self,file_name ) :
        sentences_list = []

        with open(file_name, 'rU') as input_file:
            file_str = input_file.read()
            sentences_tokenize = nltk.tokenize.PunktSentenceTokenizer()
            for sentence in sentences_tokenize.sentences_from_text( file_str ):
                sentences_list.append(  sentence )

        return sentences_list



    def Create_Word_List(self, wordPattern, sentences, Normalize, Translate ):
        word_list = []
        sentence_num = 0
        sent_list = []
        for sentence in sentences:
            sentence_word_list = []
            for word in wordPattern.findall( sentence ):
                word = word.strip()
                word = word.lower()
                n_word = Normalize( word )
                translate_list = Translate( n_word )
                w_info = word_info( word, sentence_num, n_word, translate_list )
                word_list.append( w_info )
                sentence_word_list.append(w_info)
            sent_list.append( sentence_info( sentence, sentence_word_list ) )
            sentence_num= sentence_num + 1
        return sent_list



    def Translate_ru( self, n_word ):
        return []

    def Translate_en( self, n_word ):

        self.re_for_entry = re.compile("<dtrn>(.*?)</dtrn>")

        valueWord = []
        try:
            for normal_word in n_word:
                for entry in self.dict_en_ru[ normal_word ]:
                    result_pars = self.ParsEntry( entry.data )
                    valueWord = valueWord + result_pars
        except KeyError:
            pass
        return valueWord

    def ParsEntry( self, entry_data  ) :
        l = entry_data.split( "<abr><i><c><co>" )
        result_first_step = []
        for data in l:
            result_first_step = result_first_step + self.re_for_entry.findall(data)
        result_second_step = []
        for data in result_first_step:
            temp = data.split("<")
            if temp[0] != "":
                result_second_step.append(temp[0])
        result = []
        for data in result_second_step:
            for data_prom in data.split(","):
                result = result + data_prom.split(";")
        for i in range( len( result ) ):
            result[i] = result[i].strip()
        return result


    def Normalize_ru( self, word ):
        n_word = self.morph_ru.normal_forms( word )
        if n_word:
            return n_word[0]
        else:
            return []

    def Normalize_en( self, word ):
        n_word = wordnet.morphy( word )
        if n_word:
            return [ n_word ]
        else:
            return []

    def Create_Graph(self):
        graph_matrix = [ [ 0 for i in range( len( self.sentences_list_ru ) ) ]
                            for j in range( len( self.sentences_list_en ) ) ]
        koef = abs( len( self.sentences_list_en ) - len( self.sentences_list_ru ) )
        sentence_num = 0
        for sentence in self.sentences_list_en:

            sentence_left_num = sentence_num
            sentence_right_num = sentence_num +1

            while (sentence_left_num >= 0) and (sentence_num - sentence_left_num <= koef):

                sum_eq_words = 0
                for w_info in sentence.sentence_words:

                    for translate_word in w_info.translate_list:

                        for w_info_ru in self.sentences_list_ru[sentence_left_num]:

                            for w_normal in w_info_ru.normal_form:

                                if w_normal == translate_word:
                                    sum_eq_words = sum_eq_words + 1

                graph_matrix[sentence_num][sentence_left_num] = -( sum_eq_words - sentence_num + sentence_left_num )

            while (sentence_right_num < len( self.sentences_list_ru ) ) and ( sentence_right_num - sentence_num <= koef):

                sum_eq_words = 0
                for w_info in sentence.sentence_words:

                    for translate_word in w_info.translate_list:

                        for w_info_ru in self.sentences_list_ru[sentence_right_num]:

                            for w_normal in w_info_ru.normal_form:

                                if w_normal == translate_word:
                                    sum_eq_words = sum_eq_words + 1

                graph_matrix[sentence_num][sentence_right_num] = -( sum_eq_words - sentence_num + sentence_left_num )

        return graph_matrix