def __init__(self, seed=42): super(Solver, self).__init__() self.morph = pymorphy2.MorphAnalyzer() self.seed = seed self.init_seed() self.toktok = ToktokTokenizer() self.paronyms = self.get_paronyms()
def __init__(self, seed=42, data_path='data/'): self.is_train_task = False self.morph = pymorphy2.MorphAnalyzer() self.toktok = ToktokTokenizer() self.seed = seed self.init_seed() self.synonyms = open(os.path.join(data_path, r'synonyms.txt'), 'r', encoding='utf8').readlines() self.synonyms = [ re.sub('\.', '', t.lower().strip('\n')).split(' ') for t in self.synonyms ] self.synonyms = [[t for t in l if t] for l in self.synonyms] self.antonyms = open(os.path.join(data_path, r'antonyms.txt'), 'r', encoding='utf8').readlines() self.antonyms = [t.strip(' \n').split(' - ') for t in self.antonyms] self.phraseology = open(os.path.join(data_path, r'phraseologs.txt'), 'r', encoding='utf8').readlines() self.phraseology = [[ l for l in self.lemmatize(l) if l not in ['\n', ' ', '...', '', ',', '-', '.', '?', r' (', r'/'] ] for l in self.phraseology]
def __init__(self, seed=42): super(Solver, self).__init__() self.is_train_task = False self.morph = pymorphy2.MorphAnalyzer() self.toktok = ToktokTokenizer() self.seed = seed self.init_seed()
def __init__(self, seed=42): super(Solver, self).__init__() self.seed = seed self.init_seed() self.morph = pymorphy2.MorphAnalyzer() self.has_model = True self.toktok = ToktokTokenizer() self.mode = 1 # 1 - find wrong word, 2 - replace word
def __init__(self, seed=42): self.seed = seed self.init_seed() self.tokenizer = ToktokTokenizer() self.morph = pymorphy2.MorphAnalyzer() self.count_vectorizer = CountVectorizer(ngram_range=(1, 4), tokenizer=str.split) self.classifier = CatBoostClassifier(verbose=0, use_best_model=True) super().__init__()
def __init__(self, t='text', seed=42, ngram_range=(1, 3)): self.seed = seed self.ngram_range = ngram_range self.vectorizer = TfidfVectorizer(ngram_range=ngram_range) self.vectorizer2 = TfidfVectorizer(ngram_range=ngram_range) self.clf = LinearSVC(multi_class="ovr") self.init_seed() self.word_tokenizer = ToktokTokenizer() self.type = t
class Solver(AbstractSolver): def __init__(self, seed=42): self.seed = seed self.init_seed() self.tokenizer = ToktokTokenizer() self.morph = pymorphy2.MorphAnalyzer() self.count_vectorizer = CountVectorizer(ngram_range=(1, 4), tokenizer=str.split) self.classifier = CatBoostClassifier(verbose=0, use_best_model=True) super().__init__() def init_seed(self): return random.seed(self.seed) def strs_to_pos_tags(self, texts): result = [] for text in texts: result.append(' '.join([ "PNCT" if self.morph.parse(word)[0].tag.POS is None else self.morph.parse(word)[0].tag.POS for word in self.tokenizer.tokenize(text) ])) return result def save(self, path="data/models/solver16.pkl"): model = { "count_vectorizer": self.count_vectorizer, "classifier": self.classifier } joblib.dump(model, path) def load(self, path="data/models/solver16.pkl"): model = joblib.load(path) self.count_vectorizer = model["count_vectorizer"] self.classifier = model["classifier"] def fit(self, tasks): X, y = [], [] for task in tasks: task = standardize_task(task) correct = task["solution"]["correct_variants"][ 0] if "correct_variants" in task["solution"] else [ task["solution"]["correct"] ] sentences = [ re.sub(r"^\d\) ?", "", sentence['text']) for sentence in task["question"]["choices"] ] sentences = self.strs_to_pos_tags(sentences) X.extend(sentences) y.extend([ 1 if str(i + 1) in correct else 0 for i in range(len(sentences)) ]) X = self.count_vectorizer.fit_transform(X).toarray() X_train, X_dev, y_train, y_dev = train_test_split(X, y, train_size=0.9) self.classifier.fit(X_train, y_train, eval_set=(X_dev, y_dev)) def predict_from_model(self, task): task = standardize_task(task) sentences = [ re.sub(r"^\d\) ?", "", sentence['text']) for sentence in task["question"]["choices"] ] sentences = self.strs_to_pos_tags(sentences) vector = self.count_vectorizer.transform(sentences).toarray() proba = self.classifier.predict_proba(vector)[:, 1] two_highest = sorted([str(i + 1) for i in np.argsort(proba)[-2:]]) return two_highest
class Solver(BertEmbedder): def __init__(self, seed=42): super(Solver, self).__init__() self.morph = pymorphy2.MorphAnalyzer() self.seed = seed self.init_seed() self.toktok = ToktokTokenizer() self.paronyms = self.get_paronyms() def init_seed(self): return random.seed(self.seed) def get_paronyms(self): paronyms = [] with open('data/paronyms.csv', 'r', encoding='utf-8') as in_file: for line in in_file.readlines(): pair = line.strip(punctuation).strip().split('\t') paronyms.append(pair) return paronyms def lemmatize(self, token): token_all = self.morph.parse(token.lower().rstrip('.,/;!:?'))[0] lemma = token_all.normal_form return lemma def find_closest_paronym(self, par): paronyms = set() for par1, par2 in self.paronyms: paronyms.add(par1) paronyms.add(par2) try: closest = get_close_matches(par, list(paronyms))[0] except IndexError: closest = None return closest def check_pair(self, token_norm): paronym = None for p1, p2 in self.paronyms: if token_norm == p1: paronym = p2 break if token_norm == p2: paronym = p1 break return paronym def find_paronyms(self, token): token_all = self.morph.parse(token.lower().rstrip('.,/;!:?'))[0] token_norm = token_all.normal_form paronym = self.check_pair(token_norm) if paronym is None: paronym_close = self.find_closest_paronym(token_norm) paronym = self.check_pair(paronym_close) if paronym is not None: paronym_parse = self.morph.parse(paronym)[0] try: str_grammar = str(token_all.tag).split()[1] except IndexError: str_grammar = str(token_all.tag) gr = set( str_grammar.replace("Qual ", "").replace(' ', ',').split(',')) try: final_paronym = paronym_parse.inflect(gr).word except AttributeError: final_paronym = paronym else: final_paronym = '' return final_paronym def predict(self, task): return self.predict_from_model(task) def fit(self, tasks): pass def load(self, path="data/models/solver5.pkl"): pass def save(self, path="data/models/solver5.pkl"): pass def get_score(self, a, b, paronym): return self.fill_mask(a, b, paronym.lower()) return cosine_similarity( self.sentence_embedding([sent])[0].reshape(1, -1), self.sentence_embedding([paronym.lower()])[0].reshape(1, -1))[0][0] def predict_from_model(self, task): description = task["text"].replace('НЕВЕРНО ', "неверно ") sents = [] for line in self.toktok.sentenize(description): line_tok = self.toktok.tokenize(line) for idx, token in enumerate(line_tok): if token.isupper() and len(token) > 2: # get CAPS paronyms second_pair = self.find_paronyms(token) line_before = ' '.join(line_tok[:idx]) line_after = ' '.join(line_tok[idx + 1:]) if second_pair != '': score = self.get_score( line_before, line_after, token) - self.get_score( line_before, line_after, second_pair) sents.append((score, token, second_pair)) sents.sort() return sents[0][2].strip(punctuation + '\n')
class Solver(BertEmbedder): def __init__(self, seed=42): super(Solver, self).__init__() self.is_train_task = False self.morph = pymorphy2.MorphAnalyzer() self.toktok = ToktokTokenizer() self.seed = seed self.init_seed() def init_seed(self): random.seed(self.seed) def predict(self, task): return self.predict_from_model(task) def get_num(self, text): lemmas = [ self.morph.parse(word)[0].normal_form for word in self.toktok.tokenize(text) ] if 'указывать' in lemmas and 'предложение' in lemmas: w = lemmas[lemmas.index('указывать') + 1] # first d = {'один': 1, 'два': 2, 'три': 3, 'четыре': 4, 'предложение': 1} if w in d: return d[w] elif 'указывать' in lemmas and 'вариант' in lemmas: return 'unknown' return 1 def compare_text_with_variants(self, text, variants, num=1): text_vector = self.sentence_embedding([text]) variant_vectors = self.sentence_embedding(variants) i, predictions = 0, {} for j in variant_vectors: sim = cosine_similarity(text_vector[0].reshape(1, -1), j.reshape(1, -1)).flatten()[0] predictions[i] = sim i += 1 indexes = sorted(predictions.items(), key=operator.itemgetter(1), reverse=True)[:num] return sorted([str(i[0] + 1) for i in indexes]) def sent_split(self, text): reg = r'\(*\d+\)' return re.split(reg, text) def process_task(self, task): first_phrase, task_text = re.split(r'\(*1\)', task['text'])[:2] variants = [t['text'] for t in task['question']['choices']] text, task = "", "" if 'Укажите' in task_text: text, task = re.split('Укажите ', task_text) task = 'Укажите ' + task elif 'Укажите' in first_phrase: text, task = task_text, first_phrase return text, task, variants def fit(self, tasks): pass def load(self, path=""): pass def save(self, path=''): pass def predict_from_model(self, task, num=2): text, task, variants = self.process_task(task) result = self.compare_text_with_variants(text, variants, num=num) return result
class Solver(object): def __init__(self, seed=42, data_path='data/'): self.is_train_task = False self.morph = pymorphy2.MorphAnalyzer() self.toktok = ToktokTokenizer() self.seed = seed self.init_seed() self.synonyms = open(os.path.join(data_path, r'synonyms.txt'), 'r', encoding='utf8').readlines() self.synonyms = [ re.sub('\.', '', t.lower().strip('\n')).split(' ') for t in self.synonyms ] self.synonyms = [[t for t in l if t] for l in self.synonyms] self.antonyms = open(os.path.join(data_path, r'antonyms.txt'), 'r', encoding='utf8').readlines() self.antonyms = [t.strip(' \n').split(' - ') for t in self.antonyms] self.phraseology = open(os.path.join(data_path, r'phraseologs.txt'), 'r', encoding='utf8').readlines() self.phraseology = [[ l for l in self.lemmatize(l) if l not in ['\n', ' ', '...', '', ',', '-', '.', '?', r' (', r'/'] ] for l in self.phraseology] def init_seed(self): random.seed(self.seed) def lemmatize(self, text): return [ self.morph.parse(word)[0].normal_form for word in self.toktok.tokenize(text.strip()) ] def predict(self, task): return self.predict_from_model(task) def get_word(self, text): try: return re.split('»', re.split('«', text)[1])[0] except: return '' def get_pos(self, text): pos = [] lemmas = self.lemmatize(text) lemmas = [l for l in lemmas if l != ' '] if 'фразеологизм' in lemmas: pos = "PHR" elif 'синоним' in lemmas: pos = "SYN" elif 'антоним' in lemmas: pos = "ANT" elif 'антонимический' in lemmas: pos = "ANT" elif 'синонимический' in lemmas: pos = "SYN" else: pos = "DEF" return pos def full_intersection(self, small_lst, big_lst): if sum([value in big_lst for value in small_lst]) == len(small_lst): return True return False def sent_split(self, text): reg = r'\(*\n*\d+\n*\)' return re.split(reg, text) def search(self, text_lemmas, lst): for l in lst: if self.full_intersection(l, text_lemmas): return ''.join(l) return '' def get_num(self, text): nums = 0 res = re.search('\d+–*-*\d*', text) if res: res = res[0] if '–' in res: nums = res.split('–') nums = list(range(int(nums[0]), int(nums[1]) + 1)) elif '-' in res: nums = res.split('-') nums = list(range(int(nums[0]), int(nums[1]) + 1)) else: nums = [int(res)] return nums def compare_text_with_variants(self, pos, text, nums=[], word=''): indexes = [] sents = self.sent_split(text) lemmas_all = [] for s in nums: lemmas = self.lemmatize(sents[s - 1]) lemmas_all += [l for l in lemmas if l != ' '] conditions = 0 lemmas_all = [ l for l in lemmas_all if re.match('\w+', l) and re.match('\w+', l)[0] == l ] if pos == 'SYN': variant = self.search(lemmas_all, self.synonyms) elif pos == 'ANT': variant = self.search(lemmas_all, self.antonyms) else: variant = self.search(lemmas_all, self.phraseology) if variant: return variant else: return str(random.choice(lemmas_all)) def eat_json(self, task): try: firstphrase, tasktext = re.split(r'\(\n*1\n*\)', task['text']) except ValueError: firstphrase, tasktext = ' '.join( re.split(r'\(\n*1\n*\)', task['text'])[:-1]), re.split(r'\(\n*1\n*\)', task['text'])[-1] if 'Из предложени' in tasktext: text, task = re.split('Из предложени', tasktext) task = 'Из предложени ' + task else: text, task = tasktext, firstphrase nums = self.get_num(task) pos = self.get_pos(task) word = '' if pos == 'DEF': word = self.get_word(task) return text, task, pos, nums, word def fit(self, tasks): pass def load(self, path='data/models/solver24.pkl'): pass def save(self, path='data/models/solver24.pkl'): pass def predict_from_model(self, task): text, task, pos, nums, word = self.eat_json(task) result = self.compare_text_with_variants(pos, text, nums=nums, word=word) return result
class SubSolver(object): """ Классификатор между заданиями. Работает на Tfidf векторах и мультиклассовом SVM. Parameters ---------- seed : int, optional (default=42) Random seed. ngram_range : tuple, optional uple (min_n, max_n) (default=(1, 3)) Used forTfidfVectorizer. he lower and upper boundary of the range of n-values for different n-grams to be extracted. All values of n such that min_n <= n <= max_n will be used. num_tasks : int, optional (default=27) Count of all tasks. Examples -------- >>> # Basic usage >>> from solvers import classifier >>> import json >>> from utils import read_config >>> clf = classifier.Solver() >>> tasks = [] >>> dir_path = "data/" >>> for file_name in os.listdir(dir_path): >>> if file_name.endswith(".json"): >>> data = read_config(os.path.join(dir_path, file_name)) >>> tasks.append(data) >>> clf = solver.fit(tasks) >>> # Predict for last file in dir >>> numbers_of_tasks = clf.predict(read_config(os.path.join(dir_path, file_name))) >>> numbers_of_tasks array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 12, 13, 14, 15, 16, 17, 18, 19, 17, 21, 22, 23, 24, 25, 26, 24]) >>> # Save classifier >>> clf.save("clf.pickle") >>> # Load classifier >>> clf.load("clf.pickle") """ def __init__(self, t='text', seed=42, ngram_range=(1, 3)): self.seed = seed self.ngram_range = ngram_range self.vectorizer = TfidfVectorizer(ngram_range=ngram_range) self.vectorizer2 = TfidfVectorizer(ngram_range=ngram_range) self.clf = LinearSVC(multi_class="ovr") self.init_seed() self.word_tokenizer = ToktokTokenizer() self.type = t def init_seed(self): np.random.seed(self.seed) random.seed(self.seed) def convert_to_text(self, task): #return task['text'] text = self.word_tokenizer.tokenize(task['text']) if self.type in ["choice", "multiple_choice"]: choice_type = [ t for t in task['question']['choices'][0].keys() if t != 'id' ][0] text.append(choice_type) for el in task['question']['choices']: text += self.word_tokenizer.tokenize(el[choice_type]) text = ' '.join(text) return text def fit(self, tasks): texts = [] classes = [] for data in tasks: for task in data: if task['question']['type'] == self.type: idx = int(task["id"]) if idx in range(17, 21): idx = 17 texts.append(self.convert_to_text(task)) classes.append(idx) classes = np.array(classes) self.classes = np.unique(classes) if len(self.classes) > 1: vectors = self.vectorizer.fit_transform(texts) self.clf.fit(vectors, classes) return self def predict_one(self, task): if len(self.classes) == 1: return self.classes[0] text = self.convert_to_text(task) return int( self.clf.predict(self.vectorizer.transform([text])).ravel()[0]) def fit_from_dir(self, dir_path): tasks = [] for file_name in os.listdir(dir_path): if file_name.endswith(".json"): data = read_config(os.path.join(dir_path, file_name)) tasks.append(data) tasks = [task for task in tasks if 'hint' not in task] return self.fit(tasks) def load(self, d): self.vectorizer = d['vec'] self.clf = d['clf'] self.classes = d['classes'] def save(self): return { "vec": self.vectorizer, "clf": self.clf, "classes": self.classes }
class Solver(object): def __init__(self, seed=42): self.is_train_task = False self.morph = pymorphy2.MorphAnalyzer() self.toktok = ToktokTokenizer() self.seed = seed self.init_seed() def init_seed(self): random.seed(self.seed) def lemmatize(self, text): return [self.morph.parse(word)[0].normal_form for word in self.toktok.tokenize(text.strip())] def predict(self, task): return self.predict_from_model(task) def get_word(self, text): try: return re.split('»', re.split('«', text)[1])[0] except: return '' def get_pos(self, text): pos = [] lemmas = self.lemmatize(text) lemmas = [l for l in lemmas if l!=' '] if 'сочинительный' in lemmas: pos.append("CCONJ") if 'подчинительный' in lemmas: pos.append("SCONJ") if 'наречие' in lemmas: pos.append("ADV") if 'союзный' in lemmas: pos.append("ADVPRO") if 'частица' in lemmas: pos.append("PART") if 'определительный' in lemmas: pos.append("OPRO") if 'личный' in lemmas: pos.append("LPRO") if 'указательный' in lemmas: pos.append("UPRO") return pos def sent_split(self, text): reg = r'\(*\n*\d+\n*\)' return re.split(reg, text) def get_num(self, text): nums = 0 res = re.search('\d+([–|-|—])*\d*', text) if res: res = res[0] if re.search(r'–|-|—', res): nums = re.split(r'–|-|—', res) nums = list(range(int(nums[0]), int(nums[1])+1)) else: nums = [int(res)] return nums def compare_text_with_variants(self, pos, text, nums=[]): indexes = [] sents = self.sent_split(text) dic = {"CCONJ":['но','а','и','да', 'тоже', 'также', 'зато', 'однако', 'же', 'или', 'либо'], "SCONJ":['если','хотя','однако','когда','что','потомучто'], "ADV":['сейчас','сегодня'], "ADVPRO":['который','которая'], "OPRO":['этот','это','эта','все', 'сам', 'самый', 'весь', 'всякий', 'каждый', 'любой', 'другой', 'иной', 'всяк', 'всяческий'], "LPRO":['я', 'ты', 'он', 'она', 'оно', 'мы', 'вы', 'они'], "UPRO":['этот','это','эта','все', 'тот', 'такой', 'таков', 'столько', 'сей', 'оный' ], "PART":['только','именно','не', 'ни', 'бы', 'лишь', 'пусть', 'дескать'] } if not pos: return [str(random.choice(nums))] for s in nums: lemmas = self.lemmatize(sents[s-1]) lemmas = [l for l in lemmas if l!=' '] conditions=0 for p in pos: variants = dic[p] if sum([v in lemmas for v in variants]): conditions+=1 if conditions==len(pos): indexes.append(s) if not indexes: indexes = [random.choice(nums)] return [str(i) for i in sorted(indexes)] def eat_json(self, task): try: firstphrase, tasktext = re.split(r'\(\n*1\n*\)', task['text']) except ValueError: firstphrase, tasktext = ' '.join(re.split(r'\(\n*1\n*\)', task['text'])[:-1]),re.split(r'\(\n*1\n*\)', task['text'])[-1] if 'Среди предложений' in tasktext: text, task = re.split('Среди предложений', tasktext) task = 'Среди предложений '+task #word = re.split('\.', re.split('значения слова ', text)[1])[0] else: text, task = tasktext, firstphrase #word = re.split('\.', re.split('значения слова ', task)[1])[0] nums = self.get_num(task) pos = self.get_pos(task) return text, task, nums, pos def fit(self, tasks): pass def load(self, path='data/models/solver25.pkl'): pass def save(self, path='data/models/solver25.pkl'): pass def predict_from_model(self, task): text, task, nums, pos = self.eat_json(task) result = self.compare_text_with_variants(pos, text, nums=nums) return result
class Solver(BertEmbedder): def __init__(self, seed=42): super(Solver, self).__init__() self.seed = seed self.init_seed() self.morph = pymorphy2.MorphAnalyzer() self.has_model = True self.toktok = ToktokTokenizer() self.mode = 1 # 1 - find wrong word, 2 - replace word def init_seed(self): return random.seed(self.seed) def predict_random(self, task_desc): """Random variant""" task_desc = re.sub("[^а-я0-9\-]", " ", task_desc) result = random.choice(task_desc.split()) return result def exclude_word(self, task_sent): """Make it with Bert""" tokens = self.toktok.tokenize(task_sent) to_tokens = [] for token in tokens: parse_res = self.morph.parse(token)[0] if parse_res.tag.POS not in ["CONJ", "PREP", "PRCL", "INTJ", "PRED", "NPRO"]: if parse_res.normal_form != 'быть': to_tokens.append((parse_res.word, parse_res.tag.POS)) bigrams = list(ngrams(to_tokens, 2)) results = [] for bigram in bigrams: if bigram[0] != bigram[1] and bigram[0][1] == 'ADJF' and bigram[1][1] == 'NOUN': b1 = self.sentence_embedding([bigram[0][0]])[0].reshape(1, -1) b2 = self.sentence_embedding([bigram[1][0]])[0].reshape(1, -1) sim = cosine_similarity(b1, b2)[0][0] results.append((sim, bigram[0][0], bigram[1][0], bigram[0][1], bigram[0][1])) if not results: for bigram in bigrams: if bigram[0] != bigram[1]: b1 = self.sentence_embedding([bigram[0][0]])[0].reshape(1, -1) b2 = self.sentence_embedding([bigram[1][0]])[0].reshape(1, -1) sim = cosine_similarity(b1, b2)[0][0] results.append((sim, bigram[0][0], bigram[1][0], bigram[0][1], bigram[0][1])) results = sorted(results) final_pair = results[-1] if final_pair[-1] == 'NOUN' and final_pair[-2] == 'NOUN': return results[-1][2], tokens else: return results[-1][1], tokens def fit(self, tasks): pass def load(self, path="data/models/solver6.pkl"): pass def save(self, path="data/models/solver6.pkl"): pass def predict(self, task): if not self.has_model: return self.predict_random(task) else: return self.predict_from_model(task) def predict_from_model(self, task): description = task["text"] task_desc = "" if "заменив" in description: self.mode = 2 else: self.mode = 1 for par in description.split("\n"): for sentence in nltk.sent_tokenize(par): sentence = sentence.lower().rstrip(punctuation).replace('6.', "") if re.match('.*(отредактируйте|выпишите|запишите|исправьте|исключите).*', sentence): continue else: task_desc += sentence result, tokens = self.exclude_word(task_desc) return result.strip(punctuation)
class Solver(BertEmbedder): def __init__(self, seed=42): super(Solver, self).__init__() self.is_train_task = False self.morph = pymorphy2.MorphAnalyzer() self.toktok = ToktokTokenizer() self.seed = seed self.init_seed() def init_seed(self): random.seed(self.seed) def predict(self, task): return self.predict_from_model(task) def clean_text(self, text): newtext, logic = [], [ "PREP", "CONJ", "Apro", "PRCL", "INFN", "VERB", "ADVB" ] for token in self.toktok.tokenize(text): if any(tag in self.morph.parse(token)[0].tag for tag in logic): newtext.append(self.morph.parse(token)[0].normal_form) return ' '.join(newtext) def get_pos(self, text): pos, lemmas = 'word', [ self.morph.parse(word)[0].normal_form for word in self.toktok.tokenize(text) ] if 'сочинительный' in lemmas: pos = "CCONJ" elif 'подчинительный' in lemmas: pos = "SCONJ" elif 'наречие' in lemmas: pos = "ADV" elif 'союзный' in lemmas: pos = "ADVPRO" elif 'местоимение' in lemmas: pos = "PRO" elif 'частица' in lemmas: pos = "PART" return pos def get_num(self, text): lemmas = [ self.morph.parse(word)[0].normal_form for word in self.toktok.tokenize(text) ] if 'слово' in lemmas and 'предложение' in lemmas: d = { 'один': 1, 'два': 2, 'три': 3, 'четыре': 4, 'первый': 1, 'второй': 2, 'третий': 3, 'четвертый': 4, } for i in lemmas: if i in d: return d[i] return 1 def sent_split(self, text): reg = r'\(\n*\d+\n*\)' return re.split(reg, text) def compare_text_with_variants(self, query_word, text, variants): sents = self.sent_split(text) text_vector = None for sent in sents: sent = re.sub('Прочитайте фрагмент.*', '', sent) words = self.toktok.tokenize(sent) lemmas = [self.morph.parse(word) for word in words] word_idx = None for idx in range(len(lemmas)): if query_word.lower() in [ el.normal_form for el in lemmas[idx] ]: word_idx = idx break if word_idx is not None: text = " ".join(words[:word_idx] + ['|', query_word.lower(), '|'] + words[word_idx + 1:]) text_vector = self.contextual_word_embedding([text])[0] break else: text_vector = self.sentence_embedding([text])[0] pretext = query_word.lower() + ' - это ' variants = [ pretext + re.sub('\d+[.)]', '', variant) for variant in variants ] variant_vectors = self.sentence_embedding(variants) i, predictions = 0, {} for j in variant_vectors: sim = cosine_similarity(text_vector.reshape(1, -1), j.reshape(1, -1)).flatten()[0] predictions[i] = sim i += 1 indexes = sorted(predictions.items(), key=operator.itemgetter(1), reverse=True)[:1] return sorted([str(i[0] + 1) for i in indexes]) def process_task(self, task): try: first_phrase, task_text = re.split(r'\(\n*1\n*\)', task['text']) except ValueError: first_phrase, task_text = ' '.join(re.split(r'\(\n*1\n*\)', task['text'])[:-1]), \ re.split(r'\(\n*1\n*\)', task['text'])[-1] variants = [t['text'] for t in task['question']['choices']] text, task, word = "", "", "" if 'Определите' in task_text: text, task = re.split('Определите', task_text) task = 'Определите ' + task word = re.split('\.', re.split('значения слова ', text)[1])[0] elif 'Определите' in first_phrase: text, task = task_text, first_phrase word = re.split('\.', re.split('значения слова ', task)[1])[0] return text, task, variants, word def fit(self, tasks): pass def load(self, path="data/models/solver3.pkl"): pass def save(self, path='data/models/solver3.pkl'): pass def predict_from_model(self, task): text, task, variants, word = self.process_task(task) result = self.compare_text_with_variants(word, text, variants) return result