def index(name = None): if request.args: story = request.args['joke'] mystem = Mystem() gramm = mystem.analyze(story) characters = set() for i in gramm: if (str(i).find("од=") != -1) and (str(i).find("неод=") == -1): s1 = str(i)[str(i).find("'lex': '") + 8:] characters.add(s1[:s1.find( "'")]) file = open("corp.txt", 'r', encoding = "UTF-8") f = file.read()[1:].split('\n\n') file.close() file = open("ans.txt", 'w', encoding = "UTF-8") for i in f: words = ((re.sub('[,\.\?\!\—\-\(\)\:\;]', '', i)).lower()).split(' ') if characters <= set(words): f = file.write(i + '\n\n') file.close() with open("ans.txt", "r", encoding='utf-8') as f: content = f.read().split('\n\n') return render_template("index.html", content=content) return render_template('index.html')
def without_pronouns(directory): input_files = filter(lambda x: not x.endswith('~'), os.listdir(directory)) output_data = {} m = Mystem() #иду по документам for input_file in input_files: with open(directory + '/' + input_file) as data_file: data = json.load(data_file) list_of_terms = filter(lambda x: x not in ('', ' ', '\n'), data['text'].split(' ')) my_list = list_of_terms list_of_terms = [] for term in my_list: if m.analyze(term)[0].get(u'analysis'): if not m.analyze(term)[0][u'analysis'][0][u'gr'].startswith((u'SPRO', u'APRO')): list_of_terms.append(term) else: list_of_terms.append(term) text_of_output = ' '.join(['%s' % term for term in list_of_terms]) output_data[input_file] = {} output_data[input_file]['id'] = data['id'] output_data[input_file]['positive'] = data['positive'] output_data[input_file]['sarcasm'] = data['sarcasm'] output_data[input_file]['text'] = text_of_output with open(directory + '/' + input_file, 'w') as output_file: json.dump(output_data[input_file], output_file)
def mystem_using_with_considering_of_multiple_letters(input_directory, output_directory): input_files = filter(lambda x: not x.endswith('~'), os.listdir(input_directory)) output_data = {} m = Mystem() #иду по документам for input_file in input_files: with open(input_directory + '/' + input_file) as data_file: data = json.load(data_file) list_of_terms = filter(lambda x: x != '', re.split(''' |\.|,|:|\?|"|\n|<|>|\*|!|@|_ +''', data['text'])) my_list_of_terms = [] for term in list_of_terms: if term == m.lemmatize(term)[0]: my_term = term term = u'' prev_letter = my_term[0] term += my_term[0] for i in range(1, len(my_term)): if my_term[i] != prev_letter: term += my_term[i] prev_letter = my_term[i] my_list_of_terms.append(term) else: my_list_of_terms.append(term) list_of_terms = my_list_of_terms text = ' '.join(['%s' % term for term in list_of_terms]) list_of_terms = filter(lambda x: x not in ('', ' ', '\n'), m.lemmatize(text)) text_of_output = ' '.join(['%s' % term for term in list_of_terms]) output_data[input_file] = {} output_data[input_file]['id'] = data['id'] output_data[input_file]['positive'] = data['positive'] output_data[input_file]['sarcasm'] = data['sarcasm'] output_data[input_file]['text'] = text_of_output with open(output_directory + '/' + input_file, 'w') as output_file: json.dump(output_data[input_file], output_file)
def __init__(self, path, doc_id, limit): """ :param doc_id: numerical id of a document, pass manually """ self.text = open(path).read().lower().replace('\n', '.') # need a better regex self.sentences = [sentence for sentence in re.split(r'(?:[.]\s*){3}|[.?!]', self.text) if sentence and len(sentence.split()) > 2] self.pos_data = [] self.testing_data = [] self.id = doc_id m = Mystem() counter = Counter(DEFAULTS) if not limit or limit > len(self.sentences): limit = len(self.sentences) for sentence in self.sentences[:limit]: # parse with mystem data = m.analyze(sentence) # get POS and count for each sentence pos = [word.get('analysis', None)[0]['gr'].split('(')[0].split(',')[0].split('=')[0] for word in data if word.get('analysis', None)] counter.update(pos) # append to dataset self.pos_data.append([counter[key] for key in sorted(counter)]) # reset counter counter = Counter(DEFAULTS)
def __init__(self, path): self.text = open(path).read().lower() self.sentences = [sentence for sentence in re.split(r'(?:[.]\s*){3}|[.?!]', self.text) if len(sentence) > 1] self.pos_data = [] m = Mystem() counter = [0, 0, 0, 0, 0] for sentence in self.sentences: # parse with mystem # count adjectives A, nouns S, verbs V, adverbs ADV, pronouns PR data = m.analyze(sentence) for word in data: analysis = word.get('analysis', None) if analysis: best = analysis[0] gr = best['gr'] if 'S' in gr: counter[3] += 1 elif 'ADV' in gr: counter[1] += 1 elif 'A' in gr: counter[0] += 1 elif 'V' in gr: counter[4] += 1 elif 'PR' in gr: counter[2] += 1 self.pos_data.append(counter) counter = [0, 0, 0, 0, 0] self.data = np.array(self.pos_data)
def extract(self): try: #вычисляем, сколько в директории лежит файлов input_files = filter(lambda x: not x.endswith('~'), os.listdir(self.input_directory)) output_data = {} list_of_all_terms = {} m = Mystem() #иду по документам for file in input_files: with open(self.input_directory + '/' + file) as data_file: data = json.load(data_file) list_of_terms = filter(lambda x: x != "", re.split(""" |\.|,|:|\?|"|<|>|\*|!|@|_ +""", data['text'])) text = " ".join(["%s" % term for term in list_of_terms]) list_of_terms = filter(lambda x: x != " ", m.lemmatize(text)) count_of_rows = 0 for i in range(0, len(list_of_terms)): if list_of_terms[i] == '\n' or list_of_terms[i] == ' \n': count_of_rows += 1 if list_of_terms[i] == ' \n': list_of_terms[i] = '\n' if count_of_rows < self.threshold_of_rows_count: first_list_of_terms = list_of_terms list_of_terms = [] for i in range(0, len(first_list_of_terms)): if first_list_of_terms[i] != '\n': list_of_terms.append(first_list_of_terms[i]) output_data[file] = {} output_data[file]['id'] = data['id'] output_data[file]['positive'] = data['positive'] output_data[file]['sarcasm'] = data['sarcasm'] output_data[file]['terms'] = {} #убираю повторяющиеся слова for term in list_of_terms: if term not in output_data[file]['terms']: output_data[file]['terms'][term] = 1 else: output_data[file]['terms'][term] += 1 for term in output_data[file]['terms'].keys(): if term not in list_of_all_terms: list_of_all_terms[term] = 1 else: list_of_all_terms[term] += 1 #подсчёт tf count_of_terms = output_data[file]['terms'][term] output_data[file]['terms'][term] = {'tf': float(count_of_terms)/len(list_of_terms), 'idf': 0, 'count': count_of_terms} for file in input_files: #подсчёт idf for term in output_data[file]['terms'].keys(): output_data[file]['terms'][term]['idf'] = math.log(float(len(input_files))/list_of_all_terms[term]) #запись результата with open(self.output_directory + '/' + file + '_tf-idf', 'w') as output_file: json.dump(output_data[file], output_file) except Exception: return False else: return True
def lmtze(textfile): m = Mystem() text = open(textfile, encoding='utf-8').readlines() newfile = open(textfile.replace('txt', 'lem.txt'), 'w', encoding='utf-8') result_full = [] for line in text: try: element = etree.fromstring(line.strip('\n')) text_ = element.xpath('text()') entities = element.xpath('*') result = ['<sent>'] while text_: l = text_.pop(0) # open('temp.txt', 'w', encoding='utf-8').write(l) # subprocess.call(['C:\\Mystem\\mystem', 'i']) l = m.analyze(l) # print(l) for x in l: if x.get('analysis') is not None: if x.get('analysis') == []: result.append(x['text']) else: result.append(x['analysis'][0]['lex'] + '_' + x['analysis'][0]['gr'].split(',')[0].split('=')[0]) else: continue if text_: e = entities.pop(0) e_ = m.analyze(e.text) result.append('<' + e.tag + '>') for x in e_: if x.get('analysis') is not None: if x.get('analysis') == []: result.append(x['text']) else: result.append(x['analysis'][0]['lex']) else: continue result.append('</' + e.tag + '>') except Exception: continue result.append('</sent>') result_full.append(result) result = [] print(len(result_full), ' разобралось') for sent in result_full: prev = '' for x in sent: if '<' in x and '/' not in x: newfile.write(prev + x) prev = '' elif '_' in x or x.isalpha(): newfile.write(prev + x) prev = ' ' else: newfile.write(x) newfile.write('\n')
def extract(self): try: #вычисляем, сколько в директории лежит файлов input_files = filter(lambda x: not x.endswith('~'), os.listdir(self.input_directory)) output_data = {} list_of_all_terms = {} m = Mystem() #иду по документам for file in input_files: with open(self.input_directory + '/' + file) as data_file: data = json.load(data_file) list_of_terms = filter(lambda x: x != "", re.split(""" |\.|,|:|\?|"|\n|<|>|\*|!|@|_ +""", data['text'])) text = " ".join(["%s" % term for term in list_of_terms]) list_of_terms = filter(lambda x: x not in (" ", "\n"), m.lemmatize(text)) my_list = list_of_terms list_of_terms = [] for term in my_list: if m.analyze(term)[0].get(u'analysis'): if not m.analyze(term)[0][u'analysis'][0][u'gr'].startswith(self.service_parts_of_speech) and len(term) > 1: list_of_terms.append(term) if term == u'не': list_of_terms.append(term) else: list_of_terms.append(term) output_data[file] = {} output_data[file]['id'] = data['id'] output_data[file]['positive'] = data['positive'] output_data[file]['sarcasm'] = data['sarcasm'] output_data[file]['terms'] = {} #убираю повторяющиеся слова for term in list_of_terms: if term not in output_data[file]['terms']: output_data[file]['terms'][term] = 1 else: output_data[file]['terms'][term] += 1 for term in output_data[file]['terms'].keys(): if term not in list_of_all_terms: list_of_all_terms[term] = 1 else: list_of_all_terms[term] += 1 #подсчёт tf count_of_terms = output_data[file]['terms'][term] output_data[file]['terms'][term] = {'tf': float(count_of_terms)/len(list_of_terms), 'idf': 0, 'count': count_of_terms} for file in input_files: #подсчёт idf for term in output_data[file]['terms'].keys(): output_data[file]['terms'][term]['idf'] = math.log(float(len(input_files))/list_of_all_terms[term]) #запись результата with open(self.output_directory + '/' + file + '_tf-idf', 'w') as output_file: json.dump(output_data[file], output_file) except Exception: return False else: return True
def extract(self): try: #вычисляем, сколько в директории лежит файлов input_files = filter(lambda x: not x.endswith('~'), os.listdir(self.input_directory)) output_data = {} list_of_all_n_grams = {} m = Mystem() #иду по документам for file in input_files: with open(self.input_directory + '/' + file) as data_file: data = json.load(data_file) list_of_terms = filter(lambda x: x != "", re.split(""" |\.|,|:|\?|"|\n|<|>|\*|!|@|_ +""", data['text'])) text = " ".join(["%s" % term for term in list_of_terms]) list_of_terms = filter(lambda x: x not in (" ", "\n"), m.lemmatize(text)) list_of_n_grams_tuples = {} for j in range(0, self.n): list_of_n_grams_tuples[j] = zip(*[list_of_terms[i:] for i in range(j + 1)]) list_of_n_grams_strings = [] for j in range(0, self.n): for gram_tuple in list_of_n_grams_tuples[j]: string_of_n_gram = " ".join(["%s" % term for term in gram_tuple]) list_of_n_grams_strings.append(string_of_n_gram) output_data[file] = {} output_data[file]['id'] = data['id'] output_data[file]['positive'] = data['positive'] output_data[file]['sarcasm'] = data['sarcasm'] output_data[file]['terms'] = {} #убираю повторяющиеся слова for gram in list_of_n_grams_strings: if gram not in output_data[file]['terms']: output_data[file]['terms'][gram] = 1 else: output_data[file]['terms'][gram] += 1 for gram in output_data[file]['terms'].keys(): if gram not in list_of_all_n_grams: list_of_all_n_grams[gram] = 1 else: list_of_all_n_grams[gram] += 1 #подсчёт tf count_of_n_grams = output_data[file]['terms'][gram] output_data[file]['terms'][gram] = {'tf': float(count_of_n_grams)/len(list_of_n_grams_strings), 'idf': 0, 'count': float(count_of_n_grams)} for file in input_files: #подсчёт idf for gram in output_data[file]['terms'].keys(): output_data[file]['terms'][gram]['idf'] = math.log(float(len(input_files))/list_of_all_n_grams[gram]) #запись результата with open(self.output_directory + '/' + file + '_tf-idf', 'w') as output_file: json.dump(output_data[file], output_file) except Exception: return False else: return True
def preprocess_corpus(id_data, text_data, lemmatization): print("Preprocessing the text corpus ...") reg = re.compile('[^a-z^A-Z^0-9^А-я^\s*]') descriptions = [] item_ids = [] rows_with_nan_desc = [] id_data = id_data.tolist() if lemmatization == True: mystem = Mystem() print("Lemmatization: " + str(lemmatization)) for i, descrption in tqdm(enumerate(text_data)): if lemmatization == True: temp_line = preprocess_line(descrption, reg, mystem) else: temp_line = preprocess_line(descrption, reg) if len(temp_line) > 0: descriptions.append(temp_line) item_ids.append(int(id_data[i])) else: rows_with_nan_desc.append(i) d = {} d['itemID'] = item_ids d['descriptions'] = descriptions d = pd.DataFrame(data=d) print("Number of description that are empty after preprocessing: " + str(len(rows_with_nan_desc))) return d, rows_with_nan_desc
def preprocess_sent(text): mystem = Mystem() russian_stopwords = stopwords.words("russian") letter = re.compile(r'[А-Яа-я]+') tokens = mystem.lemmatize(text.lower()) tokens = [ token for token in tokens if letter.findall(token) and letter.findall(token)[0] == token and token not in russian_stopwords ] text = " ".join(tokens) vectors = new_preproc.preproc_texts([text]) return vectors
def df_from_preproc(): """ Preprocessing data and creation of a Dataframe for LDA model """ result = get_data_gp() reviews_list = [] date = [] for i in range(len(result)): if result[i]["content"]: reviews_list.append(result[i]["content"]) date.append(result[i]["at"]) df = pd.DataFrame(data={"date": date, "text": reviews_list}) df.date = pd.to_datetime(df.date).dt.normalize() try: mystem = Mystem() except FileExistsError: print("Dierctory exists") df["text_preproc"] = df.text.apply(preprocess_text_rus, mystem=mystem) df = df[df["text_preproc"].apply(len) > 2].reset_index(drop=True) return df
class Lemmatizer(BaseProcessor): def __init__(self): self.m = Mystem() def transform(self, tokens, *args): lemm_str = " ".join(tokens) return list(filter(lambda s: s.strip(), self.m.lemmatize(lemm_str)))
class HHParser: def __init__(self) -> None: self.mystem = Mystem() self.term_extractor = rutermextract.TermExtractor() self.russian_stopwords = stopwords.words("russian") with open( os.path.dirname(os.path.realpath(__file__)) + '/models.json', 'rb') as file: self.models = dict(json.load(file)) nltk.download("stopwords") def preprocess_text(self, text: str, word_limit: int): tokens = self.mystem.lemmatize(text.lower()) tokens = [token.split(" ") for token in tokens] tokens = np.concatenate(tokens) tokens = [token.strip() for token in tokens if token not in self.russian_stopwords \ and token != " " \ and token.strip() not in punctuation] text = " ".join(tokens) terms = self.term_extractor(text, limit=word_limit, strings=True) return terms def answer_questions(self, uid: str, questions: List[str]): answers = {} for question in questions: question_terms = self.preprocess_text(question, 2) answer = parsehh(uid, question_terms=question_terms) if answer is not None and answer is not {}: answers[question] = answer return answers
def init_model(): """Init Word2Vec model.""" logging.info("Loading model '%s' ..." % config.MODEL_NAME) global model model = gensim.downloader.load(config.MODEL_NAME) logging.info("Model is loaded.") global topics topics = utils.load_topics() logging.info("Topics: %s." % topics) global tags_model standard_library.install_aliases() # Таблица преобразования частеречных тэгов Mystem в тэги UPoS: mapping_url = 'https://raw.githubusercontent.com/akutuzov/universal-pos-tags/4653e8a9154e93fe2f417c7fdb7a357b7d6ce333/ru-rnc.map' global tag_mapping mystem2upos = {} r = requests.get(mapping_url, stream=True) for pair in r.text.split('\n'): pair = pair.split() if len(pair) > 1: mystem2upos[pair[0]] = pair[1] tag_mapping = mystem2upos logging.info('Loading the tags model ...') tags_model = Mystem()
def text_analysis(texts): token_texts, part_texts = [], [] token_text, part_text = [], [] token_sent, part_sent = [], [] tokens = Mystem().analyze(texts) tokens = list(filter(lambda t: t != {"text": " "} and t != {"text": "-"}, tokens))[:-1] for token in tokens: if token['text'] == 'sent': token_text.append(token_sent) part_text.append(part_sent) token_sent, part_sent = [], [] else: if token['text'] == 'stop': token_text.append(token_sent) part_text.append(part_sent) token_texts.append(token_text) part_texts.append(part_text) token_sent, part_sent = [], [] token_text, part_text = [], [] else: try: if token['analysis'][0]['lex'] not in stopwords.words("russian"): token_sent.append(token['analysis'][0]['lex']) part_sent.append(re.split(r'[,=]', token['analysis'][0]['gr'])[0]) except IndexError: token_sent.append(token['text']) part_sent.append('DL') except KeyError: pass return token_texts, part_texts
def get_mystem(): for mys in mystems: if not mys['in_use']: return mys['mystem'] else: new_mys = Mystem() return new_mys
class NewsTextDataset: def __init__(self): self.data = [] self.unique_ids = [] self.mystem = Mystem() self.russian_stopwords = stopwords.words("russian") def append(self, article: Article): if article.article_id not in self.unique_ids: self.unique_ids.append(article.article_id) self.data.append(article) return True else: return False def save(self, path): with open(path, "w") as fp: data = { "catalog": [ob.__dict__ for ob in self.data], } json.dump( data, fp, sort_keys=True, indent=4, ensure_ascii=False, ) def load(self, path): with open(path) as json_file: data = json.load(json_file) self.data = [Article(dict_object=obj) for obj in data["catalog"]] def preprocess(self): for idx, article in tqdm(enumerate(self.data)): # r"[a-zA-Z]|\$|\d*|\(|\)|/@" pattern = r"[^а-яА-Я\s]" text = re.sub(pattern, "", article.text) tokens = self.mystem.lemmatize(text.lower()) tokens = [ token for token in tokens if token not in self.russian_stopwords and token != " " and token.strip() not in punctuation and ad.is_cyrillic(token) ] article.tokenized_text = tokens self.update(article, idx) def dump_to_pandas(self): return pd.DataFrame.from_records( [article.to_dict() for article in self.data], ) def __len__(self): return len(self.data) def update(self, article, idx): self.data[idx].tokenized_text = article.tokenized_text # Useless for now def __getitem__(self, idx): return self.data[idx]
def process_mystem(words, lang): m = Mystem() analysis = m.analyze(words) with open(lang + '_processed.txt', 'w', encoding='utf-8') as file: for elem in analysis: if elem['text'] != ' ' and elem['text'] != '\n': try: token = elem['text'] lemma = elem['analysis'][0]['lex'] pos_tag = elem['analysis'][0]['gr'].split(',')[0].split( '=')[0] info = '%s\t%s\t%s\n' % (token, lemma, pos_tag) file.write(info) except: pass
class Word2vecProcessor(object): """Объект для работы с моделью word2vec сходства слов""" def __init__(self, w2v_model_file): self.mystem = Mystem() self.word2vec = KeyedVectors.load_word2vec_format(w2v_model_file, binary=True) self.lemma2word = { word.split('_')[0]: word for word in self.word2vec.index2word } def word_vector(self, word): lemma = self.mystem.lemmatize(word)[0] word = self.lemma2word.get(lemma) return self.word2vec[word] if word in self.word2vec else None def text_vector(self, text): """Вектор текста, получается путем усреднения векторов всех слов в тексте""" word_vectors = [ self.word_vector(token) for token in word_tokenize(text.lower()) if token.isalpha() ] word_vectors = [vec for vec in word_vectors if vec is not None] return np.mean(word_vectors, axis=0) def distance(self, vec1, vec2): if vec1 is None or vec2 is None: return 2 return cosine(vec1, vec2)
class ActionDocs(Action): def __init__(self): self.m = Mystem() self.countries = json.load(open(file, "r")) def name(self) -> Text: return "action_get_docs" def run(self, dispatcher: CollectingDispatcher, tracker: Tracker, domain: Dict[Text, Any]) -> List[Dict[Text, Any]]: input_country = tracker.get_slot('country') lemmas = self.m.lemmatize(input_country) country = lemmas[0].capitalize() found = False for i in self.countries: if country == i["country"] or input_country == i["country"].lower( ): if input_country == i["country"]: country = i["country"] found = True dispatcher.utter_message(text=f"{i['documents']}") break if not found: dispatcher.utter_message( text=f"Я не знаю такую страну '{country}'") return []
def preprocessing(path): stem = Mystem() stop = set(stopwords.words("russian")) stop.update([ '.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}', '#', '№', '*', '_', '\n' ]) def preprocess_text(input_text): param = re.sub('[^a-zA-Zа-яА-Я]', ' ', input_text) param.lower() param = stem.lemmatize(param) param = [ token for token in param if token not in stop and token != " " and token.strip() not in punctuation ] input_text = " ".join(param) input_text = ' '.join(word for word in input_text.split() if len(word) > 3) return input_text html_report_part1 = open(path, 'r') soup = BeautifulSoup(html_report_part1, 'html.parser') return preprocess_text(soup.get_text())
def pos_bi(text): pos_tags = [] m = Mystem() sents = sent_tokenize(text) for sent in sents: sent_an = [] analy = m.analyze(sent) for x in analy: try: if 'analysis' in x.keys(): tag = x['analysis'][0]['gr'] sent_an.append(re.sub(r'[=|,].*', '', tag).lower()) except IndexError: pass pos_tags.append(sent_an) return pos_bi
class ActionInZone(Action): def __init__(self): self.countries = json.load(open(file, "r")) self.m = Mystem() self.schengens = [ "Австрия", "Бельгия", "Чешская Республика", "Дания", "Эстония", "Финляндия", "Франция", "Германия", "Греция", "Венгрия", "Исландия", "Италия", "Латвия", "Литва", "Люксембург", "Мальта", "Голландия", "Норвегия", "Польша", "Португалия", "Словакия", "Словения", "Испания", "Швеция", "Швейцария", "Лихтенштейн" ] def name(self) -> Text: return "is_schengen_zone" def run(self, dispatcher: CollectingDispatcher, tracker: Tracker, domain: Dict[Text, Any]) -> List[Dict[Text, Any]]: country = tracker.get_slot('country') lemmas = self.m.lemmatize(country) country = lemmas[0].capitalize() if country in self.schengens: dispatcher.utter_message(text=f"{country} член шенгенской зоны") else: dispatcher.utter_message( text=f"{country} не входит в шенгенскую зону") return []
def text_preprocessing(text_col: pd.Series, stopwords: list) -> pd.Series: """ Функция для предобработки названий и текстового описания товара. Args: text_col: Столбец с текстовой информацией. stopwords: Список стоп-слов. """ lemmatize_func = Mystem().lemmatize pattern = r'\b(?:{})\b'.format('|'.join(stopwords)) text = ( text_col.str.lower().str.replace( r'<[^>]+>|[^a-zа-яё0-9]', ' ').str.replace( r'(\s)', ' ').str.strip() # удаление пробелов в начале и в конце # .apply(lambda x: lemmatize_func(x) if isinstance(x, str) else None) # лемматизация # .apply(lambda x: ' '.join(x) if isinstance(x, list) else None) # соединение лемматизированных слов .str.replace(pattern, ' ') # удаление стоп-слов .str.replace(r'\b(\w)\b', '') # удаление слов из одной буквы .str.replace(r'\s+', ' ') # любое количество пробелов на 1 пробел .str.strip() # удаление пробелов в начале и в конце ) return text
def __init__(self, language='english', maxsents=0, lemma=False, cstlemma_dir=None): self.filepaths = None self.sentence_list = None self.language = language self.m = Mystem() self.maxsents = maxsents self.count = 0 self.word_token_count = 0 self.lemma = lemma self.cstlemma_dir = cstlemma_dir if not self.cstlemma_dir: self.cstlemma_dir = w2vconfig.cstlemma_dir
class Lemmatizer: def __init__(self, stop_words = None): self.stemmer = Mystem() self.cache = dict()#MyCache(maxsize=1000000) stop_words = stop_words if stop_words is not None else [] self.stop_words = set(stop_words + [' ', '\n', '\r\n', '\t']) pass def lemmatize_word(self, word): res = self.cache.get(word, None) if res is not None: return res lm = self.stemmer.lemmatize(word) lm = [w for w in lm if w not in self.stop_words] if len(lm) == 0: return None lemmatized_word = max(lm, key=lambda x: len(x)) self.cache[word] = lemmatized_word return lemmatized_word def fit_transform(self, words): if len(words) == 0: return [] res = [self.lemmatize_word(w) for w in words] res = [w for w in res if w is not None] return res
class MyStemWrapper: def __init__(self, join_string: str = ''): self._join_string = join_string self._stemmer = Mystem() def filter_string(self, s: str): return self._join_string.join(self._stemmer.lemmatize(s)[:-1])
class TextsLematizer(): def __init__(self): self.m = Mystem() # функция, проводящая предобработку текста def text_hangling(self, text: str): try: txt = re.sub('[^a-zа-я\d]', ' ', text.lower()) txt = re.sub('\s+', ' ', txt) # сюда можно будет вложить самую разную обработку, в том числе и вариационную return txt except: return "" # функция лемматизации одного текста def text_lemmatize(self, text: str): try: lemm_txt = self.m.lemmatize(text) lemm_txt = [w for w in lemm_txt if w not in [' ', '\n']] return lemm_txt except: return [''] # функция лемматизации списка текстов текста def texts_lemmatize(self, texts_list): return [ self.text_lemmatize(self.text_hangling(tx)) for tx in texts_list ]
def load_data(data_dir='news', parts=('train', 'test')): """ Loads data from specified directory. Returns dictionary part->(list of texts, list of corresponding labels). """ part2xy = { } # tuple(list of texts, list of their labels) for train and test parts myStem = Mystem() for part in parts: print('Loading %s set ' % part) xpath = os.path.join(data_dir, '%s.texts' % part) with codecs.open(xpath, 'r', encoding='utf-8') as inp: wholeText = inp.read().strip() texts = lemmatize(myStem, wholeText).split('\n') ypath = os.path.join(data_dir, '%s.labels' % part) if os.path.exists(ypath): with codecs.open(ypath, 'r', encoding='utf-8') as inp: labels = [s.strip() for s in inp.readlines()] assert len(labels) == len( texts), 'Number of labels and texts differ in %s set!' % part for cls in set(labels): print(cls, sum((1 for l in labels if l == cls))) else: labels = None print('unlabeled', len(texts)) part2xy[part] = (texts, labels) return part2xy
def prep_lemmatize(self, text): """ Eng: =============================================================================== :param text: Text for preprocessing; :return: Preprocessed text with all lemmatized words. Lemmatize all words with WordNet Lemmatizer. =============================================================================== Ru: =============================================================================== :param text: Текст для предобработки; :return: Обработанный текст, в котором каждое слово подвергнулось лемматизации. Лемматизирует все слова с помощью WordNet лемматизатора. =============================================================================== """ if isinstance(text, str): if self.lang == "ru": return "".join(Mystem().lemmatize(text)) return " ".join( [WordNetLemmatizer().lemmatize(word) for word in text.split()]) else: raise TypeError("Argument must be str!")
def lemmatize(s): global m try: return ''.join(m.lemmatize(s)).strip() except BrokenPipeError as ex: m = Mystem() return lemmatize(s)
def _lemmatize(self, text): from pymystem3 import Mystem mystem = Mystem() russian_stopwords = stopwords.words("russian") tokens = mystem.lemmatize(text.lower()) tokens = [ token for token in tokens if token not in russian_stopwords and token != " " and token.strip() not in punctuation ] text = " ".join(tokens) text = re.sub(" +", " ", text) return text
class Lemmatisation(object): def __init__(self): self.ru_lem = Mystem() self.en_lem = nltk.stem.WordNetLemmatizer() self.ru_stop_words = set( nltk.corpus.stopwords.words('russian') + [chr(i) for i in range(ord('а'), ord('я') + 1)]) self.en_stop_words = set( nltk.corpus.stopwords.words('english') + [chr(i) for i in range(ord('a'), ord('z') + 1)]) def visible(self, term): if re.search( NOT_DIGIT_OR_LETTER, term ) or term in self.ru_stop_words or term in self.en_stop_words: return False return True def _lemmatize(self, doc): lemmas = self.ru_lem.lemmatize(doc) lemmas = [ self.en_lem.lemmatize(lemma) for lemma in lemmas if self.visible(lemma) ] return ' '.join(lemmas) def lemmatize(self, doc_id, doc): try: return self._lemmatize(doc) except Exception as e: print(doc_id, e)
def mystem_analyze(str): global m try: return m.analyze(str) except BrokenPipeError as ex: m = Mystem() return mystem_analyze(str)
def calc_query_score(self, query): self.result_query[query] = {} result_docs = set(range(len(self.articles))) substr_list = set() query_1 = ''.join(Mystem().lemmatize(query)).strip() term = list() for word in query_1.split(): if word[0] == '-': substr_list.update(set(self.word_map.get(word[1:], list()))) elif self.word_map.get(word, None) is not None: term.append(word) result_docs.intersection_update(set(self.word_map.get(word))) else: result_docs = set() break result_docs.difference_update(substr_list) self.result_query[query] = {} for doc_num in result_docs: self.result_query[query][doc_num] = {} self.result_query[query][doc_num]['score'] = 0 self.result_query[query][doc_num]['score_full'] = 0 self.result_query[query][doc_num]['score_title'] = 0 self.result_query[query][doc_num]['score_annotate'] = 0 for word in term: self.add_tf_idf(query, word, doc_num, 'tf_idf', 'score') self.add_tf_idf(query, word, doc_num, 'tf_idf_full', 'score_full') self.add_tf_idf(query, word, doc_num, 'tf_idf_title', 'score_title') self.add_tf_idf(query, word, doc_num, 'tf_idf_annotate', 'score_annotate')
class Tokenizer: def __init__(self): self.space_pattern = re.compile(r'[^.А-ЯA-ZЁ]+', re.I) self.m = Mystem() try: with open('nw_model/stopwords.txt') as f: self.stop_words = set(f.read().split('\n')) | {''} except FileNotFoundError: self.stop_words = set() print( f'{Fore.RED}WARNING!!! Stop-words file not found!{Style.RESET_ALL}' ) def tokenize_line(self, line): """ Токенизирует одну строку :param line: :return: набор лексем (pymysteam) """ try: return [ word for word in self.m.lemmatize( self.space_pattern.sub(' ', line.lower())) if word.strip() not in self.stop_words ] except BrokenPipeError: self.m = Mystem() return self.tokenize_line(line) def join(self, lst): return self.space_pattern.sub(' ', ' '.join(lst))
def get_okapi(query): """ Returns Okapi BM25 score for every document given word in corpus :param query: :return: """ m = Mystem() query = text_to_list(query, m) # list of lemmas with open('result_1.json', 'r', encoding='utf-8') as f: f = f.read() data = json.loads(f) total_score = defaultdict(int) for word in query: try: all_info = (data[word]) n = (len(all_info)) for article in all_info: fq = article['freq'] dl = get_dl(article['doc_name']) score = score_BM25(n=n, fq=fq, dl=dl) total_score[article['doc_name']] += score except KeyError: pass result = sorted(total_score.items(), key=lambda x: x[1], reverse=True)[:10] return result
def preprocess(word): stem = Mystem() stop = set(stopwords.words("russian")) stop.update([ '.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}', '#', '№', '*', '_', '\n' ]) param = re.sub('[^a-zA-Zа-яА-Я]', ' ', word) param.lower() param = stem.lemmatize(param) param = [ token for token in param if token not in stop and token != " " and token.strip() not in punctuation ] word = " ".join(param) word = ' '.join(word for word in word.split() if len(word) > 3) return word
def with_not(directory): input_files = filter(lambda x: not x.endswith('~'), os.listdir(directory)) output_data = {} m = Mystem() #иду по документам for input_file in input_files: with open(directory + '/' + input_file) as data_file: data = json.load(data_file) list_of_terms = filter(lambda x: x not in ('', ' ', '\n'), data['text'].split(' ')) # обработка не + (слово) nums_of_bigrams = [] helping_words = [u'совсем', u'очень', u'слишком', u'самый'] for i in range(0, len(list_of_terms)): if list_of_terms[i] == u'не' and list_of_terms[i+1] not in helping_words: if m.analyze(list_of_terms[i+1])[0].get(u'analysis'): if not m.analyze(list_of_terms[i+1])[0][u'analysis'][0][u'gr'].startswith(u'S,'): nums_of_bigrams.append((i, i+1)) elif list_of_terms[i] == u'не' and list_of_terms[i+1] in helping_words: if m.analyze(list_of_terms[i+2])[0].get(u'analysis'): if not m.analyze(list_of_terms[i+2])[0][u'analysis'][0][u'gr'].startswith(u'S,'): nums_of_bigrams.append((i, i+2)) for i in range(0, len(nums_of_bigrams)): if nums_of_bigrams[i][0] + 1 == nums_of_bigrams[i][1]: list_of_terms[nums_of_bigrams[i][0]] = list_of_terms[nums_of_bigrams[i][0]] + list_of_terms[nums_of_bigrams[i][1]] list_of_terms[nums_of_bigrams[i][1]] = '' elif nums_of_bigrams[i][0] + 2 == nums_of_bigrams[i][1]: list_of_terms[nums_of_bigrams[i][0]] = list_of_terms[nums_of_bigrams[i][0]] + list_of_terms[nums_of_bigrams[i][1]] list_of_terms[nums_of_bigrams[i][1] - 1] = '' list_of_terms[nums_of_bigrams[i][1]] = '' list_of_terms = filter(lambda x: x != '', list_of_terms) text_of_output = ' '.join(['%s' % term for term in list_of_terms]) output_data[input_file] = {} output_data[input_file]['id'] = data['id'] output_data[input_file]['positive'] = data['positive'] output_data[input_file]['sarcasm'] = data['sarcasm'] output_data[input_file]['text'] = text_of_output with open(directory + '/' + input_file, 'w') as output_file: json.dump(output_data[input_file], output_file)
class Runner(object): def __init__(self, input_text): self.lemmatize = None while True: response = raw_input("Do you want to lemmatize text first? (yes/no)\n").lower() if response == "yes": print "You should wait for a while" self.lemmatize = True self.stemmer = Mystem() break elif response == "no": self.lemmatize = False break self.word_lists = list() with open(input_text, "r") as f: for line in f: line += "." if self.lemmatize: lexemes = self.stemmer.lemmatize(line) word_list = list() # список слов, неразделенных знаками пунктуации for lexeme in lexemes: lexeme = lexeme.strip() if lexeme: if lexeme.translate(None, '.,?!:;()"\' -\t\n'): # проверка, что лексема не является знаком пунктуации lexeme = lexeme.decode("utf-8") if is_cyrillic(lexeme): word_list.append(lexeme) else: # иначе, добавить биграмы из списка и завести новый пустой список self.word_lists.append(word_list) word_list = list() else: line = line.replace(".", " . ").replace(",", " , ").replace(":", " : ").replace(";", " ; ")\ .replace("?", " ? ").replace("!", " ! ").replace("(", " ( ").replace(")", " ) ")\ .replace("--", " -- ").replace(".", " . ") word_list = list() for lexeme in line.split(): # проверка, что лексема не является знаком пунктуации lexeme = lexeme.translate(None, '.,?!:;()"\'').replace("--", "").decode("utf-8").strip().lower() if lexeme: if is_cyrillic(lexeme): word_list.append(lexeme) else: if word_list: self.word_lists.append(word_list) word_list = list() train, test = self.split() self.lid = Lid(train, test) self.lid.run() def split(self): n = len(self.word_lists) train = self.word_lists[:n*9/10] test = self.word_lists[n*9/10:] return train, test
def mystem_using(input_directory, output_directory): input_files = filter(lambda x: not x.endswith('~'), os.listdir(input_directory)) output_data = {} m = Mystem() for input_file in input_files: with open(input_directory + '/' + input_file) as data_file: data = json.load(data_file) list_of_terms = filter(lambda x: x != '', re.split(''' |\.|,|:|\?|"|\n|<|>|\*|!|@|_ +''', data['text'])) text = " ".join(["%s" % term for term in list_of_terms]) list_of_terms = filter(lambda x: x not in ('', ' ', '\n'), m.lemmatize(text)) text_of_output = ' '.join(['%s' % term for term in list_of_terms]) output_data[input_file] = {} output_data[input_file]['id'] = data['id'] output_data[input_file]['positive'] = data['positive'] output_data[input_file]['sarcasm'] = data['sarcasm'] output_data[input_file]['text'] = text_of_output with open(output_directory + '/' + input_file, 'w') as output_file: json.dump(output_data[input_file], output_file)
class Index(object): def __init__(self, input_file): self.stemmer = Mystem() self.documents = dict() self.tokens = list() self.terms = dict() self.index = list() # reading documents, making tokenization with open(input_file, "r") as f: for i, line in enumerate(f, start=1): self.documents[i] = line.decode("utf-8") for word in self.stemmer.lemmatize(line): token = word.translate(None, '.,?!:;()"\'-').decode("utf-8").strip() if token: self.tokens.append((token, i)) # sorting by tokens first, then by frequency self.tokens.sort(key=lambda tup: (tup[0], tup[1])) # terminization and building index current_term = self.tokens[0][0] current_doc_id = self.tokens[0][1] doc_ids = [current_doc_id] for token, doc_id in self.tokens: term = token.lower() if term == current_term: if doc_id != current_doc_id: doc_ids.append(doc_id) current_doc_id = doc_id else: self.terms[current_term] = (len(doc_ids), doc_ids) self.index.append((current_term, len(doc_ids), doc_ids)) current_term = term current_doc_id = doc_id doc_ids = [doc_id] self.terms[current_term] = (len(doc_ids), doc_ids) self.index.append((current_term, len(doc_ids), doc_ids)) def print_to_file(self): with open("result.txt", "w") as f: for term, count, doc_ids in self.index: f.write("{},\t{},\t{}\n".format(term.encode("utf-8"), count, doc_ids)) def print_statistics(self): terms_num = len(self.terms) terms_len = 0. for term in self.terms: terms_len += len(term) print "***********************" print "Number of terms = {}".format(terms_num) print "Average term length = {}".format(terms_len / terms_num) print "***********************"
def search(): cn = None file = codecs.open('static/articles.xml', 'r', 'utf-8') rfile = file.read() tree = lxml.etree.fromstring(rfile) res = tree.xpath('entry') categ = { 'cat': 'Категория', 'wgroup': 'Группа слов с близким значением', 'comm': 'Комментарии', 'stdiff': 'Стилистические различия', 'overlap': 'Совпадающая часть значения', 'dom': 'Доминанта группы', 'diffmark': 'Различительные признаки, релевантные для данной группы', 'diff': 'Смысловые различия', 'rare': 'Редкие слова, примыкающие к группе', 'anmean': 'Другие значения слов, входящих в группу', 'comb': 'Сочетаемость', 'reg': 'Региональные варианты', 'adict': 'Данные академических словарей', 'doc': 'Нормативные документы', 'etim': 'Этимология', 'ill': 'Иллюстрации' } file.close() ms = Mystem() wordsearch = ms.lemmatize(request.form['search'].lower())[0] for i in res: if wordsearch == '': cn = 'Пустой запрос' elif i.text.lower().startswith(wordsearch): arr = [] for j in i.iter(): for k in dict.keys(categ): if j.tag == k: if j.text != 'null': arr.append('<font size="4"><b>' + str(categ[j.tag]) + '</b></font><br>' + str(j.text)) text = '<br><br>'.join([j for j in arr[1:]]) text = re.sub('\*', '<b>', text) text = re.sub('\#', '</b>', text) text = re.sub('\$', '<i>', text) text = re.sub('\%', '</i>', text) text = re.sub('\@', '<font color="#696969">', text) text = re.sub('\+', '</font>', text) cn = '<strong><big>' + i.text + '</big></strong><br><br>' + re.sub('\n', '<br>', text) break else: cn = 'По Вашему запросу ничего не найдено. <br>' \ 'Попробуйте использовать "Поиск по тегу" или измените запрос.' return render_template('search.html', cn=Markup(cn))
def build_pos(self): m = Mystem() counter = Counter(DEFAULTS) for doc in self.documents: # parse with mystem data = m.analyze(doc.text) # get POS and count for each sentence pos = [word.get('analysis', None)[0]['gr'].split('(')[0].split(',')[0].split('=')[0] for word in data if word.get('analysis', None)] counter.update(pos) # append to dataset self.pos_data.append([counter[key] for key in sorted(counter)]) # reset counter counter = Counter(DEFAULTS)
def produce_lemmas(connection, tableName, outputTableName): mystem = Mystem() cursor = connection.cursor() inserter = connection.cursor() query = 'DELETE FROM `%s`' % outputTableName inserter.execute(query) connection.commit() query = 'SELECT * FROM `%s`' % tableName cursor.execute(query) query = 'INSERT INTO `' + outputTableName + '` (`' + tableName + '_id`, `word_class_id`, `lex`, `gr`)' \ 'SELECT %i, `id`, "%s", "%s" FROM `word_classes` WHERE `abbr`="%s"' for id, concept, scheme in cursor: lemmas = mystem.analyze(concept) for lemma in lemmas: for analysis in lemma.get('analysis', []): inserter.execute(query % prepare_content(id, analysis)) connection.commit() cursor.close()
class Index(object): def __init__(self, input_file): self.stemmer = Mystem() self.tokens = list() self.index = dict() self.number_of_documents = 0 try: self.read_from_file_compressed("index_compressed.txt") except: # reading documents, making tokenization with open(input_file, "r") as f: for line in f: self.number_of_documents += 1 # self.documents[i] = line.decode("utf-8") for word in self.stemmer.lemmatize(line): token = word.translate(None, '.,?!:;()"\'-').decode("utf-8").strip() if token: self.tokens.append((token, self.number_of_documents)) # sorting by tokens first, then by frequency self.tokens.sort(key=lambda tup: (tup[0], tup[1])) # terminization and building index current_term = self.tokens[0][0] current_doc_id = self.tokens[0][1] doc_ids = [current_doc_id] for token, doc_id in self.tokens: term = token.lower() if term == current_term: if doc_id != current_doc_id: doc_ids.append(doc_id) current_doc_id = doc_id else: self.index[current_term] = (len(doc_ids), pack_doc_ids(doc_ids)) current_term = term current_doc_id = doc_id doc_ids = [doc_id] self.index[current_term] = (len(doc_ids), pack_doc_ids(doc_ids)) del self.tokens self.write_index_in_file() def write_index_in_file(self): with open("index_compressed.txt", "w") as f: pickle.dump(self.index, f) def read_from_file_compressed(self, index_file): with open(index_file, "r") as f: self.index = pickle.load(f)
def fill_mystem(): from pymystem3 import Mystem m = Mystem() for sentence in get_sentences(1): lemmas = m.analyze(sentence.source) items = list() for lemma in lemmas: text = lemma['text'] analysis = lemma.get('analysis') if not analysis: text = text.strip() if not len(text): print 'spaces = "%s"' % text continue if ' ' in text: for item in re.split('\s+', text): items.append("%s %s ?" % (item, item)) print 'several =', "|".join(re.split('\s+', text)) continue print 'delimiter = "%s"' % text items.append("%s %s ?" % (text, text)) continue if not len(text.strip()): raise Exception('Impossible') if ' ' in text: raise Exception('Impossible') lexemes = list() for lexeme in analysis: print 'lex=', lexeme.get('lex', '-') print 'gr=', lexeme.get('gr', '-') lexemes.append("%s %s" % (lexeme['lex'], lexeme['gr'])) items.append("%s %s" % (text, ' '.join(lexemes))) sentence.mystem = '\n'.join(items) sentence.save()
def __init__(self, input_file): self.stemmer = Mystem() self.documents = dict() self.tokens = list() self.terms = dict() self.index = list() # reading documents, making tokenization with open(input_file, "r") as f: for i, line in enumerate(f, start=1): self.documents[i] = line.decode("utf-8") for word in self.stemmer.lemmatize(line): token = word.translate(None, '.,?!:;()"\'-').decode("utf-8").strip() if token: self.tokens.append((token, i)) # sorting by tokens first, then by frequency self.tokens.sort(key=lambda tup: (tup[0], tup[1])) # terminization and building index current_term = self.tokens[0][0] current_doc_id = self.tokens[0][1] doc_ids = [current_doc_id] for token, doc_id in self.tokens: term = token.lower() if term == current_term: if doc_id != current_doc_id: doc_ids.append(doc_id) current_doc_id = doc_id else: self.terms[current_term] = (len(doc_ids), doc_ids) self.index.append((current_term, len(doc_ids), doc_ids)) current_term = term current_doc_id = doc_id doc_ids = [doc_id] self.terms[current_term] = (len(doc_ids), doc_ids) self.index.append((current_term, len(doc_ids), doc_ids))
import json import requests import pandas from pymystem3 import Mystem API_KEY = "api_key" if __name__ == "__main__": not_translated = [] dictionary = {} print(len(dictionary.keys())) m = Mystem() df = pandas.read_csv("/media/alexander/b32bf4b4-8724-4107-9d19-abf6615c2f60/alexander/HELP_FILE/query.yaHotelId.showInTop.sure.final.tsv", sep="\t") df_size = len(df["query"]) k = 1 for line in df["query"]: print(k, "query from", df_size) k += 1 for word in line.strip().split(): lema_word = m.lemmatize(word)[0] if dictionary.get(lema_word) is None: params = {"key": API_KEY, "text": lema_word, "lang": "ru-en"} try: r = requests.get("https://translate.yandex.net/api/v1.5/tr.json/translate", params=params) r_json = r.json() trans_word = r_json["text"][0] if r_json["code"] != 200: print("ERROR", r_json["code"]) not_translated.append(lema_word) continue
from pymystem3 import Mystem m = Mystem() t = 'Чайника, сегодня не было' lemma = m.lemmatize(t) def lemmas(text): punc = list('.?!-;:",') text = [i for i in text if i not in punc] text = ''.join(text) text = m.lemmatize(text) textn = '' for w in text: if w is not ' ' or '\n': textn += w return textn from nltk.corpus import stopwords from sklearn.feature_extraction.text import TfidfVectorizer import os s_w = stopwords.words('russian') sw = [i for i in s_w] v = TfidfVectorizer(stop_words=sw) # убираем стоп-слова #v = TfidfVectorizer() # не убираем стоп-слова totalCorpus = [] suspenseCorpus = ''
class MystemOCTagger(object): def __init__(self): self.mystem_inst = Mystem() def run_and_convert(self, input_file, output_file, strict_match = False): f_in = open(input_file, 'rb') f_out = open(output_file, 'w+') context = etree.iterparse(f_in, tag='sentence') for event, sentence_elem in context: sentence = sentence_elem.find('source') analyzed = self.analyze_sentence(sentence.text) tokens_tree = sentence_elem.find('tokens') tokens = self.extract_tokens(tokens_tree) matched = self.match_analyzed_tokens(tokens, analyzed, strict_match) result = self.analyzed_to_csv_list(matched) for s in result: f_out.write(s+'\n') sentence_elem.clear() def analyze_sentence(self, sentence): return self.mystem_inst.analyze(sentence) # builds word-index mapping, indices sorted in order of appearance def extract_tokens(self, tokens_tree): tokens_dict = {} for t in tokens_tree.iter('token'): idx = t.get('id') token = t.get('text') token = strip_word(token) if (len(token) > 0): if token in tokens_dict: tokens_dict.get(token).append(idx) else: tokens_dict[token] = [idx] return tokens_dict # matches analysis with original tokens indices def match_analyzed_tokens(self, tokens_index, analyzed, strict_match = False): analysis_indexed = {} unindexed = [] for t in analyzed: t_text = t.get('text') t_text = strip_word(t_text) if len(t_text) > 0: if t_text in tokens_index: idx = tokens_index.get(t_text).pop(0) if (len(tokens_index.get(t_text)) == 0): tokens_index.pop(t_text) analysis_indexed[idx] = t.get('analysis') else: unindexed.append(t) if (not strict_match): analysis_not_strict = {} if len(tokens_index) > 0: analysis_not_strict = self.match_not_strict(tokens_index, unindexed) analysis_indexed.update(analysis_not_strict) not_analyzed = [] if len(tokens_index) > 0: for t in tokens_index: not_analyzed.append(t) # if len(not_analyzed) > 0: # f_unindexed = open('mismatch.txt', 'a+') # f_unindexed.write('oc ') # f_unindexed.write(str(not_analyzed)+' ') # # if len(unindexed) > 0: # f_unindexed = open('mismatch.txt', 'a+') # for u in unindexed: # f_unindexed.write(' ') # f_unindexed.write(str(u.get('text'))) # f_unindexed.write('\n') return analysis_indexed def match_not_strict(self, tokens_index, analyzed): analysis_indexed = {} for t_indexed, idx_list in tokens_index.items(): for idx in idx_list: for i in range(0, len(analyzed)): t_analyzed = analyzed[i] if t_indexed.endswith(t_analyzed.get('text')): analysis_indexed[idx] = t_analyzed.get('analysis') #print(t_analyzed.get('text')+' '+t_indexed) analyzed.pop(i) idx_list.remove(idx) break idx_copy = tokens_index.copy() for t, i in idx_copy.items(): if len(i) == 0: del tokens_index[t] return analysis_indexed def analyzed_to_csv_list(self, analyzed): out = [] for idx, analysis in sorted(analyzed.items()): if analysis and len(analysis) > 0: #do we need only grammar? s = str(idx) + ', ' + str(analysis[0].get('gr')) out.append(s) return out
def __init__(self): self.mystem_inst = Mystem()
def poehali(csv_input): ''' Основная функция csv_input -- файл с таблицей ссылок На выходе |-xmlFile/ |---------year/ |--------------month/ ========= |-plain/ |-------year/ |------------month/ ========= |-html/ |------year/ |-----------month/ |csv_file.csv ''' data = [] i = 0 m = Mystem() gusina() col = ["path", "author", "sex", "birthday", "header", "created", "sphere", "genre_fi", "type", "topic", "chronotop", "style", "audience_age", "audience_level", "audience_size", "source", "publication", "publisher", "publ_year", "medium", "country", "region", "language"] time.sleep(3) path = os.getcwd() path = path + "/" csv_file = open(path + "csv_file.csv", "w") writer = csv.writer(csv_file,delimiter = ",") writer.writerow(col) dosugvbryanske = re.compile("^(http://www.briansk.ru/)(.+)") with open(csv_input) as csvfile: reader = csv.DictReader(csvfile) for row in reader: if re.search(dosugvbryanske, row['url']): print('passing on ' + str(i)) test = urllib.request.urlopen(row['url']).read().decode('cp1251') file_html = path+"/"+str(i)+".html" file_html1 = path+"/"+str(i-1)+".html" dest_html = str(i)+".html" plain = str(i)+".txt" plain_new = str(i)+"_plained.txt" plain_stem = str(i)+"_mystem.txt" output_plain_stem = str(i)+"_out_mystem.txt" xmlFile = str(i) + ".xml" #dir_for_stem = "XML_STEM" page1_html = open(file_html, 'w') page1_html.write(str(test)) page1_html.close() print("FILE EX: "+ str(os.path.exists(file_html))) pageMoving = open(file_html, 'r') #print(file_html + " PATH " + dest_html+"\n") if os.path.exists(file_html1): os.remove(file_html1) print("FILE "+str(i-1)+" HB REMOVED") else: print("FILE "+str(i-1)+" HB ALREADY MOVED") for line in pageMoving: data = re.search(r"\">[0-9]{1,2}\s{1}((янв|февр|март|апре|май|июнь|июль|авг|сентя|октяб|нояб|декаб)[а-я]{1,}\s[0-9]{4})|\">[0-9]{1,2}\s{1}(ма(а|я)\s[0-9]{4})", line) if data: ''' Определение датирования статьи ''' dates = data.group() dates2 = dates.split() year = dates2[2] month = dates2[1] create_folder(path, year, transpose_month(month), "html") shutil.move(file_html, path+"html/"+year+"/"+transpose_month(month)+"/"+dest_html) print("FILE "+str(i)+" HB MOVED") ''' Созидание директории для XML ''' create_folder(path, year, transpose_month(month), "xmlFile") forxml = path+"xmlFile/"+year+"/"+transpose_month(month)+"/"+dest_html forxml_dir = path+"xmlFile/"+year+"/"+transpose_month(month)+"/" xml_stem = forxml_dir + str(i) + "_mystem.xml" rofxml = path+"xmlFile/"+year+"/"+transpose_month(month)+"/"+xmlFile ''' Копирование html -> xmldir для дальнейшей обработки ''' shutil.copy(path+"html/"+year+"/"+transpose_month(month)+"/"+dest_html, forxml) print("FILE "+str(i)+" HB COPIED TO XML") openindosug_xml = open(forxml, "r") read_and_clean_xml = openindosug_xml.read() xml_data = amixml(read_and_clean_xml) #print(xml_data[2]) openindosug_xml.close() ''' Созидание директории для plain текста ''' create_folder(path, year, transpose_month(month), "plain") forplain = path+"plain/"+year+"/"+transpose_month(month)+"/"+dest_html forplain_dir = path+"plain/"+year+"/"+transpose_month(month)+"/" shutil.copy(path+"html/"+year+"/"+transpose_month(month)+"/"+dest_html, forplain) print("FILE "+str(i)+" HB COPIED TO PLAIN") openindosug = open(forplain, "r") dates = re.sub("\">", "", dates) ''' wri = лист для генерации ИНФО о статьи ''' wri = ["briansk.ru", str(xml_data[1]), toddmmyyy(dates), "", row['url']] page2_txt = open(str(forplain_dir)+str(plain), 'w') for datline in openindosug: page2_txt.write(str(make_it_clean(datline))) page2_txt.close() print("PLAIN FOR "+str(i)+" HB CREATED") ''' Окончательная очистка plain файла; оставляем только текст статьи или текст + ИНФО ''' provide_clean_file(forplain_dir+str(plain),forplain_dir+str(plain_new), wri, "extra") provide_clean_file(forplain_dir+str(plain),forplain_dir+str(plain_stem), wri, "mystem") os.remove(forplain_dir+str(plain)) os.remove(forplain) openindosug.close() ''' xml_data[0] -- content xml_data[1] -- headerTag xml_data[2] -- content date ''' ''' Генерация XML ''' pageEtree = etree.Element('html') doc = etree.ElementTree(pageEtree) infoTag = etree.SubElement(pageEtree, "body") dateTag = etree.SubElement(infoTag, "h1") dateTag.text = str(xml_data[2]) headerTag = etree.SubElement(infoTag, "h2") headerTag.text = str(xml_data[1]) mainTag = etree.SubElement(infoTag, "h3") contentTag = etree.SubElement(infoTag, "h4") contentTag.text = str(xml_data[0]) outFile = open(str(forxml_dir)+str(i)+".xml", 'wb') doc.write(outFile, xml_declaration=True, encoding='utf-16') outFile.close() print("FILE "+str(i)+" HB CODED TO XML") writer.writerow([str(path+"html/"+year+"/"+transpose_month(month)+"/"+dest_html) , "briansk.ru" , "" , "" , str(xml_data[1]) , toddmmyyy(dates), 'публицистика' , "" , "" , "категория" , "" , "нейтральный" , "н-возраст" , "н-уровень" , "городская" , str(row['url']) , "брянск.ru" , "" , str(year) , "газета" , "Россия" , "БРЯНСК" , "ru"]) os.remove(forxml) input_plain = forplain_dir + plain_stem output_plain = forplain_dir + output_plain_stem ''' pystem mystem ''' with open(input_plain) as file: text = file.read() lemmas = m.lemmatize(text) with open(input_plain, 'w') as file: file.write(''.join(lemmas)) os.system(r'/home/haniani/Загрузки/mystem -icd '+ input_plain + ' ' + output_plain) os.system(r'/home/haniani/Загрузки/mystem -icd --format xml '+ input_plain +' '+ xml_stem) print("MYSTEM'ed "+str(i)) break i += 1 print("PASSED ; NEXT: "+str(i)+"\n") csv_file.close() for file in glob.glob(path+"*.html"): os.remove(file)
# coding:utf-8 """ Script normalizing sentences from sentences.txt and saving ones to """ import re from pymystem3 import Mystem normalizer = Mystem() PREFIX = "mp_" with open("../data/" + PREFIX + "norm_sentences.txt", "w") as writer: count = 0 raw = [] normalized = [] for line in open("../data/" + PREFIX + "parsed.txt"): if count % 1000 == 0: print count line = re.sub("[\W\d]+", " ", line.strip().decode("utf-8").strip(), 0, re.UNICODE) line = re.sub("\s+", " ", line.strip(), 0, re.UNICODE).lower() raw.extend(line.split(" ")) writer.write("* " + line.encode("utf-8") + " **;") # print line, '->', line = " ".join(normalizer.lemmatize(line)) line = re.sub("\s+", " ", line, 0, re.UNICODE) lemmatized = filter(lambda x: len(x.strip()) > 0, normalizer.lemmatize(line)) normalized.extend(lemmatized)
def extract(self): try: #вычисляем, сколько в директории лежит файлов input_files = filter(lambda x: not x.endswith('~'), os.listdir(self.input_directory)) output_data = {} list_of_all_terms = {} m = Mystem() #иду по документам for file in input_files: with open(self.input_directory + '/' + file) as data_file: data = json.load(data_file) list_of_terms = filter(lambda x: x != "", re.split(""" |\.|,|:|\?|"|\n|<|>|\*|!|@|_ +""", data['text'])) text = " ".join(["%s" % term for term in list_of_terms]) list_of_terms = filter(lambda x: x not in (" ", "\n"), m.lemmatize(text)) # обработка не + (слово) nums_of_bigrams = [] helping_words = [u'совсем', u'очень', u'слишком', u'самый'] for i in range(0, len(list_of_terms)): if list_of_terms[i] == u'не' and list_of_terms[i+1] not in helping_words: nums_of_bigrams.append((i, i+1)) elif list_of_terms == u'не' and list_of_terms[i+1] in helping_words: nums_of_bigrams.append((i, i+2)) for i in range(0, len(nums_of_bigrams)): if nums_of_bigrams[i][0] + 1 == nums_of_bigrams[i][1]: list_of_terms[nums_of_bigrams[i][0]] = list_of_terms[nums_of_bigrams[i][0]] + ' ' + list_of_terms[nums_of_bigrams[i][1]] list_of_terms[nums_of_bigrams[i][1]] = '' elif nums_of_bigrams[i][0] + 2 == nums_of_bigrams[i][1]: list_of_terms[nums_of_bigrams[i][0]] = list_of_terms[nums_of_bigrams[i][0]] + ' ' + list_of_terms[nums_of_bigrams[i][1]] list_of_terms[nums_of_bigrams[i][1] - 1] = '' list_of_terms[nums_of_bigrams[i][1]] = '' list_of_terms = filter(lambda x: x != '', list_of_terms) output_data[file] = {} output_data[file]['id'] = data['id'] output_data[file]['positive'] = data['positive'] output_data[file]['sarcasm'] = data['sarcasm'] output_data[file]['terms'] = {} #убираю повторяющиеся слова for term in list_of_terms: if term not in output_data[file]['terms']: output_data[file]['terms'][term] = 1 else: output_data[file]['terms'][term] += 1 for term in output_data[file]['terms'].keys(): if term not in list_of_all_terms: list_of_all_terms[term] = 1 else: list_of_all_terms[term] += 1 #подсчёт tf count_of_terms = output_data[file]['terms'][term] output_data[file]['terms'][term] = {'tf': float(count_of_terms)/len(list_of_terms), 'idf': 0, 'count': count_of_terms} for file in input_files: #подсчёт idf for term in output_data[file]['terms'].keys(): output_data[file]['terms'][term]['idf'] = math.log(float(len(input_files))/list_of_all_terms[term]) #запись результата with open(self.output_directory + '/' + file + '_tf-idf', 'w') as output_file: json.dump(output_data[file], output_file) except Exception: return False else: return True
def lemma(text): m = Mystem() lemmas = m.lemmatize(text) titleStemmed = ''.join(lemmas) return titleStemmed
def extract(self): try: # вычисляем, сколько в директории лежит файлов input_files = filter(lambda x: not x.endswith("~"), os.listdir(self.input_directory)) output_data = {} list_of_all_terms = {} m = Mystem() # иду по документам for file in input_files: with open(self.input_directory + "/" + file) as data_file: data = json.load(data_file) list_of_terms = filter(lambda x: x != "", re.split(""" |\.|,|:|\?|"|\n|<|>|\*|!|@|_ +""", data["text"])) text = " ".join(["%s" % term for term in list_of_terms]) list_of_terms = filter(lambda x: x not in (" ", "\n"), m.lemmatize(text)) my_list_of_terms = [] for term in list_of_terms: my_term = term term = u"" prev_letter = my_term[0] term += my_term[0] for i in range(1, len(my_term)): if my_term[i] != prev_letter: term += my_term[i] prev_letter = my_term[i] my_list_of_terms.append(term) list_of_terms = my_list_of_terms output_data[file] = {} output_data[file]["id"] = data["id"] output_data[file]["positive"] = data["positive"] output_data[file]["sarcasm"] = data["sarcasm"] output_data[file]["terms"] = {} # убираю повторяющиеся слова for term in list_of_terms: if term not in output_data[file]["terms"]: output_data[file]["terms"][term] = 1 else: output_data[file]["terms"][term] += 1 for term in output_data[file]["terms"].keys(): if term not in list_of_all_terms: list_of_all_terms[term] = 1 else: list_of_all_terms[term] += 1 # подсчёт tf count_of_terms = output_data[file]["terms"][term] output_data[file]["terms"][term] = { "tf": float(count_of_terms) / len(list_of_terms), "idf": 0, "count": count_of_terms, } for file in input_files: # подсчёт idf for term in output_data[file]["terms"].keys(): output_data[file]["terms"][term]["idf"] = math.log( float(len(input_files)) / list_of_all_terms[term] ) # запись результата with open(self.output_directory + "/" + file + "_tf-idf", "w") as output_file: json.dump(output_data[file], output_file) except Exception: return False else: return True
from pymystem3 import Mystem import logging import re logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(message)s', handlers=[logging.StreamHandler()]) logging.info("Loading mystem") m = Mystem() logging.info("Loaded mystem") def parse_gr(gr): options = re.search('\(([^\)]*)\)', gr, re.IGNORECASE) if options: title = options.group(1) for stuff in title.split('|'): yield gr.replace("(" + title + ")", stuff) else: yield gr lines = set([]) with open("data/test.txt", "r") as input_file: logging.info("file opened") for line in input_file: for w in m.analyze(line):
import os, json, dicttoxml from pymystem3 import Mystem m = Mystem() top = 'C:\\Users\\John\\Desktop\\py_files\\питон\\korpus\\no_marks' for root, dirs, files in os.walk(top): for name in files: loc = os.path.join(root, name) loc_list = loc.split('\\') #creates list in order to remove path content new_root = loc.replace('\\no_marks\\{0}\\{1}\\{2}'.format(loc_list[8], loc_list[9], loc_list[10]), '') #removes path ending dir_marks = os.path.join(new_root + '\\marks\\{0}\\{1}'.format(loc_list[8], loc_list[9])) #adds new path ending for json.docs dir_xml = os.path.join(new_root + '\\xml\\{0}\\{1}'.format(loc_list[8], loc_list[9])) #adds new path ending for xml docs new_name = name.replace('.txt', '') if not os.path.exists(dir_marks): #makes nesessary dirs if not present os.makedirs(dir_marks) if not os.path.exists(dir_xml): os.makedirs(dir_xml) with open(loc, "r", encoding = 'utf-8') as doc: text_doc = doc.read() lines = doc.readlines() info = json.dumps(m.analyze(text_doc), ensure_ascii = False) #creates text file with gram and lem info with open("{0}\\{1}.json".format(dir_marks, new_name), 'w', encoding = 'utf-8') as doc_marks: doc_marks.write(info) xml = dicttoxml.dicttoxml(info).decode('utf-8') #converts json to xml with open("{0}\\{1}.xml".format(dir_xml, new_name), 'w', encoding = 'utf-8') as doc_xml: doc_xml.write(xml)