class Text: def __init__(self): self.stops = self.stopsget() self.mystem = Mystem(mystem_bin=None, grammar_info=False, disambiguation=False) self.stops_to_nil = [ re.compile(st) for st in ['[0-9]+', '[.!?"\-,:—%*();»«]+'] ] def stopsget(self): with open('finstops.txt') as f: stops = [ re.compile(u'(\s|^){}(\s)'.format(line.strip())) for line in f.readlines() ] return stops def normalize(self, text): for stop_nil in self.stops_to_nil: text = re.sub(stop_nil, '', text) for stop in self.stops: text = re.sub(stop, '\\1\\2', text.lower()) text = re.sub(' +', ' ', text) text = re.sub('\n ', '\n', text) tr = [] for word in text.split(): lemm = self.mystem.lemmatize(word)[0] tr.append(lemm) text = u' '.join(tr) return text def lemmat(self, line): res = [self.mystem.lemmatize(word.lower())[0] for word in line.split()] return res
def pars(self, result, lemmataize=True): ''' Pars extraction to dict, where keys are tags and they include lists of words for every entity lemmatize -- Flag to lemmatize text with Mystem by Yandex® input: extraction result output: dictionary {tag: [[words], [words]]} ''' if lemmataize: m = Mystem() d = {} s = [] for word, tag in result: if tag == 'O': if len(s) != 0: if key_tag in d.keys(): d[key_tag].append(s) else: d[key_tag] = [s] s = [] else: continue elif tag[0] == 'B': key_tag = tag[2:] s = [] if lemmataize and key_tag != 'ORG': word = m.lemmatize(word)[0] s.append(word) elif tag[0] == 'I': if lemmataize and key_tag != 'ORG': word = m.lemmatize(word)[0] s.append(word) return d
def mystem_using_with_considering_of_multiple_letters(input_directory, output_directory): input_files = filter(lambda x: not x.endswith('~'), os.listdir(input_directory)) output_data = {} m = Mystem() #иду по документам for input_file in input_files: with open(input_directory + '/' + input_file) as data_file: data = json.load(data_file) list_of_terms = filter(lambda x: x != '', re.split(''' |\.|,|:|\?|"|\n|<|>|\*|!|@|_ +''', data['text'])) my_list_of_terms = [] for term in list_of_terms: if term == m.lemmatize(term)[0]: my_term = term term = u'' prev_letter = my_term[0] term += my_term[0] for i in range(1, len(my_term)): if my_term[i] != prev_letter: term += my_term[i] prev_letter = my_term[i] my_list_of_terms.append(term) else: my_list_of_terms.append(term) list_of_terms = my_list_of_terms text = ' '.join(['%s' % term for term in list_of_terms]) list_of_terms = filter(lambda x: x not in ('', ' ', '\n'), m.lemmatize(text)) text_of_output = ' '.join(['%s' % term for term in list_of_terms]) output_data[input_file] = {} output_data[input_file]['id'] = data['id'] output_data[input_file]['positive'] = data['positive'] output_data[input_file]['sarcasm'] = data['sarcasm'] output_data[input_file]['text'] = text_of_output with open(output_directory + '/' + input_file, 'w') as output_file: json.dump(output_data[input_file], output_file)
def lambdaFunc(self, node, A, B): # A - is array # B - is array # item - is object m = Mystem() if (node["parent"] == "null"): return False nodeName = ''.join(m.lemmatize(str(node["name"]))) parentName = ''.join(m.lemmatize(str(node["parent"]))) lemA = list(map(lambda x: m.lemmatize(str(x))[0], A)) lemB = list(map(lambda x: m.lemmatize(str(x))[0], B)) AInNodeName = True AInParentName = True BInNodeName = True BInParentName = True for lem in lemA: if lem.upper() not in nodeName.upper(): AInNodeName = False for lem in lemA: if lem.upper() not in parentName.upper(): AInParentName = False for lem in lemB: if lem.upper() not in nodeName.upper(): BInNodeName = False for lem in lemB: if lem.upper() not in parentName.upper(): BInParentName = False if AInNodeName is True and BInParentName is True or BInNodeName is True and AInParentName is True: return True else: return False
def preprocess_text(text): mystem = Mystem() rs = '' for x in stopwords.words('russian'): rs += x + " " rs = mystem.lemmatize(rs) russian_stopwords = list(set(stopwords.words("russian") + rs +["который","это", "сказать", "/ТАСС/", "тыс", "млн","млрд", "президент", "весь", "год","“","”, - "," “","сообщать",') - ',"”","мочь","также","” ","время",""])) mystem = Mystem() tokens = mystem.lemmatize(text.lower()) tokens = [token for token in tokens if (token not in russian_stopwords) and (token != " ") and (token.strip() not in punctuation)] return tokens
def tokeniz(self, df): #########COMMENTS##################### for i in range(len(df)): df["comment"][i] = list(df["comment"][i][2:-2].replace( "'", '').split(',')) tw = TweetTokenizer() det = TreebankWordDetokenizer() for i in (range(len(df))): for j in range(len(df["comment"][i])): tokenized_example = (tw.tokenize(df["comment"][i][j])) filtered_example = [ word for word in tokenized_example if not word in self.sum_noise ] df["comment"][i][j] = det.detokenize(filtered_example) mystem_analyzer = Mystem(entire_input=False) for i in (range(len(df))): df["comment"][i] = [ mystem_analyzer.lemmatize(w) for w in df["comment"][i] ] df["comment"][i] = list(filter(None, df["comment"][i])) for i in range(len(df)): for j in range(len(df['comment'][i])): df['comment'][i][j] = [ word for word in df['comment'][i][j] if not word in self.sum_noise ] ##########POSTS############## for i in (range(len(df))): tokenized_example = (tw.tokenize(df["post"][i])) filtered_example = [ word for word in tokenized_example if not word in self.sum_noise ] df["post"][i] = det.detokenize(filtered_example) for i in (range(len(df))): a = [] a.append(df['post'][i]) df["post"][i] = a for i in (range(len(df))): df["post"][i] = [ mystem_analyzer.lemmatize(w) for w in df["post"][i] ][0] for i in range(len(df)): df['post'][i] = [ word for word in df['post'][i] if not word in self.sum_noise ] return df
def _get_russian(soup, word = None): russian = None if(soup.find(class_='t_inline_en')): if(word is not None): yandex_url = "https://translate.yandex.net/api/v1.5/tr.json/translate?lang=en-ru&format=plain&key=trnsl.1.1.20181026T095610Z.0f9e5b3c50d78498.83dff75a74e7d95e0712640c87b207295ef8842a&text=" + word.replace(' ','%20') yandex_url_to = "https://translate.yandex.net/api/v1.5/tr.json/translate?lang=en-ru&format=plain&key=trnsl.1.1.20181026T095610Z.0f9e5b3c50d78498.83dff75a74e7d95e0712640c87b207295ef8842a&text=" +'to%20' + word.replace(' ','%20') yandex_translate = urllib.request.urlopen(yandex_url).read() yandex_translate_to = urllib.request.urlopen(yandex_url_to).read() yd = json.loads(yandex_translate.decode("utf-8"))['text'][0].replace('чтобы','',1).strip().replace('себе','',1).strip() yd_to = json.loads(yandex_translate_to.decode("utf-8"))['text'][0] russian = soup.find(class_='t_inline_en').text.replace('\u2002',' ').replace(' ',' ').strip() mystem = Mystem() lemmas = mystem.lemmatize(yd) ws = russian.split(',') b = False for idx in range(1, 3): for w in ws: if((not w.find(yd[:-idx]) == -1 or not w.find(lemmas[0][:-idx]) == -1) and b == False): russian = russian.replace(w,'<b>' + w.upper() + '</b>', 1) b = True if(b == False): wsl = '' for idw, w in enumerate(ws): wsl += str(idw) + ' — ' + w + ', ' ii = input('Выберети основной перевод слова «' + word + ' (to ' + word + ')» — «'+ yd + ' ('+ yd_to + ')»:\n' + wsl[:-2] + ': ') russian = russian.replace(ws[int(ii)],'<b>' + ws[int(ii)].upper() + '</b>', 1) print(russian) else: russian = soup.find(class_='t_inline_en').text.replace('\u2002',' ').replace(' ',' ').strip() elif(soup.find(class_='light_tr')): russian = soup.find(class_='light_tr').text.replace('\u2002',' ').replace(' ',' ').strip() return russian
def tokenize_sentences_lemmatized(rawSentences): print('LEMMATIZED total = ' + str(rawSentences.__len__())) sentences = [] m = Mystem() index = 0 for c in rawSentences: tokenized_sents = m.lemmatize(c) cleaned_set = [] for tokenized in tokenized_sents: if tokenized == "": break tokenized = tokenized.lower() if tokenized in stopwords.words('russian'): continue token = tokenized[0] if (token >= 'а' and token <= 'я'): cleaned_set.append(tokenized) elif ((token >= 'а' and token <= 'я') or (token >= 'a' and token <= 'z')): cleaned_set.append(tokenized) if cleaned_set.__len__() > 0: sentences.append(cleaned_set) if index % 100 == 0: print(index) index += 1 return sentences
class Tokenizer: def __init__(self): self.space_pattern = re.compile(r'[^.А-ЯA-ZЁ]+', re.I) self.m = Mystem() try: with open('nw_model/stopwords.txt') as f: self.stop_words = set(f.read().split('\n')) | {''} except FileNotFoundError: self.stop_words = set() print( f'{Fore.RED}WARNING!!! Stop-words file not found!{Style.RESET_ALL}' ) def tokenize_line(self, line): """ Токенизирует одну строку :param line: :return: набор лексем (pymysteam) """ try: return [ word for word in self.m.lemmatize( self.space_pattern.sub(' ', line.lower())) if word.strip() not in self.stop_words ] except BrokenPipeError: self.m = Mystem() return self.tokenize_line(line) def join(self, lst): return self.space_pattern.sub(' ', ' '.join(lst))
def extract(input_filename, output_filename, number_of_documents, log_step, whole_size, index, verbose): m = Mystem() with open(output_filename, 'w', encoding='utf-8') as csvfile_out: with open(input_filename, "r", encoding="utf-8") as csvfile_in: datareader = csv.reader(csvfile_in) datawriter = csv.writer(csvfile_out) abs_step = int(whole_size * log_step / 100) count = 0 for row in datareader: if count == 0: datawriter.writerow( insert_lemmatized_text_into_row( row, 'text_lemmas', index)) else: datawriter.writerow( insert_lemmatized_text_into_row( row, ''.join(m.lemmatize(row[index])), index)) if (number_of_documents > 0) and (count >= number_of_documents): return count elif (whole_size > 0) and (log_step > 0) and ( abs_step > 0) and (count % abs_step == 0): log_percents(count / whole_size * 100, verbose) count += 1 return count
class HHParser: def __init__(self) -> None: self.mystem = Mystem() self.term_extractor = rutermextract.TermExtractor() self.russian_stopwords = stopwords.words("russian") with open( os.path.dirname(os.path.realpath(__file__)) + '/models.json', 'rb') as file: self.models = dict(json.load(file)) nltk.download("stopwords") def preprocess_text(self, text: str, word_limit: int): tokens = self.mystem.lemmatize(text.lower()) tokens = [token.split(" ") for token in tokens] tokens = np.concatenate(tokens) tokens = [token.strip() for token in tokens if token not in self.russian_stopwords \ and token != " " \ and token.strip() not in punctuation] text = " ".join(tokens) terms = self.term_extractor(text, limit=word_limit, strings=True) return terms def answer_questions(self, uid: str, questions: List[str]): answers = {} for question in questions: question_terms = self.preprocess_text(question, 2) answer = parsehh(uid, question_terms=question_terms) if answer is not None and answer is not {}: answers[question] = answer return answers
def collection(folder, stop): m = Mystem() data = {} data_lemmas = [] for root, dirs, files in os.walk(folder): for fname in files: f = open(root + '/' + fname, 'r', encoding='utf-8') article = f.read() f.close() article += '.' title = re.findall('@ti (.*?)\n', article)[0] url = re.findall('@url (.*?)\n', article)[0] text = re.findall('@url.*?\n(.*)\.', article, flags=re.DOTALL)[0] data[title] = [url, text] for key in data: wo_stop = [] data[key][1] = re.sub('\n', ' ', data[key][1]) data[key][1] = re.sub(' – ', ' ', data[key][1]) data[key][1] = re.sub('[.,!?:;\'\"\(\)\[\]«»]', '', data[key][1]) while ' ' in data[key][1]: data[key][1] = re.sub(' ', ' ', data[key][1]) all_words = data[key][1].split(' ') dl = len(all_words) lemmas = m.lemmatize(data[key][1]) for lem in lemmas: if lem not in stop: wo_stop.append(lem) data_lemmas.append([key, data[key][0], wo_stop, dl, all_words]) return data_lemmas, data
def preprocess_text(str1): mystem = Mystem() tokens = mystem.lemmatize(str1.lower()) str1 = " ".join(tokens) words = [] for word in str1.split(): if (word.isalpha()) and (not isEnglish(word)): words.append(word) res = set() for word in words: word_adv = word + '_ADJ' word_noun = word + '_NOUN' try: model.similarity(word_adv, 'слово_NOUN') res.add(word_adv) except BaseException: try: model.similarity(word_noun, 'слово_NOUN') res.add(word_noun) except BaseException: pass return res
def preprocess_text_lemmatize(text, setting="mystem"): text = text.lower() if text == '@@@': return text if setting == "mystem": mystem = Mystem() tokens = mystem.lemmatize(text) elif setting == "pymorphy": tokens = word_tokenize(text, language="russian") morph = pymorphy2.MorphAnalyzer() tokens = [morph.parse(token)[0].normal_form for token in tokens] else: raise Exception('parameter setting should be fill') # if len(tokens) == 1 and tokens[0] == "@@@": # return " ".join(tokens) tokens = [token for token in tokens if token.strip() not in punctuation] tokens = " ".join(tokens) tokens = tokens.replace('-', ' ').replace("``", '').replace( "''", '').replace(".", '').replace("«", '').replace("»", '').replace("—", '').replace("№", '') for symbol in punctuation: tokens = tokens.replace(symbol, '') for symbol in ['1', '2', '3', '4', '5', '6', '7', '8', '9', '0']: tokens = tokens.replace(symbol, '') # print(tokens) return ' '.join(tokens.split())
class Lemmatizer(BaseProcessor): def __init__(self): self.m = Mystem() def transform(self, tokens, *args): lemm_str = " ".join(tokens) return list(filter(lambda s: s.strip(), self.m.lemmatize(lemm_str)))
def get_tags(text: str) -> List[str]: """Get text key words""" language = detect(text) original_text = text normalize_text = None keywords_dict = None rake_obj = None if language == 'ru': rake_obj = RAKE.Rake(STOP_WORDS_DIR_RU) m = Mystem() normalize_text = ''.join(m.lemmatize(original_text)) if language == 'en': rake_obj = RAKE.Rake(STOP_WORDS_DIR_ENG) if rake_obj is not None: keywords_dict = rake_obj.run(normalize_text, maxWords=2, minCharacters=2) keywords = [] if keywords_dict: mean_rate = reduce(lambda item1, item2: item1 + item2[1], keywords_dict, 0) / len(keywords_dict) keywords = [item[0] for item in keywords_dict if item[1] >= mean_rate] return keywords
def TokenizeSentencesLemmatized(rawSentences, needStemming): print('total = ' + str(rawSentences.__len__())) sentences = [] index = 0 #st = nltk.stem.SnowballStemmer('russian') m = Mystem() for c in rawSentences: #start = time.time() tokenized_sents = m.lemmatize(c) cleaned_set = [] for tokenized in tokenized_sents: if tokenized == "": break tokenized = tokenized.lower() if tokenized in stopwords.words('russian'): continue token = tokenized[0] if (token >= 'а' and token <= 'я') and needStemming: cleaned_set.append(tokenized) elif ((token >= 'а' and token <= 'я') or (token >= 'a' and token <= 'z')): cleaned_set.append(tokenized) if cleaned_set.__len__() > 0: sentences.append(cleaned_set) #end = time.time() #print('Time: ' + str(end - start)) print(index) index += 1 return sentences
def __tokenize_sentences_lemmatized(self, rawSentences): sentences = [] m = Mystem() index = 0 for c in rawSentences: logging.warning(str(datetime.now()) + " tokinizeing " + str(index)) tokenized_sents = m.lemmatize(c) cleaned_set = [] for tokenized in tokenized_sents: if tokenized == "": break tokenized = tokenized.lower() if tokenized in stopwords.words('russian'): continue token = tokenized[0] if (token >= 'а' and token <= 'я'): cleaned_set.append(tokenized) elif ((token >= 'а' and token <= 'я') or (token >= 'a' and token <= 'z')): cleaned_set.append(tokenized) if cleaned_set.__len__() > 0: sentences.append(cleaned_set) index += 1 return sentences
def search(): if request.args: search = request.args['search'] with open('request.txt', 'w', encoding='utf-8') as fl: fl.write(search) with open('request.txt', 'r', encoding='utf-8') as f1: req = f1.read() m = Mystem() lemma_text = m.lemmatize(req) lemma_text = ' ' + str(''.join(lemma_text)) + ' ' lemma_res = '%' + str(lemma_text).replace('\n', '') + '%' conn = sqlite3.connect('newspaper.db') c = conn.cursor() c.execute("SELECT title, url, plain FROM newspaper WHERE lemma LIKE ?", (lemma_res,)) rows = c.fetchall() res = [] for row in rows: res.append(row) return render_template('search.html', search=search, res=res) return render_template('search.html')
def search(request): appropriate = defaultdict(float) m = Mystem() article_data, avdl, inverted_index = main_func() N = len(article_data) text = re.sub('[&!?*&@#/.,:.,"––)(«»№]', '', request) words = [i.lower() for i in text.split()] lemmas = [] for word in words: if word not in stopwords.words('russian'): lll = m.lemmatize(word) lemmas.append(lll[0]) for lemma in lemmas: if lemma in inverted_index: lemma_count = inverted_index[lemma] n = len(lemma_count) for l in lemma_count: data = article_data[l[0]] qf = l[1] dl = data[2] appropriate[(data[0], data[1])] += score_BM25(n, qf, N, dl, avdl) result = sorted(appropriate) return result
def detect_event(self, add_detected_word=False): """ Метод, определяющий, есть ли в переданном сообщении информация про событие :param add_detected_word: bool, default=False Добавлять ли в начало сообщения слово, по которому было детектировано событие :return: bool """ # Флаг, определяющий, содержит ли данный текст информацию про событие is_event = False # Слово, по которому сработал алгоритм detected_word = '' # Инициализация лемматизатора lemmatizer = Mystem(grammar_info=False, entire_input=False) # Перебор всех слов for norm_word in lemmatizer.lemmatize( self.convert_text(self.message.description)): # Есть ли данное слово в словаре слов про ИБ if norm_word in self.DETECT_EVENT_SET: is_event = True detected_word = norm_word break if add_detected_word: self.message.description = '#' + detected_word + '\n' + self.message.description return is_event
class Lemmatisation(object): def __init__(self): self.ru_lem = Mystem() self.en_lem = nltk.stem.WordNetLemmatizer() self.ru_stop_words = set( nltk.corpus.stopwords.words('russian') + [chr(i) for i in range(ord('а'), ord('я') + 1)]) self.en_stop_words = set( nltk.corpus.stopwords.words('english') + [chr(i) for i in range(ord('a'), ord('z') + 1)]) def visible(self, term): if re.search( NOT_DIGIT_OR_LETTER, term ) or term in self.ru_stop_words or term in self.en_stop_words: return False return True def _lemmatize(self, doc): lemmas = self.ru_lem.lemmatize(doc) lemmas = [ self.en_lem.lemmatize(lemma) for lemma in lemmas if self.visible(lemma) ] return ' '.join(lemmas) def lemmatize(self, doc_id, doc): try: return self._lemmatize(doc) except Exception as e: print(doc_id, e)
def main_func(): all_lemmas = {} article_data = {} avdl = 0 for article in os.listdir('./articles'): textwithtags = open('./articles/' + article, 'r', encoding='utf-8-sig').read() url = re.findall('@url (.*)', textwithtags)[0] name = re.findall('@ti (.*)', textwithtags)[0] text = re.findall('article=[0-9]+(.*)', textwithtags, flags=re.DOTALL) m = Mystem() if len(text) > 0: text = text[0] text = re.sub('[&!?*&@#/.,:.,"––)(«»№]', '', text) words = [i.lower() for i in text.split()] lemmas = [] for word in words: if word not in stopwords.words('russian'): lll = m.lemmatize(word) lemmas.append(lll[0]) all_lemmas[article] = lemmas article_data[article] = (url, name, len(lemmas)) avdl += len(lemmas) avdl = avdl / len(all_lemmas) inverted_index = invert_index(all_lemmas) return article_data, avdl, inverted_index
class Lemmatizer: def __init__(self, stop_words = None): self.stemmer = Mystem() self.cache = dict()#MyCache(maxsize=1000000) stop_words = stop_words if stop_words is not None else [] self.stop_words = set(stop_words + [' ', '\n', '\r\n', '\t']) pass def lemmatize_word(self, word): res = self.cache.get(word, None) if res is not None: return res lm = self.stemmer.lemmatize(word) lm = [w for w in lm if w not in self.stop_words] if len(lm) == 0: return None lemmatized_word = max(lm, key=lambda x: len(x)) self.cache[word] = lemmatized_word return lemmatized_word def fit_transform(self, words): if len(words) == 0: return [] res = [self.lemmatize_word(w) for w in words] res = [w for w in res if w is not None] return res
def process_text(text, min_word_size=4, min_sent_size=10, extra_stop=None, remove_short=True): html_cleaner = re.compile('<.*?>') cyrillic = re.compile(r'[^а-яА-Я ]') esc_punctuation = re.compile('[%s]' % re.escape(string.punctuation)) extra_spaces = re.compile(r'\s{2,}') stop_list = stopwords.words('russian') if extra_stop: stop_list.extend(['']) lmtzr = Mystem() n_sents = len(text) for i in tqdm(range(n_sents)): # html removal text[i] = html_cleaner.sub(' ', text[i]) # there's NO NoneType error # punctuation and numbers removal text[i] = esc_punctuation.sub(' ', text[i]) # leaving only cyrillic words text[i] = cyrillic.sub(' ', text[i]) # extrace spacing text[i] = extra_spaces.sub(' ', text[i]) # lemmatization (Hint: look to pymystem3 docs) text[i] = ''.join(lmtzr.lemmatize(text[i])).strip() # stopwords removal + lowercasing text[i] = ' '.join([word.lower() for word in text[i].split() \ if len(word) >= min_word_size and word not in stop_list]) text = [sent for sent in text if len(sent) >= min_sent_size] print('done!') return text
class NewsTextDataset: def __init__(self): self.data = [] self.unique_ids = [] self.mystem = Mystem() self.russian_stopwords = stopwords.words("russian") def append(self, article: Article): if article.article_id not in self.unique_ids: self.unique_ids.append(article.article_id) self.data.append(article) return True else: return False def save(self, path): with open(path, "w") as fp: data = { "catalog": [ob.__dict__ for ob in self.data], } json.dump( data, fp, sort_keys=True, indent=4, ensure_ascii=False, ) def load(self, path): with open(path) as json_file: data = json.load(json_file) self.data = [Article(dict_object=obj) for obj in data["catalog"]] def preprocess(self): for idx, article in tqdm(enumerate(self.data)): # r"[a-zA-Z]|\$|\d*|\(|\)|/@" pattern = r"[^а-яА-Я\s]" text = re.sub(pattern, "", article.text) tokens = self.mystem.lemmatize(text.lower()) tokens = [ token for token in tokens if token not in self.russian_stopwords and token != " " and token.strip() not in punctuation and ad.is_cyrillic(token) ] article.tokenized_text = tokens self.update(article, idx) def dump_to_pandas(self): return pd.DataFrame.from_records( [article.to_dict() for article in self.data], ) def __len__(self): return len(self.data) def update(self, article, idx): self.data[idx].tokenized_text = article.tokenized_text # Useless for now def __getitem__(self, idx): return self.data[idx]
class ActionDocs(Action): def __init__(self): self.m = Mystem() self.countries = json.load(open(file, "r")) def name(self) -> Text: return "action_get_docs" def run(self, dispatcher: CollectingDispatcher, tracker: Tracker, domain: Dict[Text, Any]) -> List[Dict[Text, Any]]: input_country = tracker.get_slot('country') lemmas = self.m.lemmatize(input_country) country = lemmas[0].capitalize() found = False for i in self.countries: if country == i["country"] or input_country == i["country"].lower( ): if input_country == i["country"]: country = i["country"] found = True dispatcher.utter_message(text=f"{i['documents']}") break if not found: dispatcher.utter_message( text=f"Я не знаю такую страну '{country}'") return []
def func_lemma(text): m = Mystem() # lemmatization model lemmas = [] for i in range(len(text)): # join all words in i-topic after lemmatization lemmas.append(''.join(m.lemmatize(text[i]))) return lemmas
class ActionInZone(Action): def __init__(self): self.countries = json.load(open(file, "r")) self.m = Mystem() self.schengens = [ "Австрия", "Бельгия", "Чешская Республика", "Дания", "Эстония", "Финляндия", "Франция", "Германия", "Греция", "Венгрия", "Исландия", "Италия", "Латвия", "Литва", "Люксембург", "Мальта", "Голландия", "Норвегия", "Польша", "Португалия", "Словакия", "Словения", "Испания", "Швеция", "Швейцария", "Лихтенштейн" ] def name(self) -> Text: return "is_schengen_zone" def run(self, dispatcher: CollectingDispatcher, tracker: Tracker, domain: Dict[Text, Any]) -> List[Dict[Text, Any]]: country = tracker.get_slot('country') lemmas = self.m.lemmatize(country) country = lemmas[0].capitalize() if country in self.schengens: dispatcher.utter_message(text=f"{country} член шенгенской зоны") else: dispatcher.utter_message( text=f"{country} не входит в шенгенскую зону") return []
class Word2vecProcessor(object): """Объект для работы с моделью word2vec сходства слов""" def __init__(self, w2v_model_file): self.mystem = Mystem() self.word2vec = KeyedVectors.load_word2vec_format(w2v_model_file, binary=True) self.lemma2word = { word.split('_')[0]: word for word in self.word2vec.index2word } def word_vector(self, word): lemma = self.mystem.lemmatize(word)[0] word = self.lemma2word.get(lemma) return self.word2vec[word] if word in self.word2vec else None def text_vector(self, text): """Вектор текста, получается путем усреднения векторов всех слов в тексте""" word_vectors = [ self.word_vector(token) for token in word_tokenize(text.lower()) if token.isalpha() ] word_vectors = [vec for vec in word_vectors if vec is not None] return np.mean(word_vectors, axis=0) def distance(self, vec1, vec2): if vec1 is None or vec2 is None: return 2 return cosine(vec1, vec2)
class TextsLematizer(): def __init__(self): self.m = Mystem() # функция, проводящая предобработку текста def text_hangling(self, text: str): try: txt = re.sub('[^a-zа-я\d]', ' ', text.lower()) txt = re.sub('\s+', ' ', txt) # сюда можно будет вложить самую разную обработку, в том числе и вариационную return txt except: return "" # функция лемматизации одного текста def text_lemmatize(self, text: str): try: lemm_txt = self.m.lemmatize(text) lemm_txt = [w for w in lemm_txt if w not in [' ', '\n']] return lemm_txt except: return [''] # функция лемматизации списка текстов текста def texts_lemmatize(self, texts_list): return [ self.text_lemmatize(self.text_hangling(tx)) for tx in texts_list ]
def extract(self): try: #вычисляем, сколько в директории лежит файлов input_files = filter(lambda x: not x.endswith('~'), os.listdir(self.input_directory)) output_data = {} list_of_all_terms = {} m = Mystem() #иду по документам for file in input_files: with open(self.input_directory + '/' + file) as data_file: data = json.load(data_file) list_of_terms = filter(lambda x: x != "", re.split(""" |\.|,|:|\?|"|<|>|\*|!|@|_ +""", data['text'])) text = " ".join(["%s" % term for term in list_of_terms]) list_of_terms = filter(lambda x: x != " ", m.lemmatize(text)) count_of_rows = 0 for i in range(0, len(list_of_terms)): if list_of_terms[i] == '\n' or list_of_terms[i] == ' \n': count_of_rows += 1 if list_of_terms[i] == ' \n': list_of_terms[i] = '\n' if count_of_rows < self.threshold_of_rows_count: first_list_of_terms = list_of_terms list_of_terms = [] for i in range(0, len(first_list_of_terms)): if first_list_of_terms[i] != '\n': list_of_terms.append(first_list_of_terms[i]) output_data[file] = {} output_data[file]['id'] = data['id'] output_data[file]['positive'] = data['positive'] output_data[file]['sarcasm'] = data['sarcasm'] output_data[file]['terms'] = {} #убираю повторяющиеся слова for term in list_of_terms: if term not in output_data[file]['terms']: output_data[file]['terms'][term] = 1 else: output_data[file]['terms'][term] += 1 for term in output_data[file]['terms'].keys(): if term not in list_of_all_terms: list_of_all_terms[term] = 1 else: list_of_all_terms[term] += 1 #подсчёт tf count_of_terms = output_data[file]['terms'][term] output_data[file]['terms'][term] = {'tf': float(count_of_terms)/len(list_of_terms), 'idf': 0, 'count': count_of_terms} for file in input_files: #подсчёт idf for term in output_data[file]['terms'].keys(): output_data[file]['terms'][term]['idf'] = math.log(float(len(input_files))/list_of_all_terms[term]) #запись результата with open(self.output_directory + '/' + file + '_tf-idf', 'w') as output_file: json.dump(output_data[file], output_file) except Exception: return False else: return True
def extract(self): try: #вычисляем, сколько в директории лежит файлов input_files = filter(lambda x: not x.endswith('~'), os.listdir(self.input_directory)) output_data = {} list_of_all_terms = {} m = Mystem() #иду по документам for file in input_files: with open(self.input_directory + '/' + file) as data_file: data = json.load(data_file) list_of_terms = filter(lambda x: x != "", re.split(""" |\.|,|:|\?|"|\n|<|>|\*|!|@|_ +""", data['text'])) text = " ".join(["%s" % term for term in list_of_terms]) list_of_terms = filter(lambda x: x not in (" ", "\n"), m.lemmatize(text)) my_list = list_of_terms list_of_terms = [] for term in my_list: if m.analyze(term)[0].get(u'analysis'): if not m.analyze(term)[0][u'analysis'][0][u'gr'].startswith(self.service_parts_of_speech) and len(term) > 1: list_of_terms.append(term) if term == u'не': list_of_terms.append(term) else: list_of_terms.append(term) output_data[file] = {} output_data[file]['id'] = data['id'] output_data[file]['positive'] = data['positive'] output_data[file]['sarcasm'] = data['sarcasm'] output_data[file]['terms'] = {} #убираю повторяющиеся слова for term in list_of_terms: if term not in output_data[file]['terms']: output_data[file]['terms'][term] = 1 else: output_data[file]['terms'][term] += 1 for term in output_data[file]['terms'].keys(): if term not in list_of_all_terms: list_of_all_terms[term] = 1 else: list_of_all_terms[term] += 1 #подсчёт tf count_of_terms = output_data[file]['terms'][term] output_data[file]['terms'][term] = {'tf': float(count_of_terms)/len(list_of_terms), 'idf': 0, 'count': count_of_terms} for file in input_files: #подсчёт idf for term in output_data[file]['terms'].keys(): output_data[file]['terms'][term]['idf'] = math.log(float(len(input_files))/list_of_all_terms[term]) #запись результата with open(self.output_directory + '/' + file + '_tf-idf', 'w') as output_file: json.dump(output_data[file], output_file) except Exception: return False else: return True
class Runner(object): def __init__(self, input_text): self.lemmatize = None while True: response = raw_input("Do you want to lemmatize text first? (yes/no)\n").lower() if response == "yes": print "You should wait for a while" self.lemmatize = True self.stemmer = Mystem() break elif response == "no": self.lemmatize = False break self.word_lists = list() with open(input_text, "r") as f: for line in f: line += "." if self.lemmatize: lexemes = self.stemmer.lemmatize(line) word_list = list() # список слов, неразделенных знаками пунктуации for lexeme in lexemes: lexeme = lexeme.strip() if lexeme: if lexeme.translate(None, '.,?!:;()"\' -\t\n'): # проверка, что лексема не является знаком пунктуации lexeme = lexeme.decode("utf-8") if is_cyrillic(lexeme): word_list.append(lexeme) else: # иначе, добавить биграмы из списка и завести новый пустой список self.word_lists.append(word_list) word_list = list() else: line = line.replace(".", " . ").replace(",", " , ").replace(":", " : ").replace(";", " ; ")\ .replace("?", " ? ").replace("!", " ! ").replace("(", " ( ").replace(")", " ) ")\ .replace("--", " -- ").replace(".", " . ") word_list = list() for lexeme in line.split(): # проверка, что лексема не является знаком пунктуации lexeme = lexeme.translate(None, '.,?!:;()"\'').replace("--", "").decode("utf-8").strip().lower() if lexeme: if is_cyrillic(lexeme): word_list.append(lexeme) else: if word_list: self.word_lists.append(word_list) word_list = list() train, test = self.split() self.lid = Lid(train, test) self.lid.run() def split(self): n = len(self.word_lists) train = self.word_lists[:n*9/10] test = self.word_lists[n*9/10:] return train, test
class Index(object): def __init__(self, input_file): self.stemmer = Mystem() self.documents = dict() self.tokens = list() self.terms = dict() self.index = list() # reading documents, making tokenization with open(input_file, "r") as f: for i, line in enumerate(f, start=1): self.documents[i] = line.decode("utf-8") for word in self.stemmer.lemmatize(line): token = word.translate(None, '.,?!:;()"\'-').decode("utf-8").strip() if token: self.tokens.append((token, i)) # sorting by tokens first, then by frequency self.tokens.sort(key=lambda tup: (tup[0], tup[1])) # terminization and building index current_term = self.tokens[0][0] current_doc_id = self.tokens[0][1] doc_ids = [current_doc_id] for token, doc_id in self.tokens: term = token.lower() if term == current_term: if doc_id != current_doc_id: doc_ids.append(doc_id) current_doc_id = doc_id else: self.terms[current_term] = (len(doc_ids), doc_ids) self.index.append((current_term, len(doc_ids), doc_ids)) current_term = term current_doc_id = doc_id doc_ids = [doc_id] self.terms[current_term] = (len(doc_ids), doc_ids) self.index.append((current_term, len(doc_ids), doc_ids)) def print_to_file(self): with open("result.txt", "w") as f: for term, count, doc_ids in self.index: f.write("{},\t{},\t{}\n".format(term.encode("utf-8"), count, doc_ids)) def print_statistics(self): terms_num = len(self.terms) terms_len = 0. for term in self.terms: terms_len += len(term) print "***********************" print "Number of terms = {}".format(terms_num) print "Average term length = {}".format(terms_len / terms_num) print "***********************"
def extract(self): try: #вычисляем, сколько в директории лежит файлов input_files = filter(lambda x: not x.endswith('~'), os.listdir(self.input_directory)) output_data = {} list_of_all_n_grams = {} m = Mystem() #иду по документам for file in input_files: with open(self.input_directory + '/' + file) as data_file: data = json.load(data_file) list_of_terms = filter(lambda x: x != "", re.split(""" |\.|,|:|\?|"|\n|<|>|\*|!|@|_ +""", data['text'])) text = " ".join(["%s" % term for term in list_of_terms]) list_of_terms = filter(lambda x: x not in (" ", "\n"), m.lemmatize(text)) list_of_n_grams_tuples = {} for j in range(0, self.n): list_of_n_grams_tuples[j] = zip(*[list_of_terms[i:] for i in range(j + 1)]) list_of_n_grams_strings = [] for j in range(0, self.n): for gram_tuple in list_of_n_grams_tuples[j]: string_of_n_gram = " ".join(["%s" % term for term in gram_tuple]) list_of_n_grams_strings.append(string_of_n_gram) output_data[file] = {} output_data[file]['id'] = data['id'] output_data[file]['positive'] = data['positive'] output_data[file]['sarcasm'] = data['sarcasm'] output_data[file]['terms'] = {} #убираю повторяющиеся слова for gram in list_of_n_grams_strings: if gram not in output_data[file]['terms']: output_data[file]['terms'][gram] = 1 else: output_data[file]['terms'][gram] += 1 for gram in output_data[file]['terms'].keys(): if gram not in list_of_all_n_grams: list_of_all_n_grams[gram] = 1 else: list_of_all_n_grams[gram] += 1 #подсчёт tf count_of_n_grams = output_data[file]['terms'][gram] output_data[file]['terms'][gram] = {'tf': float(count_of_n_grams)/len(list_of_n_grams_strings), 'idf': 0, 'count': float(count_of_n_grams)} for file in input_files: #подсчёт idf for gram in output_data[file]['terms'].keys(): output_data[file]['terms'][gram]['idf'] = math.log(float(len(input_files))/list_of_all_n_grams[gram]) #запись результата with open(self.output_directory + '/' + file + '_tf-idf', 'w') as output_file: json.dump(output_data[file], output_file) except Exception: return False else: return True
class Index(object): def __init__(self, input_file): self.stemmer = Mystem() self.tokens = list() self.index = dict() self.number_of_documents = 0 try: self.read_from_file_compressed("index_compressed.txt") except: # reading documents, making tokenization with open(input_file, "r") as f: for line in f: self.number_of_documents += 1 # self.documents[i] = line.decode("utf-8") for word in self.stemmer.lemmatize(line): token = word.translate(None, '.,?!:;()"\'-').decode("utf-8").strip() if token: self.tokens.append((token, self.number_of_documents)) # sorting by tokens first, then by frequency self.tokens.sort(key=lambda tup: (tup[0], tup[1])) # terminization and building index current_term = self.tokens[0][0] current_doc_id = self.tokens[0][1] doc_ids = [current_doc_id] for token, doc_id in self.tokens: term = token.lower() if term == current_term: if doc_id != current_doc_id: doc_ids.append(doc_id) current_doc_id = doc_id else: self.index[current_term] = (len(doc_ids), pack_doc_ids(doc_ids)) current_term = term current_doc_id = doc_id doc_ids = [doc_id] self.index[current_term] = (len(doc_ids), pack_doc_ids(doc_ids)) del self.tokens self.write_index_in_file() def write_index_in_file(self): with open("index_compressed.txt", "w") as f: pickle.dump(self.index, f) def read_from_file_compressed(self, index_file): with open(index_file, "r") as f: self.index = pickle.load(f)
def mystem_using(input_directory, output_directory): input_files = filter(lambda x: not x.endswith('~'), os.listdir(input_directory)) output_data = {} m = Mystem() for input_file in input_files: with open(input_directory + '/' + input_file) as data_file: data = json.load(data_file) list_of_terms = filter(lambda x: x != '', re.split(''' |\.|,|:|\?|"|\n|<|>|\*|!|@|_ +''', data['text'])) text = " ".join(["%s" % term for term in list_of_terms]) list_of_terms = filter(lambda x: x not in ('', ' ', '\n'), m.lemmatize(text)) text_of_output = ' '.join(['%s' % term for term in list_of_terms]) output_data[input_file] = {} output_data[input_file]['id'] = data['id'] output_data[input_file]['positive'] = data['positive'] output_data[input_file]['sarcasm'] = data['sarcasm'] output_data[input_file]['text'] = text_of_output with open(output_directory + '/' + input_file, 'w') as output_file: json.dump(output_data[input_file], output_file)
def search(): cn = None file = codecs.open('static/articles.xml', 'r', 'utf-8') rfile = file.read() tree = lxml.etree.fromstring(rfile) res = tree.xpath('entry') categ = { 'cat': 'Категория', 'wgroup': 'Группа слов с близким значением', 'comm': 'Комментарии', 'stdiff': 'Стилистические различия', 'overlap': 'Совпадающая часть значения', 'dom': 'Доминанта группы', 'diffmark': 'Различительные признаки, релевантные для данной группы', 'diff': 'Смысловые различия', 'rare': 'Редкие слова, примыкающие к группе', 'anmean': 'Другие значения слов, входящих в группу', 'comb': 'Сочетаемость', 'reg': 'Региональные варианты', 'adict': 'Данные академических словарей', 'doc': 'Нормативные документы', 'etim': 'Этимология', 'ill': 'Иллюстрации' } file.close() ms = Mystem() wordsearch = ms.lemmatize(request.form['search'].lower())[0] for i in res: if wordsearch == '': cn = 'Пустой запрос' elif i.text.lower().startswith(wordsearch): arr = [] for j in i.iter(): for k in dict.keys(categ): if j.tag == k: if j.text != 'null': arr.append('<font size="4"><b>' + str(categ[j.tag]) + '</b></font><br>' + str(j.text)) text = '<br><br>'.join([j for j in arr[1:]]) text = re.sub('\*', '<b>', text) text = re.sub('\#', '</b>', text) text = re.sub('\$', '<i>', text) text = re.sub('\%', '</i>', text) text = re.sub('\@', '<font color="#696969">', text) text = re.sub('\+', '</font>', text) cn = '<strong><big>' + i.text + '</big></strong><br><br>' + re.sub('\n', '<br>', text) break else: cn = 'По Вашему запросу ничего не найдено. <br>' \ 'Попробуйте использовать "Поиск по тегу" или измените запрос.' return render_template('search.html', cn=Markup(cn))
def __init__(self, input_text): self.number_of_words = 0 self.number_of_bigrams = 0 self.words_frequency = dict() self.bigrams_frequency = dict() self.words_position = dict() # как часто слово W находится в первой и во второй позиции в биграмме while True: response = raw_input("Do you want to lemmatize text first? (yes/no)\n").lower() if response == "yes": print "You should wait for a while" LEMMATIZE = True stemmer = Mystem() break elif response == "no": LEMMATIZE = False break with open(input_text, "r") as f: for i, line in enumerate(f, start=1): line = line + "." if LEMMATIZE: lexemes = stemmer.lemmatize(line) words_list = list() # список слов, неразделенных знаками пунктуации for lexeme in lexemes: lexeme = lexeme.strip() if lexeme: if lexeme.translate(None, '.,?!:;()"\' -\t\n'): # проверка, что лексема не является знаком пунктуации lexeme = lexeme.decode("utf-8") if is_cyrillic(lexeme): words_list.append(lexeme) else: # иначе, добавить биграмы из списка и завести новый пустой список n = len(words_list) if n > 1: w1 = words_list[0] self.__add_word(w1) for w2 in words_list[1:]: self.__add_word(w2) self.__add_bigram(w1, w2) w1 = w2 words_list = list() else: line = line.replace(".", " . ").replace(",", " , ").replace(":", " : ").replace(";", " ; ")\ .replace("?", " ? ").replace("!", " ! ").replace("(", " ( ").replace(")", " ) ")\ .replace("--", " -- ").replace(".", " . ") words_list = list() for lexeme in line.split(): # проверка, что лексема не является знаком пунктуации lexeme = lexeme.translate(None, '.,?!:;()"\'').replace("--", "").decode("utf-8").strip().lower() if lexeme: if is_cyrillic(lexeme): words_list.append(lexeme) else: n = len(words_list) if n > 1: w1 = words_list[0] self.__add_word(w1) for w2 in words_list[1:]: self.__add_word(w2) self.__add_bigram(w1, w2) w1 = w2 words_list = list() if i % 1000 == 0: print "Computing line {}".format(i) print "total words = {}".format(self.number_of_words) print "unique words = {}".format(len(self.words_frequency)) print "total bigrams = {}".format(self.number_of_bigrams) print "unique bigrams = {}".format(len(self.bigrams_frequency)) with open("bigrams.txt", "w") as f: bigrams = list(self.bigrams_frequency.items()) bigrams.sort(key=lambda tup: (-tup[1], tup[0])) for bigram in bigrams: f.write("{}\n".format(bigram[0].encode("utf-8")))
from pymystem3 import Mystem m = Mystem() t = 'Чайника, сегодня не было' lemma = m.lemmatize(t) def lemmas(text): punc = list('.?!-;:",') text = [i for i in text if i not in punc] text = ''.join(text) text = m.lemmatize(text) textn = '' for w in text: if w is not ' ' or '\n': textn += w return textn from nltk.corpus import stopwords from sklearn.feature_extraction.text import TfidfVectorizer import os s_w = stopwords.words('russian') sw = [i for i in s_w] v = TfidfVectorizer(stop_words=sw) # убираем стоп-слова #v = TfidfVectorizer() # не убираем стоп-слова totalCorpus = [] suspenseCorpus = ''
with open(file_in) as parsed_in, \ open("..\\data\\stemmed\\" + name + "_mystem.tsv", "wb") as mystem_out: # open("..\\data\\stemmed\\" + name + "_porter.tsv", "wb") as porter_out, \ parsed_in = csv.reader(parsed_in, delimiter='\t') mystem_out = csv.writer(mystem_out, delimiter='\t') #, quoting=csv.QUOTE_NONE mystem = Mystem() prep_counter = 0 for row in parsed_in: exclude = ['\'', '\"', '.', ',', '!', '?', u'«', u'»'] s = ''.join(ch for ch in row[1].decode("utf-8") if ch not in exclude) stemmed_tokens = m.lemmatize(s) stemmed_tokens = [token if emoticon_re.search(token) else token.lower() for token in stemmed_tokens] # punctuation = list(string.punctuation.decode("utf-8")) # stop = punctuation # stop = ['!', '"', '$', '%', '&', '\'', '(', ')', '*', '+', ',', '-', '.', '/', # ':', ';', '<', '=', '>', '?', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~'] #'@', stop = ['rt', 'via', '...', "…".decode("utf-8")] stemmed_tokens = [token if token not in stop else '' for token in stemmed_tokens] stemmed_str = "".join([token for token in stemmed_tokens]) mystem_out.writerow([row[0], stemmed_str.encode("utf-8").replace('\n', ' ')]) # Print a status message every 1000th review if prep_counter % 100. == 0.: print "Lemmatize %d strings" % (prep_counter)
with open("../data/" + PREFIX + "norm_sentences.txt", "w") as writer: count = 0 raw = [] normalized = [] for line in open("../data/" + PREFIX + "parsed.txt"): if count % 1000 == 0: print count line = re.sub("[\W\d]+", " ", line.strip().decode("utf-8").strip(), 0, re.UNICODE) line = re.sub("\s+", " ", line.strip(), 0, re.UNICODE).lower() raw.extend(line.split(" ")) writer.write("* " + line.encode("utf-8") + " **;") # print line, '->', line = " ".join(normalizer.lemmatize(line)) line = re.sub("\s+", " ", line, 0, re.UNICODE) lemmatized = filter(lambda x: len(x.strip()) > 0, normalizer.lemmatize(line)) normalized.extend(lemmatized) # print line writer.write("* " + " ".join(lemmatized).encode("utf-8") + " **\n") count += 1 # print 'saving raw' # # with open("../data/raw_terms.txt", "w") as f: # for term in set(raw): # f.write(term.encode("utf-8") + "\n") # # print 'saving norm'
# Using pymystem3 lemmatize texts import sys from pymystem3 import Mystem text = sys.argv[1] m = Mystem() lemmas = m.lemmatize(text) print(''.join(lemmas))
def test_mystem_abc(self): m = Mystem() tokens = m.lemmatize("ABC") assert ["ABC", "\n"] == tokens
def test_mystem_not_entireinput(self): m = Mystem(entire_input=False) tokens = m.lemmatize("Мама мыла раму") assert ["мама", "мыть", "рама"] == tokens
def test_mystem(self): m = Mystem() tokens = m.lemmatize("Мама мыла раму") assert ["мама", " ", "мыть", " ", "рама", "\n"] == tokens
def lemma(text): m = Mystem() lemmas = m.lemmatize(text) titleStemmed = ''.join(lemmas) return titleStemmed
temporal.append(lines[i]) if temporal: if lines[i].find('>0<') != -1: temporal.append('0') elif lines[i].find('>-1<') != -1: temporal.append('-1') elif lines[i].find('>1<') != -1: temporal.append('1') if len(temporal) == 2: strings.append(temporal[0]) labels.append(temporal[1]) temporal = [] #lemmatization mystem = Mystem() lemmas_norm = list(set(mystem.lemmatize(words_str))) #cleaning up the sentences rus_symbols = re.compile('[а-я]|\s') text = '' for string in strings: for symbol in string: if rus_symbols.search(symbol): text += symbol text += '. ' sentences = text.split('. ') for i in range(len(sentences)): sentences[i] = sentences[i].strip()
def extract(self): try: #вычисляем, сколько в директории лежит файлов input_files = filter(lambda x: not x.endswith('~'), os.listdir(self.input_directory)) output_data = {} list_of_all_terms = {} m = Mystem() #иду по документам for file in input_files: with open(self.input_directory + '/' + file) as data_file: data = json.load(data_file) list_of_terms = filter(lambda x: x != "", re.split(""" |\.|,|:|\?|"|\n|<|>|\*|!|@|_ +""", data['text'])) text = " ".join(["%s" % term for term in list_of_terms]) list_of_terms = filter(lambda x: x not in (" ", "\n"), m.lemmatize(text)) # обработка не + (слово) nums_of_bigrams = [] helping_words = [u'совсем', u'очень', u'слишком', u'самый'] for i in range(0, len(list_of_terms)): if list_of_terms[i] == u'не' and list_of_terms[i+1] not in helping_words: nums_of_bigrams.append((i, i+1)) elif list_of_terms == u'не' and list_of_terms[i+1] in helping_words: nums_of_bigrams.append((i, i+2)) for i in range(0, len(nums_of_bigrams)): if nums_of_bigrams[i][0] + 1 == nums_of_bigrams[i][1]: list_of_terms[nums_of_bigrams[i][0]] = list_of_terms[nums_of_bigrams[i][0]] + ' ' + list_of_terms[nums_of_bigrams[i][1]] list_of_terms[nums_of_bigrams[i][1]] = '' elif nums_of_bigrams[i][0] + 2 == nums_of_bigrams[i][1]: list_of_terms[nums_of_bigrams[i][0]] = list_of_terms[nums_of_bigrams[i][0]] + ' ' + list_of_terms[nums_of_bigrams[i][1]] list_of_terms[nums_of_bigrams[i][1] - 1] = '' list_of_terms[nums_of_bigrams[i][1]] = '' list_of_terms = filter(lambda x: x != '', list_of_terms) output_data[file] = {} output_data[file]['id'] = data['id'] output_data[file]['positive'] = data['positive'] output_data[file]['sarcasm'] = data['sarcasm'] output_data[file]['terms'] = {} #убираю повторяющиеся слова for term in list_of_terms: if term not in output_data[file]['terms']: output_data[file]['terms'][term] = 1 else: output_data[file]['terms'][term] += 1 for term in output_data[file]['terms'].keys(): if term not in list_of_all_terms: list_of_all_terms[term] = 1 else: list_of_all_terms[term] += 1 #подсчёт tf count_of_terms = output_data[file]['terms'][term] output_data[file]['terms'][term] = {'tf': float(count_of_terms)/len(list_of_terms), 'idf': 0, 'count': count_of_terms} for file in input_files: #подсчёт idf for term in output_data[file]['terms'].keys(): output_data[file]['terms'][term]['idf'] = math.log(float(len(input_files))/list_of_all_terms[term]) #запись результата with open(self.output_directory + '/' + file + '_tf-idf', 'w') as output_file: json.dump(output_data[file], output_file) except Exception: return False else: return True
API_KEY = "api_key" if __name__ == "__main__": not_translated = [] dictionary = {} print(len(dictionary.keys())) m = Mystem() df = pandas.read_csv("/media/alexander/b32bf4b4-8724-4107-9d19-abf6615c2f60/alexander/HELP_FILE/query.yaHotelId.showInTop.sure.final.tsv", sep="\t") df_size = len(df["query"]) k = 1 for line in df["query"]: print(k, "query from", df_size) k += 1 for word in line.strip().split(): lema_word = m.lemmatize(word)[0] if dictionary.get(lema_word) is None: params = {"key": API_KEY, "text": lema_word, "lang": "ru-en"} try: r = requests.get("https://translate.yandex.net/api/v1.5/tr.json/translate", params=params) r_json = r.json() trans_word = r_json["text"][0] if r_json["code"] != 200: print("ERROR", r_json["code"]) not_translated.append(lema_word) continue except Exception as exc: print("ERROR") not_translated.append(lema_word) continue if (len(trans_word.split()) > 1):
def poehali(csv_input): ''' Основная функция csv_input -- файл с таблицей ссылок На выходе |-xmlFile/ |---------year/ |--------------month/ ========= |-plain/ |-------year/ |------------month/ ========= |-html/ |------year/ |-----------month/ |csv_file.csv ''' data = [] i = 0 m = Mystem() gusina() col = ["path", "author", "sex", "birthday", "header", "created", "sphere", "genre_fi", "type", "topic", "chronotop", "style", "audience_age", "audience_level", "audience_size", "source", "publication", "publisher", "publ_year", "medium", "country", "region", "language"] time.sleep(3) path = os.getcwd() path = path + "/" csv_file = open(path + "csv_file.csv", "w") writer = csv.writer(csv_file,delimiter = ",") writer.writerow(col) dosugvbryanske = re.compile("^(http://www.briansk.ru/)(.+)") with open(csv_input) as csvfile: reader = csv.DictReader(csvfile) for row in reader: if re.search(dosugvbryanske, row['url']): print('passing on ' + str(i)) test = urllib.request.urlopen(row['url']).read().decode('cp1251') file_html = path+"/"+str(i)+".html" file_html1 = path+"/"+str(i-1)+".html" dest_html = str(i)+".html" plain = str(i)+".txt" plain_new = str(i)+"_plained.txt" plain_stem = str(i)+"_mystem.txt" output_plain_stem = str(i)+"_out_mystem.txt" xmlFile = str(i) + ".xml" #dir_for_stem = "XML_STEM" page1_html = open(file_html, 'w') page1_html.write(str(test)) page1_html.close() print("FILE EX: "+ str(os.path.exists(file_html))) pageMoving = open(file_html, 'r') #print(file_html + " PATH " + dest_html+"\n") if os.path.exists(file_html1): os.remove(file_html1) print("FILE "+str(i-1)+" HB REMOVED") else: print("FILE "+str(i-1)+" HB ALREADY MOVED") for line in pageMoving: data = re.search(r"\">[0-9]{1,2}\s{1}((янв|февр|март|апре|май|июнь|июль|авг|сентя|октяб|нояб|декаб)[а-я]{1,}\s[0-9]{4})|\">[0-9]{1,2}\s{1}(ма(а|я)\s[0-9]{4})", line) if data: ''' Определение датирования статьи ''' dates = data.group() dates2 = dates.split() year = dates2[2] month = dates2[1] create_folder(path, year, transpose_month(month), "html") shutil.move(file_html, path+"html/"+year+"/"+transpose_month(month)+"/"+dest_html) print("FILE "+str(i)+" HB MOVED") ''' Созидание директории для XML ''' create_folder(path, year, transpose_month(month), "xmlFile") forxml = path+"xmlFile/"+year+"/"+transpose_month(month)+"/"+dest_html forxml_dir = path+"xmlFile/"+year+"/"+transpose_month(month)+"/" xml_stem = forxml_dir + str(i) + "_mystem.xml" rofxml = path+"xmlFile/"+year+"/"+transpose_month(month)+"/"+xmlFile ''' Копирование html -> xmldir для дальнейшей обработки ''' shutil.copy(path+"html/"+year+"/"+transpose_month(month)+"/"+dest_html, forxml) print("FILE "+str(i)+" HB COPIED TO XML") openindosug_xml = open(forxml, "r") read_and_clean_xml = openindosug_xml.read() xml_data = amixml(read_and_clean_xml) #print(xml_data[2]) openindosug_xml.close() ''' Созидание директории для plain текста ''' create_folder(path, year, transpose_month(month), "plain") forplain = path+"plain/"+year+"/"+transpose_month(month)+"/"+dest_html forplain_dir = path+"plain/"+year+"/"+transpose_month(month)+"/" shutil.copy(path+"html/"+year+"/"+transpose_month(month)+"/"+dest_html, forplain) print("FILE "+str(i)+" HB COPIED TO PLAIN") openindosug = open(forplain, "r") dates = re.sub("\">", "", dates) ''' wri = лист для генерации ИНФО о статьи ''' wri = ["briansk.ru", str(xml_data[1]), toddmmyyy(dates), "", row['url']] page2_txt = open(str(forplain_dir)+str(plain), 'w') for datline in openindosug: page2_txt.write(str(make_it_clean(datline))) page2_txt.close() print("PLAIN FOR "+str(i)+" HB CREATED") ''' Окончательная очистка plain файла; оставляем только текст статьи или текст + ИНФО ''' provide_clean_file(forplain_dir+str(plain),forplain_dir+str(plain_new), wri, "extra") provide_clean_file(forplain_dir+str(plain),forplain_dir+str(plain_stem), wri, "mystem") os.remove(forplain_dir+str(plain)) os.remove(forplain) openindosug.close() ''' xml_data[0] -- content xml_data[1] -- headerTag xml_data[2] -- content date ''' ''' Генерация XML ''' pageEtree = etree.Element('html') doc = etree.ElementTree(pageEtree) infoTag = etree.SubElement(pageEtree, "body") dateTag = etree.SubElement(infoTag, "h1") dateTag.text = str(xml_data[2]) headerTag = etree.SubElement(infoTag, "h2") headerTag.text = str(xml_data[1]) mainTag = etree.SubElement(infoTag, "h3") contentTag = etree.SubElement(infoTag, "h4") contentTag.text = str(xml_data[0]) outFile = open(str(forxml_dir)+str(i)+".xml", 'wb') doc.write(outFile, xml_declaration=True, encoding='utf-16') outFile.close() print("FILE "+str(i)+" HB CODED TO XML") writer.writerow([str(path+"html/"+year+"/"+transpose_month(month)+"/"+dest_html) , "briansk.ru" , "" , "" , str(xml_data[1]) , toddmmyyy(dates), 'публицистика' , "" , "" , "категория" , "" , "нейтральный" , "н-возраст" , "н-уровень" , "городская" , str(row['url']) , "брянск.ru" , "" , str(year) , "газета" , "Россия" , "БРЯНСК" , "ru"]) os.remove(forxml) input_plain = forplain_dir + plain_stem output_plain = forplain_dir + output_plain_stem ''' pystem mystem ''' with open(input_plain) as file: text = file.read() lemmas = m.lemmatize(text) with open(input_plain, 'w') as file: file.write(''.join(lemmas)) os.system(r'/home/haniani/Загрузки/mystem -icd '+ input_plain + ' ' + output_plain) os.system(r'/home/haniani/Загрузки/mystem -icd --format xml '+ input_plain +' '+ xml_stem) print("MYSTEM'ed "+str(i)) break i += 1 print("PASSED ; NEXT: "+str(i)+"\n") csv_file.close() for file in glob.glob(path+"*.html"): os.remove(file)
def extract(self): try: # вычисляем, сколько в директории лежит файлов input_files = filter(lambda x: not x.endswith("~"), os.listdir(self.input_directory)) output_data = {} list_of_all_terms = {} m = Mystem() # иду по документам for file in input_files: with open(self.input_directory + "/" + file) as data_file: data = json.load(data_file) list_of_terms = filter(lambda x: x != "", re.split(""" |\.|,|:|\?|"|\n|<|>|\*|!|@|_ +""", data["text"])) text = " ".join(["%s" % term for term in list_of_terms]) list_of_terms = filter(lambda x: x not in (" ", "\n"), m.lemmatize(text)) my_list_of_terms = [] for term in list_of_terms: my_term = term term = u"" prev_letter = my_term[0] term += my_term[0] for i in range(1, len(my_term)): if my_term[i] != prev_letter: term += my_term[i] prev_letter = my_term[i] my_list_of_terms.append(term) list_of_terms = my_list_of_terms output_data[file] = {} output_data[file]["id"] = data["id"] output_data[file]["positive"] = data["positive"] output_data[file]["sarcasm"] = data["sarcasm"] output_data[file]["terms"] = {} # убираю повторяющиеся слова for term in list_of_terms: if term not in output_data[file]["terms"]: output_data[file]["terms"][term] = 1 else: output_data[file]["terms"][term] += 1 for term in output_data[file]["terms"].keys(): if term not in list_of_all_terms: list_of_all_terms[term] = 1 else: list_of_all_terms[term] += 1 # подсчёт tf count_of_terms = output_data[file]["terms"][term] output_data[file]["terms"][term] = { "tf": float(count_of_terms) / len(list_of_terms), "idf": 0, "count": count_of_terms, } for file in input_files: # подсчёт idf for term in output_data[file]["terms"].keys(): output_data[file]["terms"][term]["idf"] = math.log( float(len(input_files)) / list_of_all_terms[term] ) # запись результата with open(self.output_directory + "/" + file + "_tf-idf", "w") as output_file: json.dump(output_data[file], output_file) except Exception: return False else: return True