def test_split_simple(self): assert simple_word_tokenize('Мама мыла раму') == [ 'Мама', 'мыла', 'раму' ] assert simple_word_tokenize('Постой, паровоз!') == [ 'Постой', ',', 'паровоз', '!' ]
def general_data(): path_opencorpora = "." + os.sep + "!data" + os.sep + \ "newcorpus" + os.sep + "OpenCorpora_txt_clean" files = [ item for item in os.listdir(path_opencorpora) if item.endswith(".txt") ] total_pars = [] total_avs = [] total_pars_count = 0 for file in files: with open(path_opencorpora + os.sep + file, "r", encoding="utf-8") as f: pars = f.readlines() total_pars_count += len(pars) total_pars.append(len(pars)) try: av = [] for par in pars: av.append(len(tokenizers.simple_word_tokenize(par))) total_avs.append(mean(av)) except: # это пустые файлы pass print("В среднем слов в абзаце: {:.2f}\n".format(mean(total_avs)) + "Всего абзацев: {:,}\n".format(total_pars_count).replace(",", " ") + "В среднем абзацев в тексте: {:.2f}".format(mean(total_pars)))
def test_exctract_words(self): text = '''Это отразилось: на количественном,и на качествен_ном - росте карельско-финляндского сотрудничества - офигеть! кони+лошади=масло. -сказал кто-то --нет--''' assert simple_word_tokenize(text) == [ 'Это', 'отразилось', ':', 'на', 'количественном', ',', 'и', 'на', 'качествен_ном', '-', 'росте', 'карельско-финляндского', 'сотрудничества', '-', 'офигеть', '!', 'кони', '+', 'лошади', '=', 'масло', '.', '-сказал', 'кто-то', '--нет--', ]
def lemmatize(s): l = [ morph.parse(w)[0].normal_form for w in simple_word_tokenize(s) if not is_punctuation(w) ] l = [w for w in l if w not in stop_words] return " ".join(l)
def api_morphy(request): query = request.json_body if query["all"]: words = simple_word_tokenize(query["phrase"]) new_phrase = [] for word in words: new_phrase.append(lean(word, case=query["case"])) return {"phrase": " ".join(new_phrase)}
def pymorphy_simple(text): """ Разбивает текст на слова функцией из pymorphy2 :param text: Входной текст :return: список слов """ words = simple_word_tokenize(text) return words
def lemmatize_text(text): text = re.sub("\s+", " ", text) split_text = tokenizers.simple_word_tokenize(text) lemmatized_text = [lemmatize(t) for t in split_text] lemmas = [l[0] for l in lemmatized_text] t_tags = [l[1] for l in lemmatized_text] tags = [t_tags.count(gm) for gm in gram] words_known = sum([word_is_known(t) for t in split_text]) return ' '.join(lemmas), words_known, tags
def tokenize(definition: str) -> List[str]: """ разбивает строку на токены :param definition: строка - определение :return: токены (без знаков препинания) """ return [ x for x in simple_word_tokenize(definition) if x not in punctuation ]
def combine_sent(sent, lemma_seq, pos_seq): tokens = simple_word_tokenize(sent) norm_tokens = [token for token in tokens if token[0].isalpha()] if type(lemma_seq) != str: return [] lemmas = lemma_seq.split() pos = pos_seq.split() combined_list = [] for i in range(len(pos)): combined_list.append(' ' + norm_tokens[i] + '|' + lemmas[i] + '|' + pos[i]) return combined_list
def str_handler(in_string): """Обработчик строк. Удаление лишних символов, детранслитерация, нормализация(приведение к инфинитиву). Keyword arguments: in_string -- строка для обработки """ tokens = tokenizers.simple_word_tokenize(re.sub('[!?,.%]', '', in_string)) new_string = '' for word_in in tokens: if re.search(latin_pattern, word_in): word_in = translit.detranslify(word_in) new_string += morph.parse(word_in)[0].normal_form + ' ' return new_string
def pos(self, line): self.pos_dict = {} # parsing if self.language == "rus": play_pos = [ self.analyzer.parse(token)[0].tag.POS for token in simple_word_tokenize(line) ] elif self.language in self.spacy_analyzers.keys(): play_pos = [token.pos_ for token in self.analyzer(line)] elif self.language in self.cltk_analyzers.keys(): print(self.language) play_pos = self.analyzer.analyze(text=line).pos return play_pos
def tokenize(text_data): text_data = str(text_data).lower() # к нижнему регистру text_data = re.sub( '\s+', ' ', text_data) # нормализовать все переносы и табы к просто пробелу text_data = re.sub('\[id\d*\|\w*\]', 'username', text_data) # замена всех упоминаний пользователей text_data = re.sub('"', '', text_data) # удаление двойных кавычек text_data = re.sub("'", '', text_data) # удаление апострофов text_data = re.sub('ё', 'е', text_data) # замена Ё на Е text_data = re.sub(r'(?<=\w)\*', 'ё', text_data) # замена * на Ё, если перед ним буква tokens = simple_word_tokenize(text_data) # токенизация return ' '.join(tokens)
def tokenizeSingleText(text, configurations): minimal_words_in_sentence = 1 if (configurations != None): minimal_words_in_sentence = configurations.get( "minimal_words_in_sentence", 1) remove_index_list = [] for index, sentence in enumerate(text.original_sentences): if (len(sentence) > 1): tokenized_sentence = tokenizers.simple_word_tokenize(sentence) updated_tokenized_sentence = [] for word in tokenized_sentence: if word.isalpha() and len(word) > 1: updated_tokenized_sentence.append(word) tokenized_sentence = updated_tokenized_sentence if (len(tokenized_sentence) >= minimal_words_in_sentence): text.tokenized_sentences.append(tokenized_sentence) else: remove_index_list.append(index) else: remove_index_list.append(index) # Печать предложений перед вырезкой предложений # string_for_print = '' # for index, sentence in enumerate(text.original_sentences): # string_for_print = string_for_print + str(index) + ')' + sentence + '\n' # writeStringToFile(string_for_print, 'output_files/preProcessing_before_cut.txt') # need_agresive_filtration = False if (configurations != None): need_agresive_filtration = configurations.get( "need_agresive_filtration", False) sorted_remove_index_list = sorted(remove_index_list, key=lambda x: x, reverse=True) if (need_agresive_filtration): for index in sorted_remove_index_list: text.original_sentences.pop(index) # Печать предложений после вырезки # string_for_print = '' # for index, sentence in enumerate(text.original_sentences): # string_for_print = string_for_print + str(index) + ')' + sentence + '\n' # writeStringToFile(string_for_print, 'output_files/preProcessing_after_cut.txt') return text.tokenized_sentences
def lemmatize(self, line): self.lemmas = [] play_lemmas = [] if self.language == "rus": play_lemmas = [ self.analyzer.parse(token)[0].normal_form for token in simple_word_tokenize(line) ] elif self.language in self.spacy_analyzers.keys(): play_lemmas = [token.lemma_ for token in self.analyzer(line)] elif self.language in self.cltk_analyzers.keys(): print(self.language) play_lemmas = self.analyzer.analyze(text=line).lemmata self.lemmas += play_lemmas return " ".join(play_lemmas)
def main(): morph = MorphAnalyzer() texts_path = '..' + os.sep + '..' + os.sep + '..' + os.sep + '..' + os.sep + 'RuCoref' + os.sep + 'rucoref_texts' for folder in os.listdir(texts_path): text_folder = texts_path + os.sep + folder for filename in os.listdir(text_folder): if filename.endswith('.txt'): with open(text_folder + os.sep + filename, 'r', encoding='utf-8') as source_file: source_text = source_file.read() # get tokens tokens = tokenizers.simple_word_tokenize(source_text) # parse tokens tokens_with_tags = morphology_features(tokens, morph) # write tokens to new file write_info(tokens_with_tags, text_folder, filename)
def tokenize_me(file_text): file_text = file_text.lower() tokens = tokenizers.simple_word_tokenize(file_text) tokens = [morph.parse(w)[0].normal_form for w in tokens] #deleting punctuation symbols tokens = [i for i in tokens if (i not in string.punctuation)] #deleting stop_words stop_words = stopwords.words('russian') stop_words.extend([ 'что', 'это', 'так', 'вот', 'быть', 'как', 'в', '—', 'к', 'на', '...' ]) tokens = [i for i in tokens if (i not in stop_words)] return ' '.join(tokens)
def _get_gender(self, profession: str) -> GENDER: """ Predict gender, without using cache """ if not profession.strip(): # Empty string return GENDER.unknown toks = simple_word_tokenize(profession) observed_genders = [self.get_word_gender(tok) for tok in toks] if not observed_genders: # No observed gendered words - return neutral return GENDER.neutral # Return the most commonly observed gender return Counter(observed_genders).most_common()[0][0]
def normalized(twit): twit = re.sub(r'#\S+', '', #вычищение хештегов re.sub(r'@\S+', '', #вычищение @name re.sub(r'http\S+', '', #вычищение ссылок re.sub(r'RT ', '', #убрать RT re.sub("\d+", "", twit)))) #убираем все цифры ).strip() tokens = simple_word_tokenize(twit) parses = [ morph.parse(w)[0] for w in tokens if w.lower() not in stopwords ] parses = [ p for p in parses if p.tag.POS not in {'PNCT', 'UNKN', 'NUMB', 'CONJ', 'LATN'} ] return [p.normal_form.lower() for p in parses]
def api_morphy(request): query = request.json_body command = query["command"] phrase = query["phrase"] if command == "all": words = simple_word_tokenize(phrase) gender = russian.gender(words, "M", "F", "_") if gender == "_": new_phrase = [] for word in words: new_phrase.append(lean(word, case=query["case"])) else: case_ = query["case"] c = TR.get(case_, case_) new_phrase = russian.make_human_case(words, c) return {"phrase": " ".join(new_phrase)} elif command == "gender": p = russian.gender(phrase, query["M"], query["F"], ";-)") return {"phrase": p}
def preprocessing(sentence, clist): s = re.sub('[^а-яА-Яa-zA-Z]+', ' ', sentence).strip().lower() s = re.sub('ё', 'е', s) result = [] for word in tokenizers.simple_word_tokenize(s): if word not in clist: if not morph.word_is_known(word): new_words = split_word(word) else: new_words = word, for new_word in new_words: parse = morph.parse(new_word)[0] pos = parse.tag.POS if pos is not None and pos not in [ 'NPRO', 'PREP', 'CONJ', 'PRCL', 'INTJ' ]: result.append(parse.normal_form) else: result.append(word) return ' '.join(result)
def normalize_text(text): lemmas = [] for t in simple_word_tokenize(text): if t not in stops: lemmas.append(m.parse(t)[0].normal_form) return ' '.join(lemmas)
def tokenize_if_needed(tokens): if not isinstance(tokens, (list, tuple)): return simple_word_tokenize(tokens) return tokens
def predict(self, tokens): if not isinstance(tokens, (list, tuple)): tokens = simple_word_tokenize(tokens) return [self.morph.parse(token)[0] for token in tokens]
def tokenize(self, text): return simple_word_tokenize(text)
def test_split_signs(self): assert simple_word_tokenize('a+b=c_1') == ['a', '+', 'b', '=', 'c_1']
def read_text_lemmas(fileobj): m = MorphAnalyzer() for line in fileobj: yield ' '.join((m.parse(t)[0].normal_form for t in simple_word_tokenize(line.decode('utf-8'))))
def tokenize(text): return tokenizers.simple_word_tokenize(text)
def test_split_simple(self): assert simple_word_tokenize('Мама мыла раму') == ['Мама', 'мыла', 'раму'] assert simple_word_tokenize('Постой, паровоз!') == ['Постой', ',', 'паровоз', '!']
def pos_tag(text): return [f"{m.parse(word)[0].normal_form}_{m.parse(word)[0].tag.POS}" for word in simple_word_tokenize(text)]
def test_split_signs(self): assert simple_word_tokenize('a+b=c_1') == ['a','+','b','=','c_1']
import pickle from pymorphy2 import MorphAnalyzer, tokenizers from constants import PAGES_PATH morph = MorphAnalyzer() inverted_index = {} page_occurrences = {} pages = os.listdir(path=PAGES_PATH) for index, page in enumerate(pages): file = open(PAGES_PATH + page, 'r', encoding="utf-8") text = file.read() tokens = tokenizers.simple_word_tokenize(text) page_occurrences[index] = len(tokens) for token in tokens: lemma = morph.parse(token)[0].normal_form.lower() value = inverted_index.get(lemma) if value is None: inverted_index[lemma] = {index: 1} elif inverted_index[lemma].get(index) is None: inverted_index[lemma][index] = 1 else: inverted_index[lemma][index] += 1 inverted_index_file = open("inverted_index.pkl", "wb")
def test_split_hyphen(self): assert simple_word_tokenize('Ростов-на-Дону') == ['Ростов-на-Дону'] assert simple_word_tokenize('Ура - победа') == ['Ура', '-', 'победа']
def get_sents(input_data): return [normalized(simple_word_tokenize(sent)) for sent in input_data]
def _iter_tokens_tokenize(fp): """ Return an iterator of input tokens; each line is tokenized """ return (token for line in fp for token in simple_word_tokenize(line))
def lemmatize(text): return [ m.parse(word)[0].normal_form for word in simple_word_tokenize(text) ]
def tokenize(line): return [token for token in simple_word_tokenize(line) if token not in string.punctuation]