def get_entity_pymorphy(self, q_text): """ Look for (capitalized) entities in q_text. For this specific application pymorphy2 tagging is enough. """ forbidden = [ 'ВВП', 'HDI', 'ISO', 'ООН', 'UN', 'UTC', 'Utc-Поправка', 'Utc-Поправка' ] words = fix_hyphens(tokenize_words(q_text)) phrase = [] for i, w in enumerate(words[1:]): if w in forbidden: continue if w[0] == w[0].upper(): w_parsed = morph.parse(w.strip(' ?'))[0] w_lemma = w_parsed.normal_form if w_lemma in self.lem_dict: if 'ADJF' in w_parsed.tag: phrase.append(gender_agree(w_parsed).title()) phrase.append( morph.parse(words[i + 2].strip(' ?'))[0].normal_form) return ' '.join(phrase).title() elif 'NOUN' in w_parsed.tag: return w_lemma.title() elif 'UNKN' in w_parsed.tag: return w_lemma.title() matches = get_close_matches(w_lemma.title(), list(self.disamb_dict.keys())) if matches: return matches[0] else: continue return None
def org_form(self): if self.short_name.strip(): tokenized = tokenize_words(self.short_name) if tokenized: return tokenized[0].upper() return "NONE"
def get_unique_delimiters(texts): delimiters = set() for text in texts: for word in tokenize_uk.tokenize_words(text): if (len(word) == 1 and not word in delimiters and not word.isalpha() and not word.isdigit()): delimiters.add(word) return delimiters
def tokenize_words(raw, is_tokenize_uk=False): """ uses nltk by default if 'is_tokenize_uk' is True, then uses tokenize_uk """ if is_tokenize_uk: return tokenize_uk.tokenize_words(raw) else: return nltk.word_tokenize(raw)
def clean_texts(texts): dashes = {'–', '—', '―', '~'} # replace with - special_symbols = {'№', '_', '<', '>', '|', ']', '*', '[', '^', '&'} # replace with "" apostrophes = {'’', '‘'} # replace with ' direct_speech = {'“', '»', '«'} # replace with '"' three_dots = {'…'} # replace with '. counter = 0 for i in range(len(texts)): print("Processing text: ", i) text = texts[i] words = [] tokenized_words = tokenize_uk.tokenize_words(text) for word in tokenized_words: added = False for dash in dashes: if (dash in word): new_word = word.replace(dash, "-") words.append(new_word) added = True continue for special_symbol in special_symbols: if (special_symbol in word): new_word = word.replace(special_symbol, "") words.append(new_word) added = True continue for apostrophe in apostrophes: if (apostrophe in word): new_word = word.replace(apostrophe, "'") words.append(new_word) added = True continue for direct in direct_speech: if (direct in word): new_word = word.replace(direct, '"') words.append(new_word) added = True continue for dots in three_dots: if (dots in word): counter += 1 new_word = word.replace(dots, '.') words.append(new_word) added = True continue if (not added): words.append(word) reconstructed_text = " ".join(words) texts[i] = reconstructed_text return texts
def transform_sentences(self, sentences): result = [] for sentence in sentences: words = tk.tokenize_words(sentence) words = [re.sub(self.reg, '', x) for x in words] words = [x for x in words if x.strip()] words = [x.lower() for x in words] words = lemmatize(words, morph=self.morph) words = delete_stop_words(words, stop_words=self.stop_words) result.append(words) return result
def sentiment_features(comment, stars, check_stars=True, lemmatization=True): features = {} for word in tokenize_words(comment): if lemmatization: word = normalize_word(word) if word.lower() in uk_sentiment_dict.keys(): features['sentiment'] = word.lower() if 'sentiment' not in features.keys(): features['sentiment'] = None if check_stars: if stars == '5': features['stars'] = '5' return features
def preprocess_sent(self, s): s = str(s).lower() words = tokenize_uk.tokenize_words(s) words = [word for word in words if word not in self.stop_words] words = [self.ld[word] if word in self.ld else word for word in words] words = [ self.emb[self.word2id[word]] for word in words if word in self.word2id ] words = np.array(words) if words.shape[0] > self.max_words: words = np.array([]) return words
def get_features_sklearn(self, ent, sent): features = dict() words = fix_hyphens(tokenize_words(sent)) bigrams = ['_'.join(b) for b in nltk.bigrams(words)] n = 3 char_trigrams = [sent[i:i + n] for i in range(len(sent) - n + 1)] for w in words: features[w] = 1 for b in bigrams: features[b] = 1 for c in char_trigrams: features[c] = 1 return ent, features
def process_sentence(s): tok = tokenize_uk.tokenize_words(s) if len(tok) <= 1: return for (i, token) in enumerate(tok): m = morph.parse(token) p = m[0] if 'NOUN' in p.tag and 'anim' in p.tag: left = i - 1 # right = i+1 if left >= 0: p_left = find_adjective(morph.parse(tok[left])) if p_left: add_to_result(p_left, p)
def lemmatize_phrase(phrase): """ Also we can stem instead of lemmatizing... """ words = fix_hyphens(tokenize_words(phrase)) if len(words) == 1: wparsed = morph.parse(phrase)[0] if not wparsed: return phrase return wparsed.normal_form else: new_phrase = '' for w in words: new_phrase += morph.parse(w)[0].normal_form + ' ' return new_phrase.strip()
def transform_lines_into_rythm(lines, accent_vocab): rythm = '' for line in lines.split('\n'): sent_tokens = tokenize_sents(line) for sent in sent_tokens: for word in filter(lambda w: w in accent_vocab, tokenize_words(sent)): accent_options = accent_vocab[word] # use only first one, f**k other options word, index = accent_options[0] rythm_map = transform_into_rythm_map(word, index) rythm += rythm_map rythm += '\n' return rythm
def mask_token_in_sentence(sentence): tokenized_sentence = tokenize_uk.tokenize_words(sentence) number_of_tokens = len(tokenized_sentence) masked_token = False while (not masked_token): # print(number_of_tokens) index = randrange(number_of_tokens) # To eliminate tokenization of punctioation like .,;: delims = {", ", ".", "!", ":", "?", "'", ";", ''} if (not tokenized_sentence[index] in delims): tokenized_sentence[index] = "[MASK]" masked_token = True # print(tokenized_sentence) reconstructed_sentence = " ".join(tokenized_sentence) return reconstructed_sentence
def get_features_perc(self, ent, sent): """ Given question, get features from it. """ features = {} words = fix_hyphens(tokenize_words(sent)) for i, w in enumerate(words): features['word_{i}={w}'.format(i=i, w=w)] = 1 features['words'] = [('w={w}'.format(w=w), 1) for w in words] bigrams = ['_'.join(b) for b in nltk.bigrams(words)] features['bigrams'] = [('bg={bg}'.format(bg=bg), 1) for bg in bigrams] n = 3 char_trigrams = [sent[i:i + n] for i in range(len(sent) - n + 1)] features['trigrams'] = [('t={t}'.format(t=t), 1) for t in char_trigrams] return ent, features
def label_data(data, lemmatization=True, check_stars=True): for comment in data['results']: comment['sentiment'] = 0 for word in tokenize_words(comment['comment']): if lemmatization: word = normalize_word(word) if word.lower() in uk_sentiment_dict.keys(): comment['sentiment'] += int(uk_sentiment_dict[word.lower()]) if comment['sentiment'] > 0: comment['sentiment'] = 'positive' elif comment['sentiment'] < 0: comment['sentiment'] = 'negative' else: comment['sentiment'] = 'neutral' if check_stars: if comment['stars'] == '5': comment['sentiment'] = 'positive'
def main(args): lm_files = get_wiki_files(args.wiki_files) df = load_files_to_dataframe(lm_files) print(df.head().to_string()) print(df.shape) df['text'] = df['text'].apply(lambda x: split_title_from_text(x)) df['len'] = df['text'].apply(lambda x: len(tokenize_uk.tokenize_words(x))) print('Overall number of tokens', df['len'].sum()) print('Decreasing to ~100 million tokens') df = df[df['len'] > 600] print('New number of tokens', df['len'].sum()) df['labels'] = 0 df = df[['labels', 'text']] tokens = UKTokenizer().proc_all_mp(partition_by_cores(df['text'].values)) labels = list(df['labels'].values.astype(np.int64)) tokens_trn, tokens_val, labels_trn, labels_val = train_test_split(tokens, labels, test_size=0.1, random_state=1234, shuffle=True) # limiting vocabulary to ignore rare words freq = Counter(p for o in tokens_trn for p in o) itos = [o for o, c in freq.most_common(args.max_vocab) if c > args.min_freq] itos.insert(0, '_pad_') itos.insert(0, '_unk_') stoi = collections.defaultdict(lambda: 0, {v: k for k, v in enumerate(itos)}) trn_lm = np.array([[stoi[o] for o in p] for p in tokens_trn]) val_lm = np.array([[stoi[o] for o in p] for p in tokens_val]) np.save(os.path.join(args.output_dir, 'trn_ids.npy'), trn_lm) np.save(os.path.join(args.output_dir, 'val_ids.npy'), val_lm) with open(os.path.join(args.output_dir, 'itos.pkl'), 'wb') as f: pickle.dump(itos, f)
def ner_recognize(self, sent): sent = sent.strip(string.punctuation) tokens = fix_hyphens(tokenize_words(sent)) feats = [] for (i, t) in enumerate(tokens): if i == 0: prev_word = '.' else: prev_word = tokens[i - 1] if i == len(tokens) - 1: next_word = '.' else: next_word = tokens[i + 1] feats.append(self._get_ner_features(t, prev_word, next_word)) labels = self.ner_model.predict(feats) first_res = list(zip(tokens, labels)) res = [] for token, label in first_res: if token in ['море', "моря", "озеро", "озера", "океан", "океану"]: res.append((token, 'LOC')) else: res.append((token, label)) return res
import pymorphy2 import tokenize_uk morph_analyzer = pymorphy2.MorphAnalyzer(lang='uk') for i in range(20): with open("sampling" + str(i) + ".txt", "r", encoding="utf-8") as file: sents = file.readlines() pos = {} for sentence in sents: tokens = tokenize_uk.tokenize_words(sentence) for token in tokens: if token.isalnum(): token_pos = morph_analyzer.parse(token)[0].tag.POS if token_pos not in pos.keys(): pos[token_pos] = 1 else: pos[token_pos] += 1 print("Sampling ", i + 1) for k, v in pos.items(): print(k, "-", v)
def test_word_tokenization(self): assert tokenize_words("Геогра́фія або земле́пис") == [ "Геогра́фія", "або", "земле́пис"] assert tokenize_words("Комп'ютер") == [ "Комп'ютер"]
data = f.read() log('processing file ' + src_file) text = data.decode('utf-8') tokens_text = tokenize_uk.tokenize_sents(text) log('tokenization finished') sents_number = int(math.ceil(len(tokens_text) / float(sents_per_chunk))) for i in range(0, sents_number): sentences = [] chunk = tokens_text[i * sents_per_chunk:(i + 1) * sents_per_chunk] for sentence in chunk: sentences.append(tokenize_uk.tokenize_words(sentence)) if items_processed % log_interval == 0: log('items processed {}'.format(items_processed)) items_processed += 1 result_file = os.path.basename(src_file) + str(i) + '.msg' with open(sents_folder + result_file, 'wb') as f: msgpack.pack(sentences, f) log('file {} saved'.format(result_file)) log('done', Fore.GREEN)
def lemmatize(text): tokens = [t for t in tokenize_uk.tokenize_words(text)] lemmas = [morph.parse(t)[0].normal_form for t in tokens] return ' '.join(lemmas)
def tokenize(self, x): return tokenize_uk.tokenize_words(self.sub_br(x))
def check_spelling(sentence): tokens = tokenize_uk.tokenize_words(sentence) candidates = generate_candidates(tokens) return tokens, candidates
def test_word_tokenization(self): assert tokenize_words("Геогра́фія або земле́пис") == [ "Геогра́фія", "або", "земле́пис" ] assert tokenize_words("Комп'ютер") == ["Комп'ютер"]
def parse_sentence(self, sentence): ''' Tokenizes the sentence to words using tokenize_uk, converts to connlu and gets Pymorphy2 info, see parse_tree method ''' tokens = tokenize_uk.tokenize_words(sentence) return self.parse_tree(tokens)
from collections import Counter import tokenize_uk import pymorphy2 morph = pymorphy2.MorphAnalyzer(lang='uk') def get_collocations(text): collocations = Counter() for i in range(1, len(text)): prev_word = text[i - 1] word = text[i] if word.tag.POS == "NOUN" and word.tag.animacy == "anim" and prev_word.tag.POS == "ADJF": collocations[(prev_word.normal_form, word.normal_form)] += 1 return collocations with open( "/home/dasha/Документы/курс/prj-nlp-2020/tasks/02-structural-linguistics/data/tyhrolovy.txt", "r") as f: text = f.read() words = tokenize_uk.tokenize_words(text) parsed = [morph.parse(word)[0] for word in words] collocations = get_collocations(parsed) for c, freq in sorted(collocations.items(), key=lambda x: x[1], reverse=True): print('{0}: {1}'.format(freq, ' '.join(c)))
for t in tokens: tag = morph.parse(t)[0].tag.POS if tag_mapping.get(tag, str(tag)) in useful_tags: filtered_words.append(t) return filtered_words ### Preprocessing tokenized_data = [] for item in dataset: sents_list = [] sents = tokenize_uk.tokenize_sents(item[1]) for s in sents: sents_list.append(tokenize_uk.tokenize_words(s)) tokenized_data.append(sents_list) tokenized_lengths = [len(t) for t in tokenized_data] print("tokenized") lemmatized_data = [[lemmatize_tokens(i) for i in t] for t in tokenized_data] print("lemmatized") digits_cleared_data = [[[item for item in i if not item.isdigit()] for i in l] for l in lemmatized_data] print("digits") punct_cleared_data = [[[ item for item in i if item not in f'{string.punctuation}”№«»' ] for i in d] for d in digits_cleared_data] print("punct") print(punct_cleared_data[0]) pos_filtered_data = [[
def ner_nlp_extracting(text, model, vesum, word2indx, tag2indx, sess, graph): X = list( map( lambda sentence: tokenize_uk.tokenize_words(sentence), tokenize_uk.tokenize_sents(' '.join( tokenize_uk.tokenize_words(text))))) X_tokenized = np.array([[word for word in sentence] for sentence in X]) X = [[ word2indx.get(vesum.get_main_form_from_vesum(word), word2indx['UNKNOWN']) for word in sentence ] for sentence in X] X = pad_sequences(X, maxlen=70, padding='post', truncating='post', value=word2indx['ENDPAD']) with graph.as_default(): set_session(sess) pred = np.argmax(model.predict(X), axis=-1) res = [(sent, list( map( lambda tag: list( filter(lambda key: tag2indx[key] == tag, tag2indx))[0], tags[:len(sent)]))) for sent, tags in zip(X_tokenized, pred)] tokens = list() tags = list() for tokens_tmp, tags_tmp in res: tokens.extend(tokens_tmp) tags.extend(tags_tmp) find_tags = list() start_index = 0 finish_index = 0 for ind, tag in enumerate(tags): if (ind == 0 or ((ind > 0) and tags[ind - 1] == 'O')) and tag != 'O': token = tokens[ind] start_index = text.index(token, finish_index) finish_index = text.index(token, finish_index) + len(token) elif tag != 'O': token = tokens[ind] finish_index = text.index(token, finish_index) + len(token) elif ind > 0 and (tags[ind - 1][0] == 'B' or tags[ind - 1][0] == 'I') and tag == 'O': ner = tags[ind - 1][2:] ner_dict = dict() ner_dict['entity_type'] = ner ner_dict['start_index'] = start_index ner_dict['finish_index'] = finish_index ner_dict['text_entity'] = text[start_index:finish_index] find_tags.append(ner_dict) return find_tags