class Tokenizer(object): def __init__(self): self.segmenter = Segmenter() self.morph_vocab = MorphVocab() self.emb = NewsEmbedding() self.morph_tagger = NewsMorphTagger(self.emb) self.syntax_parser = NewsSyntaxParser(self.emb) self.ner_tagger = NewsNERTagger(self.emb) self.names_extractor = NamesExtractor(self.morph_vocab) self.doc = [] self.term_extractor = TermExtractor() def init_doc(self, text): self.doc = Doc(text) self.doc.segment(self.segmenter) self.doc.tag_ner(self.ner_tagger) def get_sentance(self, text): self.init_doc(text) sentences = [] for sentence in self.doc.sents: sentences.append(sentence.text) return sentences def get_tokens(self, sentence): tokens = [] for term in self.term_extractor(sentence): tokens.append(term.normalized) return tokens
def __init__(self, text: str): self.doc = Doc(text) self.doc.segment(segmenter) self.doc.tag_morph(morph_tagger) self.doc.parse_syntax(syntax_parser) self.doc.tag_ner(ner_tagger) for span in self.doc.spans: span.normalize(morph_vocab)
def get_doc(self, text: str) -> Doc: doc = Doc(text) doc.segment(self.segmenter) doc.tag_morph(self.morph_tagger) doc.parse_syntax(self.syntax_parser) doc.tag_ner(self.ner_tagger) return doc
def segmentate(text: str, date: typing.Optional[datetime.datetime] = None): doc = Doc(text) doc.segment(segmenter) doc.tag_morph(morph_tagger) doc.parse_syntax(syntax_parser) doc.tag_ner(ner_tagger) for span in doc.spans: span.normalize(morph_vocab) return {_.type: _.normal for _ in doc.spans}
def __call__(self, text): doc = Doc(text) doc.segment(self.segmenter) doc.tag_morph(self.morph_tagger) for token in doc.tokens: token.lemmatize(self.morph_vocab) doc.parse_syntax(self.syntax_parser) doc.tag_ner(self.ner_tagger) for span in doc.spans: span.normalize(self.morph_vocab) return doc
def tag_text(text): if not (text in tag_text_cache): doc = Doc(text) doc.segment(segmenter) doc.tag_morph(morph_tagger) doc.tag_ner(ner_tagger) doc.parse_syntax(syntax_parser) for span in doc.spans: span.normalize(morph_vocab) tag_text_cache[text] = doc return tag_text_cache[text]
def __init__(self, text): self.doc = Doc(text) self.doc.segment(Segmenter()) self.doc.tag_morph(NewsMorphTagger(NewsEmbedding())) morph_vocab = MorphVocab() for token in self.doc.tokens: token.lemmatize(morph_vocab) self.doc.parse_syntax(NewsSyntaxParser(NewsEmbedding())) self.doc.tag_ner(NewsNERTagger(NewsEmbedding())) for span in self.doc.spans: span.normalize(morph_vocab) self.words = tuple(filter(lambda x: x.pos not in ('X', 'PUNCT'), self.doc.tokens)) self.tokens_nouns = tuple(filter(lambda t: t.pos in ['NOUN', 'PROPN'], self.doc.tokens)) self.tokens_adjs = tuple(filter(lambda t: t.pos == 'ADJ', self.doc.tokens)) self.tokens_verbs = tuple(filter(lambda t: t.pos == 'VERB', self.doc.tokens))
def process_text_file(text_file, mongo=None): # nlp = spacy.load('ru_core_news_sm') segmenter = Segmenter() emb = NewsEmbedding() morph_tagger = NewsMorphTagger(emb) syntax_parser = NewsSyntaxParser(emb) with open(text_file, 'r', encoding='utf-8') as file: file_name = file.name[2:] line_number = 0 for line in file: line_number += 1 if line_number % 100 == 0: logging.info(f'Processed line {line_number}') if line_number >= 100000: return sents = [sent.text for sent in sentenize(line)] sentence_number = 0 for sentence in sents: doc = Doc(sentence) doc.segment(segmenter) doc.tag_morph(morph_tagger) doc.parse_syntax(syntax_parser) sentence_number += 1 sentence_tokens = doc.tokens # sentence_tokens = [ # { # 'text': token.text, # 'lemma': token.lemma_, # 'pos': token.pos_, # 'tag': token.tag_, # 'dep': token.dep_, # 'shape': token.shape_, # 'is_alpha': token.is_alpha, # 'is_stop': token.is_stop # } for token in sentence] words = markup_words(doc.syntax) deps = token_deps(doc.syntax.tokens) html = show_dep_markup(words, deps) save_html( html, f'./htmls/dependency_plot_{file_name}_{line_number}_{sentence_number}.html' ) # # svg = displacy.render(sentence, style='dep', options={'compact': False, 'bg': '#09a3d5', # 'color': 'white', 'font': 'Source Sans Pro'}) # output_path = Path(f'./images/dependency_plot_{file_name}_{line_number}_{sentence_number}.svg') # output_path.open('w', encoding='utf-8').write(svg) PatternExtractor.extract_relations( file_name, line_number, sentence_number, sentence, sentence_tokens, # noun_phrases, # mongo=mongo )
def __call__(self, text): doc = Doc(text) doc.segment(self.segmenter) doc.tag_morph(self.morph_tagger) doc.parse_syntax(self.syntax_parser) doc.tag_ner(self.ner_tagger) for token in doc.tokens: token.lemmatize(self.morph_vocab) for span in doc.spans: span.normalize(self.morph_vocab) if span.type == PER: span.extract_fact(self.names_extractor) return doc
def test_doc(segmenter, morph_vocab, morph_tagger, syntax_parser, ner_tagger, names_extractor, capsys): doc = Doc(TEXT) doc.segment(segmenter) doc.tag_morph(morph_tagger) doc.parse_syntax(syntax_parser) doc.tag_ner(ner_tagger) for span in doc.spans: span.normalize(morph_vocab) if span.type == PER: span.extract_fact(names_extractor) for token in doc.tokens: token.lemmatize(morph_vocab) doc.ner.print() assert strip(capsys.readouterr().out) == NER sent = doc.sents[0] sent.morph.print() assert strip(capsys.readouterr().out) == MORPH sent.syntax.print() assert strip(capsys.readouterr().out) == SYNTAX lemmas = { _.text: _.lemma for _ in doc.tokens if _.text.lower() != _.lemma } assert lemmas == LEMMAS normals = { _.text: _.normal for _ in doc.spans } assert normals == NORMALS facts = { _.normal: _.fact.as_dict for _ in doc.spans if _.fact } assert facts == FACTS
def calculate_skills_assessment(text, ca): vacancy_key_skills = list( map( lambda x: x.lower(), list(ca.core_vacancy.key_skills.all().values_list('title', flat=True)))) vacancy_additional_skills = list( map( lambda x: x.lower(), list(ca.core_vacancy.additional_skills.all().values_list( 'title', flat=True)))) segmenter = Segmenter() emb = NewsEmbedding() morph_tagger = NewsMorphTagger(emb) syntax_parser = NewsSyntaxParser(emb) morph_vocab = MorphVocab() text = extract_text(ca.cv_file.path) doc = Doc(text) doc.segment(segmenter) doc.tag_morph(morph_tagger) doc.parse_syntax(syntax_parser) cv_key_skills = [] cv_additional_skills = [] for token in doc.tokens: token.lemmatize(morph_vocab) print(token) if token.lemma in vacancy_key_skills and token.lemma not in cv_key_skills: cv_key_skills.append(token.lemma) print(token.lemma) if token.lemma in vacancy_additional_skills and token.lemma not in cv_additional_skills: cv_additional_skills.append(token.lemma) print(token.lemma) candidate_conformity = { "key_skills": { "vacancy_key_skills": vacancy_key_skills, "cv_key_skills": cv_key_skills, "conformity_percent": len(cv_key_skills) / len(vacancy_key_skills) }, "additional_skills": { "vacancy_additional_skills": vacancy_additional_skills, "cv_additional_skills": cv_additional_skills, "conformity_percent": len(cv_additional_skills) / len(vacancy_additional_skills) } } return candidate_conformity
def nat_parse(textDf, textCol='text', columns=tokenCols): t0 = time.time() # initialize collective token dataframe tokenDf = pd.DataFrame(columns=columns) # gather row list for an_id in tqdm(textDf.index.to_list(), desc="Text DF Index id"): # initialize list of token data dicts pDict = [] # create Natasha Doc object with text doc = Doc(textDf.loc[an_id][textCol]) # apply segmenter (sentenizer+tokenizer) doc.segment(segmenter) # apply morphology tagger doc.tag_morph(morph_tagger) # apply lemmatizer for token in doc.tokens: token.lemmatize(morph_vocab) # apply syntax parser doc.parse_syntax(syntax_parser) # apply NER tagger doc.tag_ner(ner_tagger) # gather all tokens' data (excluding punctuation which Natasha treats as tokens) for token in tqdm([x for x in doc.tokens if x.pos != 'PUNCT'], desc="Token id", leave=False): start = token.start stop = token.stop text = token.text token_id = token.id head_id = token.head_id rel = token.rel pos = token.pos lemma = token.lemma # Animacy, Aspect, Case, Degree, Gender, Mood, Number, Person, Tense, VerbForm, Voice # several to many for each token will be NoneType and throw an error try: anim = token.feats['Animacy'] except: anim = None try: aspect = token.feats['Aspect'] except: aspect = None try: case = token.feats['Case'] except:
def process(self, text: str) -> Doc: doc = Doc(text) doc.segment(self.segmenter) doc.tag_morph(self.morph_tagger) for token in doc.tokens: token.lemmatize(self.morph_vocab) doc.parse_syntax(self.syntax_parser) return doc
def select_corefs(self, text: str) -> Tuple[List]: '''Метод извлекает из текста кореферентности на основе NER. ''' doc = Doc(text) doc.segment(self.segmenter) doc.tag_morph(self.morph_tagger) for token in doc.tokens: token.lemmatize(self.morph_vocab) doc.tag_ner(self.ner_tagger) # Извлекаем леммы и ищем встречающиеся NER-сущности extracted_lemmas = {} for span in doc.spans: for token in span.tokens: if token.lemma in extracted_lemmas: extracted_lemmas[token.lemma] += 1 else: extracted_lemmas[token.lemma] = 1 selected_items = [ item for item in extracted_lemmas if extracted_lemmas[item] > 1 ] # Выбираем антецеденты и упоминания coref_sequence = [] for item in selected_items: antecedent_found = -100 for span in doc.spans: for token in span.tokens: if token.lemma == item: if antecedent_found == -100: antecedent_found = span.start coref_sequence.append( CorefItem(span.text, token.lemma, span.type, span.start, span.stop)) else: coref_sequence.append( CorefItem(span.text, token.lemma, span.type, span.start, span.stop, antecedent_found)) # Обзначаем индексы токенов sequence = [token for token in doc.tokens] indexes = {} for item in coref_sequence: for i, token in enumerate(doc.tokens): if item.start == token.start: indexes[item.start] = i item.start = i if item.stop == token.stop: item.stop = i for item in coref_sequence: if item.coref != -100: item.coref = indexes[item.coref] return sequence, coref_sequence
def extract_names(text): """Извлекает имена из текста""" doc = Doc(text) doc.segment(segmenter) doc.tag_morph(morph_tagger) doc.parse_syntax(syntax_parser) doc.tag_ner(ner_tagger) for span in doc.spans: if span.type == PER: span.normalize(morph_vocab) span.extract_fact(names_extractor) names = [{ 'normal': _.normal, 'fio': _.fact.as_dict, 'start': _.start, 'end': _.stop } for _ in doc.spans if _.fact] return names
def check_in_sent(text,word1,word2): doc = Doc(text) doc.segment(segmenter) for sents in doc.sents: text2 = Doc(sents.text) text2.segment(segmenter) list_token_sents = text2.tokens[:] for k in range(0,len(list_token_sents)): for j in range(k+1,len(list_token_sents)): if((text2.tokens[k].text == word1) & (text2.tokens[j].text == word2)): return 'TRUE'
def anon_ner(text): result = '' doc = Doc(text) doc.segment(segmenter) doc.tag_ner(ner_tagger) result_temp = '' last = 0 for span in doc.spans: if span.type == 'PER': result_temp += text[last:span.start] result_temp += 'ИМЯ' if span.type == 'ORG': result_temp += text[last:span.start] result_temp += 'ОРГАНИЗАЦИЯ' if span.type == 'LOC': result_temp += text[last:span.start] result_temp += 'ЛОКАЦИЯ' if span.type == 'PER' or span.type == 'ORG' or span.type == 'LOC': last = span.stop result_temp += text[last:] result = result_temp result_temp = "" last = 0 countries = [ 'AZ', 'AM', 'BY', 'KZ', 'KG', 'MD', 'RU', 'TJ', 'TM', 'UZ', 'UA' ] for country in countries: for match in phonenumbers.PhoneNumberMatcher(result, country): result_temp += result[last:match.start] result_temp += 'ТЕЛЕФОН ' last = match.end result_temp += result[last:] result = result_temp return result
def delete_NER(words): nf_words = ' '.join(words) per_words = [] loc_words = [] doc = Doc(nf_words) doc.segment(segmenter) doc.tag_ner(ner_tagger) for span in doc.spans: span.normalize(morph_vocab) for span in doc.spans: if span.type == 'PER': span.extract_fact(names_extractor) per_words.append(span.text) if span.type == 'LOC': span.extract_fact(names_extractor) loc_words.append(span.text) for word in per_words: if word in nf_words: nf_words = nf_words.replace(word, ' PER ') for word in loc_words: if word in nf_words: nf_words = nf_words.replace(word, ' LOC ') words = nf_words.split(' ') return words
def tag_ner(self, text): doc = Doc(text) doc.segment(self.segmenter) doc.tag_ner(self.ner_tagger) return [(sp.start, sp.stop, sp.text.replace("\n", " "), sp.type) for sp in doc.spans]
def __tag_text(text): doc = Doc(text) doc.segment(Segmenter()) ner_tagger = NewsNERTagger(NewsEmbedding()) doc.tag_ner(ner_tagger) return doc
def extract_entities(text: str): """Returns dictionry with all recognized entities in format { locations: [], peaple: [], orginizations: [], money: [] } """ doc = Doc(text) doc.segment(segmenter) doc.tag_morph(morph_tagger) doc.parse_syntax(syntax_parser) doc.tag_ner(ner_tagger) for span in doc.spans: span.normalize(morph_vocab) locations = list(filter(lambda span: span.type == 'LOC', doc.spans)) locations = list(set(location.normal for location in locations)) orginizations = list(filter(lambda span: span.type == 'ORG', doc.spans)) orginizations = list(set(org.normal for org in orginizations)) people = list(filter(lambda span: span.type == 'PER', doc.spans)) people = list(set(person.normal for person in people)) money = list(match.fact for match in money_extractor(text)) money = list(set(f'{m.amount} {m.currency}' for m in money)) return { 'locations': locations, 'people': people, 'orginizations': orginizations, 'money': money } # text = 'Минздрав Украины проверит медицинские учреждения Харьковской, Одесской и Запорожской областей из-за того, что они не до конца использовали индийскую вакцину от коронавируса Covishield компании AstraZeneca из первой партии. Об этом сегодня, 23 апреля, во время брифинга сказал главный государственный санитарный врач Виктор Ляшко. По его словам, только в трех областях до сих пор не использовали полностью вакцину Covishield из первой партии, нарушив тем самым указания Минздрава. Ляшко сообщил, что с 26 апреля в Харьковскую, Одесскую и Запорожскую области направятся представители Минздрава, чтобы выяснить, почему сложилась такая ситуация. Напомним, что в Украине вакцинация от коронавируса началась 24 февраля 2021 года. По состоянию на утро 23 апреля прививки получили 508 046 человек. Из них пять человек получили две дозы вакцины. Ранее сообщалось, что с начала пандемии в Украине по состоянию на утро 23 апреля 2021 года было подтверждено 2 004 630 случаев СOVID-19. Выздоровели 1 552 267 человек, а 41 700 – умерли.' # print(extract_entities(text))
def lemmatize(text): doc = Doc(text) doc.segment(segmenter) doc.tag_morph(morph_tagger) for token in doc.tokens: token.lemmatize(morph_vocab) return [token.lemma for token in doc.tokens]
def preprocess_sent(incoming_sent): doc = Doc(incoming_sent) segmenter = Segmenter() emb = NewsEmbedding() morph_tagger = NewsMorphTagger(emb) syntax_parser = NewsSyntaxParser(emb) doc.segment(segmenter) doc.tag_morph(morph_tagger) doc.parse_syntax(syntax_parser) return doc.sents[0]
def __FuncTokLem(text): doc = Doc(text) doc.segment(segmenter) doc.tag_morph(morph_tagger) for token in doc.tokens: token.lemmatize(morph_vocab) return doc.tokens[0].text
def _text_preprocess(text): if text is None: return [] text = text.strip().replace('`', "'") doc = Doc(text) doc.segment(segmenter) doc.tag_morph(morph_tagger) doc.parse_syntax(syntax_parser) for token in doc.tokens: token.lemmatize(morph_vocab) tokens = [t.lemma for t in doc.tokens] return tokens
def preprocess_ner(text): """Удаление именованных сущностей """ doc = Doc(text) doc.segment(segmenter) doc.tag_ner(ner_tagger) new_text = text for entity in doc.spans: new_text = new_text.replace(text[entity.start:entity.stop], '') return new_text
def respond(self, ctx: Context): if not ctx.message_text: return Response('привет!') doc = Doc(ctx.message_text) doc.segment(segmenter) doc.tag_morph(morph_tagger) for token in doc.tokens: token.lemmatize(morph_vocab) return Response('Леммы: ' + ' '.join([t.lemma for t in doc.tokens]))
def get_tokens(self, str_): lemms = list() doc = Doc(str_) doc.segment(self.segmenter) doc.tag_morph(self.morph_tagger) for token in doc.tokens: token.lemmatize(self.morph_vocab) lemms.append(token.text) return [lemms]
def get_extended_lemms(self, str_): doc = Doc(str_) doc.segment(self.segmenter) doc.tag_morph(self.morph_tagger) lemms = list() for token in doc.tokens: token.lemmatize(self.morph_vocab) lemms.append([token.lemma, token.text]) return lemms
def anonymoize(): entities = request.json['entities'] raw_text = request.json['raw_text'] if "DATE" in entities: raw_text = anonymize_date(raw_text) doc = Doc(raw_text) doc.segment(segmenter) doc.tag_ner(ner_tagger) return filter_data(doc.spans, raw_text, entities)