def __init__(self): self.morph = pymorphy2.MorphAnalyzer() self.segmenter = Segmenter() self.morph_vocab = MorphVocab() self.emb = NewsEmbedding() self.morph_tagger = NewsMorphTagger(self.emb) self.ner_tagger = NewsNERTagger(self.emb)
def promocode_expiration_date(self) -> 'datetime.datetime | None': morph_vocab = MorphVocab() dates_extractor = DatesExtractor(morph_vocab) now = datetime.datetime.now() expiration_date = list(dates_extractor(self.text)) if not expiration_date: del morph_vocab del dates_extractor return None expiration_date = expiration_date[-1].fact # make order by date if not expiration_date.month or not expiration_date.day: del morph_vocab del dates_extractor return None year = expiration_date.year or now.year month = expiration_date.month day = expiration_date.day del morph_vocab del dates_extractor return datetime.datetime(year, month, day)
def __init__(self): self.ner_model = build_model(configs.ner.ner_ontonotes_bert_mult, download=False) self.segmenter = Segmenter() self.morph_vocab = MorphVocab() self.emb = NewsEmbedding() self.morph_tagger = NewsMorphTagger(self.emb)
def __init__(self): self.segmenter = Segmenter() self.morph_vocab = MorphVocab() self.emb = NewsEmbedding() self.morph_tagger = NewsMorphTagger(self.emb) self.ner_tagger = NewsNERTagger(self.emb) self.syntax_parser = NewsSyntaxParser(self.emb)
def calculate_skills_assessment(text, ca): vacancy_key_skills = list( map( lambda x: x.lower(), list(ca.core_vacancy.key_skills.all().values_list('title', flat=True)))) vacancy_additional_skills = list( map( lambda x: x.lower(), list(ca.core_vacancy.additional_skills.all().values_list( 'title', flat=True)))) segmenter = Segmenter() emb = NewsEmbedding() morph_tagger = NewsMorphTagger(emb) syntax_parser = NewsSyntaxParser(emb) morph_vocab = MorphVocab() text = extract_text(ca.cv_file.path) doc = Doc(text) doc.segment(segmenter) doc.tag_morph(morph_tagger) doc.parse_syntax(syntax_parser) cv_key_skills = [] cv_additional_skills = [] for token in doc.tokens: token.lemmatize(morph_vocab) print(token) if token.lemma in vacancy_key_skills and token.lemma not in cv_key_skills: cv_key_skills.append(token.lemma) print(token.lemma) if token.lemma in vacancy_additional_skills and token.lemma not in cv_additional_skills: cv_additional_skills.append(token.lemma) print(token.lemma) candidate_conformity = { "key_skills": { "vacancy_key_skills": vacancy_key_skills, "cv_key_skills": cv_key_skills, "conformity_percent": len(cv_key_skills) / len(vacancy_key_skills) }, "additional_skills": { "vacancy_additional_skills": vacancy_additional_skills, "cv_additional_skills": cv_additional_skills, "conformity_percent": len(cv_additional_skills) / len(vacancy_additional_skills) } } return candidate_conformity
def __init__(self): self.segmenter = Segmenter() self.morph_vocab = MorphVocab() self.emb = NewsEmbedding() self.morph_tagger = NewsMorphTagger(self.emb) self.syntax_parser = NewsSyntaxParser(self.emb) self.ner_tagger = NewsNERTagger(self.emb) self.names_extractor = NamesExtractor(self.morph_vocab) self.doc = [] self.term_extractor = TermExtractor()
def __init__(self, text): self.doc = Doc(text) self.doc.segment(Segmenter()) self.doc.tag_morph(NewsMorphTagger(NewsEmbedding())) morph_vocab = MorphVocab() for token in self.doc.tokens: token.lemmatize(morph_vocab) self.doc.parse_syntax(NewsSyntaxParser(NewsEmbedding())) self.doc.tag_ner(NewsNERTagger(NewsEmbedding())) for span in self.doc.spans: span.normalize(morph_vocab) self.words = tuple(filter(lambda x: x.pos not in ('X', 'PUNCT'), self.doc.tokens)) self.tokens_nouns = tuple(filter(lambda t: t.pos in ['NOUN', 'PROPN'], self.doc.tokens)) self.tokens_adjs = tuple(filter(lambda t: t.pos == 'ADJ', self.doc.tokens)) self.tokens_verbs = tuple(filter(lambda t: t.pos == 'VERB', self.doc.tokens))
def main(): news_sites = {'m24.ru': M24_accidents, 'mosday.ru': Mosday_accidents, 'vm.ru': VM_accidents} # Инициализируем Наташу morph_vocab = MorphVocab() extractor = AddrExtractor(morph_vocab) # Ищем новости, проверяем на наличие адресов, загружаем # во временное хранилище news_list = [] for key in news_sites.keys(): try: ScrapeClass = news_sites.get(key) source = ScrapeClass() rec = get_news(source, extractor) news_list += rec except (TypeError): print("Источник {} недоступен.".format(key)) for item in news_list: published = item['time'] + ' ' + item['date'] published = datetime.strptime(published, '%H:%M %d.%m.%Y') record = News( title=item['title'], link=item['link'], date_and_time=published, text=item['text'], address=item['location']['address'], street=item['location']['street'], lat=item['location']['coordinates'][0], lon=item['location']['coordinates'][1] ) record_in_db = News.query.filter_by(link=item['link']).first() if record_in_db: continue else: db.session.add(record) db.session.commit() '''
def get_date(text): text = text.lower() doc = Doc(text) doc.segment(segmenter) doc.tag_morph(morph_tagger) doc.parse_syntax(syntax_parser) from natasha import MorphVocab morph_vocab = MorphVocab() from natasha import DatesExtractor dates_extractor = DatesExtractor(morph_vocab) if 'завтр' in text or tomorrow in str(list(dates_extractor(text))): return "завтра" elif 'сегодня' in text or 'сейчас' in text or today in str( list(dates_extractor(text))): return "сегодня" else: return None
import pandas as pd import numpy as np from natasha import (MorphVocab, DatesExtractor) data = pd.read_table('/content/overhumanized-dev-fp.tsv') vocab = MorphVocab() extractor = DatesExtractor(vocab) result_data = [] def get_date_from_string(s): res = [] matches = [x for x in extractor(s)] for mch in matches: result = "" y = mch.__dict__['fact'].__dict__['year'] # m = mch.__dict__['fact'].__dict__['month'] # d = mch.__dict__['fact'].__dict__['day'] if y is not None: result += str(y) # if m is not None: # if m//10 == 0: # result += "-0"+str(m) # else: # result += "-"+str(m) # if d is not None: # if d//10 == 0:
def main(): parser = ArgumentParser() parser.add_argument('--input_request_str', default=r"Классическая литература", type=str, help="Строка поискового запроса") parser.add_argument('--input_dict_path', default=r"../task_2/tokenized_texts/dict.txt", type=str, help="Путь к словарю") parser.add_argument( '--input_df_path', default="../task_4/tf_idf/df.txt", type=str, help=r"Путь до файла с документными частотами терминов." r"Записываю DF в файл, потому что почему бы и нет, так нагляднее") parser.add_argument( '--input_tf_idf_path', default="../task_4/tf_idf/tf_idf.txt", type=str, help= r"Путь до файла cо значениями TF-IDF. Каждая строка соответствует одному" r"документу. В строке пробелами разделены пары <термин, его idf, его tf-idf>," r"а термин и его tf-idf разделены строкой '~~~'") parser.add_argument('--input_raw_documents_dir', default=r"../task_1/reviews/reviews/", type=str, help="Путь к директории непредобработанных документов") parser.add_argument('--output_log_path', default=r"search_log.txt", type=str, help="Путь к файлу логов поисковых запросов") args = parser.parse_args() input_request_str = args.input_request_str input_dict_path = args.input_dict_path input_df_path = args.input_df_path input_tf_idf_path = args.input_tf_idf_path input_raw_documents_dir = args.input_raw_documents_dir output_log_path = args.output_log_path output_dir = os.path.dirname(output_log_path) if not os.path.exists(output_dir) and output_dir != '': os.makedirs(output_dir) # Подгружаем словарь в память token2id = load_dict(input_dict_path) # Подгружаем предпосчитанную матрицу TF-IDF из файла tf_idf_matrix = load_tf_idf_matrix_from_file( tf_idf_file_path=input_tf_idf_path, token2id=token2id, ) num_documents = tf_idf_matrix.shape[0] # Подгружаем инвертированные документные частоты терминов (IDF) token_idfs = load_vocab_idfs(vocab_dfs_path=input_df_path, token2id=token2id, num_documents=num_documents) segmenter = Segmenter() morph_vocab = MorphVocab() emb = NewsEmbedding() morph_tagger = NewsMorphTagger(emb) request_vector = vectorize_request_tf_idf( request_raw_text=input_request_str, segmenter=segmenter, morph_tagger=morph_tagger, morph_vocab=morph_vocab, token2id=token2id, token_idfs=token_idfs) # Идентификатор документа, наиболее похожего на запрос векторно. Мера похожести - косинусная близость векторов response_document_id = cosine_similarity(tf_idf_matrix, request_vector).argmax() # Путь до файла исходного непредобработанного документа response_document_path = os.path.join( input_raw_documents_dir, f"review_{response_document_id}.txt") # Логируем результат выполнения запроса write_request_log(log_file_path=output_log_path, request_str=input_request_str, response_document_id=response_document_id, response_document_path=response_document_path)
def main(): parser = ArgumentParser() parser.add_argument( '--input_data_dir', default=r"../../task_1/reviews/reviews", type=str, help="Директория непредобработанных текстов, в данном случае - отзывов" ) parser.add_argument( '--output_dir', default=r"../tokenized_texts", type=str, help="Выходная директория, в которой будут содержаться словарь" "и файл с лемматизированными текстами") parser.add_argument('--output_dict_fname', default=r"dict.txt", type=str, help="имя файла словаря") parser.add_argument('--output_documents_fname', default=r"documents.txt", type=str, help="Имя файла с лемматизированными документами") args = parser.parse_args() input_data_dir = args.input_data_dir output_dir = args.output_dir if not os.path.exists(output_dir) and output_dir != '': os.makedirs(output_dir) output_dict_fname = args.output_dict_fname output_documents_fname = args.output_documents_fname output_dict_path = os.path.join(output_dir, output_dict_fname) output_documents_path = os.path.join(output_dir, output_documents_fname) segmenter = Segmenter() morph_vocab = MorphVocab() emb = NewsEmbedding() morph_tagger = NewsMorphTagger(emb) # список списков лемм всех документов lemmatized_tokens_lists = [] # словарь лемм lemmas_dictionary = set() for document_fname in sorted(os.listdir(input_data_dir), key=lambda x: get_doc_id_word_key(x)): document_path = os.path.join(input_data_dir, document_fname) with codecs.open(document_path, 'r', encoding="utf-8") as review_file: document_raw_text = review_file.read() # получаем список лемм документа lemmatized_tokens = get_lemmatized_doc(raw_text=document_raw_text, segmenter=segmenter, morph_tagger=morph_tagger, morph_vocab=morph_vocab) # Добавляем список лемм в список списков лемм всех документов lemmatized_tokens_lists.append(lemmatized_tokens) # обновляем словарь лемм lemmas_dictionary.update(lemmatized_tokens) # запись словаря в файл with codecs.open(output_dict_path, 'w+', encoding="utf-8") as dict_file: for token in lemmas_dictionary: dict_file.write(f"{token}\n") # Запись лемматизированных документов в файл with codecs.open(output_documents_path, 'w+', encoding="utf-8") as documents_file: for doc_lemmas_list in lemmatized_tokens_lists: documents_file.write(f"{' '.join(doc_lemmas_list)}\n")
def get_morph_vocab(cls): morph_vocab = getattr(cls, "_morph_vocab", None) if not morph_vocab: morph_vocab = MorphVocab() cls._morph_vocab = morph_vocab return morph_vocab
def morph_vocab(): return MorphVocab()
n = 10000 n_sql = f"SELECT * FROM instagram_post WHERE note is NULL LIMIT {n};" df = pd.io.sql.read_sql_query(n_sql, conn) # In[5]: if df.size == 0: break # # Выделение NER # In[6]: segmenter = Segmenter() morph_vocab = MorphVocab() emb = NewsEmbedding() ner_tagger = NewsNERTagger(emb) # In[7]: insert_nameentity_sql = '''INSERT INTO instagram_nameentity (name, type) VALUES (%s, %s) ON CONFLICT DO NOTHING;''' insert_nameentity_post_sql = '''INSERT INTO instagram_postnameentity (name_entity_id, post_id) VALUES (%s, %s) ON CONFLICT DO NOTHING;''' # In[8]: db = conn.cursor() # In[9]: df.loc[:, 'caption_ner_locs'] = None
def ca_details(request, ca_id): ca = get_object_or_404(CandidateApplication, id=ca_id) vacancy_key_skills = list( map( lambda x: x.lower(), list(ca.core_vacancy.key_skills.all().values_list('title', flat=True)))) vacancy_additional_skills = list( map( lambda x: x.lower(), list(ca.core_vacancy.additional_skills.all().values_list( 'title', flat=True)))) segmenter = Segmenter() emb = NewsEmbedding() morph_tagger = NewsMorphTagger(emb) syntax_parser = NewsSyntaxParser(emb) morph_vocab = MorphVocab() text = extract_text(ca.cv_file.path) doc = Doc(text) doc.segment(segmenter) doc.tag_morph(morph_tagger) doc.parse_syntax(syntax_parser) cv_key_skills = [] cv_additional_skills = [] for token in doc.tokens: token.lemmatize(morph_vocab) print(token) if token.lemma in vacancy_key_skills and token.lemma not in cv_key_skills: cv_key_skills.append(token.lemma) print(token.lemma) if token.lemma in vacancy_additional_skills and token.lemma not in cv_additional_skills: cv_additional_skills.append(token.lemma) print(token.lemma) candidate_conformity = { "key_skills": { "vacancy_key_skills": vacancy_key_skills, "cv_key_skills": cv_key_skills, "conformity_percent": len(cv_key_skills) / len(vacancy_key_skills) }, "additional_skills": { "vacancy_additional_skills": vacancy_additional_skills, "cv_additional_skills": cv_additional_skills, "conformity_percent": len(cv_additional_skills) / len(vacancy_additional_skills) } } return render(request, 'demo_data.html', context={'data': json.dumps(candidate_conformity)})
from tensorflow.keras.preprocessing.sequence import pad_sequences from tensorflow.keras.layers import Dense, Dropout, Bidirectional, GlobalMaxPooling1D, Input, Activation, concatenate, GlobalAveragePooling1D, GRU import gensim from sklearn.preprocessing import OneHotEncoder from sklearn.model_selection import train_test_split from natasha import Segmenter, NewsEmbedding, PER, MorphVocab, NewsMorphTagger, Doc from helpers import to_serializable, make_keras_picklable app = Flask(__name__) STOP_WORDS = set(open('stop_words.txt', encoding='utf-8').read().split()) # для токенизации segmenter = Segmenter() morph_vocab = MorphVocab() emb = NewsEmbedding() morph_tagger = NewsMorphTagger(emb) # max sentence length WORD_LIMIT = 10 EMBEDDING_DIM = 300 INTENTS_URL = 'http://localhost:6969/intents' # load intents from json def load_labels(): labels = [] url = INTENTS_URL
def Main(docType, text): status = 1 res = {} segmenter = Segmenter() emb = NewsEmbedding() morph_tagger = NewsMorphTagger(emb) syntax_parser = NewsSyntaxParser(emb) ner_tagger = NewsNERTagger(emb) morph_vocab = MorphVocab() names_extractor = NamesExtractor(morph_vocab) money_extractor = MoneyExtractor(morph_vocab) doc = Doc(text) doc.segment(segmenter) doc.tag_morph(morph_tagger) doc.parse_syntax(syntax_parser) doc.tag_ner(ner_tagger) for span in doc.spans: span.normalize(morph_vocab) #для судебного приказа if docType == 'coast': #фио for span in doc.spans: if span.type == PER: span.extract_fact(names_extractor) x = [_.fact.as_dict for _ in doc.spans if _.type == PER] if x: res['ФИО'] = x else: status = 0 #инн y = myextractors.findINN(text) if y: res['ИНН'] = y else: status = 0 #номер судебного приказа y = myextractors.findNCOASTCASE(text) if y: res['номер судебного приказа'] = y else: status = 0 #дата с п y = myextractors.findDATECOAST(text) if y: res['дата судебного приказа'] = y else: status = 0 #организации y = [] for span in doc.spans: if span.type == ORG: d = {} d['name'] = span.text y = y + [d] if y: res['организации'] = y else: status = 0 #для письма if docType == 'mail': #фио for span in doc.spans: if span.type == PER: span.extract_fact(names_extractor) x = [_.fact.as_dict for _ in doc.spans if _.type == PER] if x: res['ФИО'] = x else: status = 0 #инн y = myextractors.findINN(text) if y: res['ИНН'] = y else: status = 0 #номер дог y = myextractors.findNCONTRACT(text) if y: res['номер договора'] = y else: status = 0 #дата дог y = myextractors.findDATECONT(text) if y: res['дата договора'] = y else: status = 0 #для платежного поручения if docType == 'order': #фио for span in doc.spans: if span.type == PER: span.extract_fact(names_extractor) x = [_.fact.as_dict for _ in doc.spans if _.type == PER] if x: res['ФИО'] = x else: status = 0 #инн y = myextractors.findINN(text) if y: res['ИНН'] = y else: status = 0 #организации y = [] for span in doc.spans: if span.type == ORG: d = {} d['name'] = span.text y = y + [d] if y: res['организации'] = y else: status = 0 #номер дог y = myextractors.findNCONTRACT(text) if y: res['номер договора'] = y else: status = 0 #дата дог y = myextractors.findNCONTRACT(text) if y: res['номер договора'] = y else: status = 0 #сумма matches = list(money_extractor(text)) y = [_.fact for _ in matches] ret = [] for i in y: z = {} z['amount'] = i.amount z['currency'] = i.currency ret = ret + [z] if ret: res['сумма'] = ret else: status = 0 returning = {} if status == 1: returning['status'] = 'успех' else: returning['status'] = 'не успех' returning['entities'] = res return returning