def fit_model(self): self.model_config = read_json( configs.doc_retrieval.ru_ranker_tfidf_wiki) self.model_config["dataset_reader"]["data_path"] = os.path.abspath( os.getcwd()) + "/Resourses" self.model_config["dataset_reader"]["dataset_format"] = "txt" self.model_config["train"]["batch_size"] = 100 print("work!") self.doc_retrieval = train_model(self.model_config) self.squad = build_model(configs.squad.squad_ru_rubert_infer, download=True) self.odqa = build_model(configs.odqa.ru_odqa_infer_wiki_rubert, download=False)
def __init__(self, config_dict): # tf.compat.v1.random.set_random_seed(1234) # self.elmo_lm = build_model(config_dict, download=True) try: self.elmo_lm = build_model(config_dict, download=False) except Exception as e: self.elmo_lm = build_model(config_dict, download=True) self.words = self.elmo_lm.pipe[-1][-1].get_vocab() self.word_index = {word: i for i, word in enumerate(self.words)} self.INIT_STATE_OF_ELMO = self.elmo_lm.pipe[-1][-1].init_states # index of unknown token: self.IDX_UNK_TOKEN = self.word_index.get("<UNK>")
def deeppavlov_ner_cell(x, *args): from deeppavlov import configs, build_model which = args[0] ner_model = None if which == 'onto_bert_mult': ner_model = build_model(configs.ner.ner_ontonotes_bert_mult, download=True) # done if which == 'onto_bert': ner_model = build_model(configs.ner.ner_ontonotes_bert, download=True) # done if which == 'onto': ner_model = build_model(configs.ner.ner_ontonotes, download=True) # done if which == 'conl_bert': ner_model = build_model(configs.ner.ner_conll2003_bert, download=True) # done if which == 'conl': ner_model = build_model(configs.ner.ner_conll2003, download=True) # done # if which == 'dstc2': # deprecated # ner_model = build_model(configs.ner.ner_dstc2, download=True) # done, but miss if ner_model is None: raise ValueError("Insufficient vespine gas") y = ner_model([x]) enha = {} current_token_l = '' for j in range(len(y[1][0])): token = y[0][0][j] code = y[1][0][j] if code != 'O': code_mark = code[0] code_label = code[2:] if code_mark == 'B': current_token_l = token if code_mark == 'I': del enha[current_token_l] current_token_l = current_token_l + ' ' + token if current_token_l in list(enha.keys()): if code_label not in enha[current_token_l]: enha[current_token_l].append(code_label) else: enha[current_token_l] = [code_label] return enha
def main(args): os.makedirs(args.outdir, exist_ok=True) # pos_model = build_model(configs.morpho_tagger.UD2_0.morpho_ru_syntagrus_pymorphy, download=True) pos_model = build_model( configs.morpho_tagger.BERT.morpho_ru_syntagrus_bert, download=True) syntax_model = build_model(configs.syntax.syntax_ru_syntagrus_bert, download=True) for in_path in glob.glob(args.inglob, recursive=True): try: print(in_path) docname = os.path.splitext(os.path.basename(in_path))[0] out_path = os.path.join(args.outdir, docname + '.pickle') if os.path.exists(out_path) and not args.f: print('Already processed') continue with open(in_path, 'r') as f: full_text = clean_text(f.read()) sentences_spans = list(sentenize(full_text)) sentences_spans = [ split_sent for sent in sentences_spans for split_sent in split_long_sentence(sent, max_len=args.max_sent_len) ] sentences_texts = [s.text for s in sentences_spans] sentences_pos = pos_model.batched_call(sentences_texts, batch_size=args.batch_size) sentences_syntax = syntax_model.batched_call( sentences_texts, batch_size=args.batch_size) assert len(sentences_spans) == len(sentences_pos) == len( sentences_syntax) doc_sentences = [ dict(span=(span.start, span.stop), text=span.text, pos=pos, syntax=synt) for span, pos, synt in zip( sentences_spans, sentences_pos, sentences_syntax) ] with open(out_path, 'wb') as f: pickle.dump(doc_sentences, f) except Exception as ex: print( f'Failed to process {in_path} due to {ex}\n{traceback.format_exc()}' )
def __init__(self, use_noans=False, download=False): if use_noans: config = configs.squad.multi_squad_noans else: config = configs.squad.squad self.model = build_model(config, download=download)
def __init__(self, data_path: Optional[str] = None, config_type: Optional[str] = 'tfidf_autofaq', x_col_name: Optional[str] = 'Question', y_col_name: Optional[str] = 'Answer', save_load_path: Optional[str] = './similarity_matching', edit_dict: Optional[dict] = None, train: Optional[bool] = True): if config_type not in configs.faq: raise ValueError("There is no config named '{0}'. Possible options are: {1}" .format(config_type, ", ".join(configs.faq.keys()))) model_config = read_json(configs.faq[config_type]) if x_col_name is not None: model_config['dataset_reader']['x_col_name'] = x_col_name if y_col_name is not None: model_config['dataset_reader']['y_col_name'] = y_col_name model_config['metadata']['variables']['MODELS_PATH'] = save_load_path if data_path is not None: if expand_path(data_path).exists(): if 'data_url' in model_config['dataset_reader']: del model_config['dataset_reader']['data_url'] model_config['dataset_reader']['data_path'] = data_path else: if 'data_path' in model_config['dataset_reader']: del model_config['dataset_reader']['data_path'] model_config['dataset_reader']['data_url'] = data_path if edit_dict is not None: update_dict_recursive(model_config, edit_dict) if train: self.model = train_model(model_config, download=True) log.info('Your model was saved at: \'' + save_load_path + '\'') else: self.model = build_model(model_config, download=False)
def __init__(self): self.ner_model = build_model(configs.ner.ner_ontonotes_bert_mult, download=False) self.segmenter = Segmenter() self.morph_vocab = MorphVocab() self.emb = NewsEmbedding() self.morph_tagger = NewsMorphTagger(self.emb)
def start_alice_server(model_config, https=False, ssl_key=None, ssl_cert=None, port=None): server_config_path = get_settings_path() / SERVER_CONFIG_FILENAME server_params = get_server_params(server_config_path, model_config) https = https or server_params['https'] if not https: ssl_key = ssl_cert = None else: ssh_key = Path(ssl_key or server_params['https_key_path']).resolve() if not ssh_key.is_file(): e = FileNotFoundError('Ssh key file not found: please provide correct path in --key param or ' 'https_key_path param in server configuration file') log.error(e) raise e ssh_cert = Path(ssl_cert or server_params['https_cert_path']).resolve() if not ssh_cert.is_file(): e = FileNotFoundError('Ssh certificate file not found: please provide correct path in --cert param or ' 'https_cert_path param in server configuration file') log.error(e) raise e host = server_params['host'] port = port or server_params['port'] model_endpoint = server_params['model_endpoint'] model = build_model(model_config) skill = DefaultStatelessSkill(model, lang='ru') agent = DefaultAgent([skill], skills_processor=DefaultRichContentWrapper()) start_agent_server(agent, host, port, model_endpoint, ssl_key, ssl_cert)
def ask_model(self, model_name, question: str): if self.__model_is_exist(model_name): model = build_model(self.file_util.get_config_model_path(model_name)) result = model([question]) return result else: raise ModelNotFoundException("model {} not found".format(model_name))
def __init__(self, data_path: Optional[str] = None, x_col_name: Optional[str] = None, y_col_name: Optional[str] = None, save_load_path: Optional[str] = './similarity_matching', edit_dict: Optional[dict] = None, train: bool = True): model_config = read_json(configs.faq.tfidf_autofaq) if x_col_name is not None: model_config['dataset_reader']['x_col_name'] = x_col_name if y_col_name is not None: model_config['dataset_reader']['y_col_name'] = y_col_name model_config['metadata']['variables']['ROOT_PATH'] = save_load_path if data_path is not None: if expand_path(data_path).exists(): if 'data_url' in model_config['dataset_reader']: del model_config['dataset_reader']['data_url'] model_config['dataset_reader']['data_path'] = data_path else: if 'data_path' in model_config['dataset_reader']: del model_config['dataset_reader']['data_path'] model_config['dataset_reader']['data_url'] = data_path if edit_dict is not None: update_dict_recursive(model_config, edit_dict) if train: self.model = train_model(model_config) log.info('Your model was saved at: \'' + save_load_path + '\'') else: self.model = build_model(model_config)
def ner_rec(dataframe): """ :param dataframe: :return: """ # build model ner_model = build_model(configs.ner.ner_ontonotes_bert_mult) # make empty list to hold all results res_all = [] # iterate over each news text for text in tqdm(dataframe['text'], desc='entity recognition'): # make empty list to hold results for each text res_text = [] # iterate over each sentence in text for sentence in text: # find entities res = ner_model([sentence]) # concat results with text into list of tuples tokenized_text = res[0][0] tokenized_entity = res[1][0] res_list = list(zip(tokenized_text, tokenized_entity)) # add to text res_text += res_list # add processed txt to overall results res_all.append(res_text) return res_all
def get_dp_model(): global model if model is None: #model = build_model(configs.squad.squad, download=True) model = build_model(configs.squad.multi_squad_ru_retr_noans_rubert_infer, download=False) #model = build_model(configs.squad.squad_bert_infer, download=True) return model
def __init__(self): """ Модель из библиотеки deeppavlov, определяет эмоционыльный окрас русского предложения Подробнее на docs.deeppavlov """ self.model = build_model( configs.classifiers.rusentiment_elmo_twitter_cnn, download=True)
def __init__(self, squad_model_config: str, vocab_file: str, do_lower_case: bool, max_seq_length: int = 512, batch_size: int = 10, lang: str = 'en', **kwargs) -> None: config = json.load(open(squad_model_config)) config['chainer']['pipe'][0]['max_seq_length'] = max_seq_length self.model = build_model(config) self.max_seq_length = max_seq_length if Path(vocab_file).is_file(): vocab_file = str(expand_path(vocab_file)) self.tokenizer = AutoTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case) else: self.tokenizer = AutoTokenizer.from_pretrained( vocab_file, do_lower_case=do_lower_case) self.batch_size = batch_size if lang == 'en': from nltk import sent_tokenize self.sent_tokenizer = sent_tokenize elif lang == 'ru': from ru_sent_tokenize import ru_sent_tokenize self.sent_tokenizer = ru_sent_tokenize else: raise RuntimeError('en and ru languages are supported only')
def correct(self, word: str) -> str: if self.model is None: self.model = build_model( configs.spelling_correction.levenshtein_corrector_ru, download=False) return self.model([word])[0]
def useLevenstein(self): self.originalText, self.errorText = FP().prepareFiles() originalSentencesList, errorSentencesList = EC().textToSentences(self.originalText, self.errorText) print(len(originalSentencesList), len(errorSentencesList)) correctorModel = build_model(configs.spelling_correction.levenshtein_corrector_ru, download=True) processedSentencesList = correctorModel(errorSentencesList) Metrics().estimateWords(self.originalText, processedSentencesList)
class NER: config = "./models/ner_config.json" ner_model = build_model(config, download=True) def train(): train_model(NER.config, download=True) NER.ner_model = build_model(NER.config, download=True) def NamedEntityRecognition(message): ner = NER.ner_model([message]) sentence, labels = ner[0][0], ner[1][0] print("###NER: ", sentence) print("###NER: ", labels) entities, slots = DstcSlotFillingNetwork._chunk_finder( sentence, labels) s = {} for i, slot in enumerate(slots): if slot not in s: s[slot] = set() s[slot].add(entities[i]) if 'GENRE' in s: for genre in s['GENRE']: s['GENRE'] = set.union( set(word for word in genre.split() if word not in (stopwords.words('english'))), s['GENRE']) return s
def __init__(self, gobot_config_path): gobot_config = read_json(f"{gobot_config_path}/gobot_config.json") domain_yml_path = "dp_minimal_demo_dir/domain.yml" self.response_templates = read_yaml(domain_yml_path)["responses"] self.gobot = build_model(gobot_config) self.DATABASE, self.PREV_UPDATE_TIME = self._update_database()
def __init__(self, toml_file=None): super().__init__(toml_file) # Do you Init Work here with open('./insults_kaggle_conv_bert.json') as f: self.configs = json.load(f) self.model = build_model(self.configs) self.ready()
def __init__(self, texts=[], embeddings=[]): self.m = build_model(DUMB_MODEL_CONFIG_PATH) if len(texts) and not len(embeddings): self.sent_max_embs, _, _ = self.m(texts) elif not len(texts) and len(embeddings): self.sent_max_embs = embeddings else: raise RuntimeError('no texts or embeddings were provided')
def _parse_syntax(sents: Iterable[str]): """Parse syntax with deeppavlov model""" model = build_model("ru_syntagrus_joint_parsing") model['main'].to_output_string = False model['main'].output_format = 'json' output = model(sents) model.destroy() return output
def __init__(self, nlp): """Initialization method of :class:`dragonfire.odqa.ODQA` class. Args: nlp: :mod:`spacy` model instance. """ self.nlp = nlp # Load en_core_web_sm, English, 50 MB, default model self.model = build_model(configs.squad.squad, download=True)
def handle_messages(): print("Handling Messages") model = build_model('faq.json') payload = request.get_data() print(payload) for sender, message in messaging_events(payload): print("Incoming from %s: %s" % (sender, message)) send_message(PAT, sender, message, model) return "ok"
def _infer(config, inputs, download=False): chainer = build_model(config, download=download) if inputs: prediction = chainer(*inputs) if len(chainer.out_params) == 1: prediction = [prediction] else: prediction = [] return prediction
def __init__(self): self.model = build_model(configs.squad.squad, download=True) # self.model = "" self.stopwords = set( ["organizations", "sectors", "entities", "organization", "sector", "entity", "actor", "actors", "target", "targets", "compromises", "compromise", "threat", "threats", "computer", "computers", "network", "networks", "institute", "institutes", "republic", "middle", "purpose", "purposes", "firms", "firm", "application", "applications"]) self.lemmatizer = WordNetLemmatizer()
def __init__(self, model=None, download_model=False, empty=False): if empty: return if model is None: self.model = build_model(configs.syntax.syntax_ru_syntagrus_bert, download=download_model) else: self.model = model
def __init__(self): self.model_deeppavlov = build_model( configs.syntax.syntax_ru_syntagrus_bert, download=True) self.coordinative_conjunction = [ 'и', 'да', 'ни-ни', 'тоже', 'также', 'а', 'но', 'да', 'зато', 'однако', 'же', 'или', 'либо', 'то-то' ] self.morph = pymorphy2.MorphAnalyzer() self.like_root = ['acl:relcl', 'advcl', 'root', 'parataxis', 'ccomp'] self.can_be_root = ['nsubj', 'conj']
def _deserialize(config, raw_bytes, examples): chainer = build_model(config, serialized=raw_bytes) for *query, expected_response in examples: query = [[q] for q in query] actual_response = chainer(*query) if expected_response is not None: if actual_response is not None and len(actual_response) > 0: actual_response = actual_response[0] assert expected_response == str(actual_response), \ f"Error in interacting with {model_dir} ({conf_file}): {query}"
def __init__(self, delay_init=False): if not delay_init: self.init() self.MODELNAME = 'ru_syntagrus_joint_parsing' self.model = build_model(self.MODELNAME, download=True) self._enable_tagger = True self._enable_parser = True self.converter_conll = ConverterConllUDV1()
def __init__(self, model_settings: dict, doc2vec: Doc2Vec, dist_class: Type[LinearizedDist] = Dist, linearization_settings: dict = {}): self.model_settings = model_settings model = build_model(model_settings, download=True) self.doc2vec = doc2vec self.dist = dist_class(model, self.doc2vec, linearization_settings) VPTreeSearchEngine.__init__(self, self.dist)
def _serialize(config): chainer = build_model(config, download=True) return chainer.serialize()