def predict(self): f = open("model/data_map.pkl", "rb") maps = cPickle.load(f) f.close() self.batch_size = 1 self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(FLAGS.spm) self.train_length = 10 self.tag_map = maps.get("tag_map", {}) self.nums_tags = len(self.tag_map.values()) self.__creat_model() with tf.Session() as sess: ckpt = tf.train.get_checkpoint_state(self.checkpoint_dir) if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path): print("[->] restore model") self.saver.restore(sess, ckpt.model_checkpoint_path) else: print("[->] no model, initializing") sess.run(tf.global_variables_initializer()) trans = self.trans.eval() dataset = [] with open('data/test_raw_big.txt', 'r', encoding="utf-8") as f: for data in f.readlines(): ent, raw_con, info = data.replace('\n', '').split('\t') dataset.append([ent, raw_con, info]) f.close() for ele in dataset: info = ele[2] text = info feed = self.prepare_xlnet_pred_data(text) paths, length = sess.run([self.pred_ids, self.length], feed_dict=feed) print(format_tags(paths[0], self.tag_map)) org = get_tags(paths[0], "", self.tag_map) org_entity = format_result(org, text, "") per = get_tags(paths[0], "", self.tag_map) per_entity = format_result(per, text, "") loc = get_tags(paths[0], "", self.tag_map) loc_entity = format_result(loc, text, "") resp = org_entity["entities"] + per_entity[ "entities"] + loc_entity["entities"] ele.append(str(resp)) with open('data/test_result.txt', 'w', encoding="utf-8") as f1: for ele in dataset: f1.write(ele[2]) f1.write('\t') f1.write(ele[1]) f1.write('\t') f1.write(ele[3]) f1.write('\n') f1.close() '''
def predict(self): f = open("data/data_map.pkl", "rb") maps = cPickle.load(f) f.close() self.vocab = maps.get("vocab", {}) self.tag_map = maps.get("tag_map", {}) self.nums_tags = len(self.tag_map.values()) self.input_size = maps.get("input_size", 10000) + 1 self.__creat_model() with tf.Session() as sess: ckpt = tf.train.get_checkpoint_state(self.checkpoint_dir) if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path): print("[->] restore model") self.saver.restore(sess, ckpt.model_checkpoint_path) else: print("[->] no model, initializing") sess.run(tf.global_variables_initializer()) trans = self.trans.eval() while True: text = input(" > ") feed = self.prepare_pred_data(text) logits, length = sess.run([self.logits, self.length], feed_dict=feed) paths = self.decode(logits, length, trans) org = get_tags(paths[0], "ORG", self.tag_map) org_entity = format_result(org, text, "ORG") per = get_tags(paths[0], "PER", self.tag_map) per_entity = format_result(per, text, "PER") resp = org_entity["entities"] + per_entity["entities"] print(json.dumps(resp, indent=2, ensure_ascii=False))
def act(self, action): assert (self.context.strip() + action.strip()) assert (settings.getint('top-keks') is not None) self.actions.append(format_result(action)) result = self.generator.generate( self.get_story() + action, self.context + ' '.join(self.memory), temperature=settings.getfloat('temp'), top_p=settings.getfloat('top-p'), top_k=settings.getint('top-keks'), repetition_penalty=settings.getfloat('rep-pen')) self.results.append(format_result(result)) return self.results[-1]
def predict(self, input_str="", input_path=None): if input_path is not None: tests = pd.read_csv(input_path) with open('output.txt', 'w', encoding='utf-8') as o: #o.write('id,aspect,opinion\n') for ids in range(1, 2235): input_str = self.get_string( str(tests.loc[ids - 1:ids - 1, ['Review']])) index = int( self.get_string(str(tests.loc[ids - 1:ids - 1, ['id']]))) input_vec = [self.vocab.get(i, 0) for i in input_str] # convert to tensor if (self.use_gpu): # gpu加速 sentences = torch.tensor(input_vec).view(1, -1).cuda() else: sentences = torch.tensor(input_vec).view(1, -1) _, paths = self.model(sentences) entities = [] for tag in self.tags: tags = get_tags(paths[0], tag, self.tag_map) entities += format_result(tags, input_str, tag) entities = sorted(entities, key=lambda x: x['start']) #print(str(index) + " " + input_str + " " +str(len(entities))) for entity in entities: #print(entity) o.write( str(index) + ',' + entity['type'] + ',' + entity['word'] + '\n') else: if not input_str: input_str = input("请输入文本: ") input_vec = [self.vocab.get(i, 0) for i in input_str] # convert to tensor if (self.use_gpu): # gpu加速 sentences = torch.tensor(input_vec).view(1, -1).cuda() else: sentences = torch.tensor(input_vec).view(1, -1) _, paths = self.model(sentences) entities = [] for tag in self.tags: tags = get_tags(paths[0], tag, self.tag_map) entities += format_result(tags, input_str, tag) return entities
def decode(s, **kwargs): if s.startswith('0b'): s = int(s[2:], 2) else: s = int(s, 2) result = long_to_bytes(s) return format_result(s, hex(s), str(result, 'utf-8', errors='backslashreplace'))
def test(self): with torch.no_grad(): id2vocab = {self.vocab[i]: i for i in self.vocab} print(len(id2vocab)) f = open('./result/test_tag.json', 'w') total_matrix = np.zeros( [len(self.tags), 3] ) #横坐标分别表示component,disease&symptom,people;纵坐标分别表示recall, precision, f1 count = 0 for batch in self.dev_manager.get_batch(): count += 1 print(count) # print(type(items)) sentences, labels, length = zip(*batch) # sentences, labels, length = zip(*self.dev_batch.__next__()) # print('I am in') strs = [[id2vocab[w] for w in s] for s in sentences] # print(strs) # print(len(sentences),len(sentences[0]),len(sentences[5])) _, paths = self.model(sentences) # print("\teval") # print('path',len(paths),len(paths[0]),len(paths[1])) for i in range(len(self.tags)): recall, precision, f1 = f1_score(labels, paths, self.tags[i], self.model.tag_map) total_matrix[i][0] += recall total_matrix[i][1] += precision total_matrix[i][2] += f1 entities = [] for i in range(len(paths)): tmp = [] for tag in self.tags: tags = get_tags(paths[i], tag, self.tag_map) tmp += format_result(tags, strs[i], tag) entities.append(tmp) # print(entities) for i in range(len(entities)): dic = { 'sentense': ''.join(strs[i]), 'entities': entities[i] } json.dump(dic, f, ensure_ascii=False) # f.write(''.join(strs[i])+'#####找到的实体为#####'+'&'.join(entities[i])+'\n') total_matrix /= count # print(total_matrix) for i in range(len(self.tags)): print( "{}\tcount\t{}\trecall {:.2f}\tprecision {:.2f}\tf1 {:.2f}" .format(count, self.tags[i], total_matrix[i][0], total_matrix[i][1], total_matrix[i][2])) f.close()
def from_query(request): """ Serve an index page that knows a user's location from their browser, with details passed in via query parameters. """ lat = float(request.GET.get("lat", 0.0)) lon = float(request.GET.get("lon", 0.0)) conditions = weather.current_conditions(lat, lon) raining = weather.is_it_raining_at(lat, lon, conditions) return utils.format_result(raining, (lat, lon), conditions, "Location determined from your browser")
def predict(self): f = open("model/data_map.pkl", "rb") maps = cPickle.load(f) f.close() self.batch_size = 1 self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(FLAGS.spm) self.train_length = 10 self.tag_map = maps.get("tag_map", {}) self.nums_tags = len(self.tag_map.values()) self.__creat_model() with tf.Session() as sess: ckpt = tf.train.get_checkpoint_state(self.checkpoint_dir) if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path): print("[->] restore model") self.saver.restore(sess, ckpt.model_checkpoint_path) else: print("[->] no model, initializing") sess.run(tf.global_variables_initializer()) trans = self.trans.eval() while True: text = input(" > ") feed = self.prepare_xlnet_pred_data(text) paths, length = sess.run([self.pred_ids, self.length], feed_dict=feed) print(format_tags(paths[0], self.tag_map)) org = get_tags(paths[0], "ORG", self.tag_map) org_entity = format_result(org, text, "ORG") per = get_tags(paths[0], "PER", self.tag_map) per_entity = format_result(per, text, "PER") loc = get_tags(paths[0], "LOC", self.tag_map) loc_entity = format_result(loc, text, "LOC") resp = org_entity["entities"] + per_entity[ "entities"] + loc_entity["entities"] print(json.dumps(resp, indent=2, ensure_ascii=False))
def predict(self, tag, input_str=""): model.load_state_dict(torch.load("./model/params.pkl")) if not input_str: input_str = input("请输入文本: ") input_vec = [word2id.get(i, 0) for i in input_str] # convert to tensor sentences = torch.tensor(input_vec).view(1, -1) paths = model(sentences) entities = [] tags = get_tags(paths[0], tag, tag2id) entities += format_result(tags, input_str, tag) print(entities)
def print_story(self, wrap=True, color=True): first_result = format_result(self.actions[0] + ' ' + self.results[0]) col1 = 'user-text' if color else None col2 = 'ai-text' if color else None output(self.context, col1, first_result, col2, wrap=wrap) maxactions = len(self.actions) maxresults = len(self.results) for i in range(1, max(maxactions, maxresults)): if i < maxactions and self.actions[i].strip() != "": caret = "> " if re.match("^[Yy]ou +", self.actions[i]) else "" output(caret + self.actions[i], col1, wrap=wrap) if i < maxresults and self.results[i].strip() != "": output(self.results[i], col2, wrap=wrap)
def predict(self, input_str=""): if not input_str: input_str = input("请输入文本: ") input_vec = [self.vocab.get(i, 0) for i in input_str] # convert to tensor sentences = torch.tensor(input_vec).view(1, -1) _, paths = self.model(sentences) entities = [] for tag in self.tags: tags = get_tags(paths[0], tag, self.tag_map) entities += format_result(tags, input_str, tag) return entities
def print_action_result(self, i, wrap=True, color=True): col1 = 'user-text' if color else None col2 = 'ai-text' if color else None if i == 0 or len(self.actions) == 1: start = format_result(self.context + ' ' + self.actions[0]) result = format_result(self.results[0]) is_start_end = re.match(r"[.!?]\s*$", start) # if start ends logically is_result_continue = re.match( r"^\s*[a-z.!?,\"]", result) # if result is a continuation sep = ' ' if not is_start_end and is_result_continue else '\n' if not self.actions[0]: output(self.context, col1, self.results[0], col2, sep=sep) else: output(self.context, col1) output(self.actions[0], col1, self.results[0], col2, sep=sep) else: if i < len(self.actions) and self.actions[i].strip() != "": caret = "> " if re.match( r"^ *you +", self.actions[i], flags=re.I) else "" output(format_result(caret + self.actions[i]), col1, wrap=wrap) if i < len(self.results) and self.results[i].strip() != "": output(format_result(self.results[i]), col2, wrap=wrap)
def encode(s, **kwargs): if not kwargs['key']: print("Key not provided") return None if s.startswith('0x'): s = int(s[2:], 16) key = kwargs['key'] if key.startswith('0x'): key = int(key[2:], 16) if isint(s) and isint(key): result = xor_int(s, key) else: result = xor_strings(s, key) return format_result(result, hex(result), long_to_bytes(result).decode('latin-1'))
def predict_model(model, word2id, tag2id): input_str = input("请输入文本: ") # 将输入的每个字转换为对应的id input2id = [word2id.get(i, 0) for i in input_str] # 转换为tensor(修改shape) sentences = torch.tensor(input2id).view(1, -1) # 转换为list sentences = sentences.tolist() _, paths = model(sentences) entities = [] for tag in ["ORG", "PER"]: positions = get_entity_position(paths[0], tag, tag2id) entities += format_result(positions, input_str, tag) # 输出结果 print(entities)
def perform_result(self, items): """ Фильтрация и сортировка результатов поиска :param items: максимальное число элементов в выдаче :return: """ result = [i for i in self.counts.items() if len(i[1]) > 0] # убираем пустые результаты def sort_order(dict_pair): return ( len(dict_pair[1]), # число найденных слов sum(map(lambda x: x[1], dict_pair[1].items())) # общее число вхождений ) result.sort(key=sort_order, reverse=True) # сортируем результаты return map(lambda x: format_result(x[0], x[1]), result[:items])
def predict(self, input_str=""): if not input_str: input_str = input("请输入文本: ") # 获取输入句子所有汉字的在vocab的索引 input_vec = [self.vocab.get(i, 0) for i in input_str] # convert to tensor sentences = torch.tensor(input_vec, dtype=torch.long).view(1, -1) sentences = sentences.cuda() # paths 预测出来的标签索引 shape 为 [1,1] _, paths = self.model(sentences) entities = [] # "tags": ["ORG", "PER"] for tag in self.tags: tags = get_tags(paths[0], tag, self.tag_map) entities += format_result(tags, input_str, tag) print(entities) print(json.dumps(entities, indent=4, ensure_ascii=False)) return entities
def predict(self, path): #, input_str=""): # if not input_str: # input_str = input("请输入文本: ") sentences = [] with open('./data/' + path + '.txt', 'r', encoding='utf-8') as f: for i in f: sentences += i.strip().split('。') f = open('./result/tag_' + path + '.json', 'w') for input_str in sentences: input_vec = [self.vocab.get(i, 0) for i in input_str] # convert to tensor sentences = torch.tensor(input_vec).view(1, -1) _, paths = self.model(sentences) entities = [] for tag in self.tags: tags = get_tags(paths[0], tag, self.tag_map) entities += format_result(tags, input_str, tag) dic = {'sentense': input_str, 'entities': entities} json.dump(dic, f, ensure_ascii=False) f.close()
def predict(text, config, params, is_export=False): """模型预测。""" # 读取词典 vocab2id, id2vocab = read_vocab(config["vocab_file"]) tag2id, id2tag = read_vocab(config["tag_file"]) # 构建模型 model = BiLSTMCRF(hidden_num=params["hidden_num"], vocab_size=len(vocab2id), label_size=len(tag2id), embedding_size=params["embedding_size"]) model.load_weights(config["ckpt_path"]) # 数据预处理 dataset = tf.keras.preprocessing.sequence.pad_sequences( [[vocab2id.get(char, 0) for char in text]], padding='post', maxlen=params["maxlen"]) # 模型预测 result = model.predict(dataset)[0] result = np.argmax(result, axis=-1) result = [id2tag[i] for i in result] print(result) # 结果处理 entities_result = format_result(list(text), result) print(json.dumps(entities_result, indent=4, ensure_ascii=False)) if is_export: # 导出模型 tf.keras.models.save_model(model, config["export_dir"], overwrite=True, include_optimizer=True, save_format=None, options=None)
def predict(text, config, params): """模型预测。""" # 读取词典 vocab2id, id2vocab = read_vocab(config["vocab_file"]) tag2id, id2tag = read_vocab(config["tag_file"]) # 构建模型 model = BiLSTMCRF( hidden_num=params["hidden_num"], vocab_size=len(vocab2id), label_size=len(tag2id), embedding_size=params["embedding_size"]) model.load_weights(config["ckpt_path"]) # 数据预处理 dataset = tf.keras.preprocessing.sequence.pad_sequences( [[vocab2id.get(char, 0) for char in text]], padding='post') # 模型预测 result = model.predict(dataset)[0] result = np.argmax(result, axis=-1) result = [id2tag[i] for i in result] print(result) # 结果处理 entities_result = format_result(list(text), result) print(json.dumps(entities_result, indent=4, ensure_ascii=False))
import json vocab2id, id2vocab = read_vocab(args.vocab_file) tag2id, id2tag = read_vocab(args.tag_file) text_sequences ,label_sequences= tokenize(args.test_path,vocab2id,tag2id) optimizer = tf.keras.optimizers.Adam(args.lr) model = NerModel(hidden_num = args.hidden_num, vocab_size =len(vocab2id), label_size = len(tag2id), embedding_size = args.embedding_size) # restore model ckpt = tf.train.Checkpoint(optimizer=optimizer,model=model) ckpt.restore(tf.train.latest_checkpoint(args.output_dir)) while True: text = input("input:") dataset = tf.keras.preprocessing.sequence.pad_sequences([[vocab2id.get(char,0) for char in text]], padding='post') print(dataset) logits, text_lens = model.predict(dataset) paths = [] for logit, text_len in zip(logits, text_lens): viterbi_path, _ = tf_ad.text.viterbi_decode(logit[:text_len], model.transition_params) paths.append(viterbi_path) print(paths[0]) print([id2tag[id] for id in paths[0]]) entities_result = format_result(list(text), [id2tag[id] for id in paths[0]]) print(json.dumps(entities_result, indent=4, ensure_ascii=False))
def sample_sequence( model, length, context, temperature=1, top_k=0, top_p=0.9, repetition_penalty=1.0, device="cpu", stop_tokens=None, tokenizer=None ): """Actually generate the tokens""" logger.debug( 'temp: {} top_k: {} top_p: {} rep-pen: {}'.format(temperature, top_k, top_p, repetition_penalty)) context_tokens = context context = torch.tensor(context, dtype=torch.long, device=device) # context = context.repeat(num_samples, 1) generated = context USE_PAST = True next_token = context pasts = None clines = 0 with torch.no_grad(): for j in range(length): # why would we ever not use past? # is generated and next_token always same thing? if not USE_PAST: input_ids_next = generated pasts = None else: input_ids_next = next_token # Note: we could also use 'past' with GPT-2/Transfo-XL/XLNet/CTRL (cached hidden-states) logits, pasts = model(input_ids=input_ids_next, past=pasts) logits = logits[-1, :].float() # переписать логику TODO if settings.getboolean('sparse-gen'): probs = entmax_bisect(logits, dim=-1, alpha=settings.sparse-level) next_token = torch.multinomial(probs, num_samples=1) else: # Originally the order was Temperature, Repetition Penalty, then top-k/p if settings.getboolean('top-p-first'): logits = top_k_top_p_filtering(logits, top_k=top_k, top_p=top_p) logits = logits / (temperature if temperature > 0 else 1.0) # repetition penalty from CTRL (https://arxiv.org/abs/1909.05858) for k in set(generated.tolist()): logits[k] /= repetition_penalty if not settings.getboolean('top-p-first'): logits = top_k_top_p_filtering(logits, top_k=top_k, top_p=top_p) if temperature == 0: # greedy sampling: next_token = torch.argmax(logits, dim=-1).unsqueeze(-1) else: next_token = torch.multinomial( F.softmax(logits, dim=-1), num_samples=1 ) generated = torch.cat((generated, next_token), dim=-1) # Decode into plain text o = generated[len(context_tokens):].tolist() generated.text = tokenizer.decode( o, clean_up_tokenization_spaces=False, skip_special_tokens=True ) if use_ptoolkit(): clear_lines(clines) generated.text = format_result(generated.text) clines = output(generated.text, "ai-text") if ( (stop_tokens is not None) and (j > 4) and (next_token[0] in stop_tokens) ): # Why the minimum tokens, j>X. Because sometimes the models starts with whitespace, which will strip away anyway. Having a minimum amount of tokens before we stop usually means we don't just stop because of "\n " or similar logger.debug( "Stopping generation as we found stop tokens. One of `%s`, in '%s'. token generated `%s`", stop_tokens, next_token, j, ) break clear_lines(clines) return generated
def decode(s, **kwargs): bs = b64decode(s) long_bs = bytes_to_long(bs) return format_result(long_bs, hex(long_bs), str(bs))
def sample_sequence(model, length, context, temperature=1, top_k=0, top_p=0.9, repetition_penalty=1.0, repetition_penalty_range=512, repetition_penalty_slope=3.33, device="cpu", stop_tokens=None, tokenizer=None): """Actually generate the tokens""" logger.debug( 'temp: {} top_k: {} top_p: {} rep-pen: {} rep-pen-range: {} rep-pen-slope: {}' .format(temperature, top_k, top_p, repetition_penalty, repetition_penalty_range, repetition_penalty_slope)) context_tokens = context context = torch.tensor(context, dtype=torch.long, device=device) # context = context.repeat(num_samples, 1) generated = context USE_PAST = True next_token = context pasts = None clines = 0 penalty = None if not repetition_penalty_range is None and not repetition_penalty_slope is None and repetition_penalty_range > 0: penalty = (torch.arange(repetition_penalty_range) / (repetition_penalty_range - 1)) * 2. - 1 penalty = (repetition_penalty_slope * penalty) / (1 + torch.abs(penalty) * (repetition_penalty_slope - 1)) penalty = 1 + ((penalty + 1) / 2) * (repetition_penalty - 1) with torch.no_grad(): for j in range(length): # why would we ever not use past? # is generated and next_token always same thing? if not USE_PAST: input_ids_next = generated pasts = None else: input_ids_next = next_token # Note: we could also use 'past' with GPT-2/Transfo-XL/XLNet/CTRL (cached hidden-states) model_kwargs = {"past": pasts, "use_cache": True} model_inputs = model.prepare_inputs_for_generation( generated.unsqueeze(0), **model_kwargs) model_outputs = model(**model_inputs, return_dict=True) logits, pasts = model_outputs.logits, model_outputs.past_key_values logits = logits[0, -1, :].float() # Originally the order was Temperature, Repetition Penalty, then top-k/p if settings.getboolean('top-p-first'): logits = top_k_top_p_filtering(logits, top_k=top_k, top_p=top_p) logits = logits / (temperature if temperature > 0 else 1.0) # repetition penalty from CTRL (https://arxiv.org/abs/1909.05858) plus range limit if repetition_penalty != 1.0: if penalty is not None: penalty_len = min(generated.shape[0], repetition_penalty_range) penalty_context = generated[-repetition_penalty_range:] score = torch.gather(logits, 0, penalty_context) penalty = penalty.type(score.dtype).to(score.device) penalty_window = penalty[-penalty_len:] score = torch.where(score < 0, score * penalty_window, score / penalty_window) logits.scatter_(0, penalty_context, score) else: score = torch.gather(logits, 0, generated) score = torch.where(score < 0, score * repetition_penalty, score / repetition_penalty) logits.scatter_(0, generated, score) if not settings.getboolean('top-p-first'): logits = top_k_top_p_filtering(logits, top_k=top_k, top_p=top_p) if temperature == 0: # greedy sampling: next_token = torch.argmax(logits, dim=-1).unsqueeze(-1) else: next_token = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1) generated = torch.cat((generated, next_token), dim=-1) # Decode into plain text o = generated[len(context_tokens):].tolist() generated.text = tokenizer.decode( o, clean_up_tokenization_spaces=False, skip_special_tokens=True) if use_ptoolkit(): clear_lines(clines) generated.text = format_result(generated.text) clines = output(generated.text, "ai-text") if ((stop_tokens is not None) and (j > 4) and (next_token[0] in stop_tokens)): # Why the minimum tokens, j>X. Because sometimes the models starts with whitespace, which will strip away anyway. Having a minimum amount of tokens before we stop usually means we don't just stop because of "\n " or similar logger.debug( "Stopping generation as we found stop tokens. One of `%s`, in '%s'. token generated `%s`", stop_tokens, next_token, j, ) break clear_lines(clines) return generated