def read_squad_data(self, file): """ read squad data file in string form """ logger("Reading SQuAD data.") def extract(sample_data): document = sample_data["context"] for qas in sample_data["qas"]: question = qas["question"] for ans in qas["answers"]: answer_len = len(ans["text"]) answer_span = [ ans["answer_start"], ans["answer_start"] + answer_len ] assert (ans["text"] == document[ans["answer_start"]:( ans["answer_start"] + answer_len)]) documents.append(document) questions.append(question) answer_spans.append(answer_span) documents, questions, answer_spans = [], [], [] f = json.load(open(file, encoding="utf-8")) data_list, version = f["data"], f["version"] logger("SQuAD version: {}".format(version)) [ extract(sample) for data in data_list for sample in data["paragraphs"] ] if self.args.debug: documents, questions, answer_spans = documents[: 500], questions[: 500], answer_spans[: 500] return documents, questions, answer_spans
def load_weight(self): ckpt = tf.train.get_checkpoint_state(self.args.weight_path) if ckpt is not None: logger("Load models from {}.".format(ckpt.model_checkpoint_path)) self.saver.restore(self.sess, ckpt.model_checkpoint_path) else: logger("No previous models.")
def gen_vocab(data_file, tokenizer=default_tokenizer, old_counter=None, max_count=None): """ generate vocabulary according to train corpus. """ logger("Creating word dict from data {}.".format(data_file)) word_counter = old_counter if old_counter else Counter() counter = 0 with gfile.FastGFile(data_file) as f: for line in f: counter += 1 if max_count and counter > max_count: break tokens = tokenizer(line.rstrip('\n')) word_counter.update(tokens) if counter % 100000 == 0: logger("Process line %d Done." % counter) # summary statistics total_words = sum(word_counter.values()) distinct_words = len(list(word_counter)) logger("STATISTICS" + "-" * 20) logger("Total words: " + str(total_words)) logger("Total distinct words: " + str(distinct_words)) return word_counter
def test(self): if not self.args.train: self.sess.run(tf.global_variables_initializer()) self.load_weight() batch_size = self.args.batch_size batch_num = self.test_num // batch_size batch_num = batch_num + 1 if (self.test_num % batch_size) != 0 else batch_num correct_num, total_num = 0, 0 result = list() for i in range(batch_num): data, samples = self.get_batch_data("test", i) # TODO : here can be remove if the test being desperated data = dict(data, **{'keep_prob:0': 1.}) if samples != 0: correct, pred = self.sess.run( [self.correct_prediction, self.prediction], feed_dict=data) correct_num, total_num = correct_num + correct, total_num + samples result.extend(pred.tolist()) assert (total_num == self.test_num == len(result)) logger("Test on : {}/{}".format(total_num, self.test_num)) test_acc = correct_num / total_num logger("Test accuracy is : {:.5f}".format(test_acc)) res = {"model": self.model_name, "test_acc": test_acc} self.test_save(pred=result) save_obj_to_json(self.args.weight_path, res, "result.json")
def cbt_data_to_token_ids(self, data_file, target_file, vocab_file, max_count=None): """ 22 lines for one sample. first 20 lines:documents with line number in the front. 21st line:line-number question\tAnswer\t\tCandidate1|...|Candidate10. 22nd line:blank. """ if gfile.Exists(target_file): return logger("Tokenizing data in {}".format(data_file)) word_dict = self.load_vocab(vocab_file) counter = 0 with gfile.FastGFile(data_file) as f: with gfile.FastGFile(target_file, mode="wb") as tokens_file: for line in f: counter += 1 if counter % 100000 == 0: logger("Tokenizing line %d" % counter) if max_count and counter > max_count: break if counter % 22 == 21: q, a, _, A = line.split("\t") token_ids_q = self.sentence_to_token_ids(q, word_dict)[1:] token_ids_A = [word_dict.get(a.lower(), self.UNK_ID) for a in A.rstrip("\n").split("|")] tokens_file.write(" ".join([str(tok) for tok in token_ids_q]) + "\t" + str(word_dict.get(a.lower(), self.UNK_ID)) + "\t" + "|".join([str(tok) for tok in token_ids_A]) + "\n") else: token_ids = self.sentence_to_token_ids(line, word_dict) token_ids = token_ids[1:] if token_ids else token_ids tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")
def testAllFile(self): log.logger().info("=======开始执行所有测试用例文件=======") # 创建测试工具类对象 commonUtils = CommonUtils() # 调用测试所有用例方法传入要执行用例的文件夹路径 commonUtils.executeAllFile("../data") # 调用发送报告方法将测试结果发送到指定邮箱 commonUtils.exportReport()
def testSingleFile(self): log.logger().info("=======开始执行单个测试用例文件=======") # 创建测试工具类对象 commonUtils = CommonUtils() # 调用测试所有用例方法传入要执行用例文件路径 commonUtils.executeSingleFile("../data/case1.xls") # 调用发送报告方法将测试结果发送到指定邮箱 commonUtils.exportReport()
def load_weight(self): ckpt = tf.train.get_checkpoint_state(self.args.weight_path) if ckpt is not None and ckpt.model_checkpoint_path.startswith( os.path.join(self.args.weight_path, self.__class__.__name__)): logger("Load models from {}.".format(ckpt.model_checkpoint_path)) self.saver.restore(self.sess, ckpt.model_checkpoint_path) else: logger("No previous models. model :%s" % self.__class__.__name__)
def random_proxy(self, protocal=None, domain=None, nick_type=0, count=0): """返回满足要求的一个随机代理IP """ try: proxy_list = self.get_proxies(protocol=protocal, domain=domain, count=count, nick_type=nick_type) return random.choice(proxy_list) except Exception as e: logger("当前没有满足要求的代理IP")
def save_weight(self, val_acc, step): path = self.saver.save(self.sess, os.path.join( self.args.weight_path, "{}-val_acc-{:.4f}.models".format( self.model_name, val_acc)), global_step=step) logger("Save models to {}.".format(path))
def get_word_index(self, path = None): if not path: path = self.args.tmp_dir + self.__class__.__name__ + self.args.word_file word2id = dict() with open(path, mode = 'r', encoding = 'utf-8') as f: for l in f: word2id.setdefault(l.strip(), len(word2id)) logger('Word2id size : %d' % len(word2id)) return word2id
def early_stopping(self, val_acc, val_loss, step): if val_acc > self.best_val_acc: self.patience = self.args.patience self.best_val_acc = val_acc self.save_weight(val_acc, step) elif self.patience == 1: logger("Oh u, stop training.") exit(0) else: self.patience -= 1 logger("Remaining/Patience : {}/{} .".format( self.patience, self.args.patience))
def softmax_with_mask(logits, axis, mask, epsilon=10e-8, name=None): # 1. normalize 2. softmax with tf.name_scope(name, 'softmax', [logits, mask]): max_axis = tf.reduce_max(logits, axis, keep_dims=True) target_exp = tf.exp(logits - max_axis) * mask normalize = tf.reduce_sum(target_exp, axis, keep_dims=True) softmax = target_exp / (normalize + epsilon) logger("softmax shape {}".format(softmax.get_shape())) return softmax
def token_idx_map(self, context, answer_span): logger("Convert answer to position in the context.") answer_se = [] for i in range(len(context)): answer_tokens = process_tokens( default_tokenizer( context[i][answer_span[i][0]:answer_span[i][1]])) con = process_tokens( default_tokenizer(context[i][:answer_span[i][0]])) a_start_idx = len(con) a_end_idx = len(con) + len(answer_tokens) - 1 answer_se.append([a_start_idx, a_end_idx]) return answer_se
def save_weight(self, val_acc, step): path = self.saver.save(self.sess, os.path.join( self.args.weight_path, "{}-val_acc-{:.4f}.models-{}".format( self.model_name, val_acc, datetime.datetime.now())), global_step=step) if self.args.tensorboard and self.args.visualize_embedding: visualize_embedding(word2id=self.dataset.word2id, embedding_matrix_name=self.embedding.name, writer=self.writer) logger("Save models to {}.".format(path))
def read_cbt_data(self, file, max_count=None): """ read CBT data in id format. :return: (documents,questions,answers,candidates) each elements is a numpy array. """ documents, questions, answers, candidates = [], [], [], [] with FastGFile(file, mode="r") as f: counter = 0 d, q, a, A = [], [], [], [] for line in f: counter += 1 if max_count and counter > max_count: break if counter % 100000 == 0: logger("Reading line %d in %s" % (counter, file)) if counter % 22 == 21: tmp = line.strip().split("\t") q = tmp[0].split(" ") + [self.EOS_ID] a = [1 if tmp[1] == i else 0 for i in d] A = [a for a in tmp[2].split("|")] A.remove(tmp[1]) A.insert( 0, tmp[1]) # put right answer in the first of candidate elif counter % 22 == 0: documents.append(d) questions.append(q) answers.append(a) candidates.append(A) d, q, a, A = [], [], [], [] else: d.extend(line.strip().split(" ") + [self.EOS_ID ]) # add EOS ID in the end of each sentence d_lens = [len(i) for i in documents] q_lens = [len(i) for i in questions] avg_d_len = reduce(lambda x, y: x + y, d_lens) / len(documents) logger("Document average length: %d." % avg_d_len) logger("Document midden length: %d." % len(sorted(documents, key=len)[len(documents) // 2])) avg_q_len = reduce(lambda x, y: x + y, q_lens) / len(questions) logger("Question average length: %d." % avg_q_len) logger("Question midden length: %d." % len(sorted(questions, key=len)[len(questions) // 2])) return documents, questions, answers, candidates
def draw_graph(self): log_file = '../logs/log-%s-%s-%s-emb%d-id%s' % ( self.args.activation, self.args.dataset, self.args.rnn_type, self.args.embedding_dim, str(datetime.datetime.now())) self.writer = tf.summary.FileWriter(log_file) self.writer.add_graph(self.sess.graph) tf.summary.scalar('loss', self.loss) tf.summary.scalar('accuracy', self.accuracy) config = projector.ProjectorConfig() embedding_conf = config.embeddings.add() embedding_conf.tensor_name = 'embedding_matrix' # embedding_conf.metadata_path = os.path.join(log_file, 'metadata.tsv') projector.visualize_embeddings(self.writer, config) self.merged_summary = tf.summary.merge_all() logger('Save log to %s' % log_file)
def squad_data_to_idx(self, vocab_file, *args): """ convert string list to index list form. """ logger("Convert string data to index.") word_dict = self.load_vocab(vocab_file) res_data = [ 0, ] * len(args) for idx, i in enumerate(args): tmp = [ self.sentence_to_token_ids(document, word_dict) for document in i ] res_data[idx] = tmp.copy() logger("Convert string2index done.") return res_data
def prepare_data(self, data_dir, train_file, valid_file, max_vocab_num, output_dir=""): """ build word vocabulary and character vocabulary. """ if not gfile.Exists(os.path.join(data_dir, output_dir)): os.mkdir(os.path.join(data_dir, output_dir)) os_train_file = os.path.join(data_dir, train_file) os_valid_file = os.path.join(data_dir, valid_file) vocab_file = os.path.join(data_dir, output_dir, "vocab.%d" % max_vocab_num) char_vocab_file = os.path.join(data_dir, output_dir, "char_vocab") vocab_data_file = os.path.join(data_dir, output_dir, "data.txt") def save_data(d_data, q_data): """ save all data to a file and use it build vocabulary. """ with open(vocab_data_file, mode="w", encoding="utf-8") as f: f.write("\t".join(d_data) + "\n") f.write("\t".join(q_data) + "\n") if not gfile.Exists(vocab_data_file): d, q, _ = self.read_squad_data(os_train_file) v_d, v_q, _ = self.read_squad_data(os_valid_file) save_data(d, q) save_data(v_d, v_q) if not gfile.Exists(vocab_file): logger("Start create vocabulary.") word_counter = self.gen_vocab(vocab_data_file, max_count=self.args.max_count) self.save_vocab(word_counter, vocab_file, max_vocab_num) if not gfile.Exists(char_vocab_file): logger("Start create character vocabulary.") char_counter = self.gen_char_vocab(vocab_data_file) self.save_char_vocab(char_counter, char_vocab_file, max_vocab_num=70) return os_train_file, os_valid_file, vocab_file, char_vocab_file
def test(self): if not self.args.train: self.sess.run(tf.global_variables_initializer()) self.load_weight() batch_size = self.args.batch_size batch_num = self.test_num // batch_size batch_num = batch_num + 1 if (self.test_num % batch_size) != 0 else batch_num correct_num, total_num = 0, 0 for i in range(batch_num): data, samples = self.get_batch_data("test", i) if samples != 0: correct, = self.sess.run([self.correct_prediction], feed_dict=data) correct_num, total_num = correct_num + correct, total_num + samples assert (total_num == self.test_num) logger("Test on : {}/{}".format(total_num, self.test_num)) test_acc = correct_num / total_num logger("Test accuracy is : {:.5f}".format(test_acc)) res = {"model": self.model_name, "test_acc": test_acc} save_obj_to_json(self.args.weight_path, res, "result.json")
def doGET(self, url, params, headers): # get请求执行方法 result = requests.post(url=url, params=params, headers=headers, timeout=10) # 判断请求状态码如果是200请求成功做出相应操作 if result.status_code == 200: # 提取headers 头信息中的Cookie信息 self.setCookies(result.headers) # 如果返回不是json格式会抛出异常添加try try: # 返回json格式的返回结果 return result.json() # 如果返回不是json格式会抛出异常添加except返回其他结果 except: # 返回其他形式的信息 return result.text else: # 添加log捕获异常错误信息 log.logger().info("当前出错了,错误码是%d" % result.status_code) try: error = result.raise_for_status() # 添加异常捕获log log.logger().info("当前出错了错误信息是%s" % str(error)) except Exception as e: # 添加异常捕获log log.logger().info("当前出错了错误信息是%s" % e) # 返回字典形式的信息以免报错 return {}
def gen_embeddings(word_dict, embed_dim, in_file=None, init=np.zeros): """ Init embedding matrix with (or without) pre-trained word embeddings. """ num_words = max(word_dict.values()) + 1 embedding_matrix = init(-0.05, 0.05, (num_words, embed_dim)) logger('Embeddings: %d x %d' % (num_words, embed_dim)) if not in_file: return embedding_matrix def get_dim(file): first = gfile.FastGFile(file, mode='r').readline() return len(first.split()) - 1 assert get_dim(in_file) == embed_dim logger('Loading embedding file: %s' % in_file) pre_trained = 0 for line in codecs.open(in_file, encoding="utf-8"): sp = line.split() if sp[0] in word_dict: pre_trained += 1 embedding_matrix[word_dict[sp[0]]] = np.asarray([float(x) for x in sp[1:]], dtype=np.float32) logger("Pre-trained: {}, {:.3f}%".format(pre_trained, pre_trained * 100.0 / num_words)) return embedding_matrix
def squad_data_to_idx(self, vocab_file, char_vocab_file=None, *args): """ convert string list to index list form. """ logger("Convert string data to index.") word_dict = self.load_vocab(vocab_file) if self.args.use_char_embedding: char_dict = self.load_vocab(self.char_vocab_file) res_data = [] for idx, i in enumerate(args): tmp = [ self.sentence_to_token_ids(document, word_dict) for document in i ] res_data.append(tmp.copy()) if self.args.use_char_embedding: tmp_c = [ self.words_to_char_ids(document, char_dict) for document in i ] res_data.append(tmp_c.copy()) logger("Convert string2index done.") return res_data
def validate(self): batch_size = self.args.batch_size v_batch_num = self.valid_nums // batch_size # ensure the entire valid set is selected v_batch_num = v_batch_num + 1 if (self.valid_nums % batch_size) != 0 else v_batch_num # logger("Validate on {} batches, {} samples per batch, {} total." # .format(v_batch_num, batch_size, self.valid_nums)) val_num, val_corrects, v_loss = 0, 0, 0 preds = list() for i in range(v_batch_num): data, samples = self.get_batch_data("valid", i) # TODO : here can be remove if the test being desperated data = dict(data, **{'keep_prob:0': 1.}) if samples != 0: loss, v_correct, prediction = self.sess.run( [self.loss, self.correct_prediction, self.prediction], feed_dict=data) val_num += samples val_corrects += v_correct v_loss += loss * samples preds.extend(prediction.tolist()) # call the custom metric # self.metric(preds = preds, label = self.dataset.valid_y.tolist()) assert (val_num == self.valid_nums) val_acc = val_corrects / val_num val_loss = v_loss / val_num logger( "Evaluate on : {}/{}.\tVal acc : {:.4f}.\tVal Loss : {:.4f}, Best acc:{:.4f}, , Dataset:{}, " .format(val_num, self.valid_nums, val_acc, val_loss, self.best_val_acc, self.args.dataset)) return val_acc, val_loss
def execute(self): """ main method to train and test """ # self.confirm_model_dataset_fitness() self.dataset = getattr(sys.modules["tf.datasets"], self.args.dataset)(self.args) if hasattr(self.dataset, 'get_embedding_matrix'): self.embedding_matrix = self.dataset.get_embedding_matrix( is_char_embedding=False) else: logger( 'No function named get_embedding_matrix in data set %s, use the random initialization' % self.dataset.__class__.__name__) self.max_len = self.dataset.max_len self.word2id_size = self.dataset.word2id_size self.train_nums, self.valid_nums, self.test_num = self.dataset.train_nums, self.dataset.valid_nums, self.dataset.test_nums self.num_class = self.dataset.num_class self.create_model() self.make_sure_model_is_valid() self.saver = tf.train.Saver(max_to_keep=20) if self.args.train: if self.args.tensorboard: self.draw_graph() self.train() if self.args.test: self.test() self.sess.close()
def get_equipment_image(verbose=False): url = "https://wiki.biligame.com/pcr/%E8%A3%85%E5%A4%87%E4%B8%80%E8%A7%88" response = requests.get(url) soup = BeautifulSoup(response.text, "lxml") table = soup.select('#wiki_table')[0] equipments = table.select('span') result = {} save_path = os.path.join(app_config["data_path"], "equipments") if not os.path.exists(save_path): try: os.mkdir(save_path) except OSError: logger("Error", "Create download path %s failed!" % save_path, verbose) for index, equip in enumerate(equipments): logger("Info", "Downloading equipment image %d/%d" % (index, len(equipments)), verbose) name = equip.select('a')[0]["title"] image_url = equip.select('img')[0]["src"] equip_save_path = os.path.join(save_path, "%d.png" % index) urllib.request.urlretrieve(image_url, equip_save_path) result[name] = { "id": index, "equipment_name": name, "equipment_image_url": image_url, "equipment_image_path": "%d.png" % index } with open(os.path.join(save_path, "equipments.json"), "w+") as file: file.write(json.dumps(result, indent=4, ensure_ascii=False)) return result
def validate(self): batch_size = self.args.batch_size v_batch_num = self.valid_nums // batch_size # ensure the entire valid set is selected v_batch_num = v_batch_num + 1 if (self.valid_nums % batch_size) != 0 else v_batch_num logger( "Validate on {} batches, {} samples per batch, {} total.".format( v_batch_num, batch_size, self.valid_nums)) val_num, val_corrects, v_loss = 0, 0, 0 for i in range(v_batch_num): data, samples = self.get_batch_data("valid", i) if samples != 0: loss, v_correct = self.sess.run( [self.loss, self.correct_prediction], feed_dict=data) val_num += samples val_corrects += v_correct v_loss += loss * samples assert (val_num == self.valid_nums) val_acc = val_corrects / val_num val_loss = v_loss / val_num logger("Evaluate on : {}/{}.\tVal acc : {:.4f}.\tVal Loss : {:.4f}". format(val_num, self.valid_nums, val_acc, val_loss)) return val_acc, val_loss
def gen_char_vocab(data_file, tokenizer=default_tokenizer, old_counter=None): """ generate character level vocabulary according to train corpus. """ logger("Creating character dict from data {}.".format(data_file)) char_counter = old_counter if old_counter else Counter() with gfile.FastGFile(data_file) as f: for line in f: tokens = tokenizer(line.rstrip("\n")) char_counter.update([char for word in tokens for char in word]) # summary statistics total_chars = sum(char_counter.values()) distinct_chars = len(list(char_counter)) logger("STATISTICS" + "-" * 20) logger("Total characters: " + str(total_chars)) logger("Total distinct characters: " + str(distinct_chars)) return char_counter
def train(self): """ train model """ self.step = tf.Variable(0, name="global_step", trainable=False) batch_size = self.args.batch_size epochs = self.args.num_epoches self.get_train_op() self.sess.run(tf.global_variables_initializer()) self.load_weight() # early stopping params, by default val_acc is the metric self.patience, self.best_val_acc = self.args.patience, 0. # Start training corrects_in_epoch, samples_in_epoch, loss_in_epoch = 0, 0, 0 batch_num = self.train_nums // batch_size logger("Train on {} batches, {} samples per batch, {} total.".format( batch_num, batch_size, self.train_nums)) step = self.sess.run(self.step) while step < batch_num * epochs: step = self.sess.run(self.step) # on Epoch start if step % batch_num == 0: corrects_in_epoch, samples_in_epoch, loss_in_epoch = 0, 0, 0 logger("{}Epoch : {}{}".format("-" * 40, step // batch_num + 1, "-" * 40)) self.dataset.shuffle() data, samples = self.get_batch_data("train", step % batch_num) loss, _, corrects_in_batch = self.sess.run( [self.loss, self.train_op, self.correct_prediction], feed_dict=data) corrects_in_epoch += corrects_in_batch loss_in_epoch += loss * samples samples_in_epoch += samples # logger if step % self.args.print_every_n == 0: logger( "Samples : {}/{}.\tStep : {}/{}.\tLoss : {:.4f}.\tAccuracy : {:.4f}" .format(samples_in_epoch, self.train_nums, step % batch_num, batch_num, loss_in_epoch / samples_in_epoch, corrects_in_epoch / samples_in_epoch)) # evaluate on the valid set and early stopping if step and step % self.args.evaluate_every_n == 0: val_acc, val_loss = self.validate() self.early_stopping(val_acc, val_loss, step)
def get_characters_illustration(verbose=False): logger("Info", "Start download characters illustrations...", verbose) url = "https://wiki.biligame.com/pcr/%E8%A7%92%E8%89%B2%E5%9B%BE%E9%89%B4" response = requests.get(url) soup = BeautifulSoup(response.text, "lxml") characters_data = soup.select(".box-js") result = {} # 创建文件夹 save_path = os.path.join(app_config["data_path"], "illustrations") if not os.path.exists(save_path): try: os.mkdir(save_path) except OSError: logger("Error", "Create download path %s failed!" % save_path, verbose) # 下载图鉴 for index, character in enumerate(characters_data): logger("Info", "Downloading illustrations %d/%d" % (index + 1, len(characters_data)), verbose) character_name = character.select("a")[0]["title"] character_illustration_url = character.select("img")[0]["src"] illustration_save_path = os.path.join(save_path, "%d.jpg" % index) urllib.request.urlretrieve(character_illustration_url, illustration_save_path) result[character_name] = { "id": index, "character_name": character_name, "illustration_url": character_illustration_url, "illustration_path": "%d.jpg" % index } logger("OK", "Character' illustrations has been downloaded to %s!" % save_path, verbose) with open(os.path.join(save_path, "illustration.json"), "w+") as file: file.write(json.dumps(result, indent=4, ensure_ascii=False)) return result