Ejemplo n.º 1
0
    def read_squad_data(self, file):
        """
        read squad data file in string form
        """
        logger("Reading SQuAD data.")

        def extract(sample_data):
            document = sample_data["context"]
            for qas in sample_data["qas"]:
                question = qas["question"]
                for ans in qas["answers"]:
                    answer_len = len(ans["text"])
                    answer_span = [
                        ans["answer_start"], ans["answer_start"] + answer_len
                    ]
                    assert (ans["text"] == document[ans["answer_start"]:(
                        ans["answer_start"] + answer_len)])
                    documents.append(document)
                    questions.append(question)
                    answer_spans.append(answer_span)

        documents, questions, answer_spans = [], [], []
        f = json.load(open(file, encoding="utf-8"))
        data_list, version = f["data"], f["version"]
        logger("SQuAD version: {}".format(version))
        [
            extract(sample) for data in data_list
            for sample in data["paragraphs"]
        ]
        if self.args.debug:
            documents, questions, answer_spans = documents[:
                                                           500], questions[:
                                                                           500], answer_spans[:
                                                                                              500]
        return documents, questions, answer_spans
Ejemplo n.º 2
0
 def load_weight(self):
     ckpt = tf.train.get_checkpoint_state(self.args.weight_path)
     if ckpt is not None:
         logger("Load models from {}.".format(ckpt.model_checkpoint_path))
         self.saver.restore(self.sess, ckpt.model_checkpoint_path)
     else:
         logger("No previous models.")
Ejemplo n.º 3
0
    def gen_vocab(data_file, tokenizer=default_tokenizer, old_counter=None, max_count=None):
        """
        generate vocabulary according to train corpus.
        """
        logger("Creating word dict from data {}.".format(data_file))
        word_counter = old_counter if old_counter else Counter()
        counter = 0
        with gfile.FastGFile(data_file) as f:
            for line in f:
                counter += 1
                if max_count and counter > max_count:
                    break
                tokens = tokenizer(line.rstrip('\n'))
                word_counter.update(tokens)
                if counter % 100000 == 0:
                    logger("Process line %d Done." % counter)

        # summary statistics
        total_words = sum(word_counter.values())
        distinct_words = len(list(word_counter))

        logger("STATISTICS" + "-" * 20)
        logger("Total words: " + str(total_words))
        logger("Total distinct words: " + str(distinct_words))

        return word_counter
Ejemplo n.º 4
0
    def test(self):
        if not self.args.train:
            self.sess.run(tf.global_variables_initializer())
            self.load_weight()
        batch_size = self.args.batch_size
        batch_num = self.test_num // batch_size
        batch_num = batch_num + 1 if (self.test_num %
                                      batch_size) != 0 else batch_num
        correct_num, total_num = 0, 0
        result = list()
        for i in range(batch_num):
            data, samples = self.get_batch_data("test", i)
            # TODO : here can be remove if the test being desperated
            data = dict(data, **{'keep_prob:0': 1.})

            if samples != 0:
                correct, pred = self.sess.run(
                    [self.correct_prediction, self.prediction], feed_dict=data)
                correct_num, total_num = correct_num + correct, total_num + samples
                result.extend(pred.tolist())
        assert (total_num == self.test_num == len(result))
        logger("Test on : {}/{}".format(total_num, self.test_num))
        test_acc = correct_num / total_num
        logger("Test accuracy is : {:.5f}".format(test_acc))
        res = {"model": self.model_name, "test_acc": test_acc}
        self.test_save(pred=result)
        save_obj_to_json(self.args.weight_path, res, "result.json")
Ejemplo n.º 5
0
    def cbt_data_to_token_ids(self, data_file, target_file, vocab_file, max_count=None):
        """
        22 lines for one sample.
        first 20 lines:documents with line number in the front.
        21st line:line-number question\tAnswer\t\tCandidate1|...|Candidate10.
        22nd line:blank.
        """
        if gfile.Exists(target_file):
            return
        logger("Tokenizing data in {}".format(data_file))
        word_dict = self.load_vocab(vocab_file)
        counter = 0

        with gfile.FastGFile(data_file) as f:
            with gfile.FastGFile(target_file, mode="wb") as tokens_file:
                for line in f:
                    counter += 1
                    if counter % 100000 == 0:
                        logger("Tokenizing line %d" % counter)
                    if max_count and counter > max_count:
                        break
                    if counter % 22 == 21:
                        q, a, _, A = line.split("\t")
                        token_ids_q = self.sentence_to_token_ids(q, word_dict)[1:]
                        token_ids_A = [word_dict.get(a.lower(), self.UNK_ID) for a in A.rstrip("\n").split("|")]
                        tokens_file.write(" ".join([str(tok) for tok in token_ids_q]) + "\t"
                                          + str(word_dict.get(a.lower(), self.UNK_ID)) + "\t"
                                          + "|".join([str(tok) for tok in token_ids_A]) + "\n")
                    else:
                        token_ids = self.sentence_to_token_ids(line, word_dict)
                        token_ids = token_ids[1:] if token_ids else token_ids
                        tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")
Ejemplo n.º 6
0
 def testAllFile(self):
     log.logger().info("=======开始执行所有测试用例文件=======")
     # 创建测试工具类对象
     commonUtils = CommonUtils()
     # 调用测试所有用例方法传入要执行用例的文件夹路径
     commonUtils.executeAllFile("../data")
     # 调用发送报告方法将测试结果发送到指定邮箱
     commonUtils.exportReport()
Ejemplo n.º 7
0
 def testSingleFile(self):
     log.logger().info("=======开始执行单个测试用例文件=======")
     # 创建测试工具类对象
     commonUtils = CommonUtils()
     # 调用测试所有用例方法传入要执行用例文件路径
     commonUtils.executeSingleFile("../data/case1.xls")
     # 调用发送报告方法将测试结果发送到指定邮箱
     commonUtils.exportReport()
Ejemplo n.º 8
0
 def load_weight(self):
     ckpt = tf.train.get_checkpoint_state(self.args.weight_path)
     if ckpt is not None and ckpt.model_checkpoint_path.startswith(
             os.path.join(self.args.weight_path, self.__class__.__name__)):
         logger("Load models from {}.".format(ckpt.model_checkpoint_path))
         self.saver.restore(self.sess, ckpt.model_checkpoint_path)
     else:
         logger("No previous models. model :%s" % self.__class__.__name__)
Ejemplo n.º 9
0
 def random_proxy(self, protocal=None, domain=None, nick_type=0, count=0):
     """返回满足要求的一个随机代理IP
     """
     try:
         proxy_list = self.get_proxies(protocol=protocal, domain=domain, count=count, nick_type=nick_type)
         return random.choice(proxy_list)
     except Exception as e:
         logger("当前没有满足要求的代理IP")
Ejemplo n.º 10
0
 def save_weight(self, val_acc, step):
     path = self.saver.save(self.sess,
                            os.path.join(
                                self.args.weight_path,
                                "{}-val_acc-{:.4f}.models".format(
                                    self.model_name, val_acc)),
                            global_step=step)
     logger("Save models to {}.".format(path))
Ejemplo n.º 11
0
 def get_word_index(self, path = None):
     if not path:
         path = self.args.tmp_dir + self.__class__.__name__ + self.args.word_file
     word2id = dict()
     with open(path, mode = 'r', encoding = 'utf-8') as f:
         for l in f:
             word2id.setdefault(l.strip(), len(word2id))
     logger('Word2id size : %d' % len(word2id))
     return word2id
Ejemplo n.º 12
0
 def early_stopping(self, val_acc, val_loss, step):
     if val_acc > self.best_val_acc:
         self.patience = self.args.patience
         self.best_val_acc = val_acc
         self.save_weight(val_acc, step)
     elif self.patience == 1:
         logger("Oh u, stop training.")
         exit(0)
     else:
         self.patience -= 1
         logger("Remaining/Patience : {}/{} .".format(
             self.patience, self.args.patience))
Ejemplo n.º 13
0
 def softmax_with_mask(logits,
                       axis,
                       mask,
                       epsilon=10e-8,
                       name=None):  # 1. normalize 2. softmax
     with tf.name_scope(name, 'softmax', [logits, mask]):
         max_axis = tf.reduce_max(logits, axis, keep_dims=True)
         target_exp = tf.exp(logits - max_axis) * mask
         normalize = tf.reduce_sum(target_exp, axis, keep_dims=True)
         softmax = target_exp / (normalize + epsilon)
         logger("softmax shape {}".format(softmax.get_shape()))
         return softmax
Ejemplo n.º 14
0
 def token_idx_map(self, context, answer_span):
     logger("Convert answer to position in the context.")
     answer_se = []
     for i in range(len(context)):
         answer_tokens = process_tokens(
             default_tokenizer(
                 context[i][answer_span[i][0]:answer_span[i][1]]))
         con = process_tokens(
             default_tokenizer(context[i][:answer_span[i][0]]))
         a_start_idx = len(con)
         a_end_idx = len(con) + len(answer_tokens) - 1
         answer_se.append([a_start_idx, a_end_idx])
     return answer_se
Ejemplo n.º 15
0
 def save_weight(self, val_acc, step):
     path = self.saver.save(self.sess,
                            os.path.join(
                                self.args.weight_path,
                                "{}-val_acc-{:.4f}.models-{}".format(
                                    self.model_name, val_acc,
                                    datetime.datetime.now())),
                            global_step=step)
     if self.args.tensorboard and self.args.visualize_embedding:
         visualize_embedding(word2id=self.dataset.word2id,
                             embedding_matrix_name=self.embedding.name,
                             writer=self.writer)
     logger("Save models to {}.".format(path))
Ejemplo n.º 16
0
    def read_cbt_data(self, file, max_count=None):
        """
        read CBT data in id format.
        :return: (documents,questions,answers,candidates) each elements is a numpy array.
        """
        documents, questions, answers, candidates = [], [], [], []
        with FastGFile(file, mode="r") as f:
            counter = 0
            d, q, a, A = [], [], [], []
            for line in f:
                counter += 1
                if max_count and counter > max_count:
                    break
                if counter % 100000 == 0:
                    logger("Reading line %d in %s" % (counter, file))
                if counter % 22 == 21:
                    tmp = line.strip().split("\t")
                    q = tmp[0].split(" ") + [self.EOS_ID]
                    a = [1 if tmp[1] == i else 0 for i in d]
                    A = [a for a in tmp[2].split("|")]
                    A.remove(tmp[1])
                    A.insert(
                        0,
                        tmp[1])  # put right answer in the first of candidate
                elif counter % 22 == 0:
                    documents.append(d)
                    questions.append(q)
                    answers.append(a)
                    candidates.append(A)
                    d, q, a, A = [], [], [], []
                else:
                    d.extend(line.strip().split(" ") +
                             [self.EOS_ID
                              ])  # add EOS ID in the end of each sentence

        d_lens = [len(i) for i in documents]
        q_lens = [len(i) for i in questions]
        avg_d_len = reduce(lambda x, y: x + y, d_lens) / len(documents)
        logger("Document average length: %d." % avg_d_len)
        logger("Document midden length: %d." %
               len(sorted(documents, key=len)[len(documents) // 2]))
        avg_q_len = reduce(lambda x, y: x + y, q_lens) / len(questions)
        logger("Question average length: %d." % avg_q_len)
        logger("Question midden length: %d." %
               len(sorted(questions, key=len)[len(questions) // 2]))

        return documents, questions, answers, candidates
Ejemplo n.º 17
0
    def draw_graph(self):
        log_file = '../logs/log-%s-%s-%s-emb%d-id%s' % (
            self.args.activation, self.args.dataset, self.args.rnn_type,
            self.args.embedding_dim, str(datetime.datetime.now()))
        self.writer = tf.summary.FileWriter(log_file)
        self.writer.add_graph(self.sess.graph)
        tf.summary.scalar('loss', self.loss)
        tf.summary.scalar('accuracy', self.accuracy)

        config = projector.ProjectorConfig()
        embedding_conf = config.embeddings.add()
        embedding_conf.tensor_name = 'embedding_matrix'
        # embedding_conf.metadata_path = os.path.join(log_file, 'metadata.tsv')
        projector.visualize_embeddings(self.writer, config)

        self.merged_summary = tf.summary.merge_all()
        logger('Save log to %s' % log_file)
Ejemplo n.º 18
0
 def squad_data_to_idx(self, vocab_file, *args):
     """
     convert string list to index list form.         
     """
     logger("Convert string data to index.")
     word_dict = self.load_vocab(vocab_file)
     res_data = [
         0,
     ] * len(args)
     for idx, i in enumerate(args):
         tmp = [
             self.sentence_to_token_ids(document, word_dict)
             for document in i
         ]
         res_data[idx] = tmp.copy()
     logger("Convert string2index done.")
     return res_data
Ejemplo n.º 19
0
    def prepare_data(self,
                     data_dir,
                     train_file,
                     valid_file,
                     max_vocab_num,
                     output_dir=""):
        """
        build word vocabulary and character vocabulary.
        """
        if not gfile.Exists(os.path.join(data_dir, output_dir)):
            os.mkdir(os.path.join(data_dir, output_dir))
        os_train_file = os.path.join(data_dir, train_file)
        os_valid_file = os.path.join(data_dir, valid_file)
        vocab_file = os.path.join(data_dir, output_dir,
                                  "vocab.%d" % max_vocab_num)
        char_vocab_file = os.path.join(data_dir, output_dir, "char_vocab")

        vocab_data_file = os.path.join(data_dir, output_dir, "data.txt")

        def save_data(d_data, q_data):
            """
            save all data to a file and use it build vocabulary.
            """
            with open(vocab_data_file, mode="w", encoding="utf-8") as f:
                f.write("\t".join(d_data) + "\n")
                f.write("\t".join(q_data) + "\n")

        if not gfile.Exists(vocab_data_file):
            d, q, _ = self.read_squad_data(os_train_file)
            v_d, v_q, _ = self.read_squad_data(os_valid_file)
            save_data(d, q)
            save_data(v_d, v_q)
        if not gfile.Exists(vocab_file):
            logger("Start create vocabulary.")
            word_counter = self.gen_vocab(vocab_data_file,
                                          max_count=self.args.max_count)
            self.save_vocab(word_counter, vocab_file, max_vocab_num)
        if not gfile.Exists(char_vocab_file):
            logger("Start create character vocabulary.")
            char_counter = self.gen_char_vocab(vocab_data_file)
            self.save_char_vocab(char_counter,
                                 char_vocab_file,
                                 max_vocab_num=70)

        return os_train_file, os_valid_file, vocab_file, char_vocab_file
Ejemplo n.º 20
0
 def test(self):
     if not self.args.train:
         self.sess.run(tf.global_variables_initializer())
         self.load_weight()
     batch_size = self.args.batch_size
     batch_num = self.test_num // batch_size
     batch_num = batch_num + 1 if (self.test_num %
                                   batch_size) != 0 else batch_num
     correct_num, total_num = 0, 0
     for i in range(batch_num):
         data, samples = self.get_batch_data("test", i)
         if samples != 0:
             correct, = self.sess.run([self.correct_prediction],
                                      feed_dict=data)
             correct_num, total_num = correct_num + correct, total_num + samples
     assert (total_num == self.test_num)
     logger("Test on : {}/{}".format(total_num, self.test_num))
     test_acc = correct_num / total_num
     logger("Test accuracy is : {:.5f}".format(test_acc))
     res = {"model": self.model_name, "test_acc": test_acc}
     save_obj_to_json(self.args.weight_path, res, "result.json")
Ejemplo n.º 21
0
 def doGET(self, url, params, headers):
     # get请求执行方法
     result = requests.post(url=url,
                            params=params,
                            headers=headers,
                            timeout=10)
     # 判断请求状态码如果是200请求成功做出相应操作
     if result.status_code == 200:
         # 提取headers 头信息中的Cookie信息
         self.setCookies(result.headers)
         # 如果返回不是json格式会抛出异常添加try
         try:
             # 返回json格式的返回结果
             return result.json()
         # 如果返回不是json格式会抛出异常添加except返回其他结果
         except:
             # 返回其他形式的信息
             return result.text
     else:
         # 添加log捕获异常错误信息
         log.logger().info("当前出错了,错误码是%d" % result.status_code)
         try:
             error = result.raise_for_status()
             # 添加异常捕获log
             log.logger().info("当前出错了错误信息是%s" % str(error))
         except Exception as e:
             # 添加异常捕获log
             log.logger().info("当前出错了错误信息是%s" % e)
         # 返回字典形式的信息以免报错
         return {}
Ejemplo n.º 22
0
    def gen_embeddings(word_dict, embed_dim, in_file=None, init=np.zeros):
        """
        Init embedding matrix with (or without) pre-trained word embeddings.
        """
        num_words = max(word_dict.values()) + 1
        embedding_matrix = init(-0.05, 0.05, (num_words, embed_dim))
        logger('Embeddings: %d x %d' % (num_words, embed_dim))

        if not in_file:
            return embedding_matrix

        def get_dim(file):
            first = gfile.FastGFile(file, mode='r').readline()
            return len(first.split()) - 1

        assert get_dim(in_file) == embed_dim
        logger('Loading embedding file: %s' % in_file)
        pre_trained = 0
        for line in codecs.open(in_file, encoding="utf-8"):
            sp = line.split()
            if sp[0] in word_dict:
                pre_trained += 1
                embedding_matrix[word_dict[sp[0]]] = np.asarray([float(x) for x in sp[1:]], dtype=np.float32)
        logger("Pre-trained: {}, {:.3f}%".format(pre_trained, pre_trained * 100.0 / num_words))
        return embedding_matrix
Ejemplo n.º 23
0
 def squad_data_to_idx(self, vocab_file, char_vocab_file=None, *args):
     """
     convert string list to index list form.         
     """
     logger("Convert string data to index.")
     word_dict = self.load_vocab(vocab_file)
     if self.args.use_char_embedding:
         char_dict = self.load_vocab(self.char_vocab_file)
     res_data = []
     for idx, i in enumerate(args):
         tmp = [
             self.sentence_to_token_ids(document, word_dict)
             for document in i
         ]
         res_data.append(tmp.copy())
         if self.args.use_char_embedding:
             tmp_c = [
                 self.words_to_char_ids(document, char_dict)
                 for document in i
             ]
             res_data.append(tmp_c.copy())
     logger("Convert string2index done.")
     return res_data
Ejemplo n.º 24
0
    def validate(self):
        batch_size = self.args.batch_size
        v_batch_num = self.valid_nums // batch_size
        # ensure the entire valid set is selected
        v_batch_num = v_batch_num + 1 if (self.valid_nums %
                                          batch_size) != 0 else v_batch_num
        # logger("Validate on {} batches, {} samples per batch, {} total."
        #        .format(v_batch_num, batch_size, self.valid_nums))
        val_num, val_corrects, v_loss = 0, 0, 0

        preds = list()
        for i in range(v_batch_num):
            data, samples = self.get_batch_data("valid", i)

            # TODO : here can be remove if the test being desperated
            data = dict(data, **{'keep_prob:0': 1.})

            if samples != 0:
                loss, v_correct, prediction = self.sess.run(
                    [self.loss, self.correct_prediction, self.prediction],
                    feed_dict=data)
                val_num += samples
                val_corrects += v_correct
                v_loss += loss * samples
                preds.extend(prediction.tolist())

        # call the custom metric
        # self.metric(preds = preds, label = self.dataset.valid_y.tolist())

        assert (val_num == self.valid_nums)
        val_acc = val_corrects / val_num
        val_loss = v_loss / val_num
        logger(
            "Evaluate on : {}/{}.\tVal acc : {:.4f}.\tVal Loss : {:.4f}, Best acc:{:.4f}, , Dataset:{}, "
            .format(val_num, self.valid_nums, val_acc, val_loss,
                    self.best_val_acc, self.args.dataset))
        return val_acc, val_loss
Ejemplo n.º 25
0
    def execute(self):
        """
        main method to train and test
        """
        # self.confirm_model_dataset_fitness()

        self.dataset = getattr(sys.modules["tf.datasets"],
                               self.args.dataset)(self.args)

        if hasattr(self.dataset, 'get_embedding_matrix'):
            self.embedding_matrix = self.dataset.get_embedding_matrix(
                is_char_embedding=False)
        else:
            logger(
                'No function named get_embedding_matrix in data set %s, use the random initialization'
                % self.dataset.__class__.__name__)

        self.max_len = self.dataset.max_len
        self.word2id_size = self.dataset.word2id_size
        self.train_nums, self.valid_nums, self.test_num = self.dataset.train_nums, self.dataset.valid_nums, self.dataset.test_nums

        self.num_class = self.dataset.num_class

        self.create_model()

        self.make_sure_model_is_valid()

        self.saver = tf.train.Saver(max_to_keep=20)

        if self.args.train:
            if self.args.tensorboard:
                self.draw_graph()
            self.train()
        if self.args.test:
            self.test()

        self.sess.close()
Ejemplo n.º 26
0
def get_equipment_image(verbose=False):
    url = "https://wiki.biligame.com/pcr/%E8%A3%85%E5%A4%87%E4%B8%80%E8%A7%88"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "lxml")
    table = soup.select('#wiki_table')[0]
    equipments = table.select('span')

    result = {}
    save_path = os.path.join(app_config["data_path"], "equipments")
    if not os.path.exists(save_path):
        try:
            os.mkdir(save_path)
        except OSError:
            logger("Error", "Create download path %s failed!" % save_path,
                   verbose)

    for index, equip in enumerate(equipments):
        logger("Info",
               "Downloading equipment image %d/%d" % (index, len(equipments)),
               verbose)

        name = equip.select('a')[0]["title"]
        image_url = equip.select('img')[0]["src"]
        equip_save_path = os.path.join(save_path, "%d.png" % index)
        urllib.request.urlretrieve(image_url, equip_save_path)
        result[name] = {
            "id": index,
            "equipment_name": name,
            "equipment_image_url": image_url,
            "equipment_image_path": "%d.png" % index
        }

    with open(os.path.join(save_path, "equipments.json"), "w+") as file:
        file.write(json.dumps(result, indent=4, ensure_ascii=False))

    return result
Ejemplo n.º 27
0
 def validate(self):
     batch_size = self.args.batch_size
     v_batch_num = self.valid_nums // batch_size
     # ensure the entire valid set is selected
     v_batch_num = v_batch_num + 1 if (self.valid_nums %
                                       batch_size) != 0 else v_batch_num
     logger(
         "Validate on {} batches, {} samples per batch, {} total.".format(
             v_batch_num, batch_size, self.valid_nums))
     val_num, val_corrects, v_loss = 0, 0, 0
     for i in range(v_batch_num):
         data, samples = self.get_batch_data("valid", i)
         if samples != 0:
             loss, v_correct = self.sess.run(
                 [self.loss, self.correct_prediction], feed_dict=data)
             val_num += samples
             val_corrects += v_correct
             v_loss += loss * samples
     assert (val_num == self.valid_nums)
     val_acc = val_corrects / val_num
     val_loss = v_loss / val_num
     logger("Evaluate on : {}/{}.\tVal acc : {:.4f}.\tVal Loss : {:.4f}".
            format(val_num, self.valid_nums, val_acc, val_loss))
     return val_acc, val_loss
Ejemplo n.º 28
0
    def gen_char_vocab(data_file, tokenizer=default_tokenizer, old_counter=None):
        """
         generate character level vocabulary according to train corpus.
        """
        logger("Creating character dict from data {}.".format(data_file))
        char_counter = old_counter if old_counter else Counter()
        with gfile.FastGFile(data_file) as f:
            for line in f:
                tokens = tokenizer(line.rstrip("\n"))
                char_counter.update([char for word in tokens for char in word])

        # summary statistics
        total_chars = sum(char_counter.values())
        distinct_chars = len(list(char_counter))

        logger("STATISTICS" + "-" * 20)
        logger("Total characters: " + str(total_chars))
        logger("Total distinct characters: " + str(distinct_chars))
        return char_counter
Ejemplo n.º 29
0
    def train(self):
        """
        train model
        """
        self.step = tf.Variable(0, name="global_step", trainable=False)
        batch_size = self.args.batch_size
        epochs = self.args.num_epoches
        self.get_train_op()
        self.sess.run(tf.global_variables_initializer())
        self.load_weight()

        # early stopping params, by default val_acc is the metric
        self.patience, self.best_val_acc = self.args.patience, 0.
        # Start training
        corrects_in_epoch, samples_in_epoch, loss_in_epoch = 0, 0, 0
        batch_num = self.train_nums // batch_size
        logger("Train on {} batches, {} samples per batch, {} total.".format(
            batch_num, batch_size, self.train_nums))

        step = self.sess.run(self.step)
        while step < batch_num * epochs:
            step = self.sess.run(self.step)
            # on Epoch start
            if step % batch_num == 0:
                corrects_in_epoch, samples_in_epoch, loss_in_epoch = 0, 0, 0
                logger("{}Epoch : {}{}".format("-" * 40, step // batch_num + 1,
                                               "-" * 40))
                self.dataset.shuffle()

            data, samples = self.get_batch_data("train", step % batch_num)
            loss, _, corrects_in_batch = self.sess.run(
                [self.loss, self.train_op, self.correct_prediction],
                feed_dict=data)
            corrects_in_epoch += corrects_in_batch
            loss_in_epoch += loss * samples
            samples_in_epoch += samples

            # logger
            if step % self.args.print_every_n == 0:
                logger(
                    "Samples : {}/{}.\tStep : {}/{}.\tLoss : {:.4f}.\tAccuracy : {:.4f}"
                    .format(samples_in_epoch, self.train_nums,
                            step % batch_num, batch_num,
                            loss_in_epoch / samples_in_epoch,
                            corrects_in_epoch / samples_in_epoch))

            # evaluate on the valid set and early stopping
            if step and step % self.args.evaluate_every_n == 0:
                val_acc, val_loss = self.validate()
                self.early_stopping(val_acc, val_loss, step)
def get_characters_illustration(verbose=False):
    logger("Info", "Start download characters illustrations...", verbose)

    url = "https://wiki.biligame.com/pcr/%E8%A7%92%E8%89%B2%E5%9B%BE%E9%89%B4"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "lxml")
    characters_data = soup.select(".box-js")
    result = {}

    # 创建文件夹
    save_path = os.path.join(app_config["data_path"], "illustrations")
    if not os.path.exists(save_path):
        try:
            os.mkdir(save_path)
        except OSError:
            logger("Error", "Create download path %s failed!" % save_path, verbose)

    # 下载图鉴
    for index, character in enumerate(characters_data):
        logger("Info", "Downloading illustrations %d/%d" % (index + 1, len(characters_data)), verbose)
        character_name = character.select("a")[0]["title"]
        character_illustration_url = character.select("img")[0]["src"]
        illustration_save_path = os.path.join(save_path, "%d.jpg" % index)
        urllib.request.urlretrieve(character_illustration_url, illustration_save_path)
        result[character_name] = {
            "id": index,
            "character_name": character_name,
            "illustration_url": character_illustration_url,
            "illustration_path": "%d.jpg" % index
        }

    logger("OK", "Character' illustrations has been downloaded to %s!" % save_path, verbose)
    with open(os.path.join(save_path, "illustration.json"), "w+") as file:
        file.write(json.dumps(result, indent=4, ensure_ascii=False))

    return result