Exemple #1
0
    def __init__(self,base_model_list=bagging_config.base_model_list):
        self.base_model_list = base_model_list.split("-")
        self.num_random=len(self.base_model_list)
        self.dataDir = general_config.data_dir + "/random"
        createRandomData(self.num_random)

        self.models = []
        self.models_name=[]
        for i in range(self.num_random):
            base_model = self.base_model_list[i]
            assert base_model in ["1", "2", "3", "4","5"], "Invalid base model type!"
            if base_model == "1":
                model = TextCNN()
            elif base_model == "2":
                model = TextRNN()
            elif base_model == "3":
                model = CRNN()
            elif base_model=="4":
                model = RCNN()
            else:
                model=HAN()
            self.models.append(model)
            self.models_name.append(modelDict[base_model])
        self.logDir = ensure_dir_exist(general_config.log_dir + "/bagging/" + "-".join(self.models_name))
        self.saveDir = ensure_dir_exist(general_config.save_dir + "/bagging/" + "-".join(self.models_name))
        self.logger=my_logger(self.logDir+"/log.txt")
Exemple #2
0
 def __init__(self):
     self.embeddings = load_embedding_matrix(
         wv_path=general_config.wv_path,
         int2vocabPath=general_config.global_static_i2v_path)
     self.model = GaussianNB()
     self.log_dir = ensure_dir_exist(general_config.log_dir + "/NB")
     self.save_dir = ensure_dir_exist(general_config.save_dir + "/NB")
     self.logger = my_logger(self.log_dir + "/log.txt")
Exemple #3
0
    def predict(self,
                testFile=None,
                vocab2intPath=None,
                load_path=general_config.load_path_test,
                is_save=True,
                resPath=None):
        if testFile is None or vocab2intPath is None:
            testFile = os.path.join(general_config.data_dir,
                                    "testing_data_new.txt")
            vocab2intPath = general_config.global_nonstatic_v2i_path
        test_generator = BucketedDataIteratorForDoc(
            loadPath=testFile, vocab2intPath=vocab2intPath)
        load_dir = load_path if os.path.isdir(load_path) else os.path.dirname(
            load_path)
        log_dir = load_dir.replace("checkpoints", "logs")
        logger = my_logger(log_dir + "/log_predict.txt")
        os.environ['CUDA_VISIBLE_DEVICES'] = str(0)
        config = tf.ConfigProto()
        config.gpu_options.per_process_gpu_memory_fraction = 0.8
        with tf.Session(config=config, graph=self.graph) as sess:
            logger.info("Loading model...")
            saver = tf.train.Saver()
            if os.path.isdir(load_path):
                ckpt = tf.train.get_checkpoint_state(load_path)
                saver.restore(sess, ckpt.model_checkpoint_path)
                global_step = ckpt.model_checkpoint_path.split("-")[-1]
            else:
                saver.restore(sess, load_path)
                global_step = load_path.split("-")[-1]
            logger.info("Loading successfully, loading epoch is %s" %
                        global_step)

            cur_loop = test_generator.loop
            batch_idx, batch_docs, _, batch_doc_lens, batch_sent_lens = test_generator.next(
                batch_size=1024, need_all=True)
            res = {}
            while (test_generator.loop == cur_loop):
                predicted = sess.run(self.predicted,
                                     feed_dict=self._feed_dict_test(
                                         batch_docs,
                                         batch_doc_len=batch_doc_lens,
                                         batch_sent_len=batch_sent_lens))
                for (id, label) in zip(batch_idx, predicted):
                    res[id] = int(label)
                batch_idx, batch_docs, _, batch_doc_lens, batch_sent_lens = test_generator.next(
                    1024, need_all=True)
            if is_save:
                if resPath is None:
                    res_dir = ensure_dir_exist(
                        load_dir.replace("checkpoints", "results"))
                    resPath = os.path.join(res_dir,
                                           "predicted.csv-" + str(global_step))
                res_save = [[key, value] for (key, value) in res.items()]
                # 用于存放测试识别结果
                WriteToSubmission(fileName=resPath, res=res_save)
        return res
Exemple #4
0
    def __init__(self,
                 base_model_list=stacking_config.base_model_list,
                 num_cv=stacking_config.num_cv):
        self.base_model_list = base_model_list.split("-")
        self.num_models = len(self.base_model_list)
        self.num_cv = num_cv
        self.dataDir = general_config.data_dir + "/cv/" + str(self.num_cv)
        if not os.path.exists(self.dataDir):
            createCrossValidationData(self.num_cv)

        self.models = []
        self.models_name = []
        for n in range(self.num_models):
            base_model = self.base_model_list[n]
            assert base_model in ["1", "2", "3", "4",
                                  "5"], "Invalid base model type!"
            if base_model == "1":
                model = TextCNN()
            elif base_model == "2":
                model = TextRNN()
            elif base_model == "3":
                model = CRNN()
            elif base_model == "4":
                model = RCNN()
            else:
                model = HAN()
            self.models.append(model)
            self.models_name.append(modelDict[base_model])
        self.logDir = ensure_dir_exist(general_config.log_dir + "/stacking/" +
                                       "-".join(self.models_name) + "/" +
                                       str(self.num_cv))
        self.saveDir = ensure_dir_exist(general_config.save_dir +
                                        "/stacking/" +
                                        "-".join(self.models_name) + "/" +
                                        str(self.num_cv))
        self.classifier = LogisticRegression()
        self.logger = my_logger(self.logDir + "/log.txt")
Exemple #5
0
def createRandomData(num_random):
    trainingFile = general_config.training_file
    with open(trainingFile, 'r') as f:
        raw_data = np.asarray(f.readlines())
    total_size=len(raw_data)
    saveDir = ensure_dir_exist(general_config.data_dir + "/random")
    for i in range(num_random):
        trainFile = saveDir + "/training" + str(i) + ".txt"
        if os.path.exists(trainFile):
            continue
        np.random.seed(seed=10*i)
        indices=np.random.choice(total_size,total_size,replace=True)
        with open(trainFile,'w') as f:
            f.writelines(raw_data[indices])
        getNonstaticWordDict(trainFile=trainFile,global_v2i_path=general_config.global_nonstatic_v2i_path)
        create_visual_metadata(int2vocab_path=trainFile.replace(".txt","_i2v.json"))
Exemple #6
0
 def predict(self,load_epochs_list=bagging_config.load_epochs_list,
             testFile=None):
     if load_epochs_list is None:
         load_epochs=None
     else:
         load_epochs = load_epochs_list.split("-")
         assert len(load_epochs)==self.num_random
     if testFile is None:
         testFile =general_config.testing_file
     tmp_res={}
     res_dir = ensure_dir_exist(self.saveDir.replace("checkpoints", "results"))
     for i in range(self.num_random):
         model=self.models[i]
         model_name=self.models_name[i]
         vocab2intPath = self.dataDir + "/training" + str(i) + "_v2i.json"
         if model_name == "TextCNN":
             load_path = self.saveDir + "/" + str(i) + "_" + model_name + "/nonstatic/train"
         else:
             load_path = self.saveDir + "/" + str(i) + "_" + model_name + "/train"
         resPath = res_dir + "/" + str(i) + "_predicted.csv"
         if load_epochs is not None:
             load_path+="/model.ckpt-"+load_epochs[i]
             resPath = resPath+"-"+load_epochs[i]
         res=model.predict(testFile=testFile,vocab2intPath=vocab2intPath,
                           load_path=load_path,resPath=resPath)
         for key, value in res.items():
             try:
                 tmp_res[key][value]+=1
             except:
                 tmp = {}
                 for j in range(general_config.num_classes):
                     tmp[j] = 0
                 tmp[value]+=1
                 tmp_res[key]=tmp
     res=[]
     for id,item in tmp_res.items():
         tmp=sorted(item.items(),key=lambda d:d[1],reverse=True)[0][0]
         res.append([id,tmp])
     if load_epochs_list is None:
         WriteToSubmission(res, fileName=res_dir + "/predicted.csv")
     else:
         WriteToSubmission(res,fileName=res_dir+"/predicted.csv-"+load_epochs_list)
Exemple #7
0
def createCrossValidationData(num_cv=5):
    trainingFile = general_config.training_file
    with open(trainingFile, 'r') as f:
        raw_data = np.asarray(f.readlines())
    saveDir = ensure_dir_exist(general_config.data_dir + "/cv/" + str(num_cv))
    kf = KFold(num_cv, random_state=1234 + num_cv, shuffle=True)
    count = 0
    for train_index, test_index in kf.split(raw_data):
        train = raw_data[train_index]
        test = raw_data[test_index]
        with open(saveDir + "/train" + str(count) + ".txt", 'w') as f:
            f.writelines(train)
        getNonstaticWordDict(
            trainFile=saveDir + "/train" + str(count) + ".txt",
            global_v2i_path=general_config.global_nonstatic_v2i_path)
        create_visual_metadata(int2vocab_path=saveDir + "/train" + str(count) +
                               "_i2v.json")
        with open(saveDir + "/valid" + str(count) + ".txt", 'w') as f:
            f.writelines(test)
        count += 1
Exemple #8
0
    def fit(self,trainFile=None,with_validation=general_config.with_validation,
              log_dir=general_config.log_dir+"/TextCNN",save_dir=general_config.save_dir+"/TextCNN",
              load_path=general_config.load_path_train,
            num_epochs=general_config.num_epochs,
              steps_every_epoch=general_config.steps_every_epoch,
            batch_size=general_config.batch_size,
              learning_rate=general_config.learning_rate,
              lr_changing=general_config.lr_changing,
              min_learning_rate=general_config.min_learning_rate,
            learning_rate_decay=general_config.learning_rate_decay,
              save_epochs=general_config.save_epochs,
            early_stopping=general_config.early_stopping,
            num_visual=general_config.num_visualize):

        self.learning_rate_value = learning_rate

        self.trainFile = trainFile
        self.validFile = None
        self.with_validation = with_validation
        if self.trainFile is None:
            if self.with_validation:
                self.trainFile = general_config.train_file
            else:
                self.trainFile = general_config.training_file
        if self.with_validation:
            self.validFile = self.trainFile.replace("train", "valid")
        tmp = os.path.join(os.path.dirname(self.trainFile),
                           os.path.basename(self.trainFile).replace(".txt", "").split("_")[0])
        if self.model_type in ["static","multichannel"]:
            self.int2vocabPath = general_config.global_static_i2v_path
            self.vocab2intPath = general_config.global_static_v2i_path
        else:
            self.int2vocabPath = tmp + "_i2v.json"
            self.vocab2intPath = tmp + "_v2i.json"
        metadataPath = {
            "static": "/home/leechen/code/python/TextSentimentClassification/data_helpers/dataset/training_testing_metadata.tsv"}
        metadataPath["nonstatic"] = "/home/leechen/code/python/TextSentimentClassification/" \
                                    + self.vocab2intPath.replace("v2i.json", "metadata.tsv")
        train_loss = []
        train_accuracy = []
        valid_loss = []
        valid_accuracy = []
        # 训练过程中的日志保存文件以及模型保存路径
        if self.with_validation:
            log_dir=ensure_dir_exist(log_dir+"/"+self.model_type+"/train_valid")
            train_dir = os.path.join(log_dir, "train")
            val_dir = os.path.join(log_dir, "valid")
            save_dir = ensure_dir_exist(save_dir + "/" + self.model_type + "/train_valid")
        else:
            log_dir=ensure_dir_exist(log_dir+"/"+self.model_type+"/train")
            train_dir = os.path.join(log_dir, "train")
            val_dir=None
            save_dir = ensure_dir_exist(save_dir + "/" + self.model_type + "/train")

        # 生成日志
        logger=my_logger(log_dir+"/log_fit.txt")
        msg = "\n--filter_size_list: %s\n" % self.filter_size_list \
              + "--filter_num: %s\n" % self.filter_num \
              + "--fc_layer_size_list: %s\n" % self.fc_layer_size_list \
              + "--embedding_size: %s\n" % self.embedding_size \
              + "--dropout: %s\n" % self.dropout_value \
              + "--max_l2_norm: %s\n" % self.max_l2_norm \
              + "--learning_rate: %s\n" % self.learning_rate_value \
              + "--lr_changing: %s\n" % lr_changing \
              + "--min_learning_rate: %s\n" % min_learning_rate\
              + "--learning_rate_decay: %s\n" % learning_rate_decay\
              +"--load_path: %s\n" % load_path\
              +"--num_epochs: %s\n" % num_epochs\
              +"--steps_every_epoch: %s\n" % steps_every_epoch\
              +"--batch_size: %s\n" % batch_size\
              +"--save_epochs: %s\n" % save_epochs\
              +"--early_stopping: %s\n" % early_stopping\
              +"--num_visual: %s\n"%num_visual
        logger.info(msg)

        # 定义数据生成器
        train_generator = PaddedDataIterator(loadPath=self.trainFile,vocab2intPath=self.vocab2intPath)
        val_generator = None if self.validFile is None else PaddedDataIterator(loadPath=self.validFile,
                                                                               vocab2intPath=self.vocab2intPath)

        os.environ["CUDA_VISIBLE_DEVICES"] = str(0)
        config = tf.ConfigProto()
        config.gpu_options.per_process_gpu_memory_fraction = 0.8

        with tf.Session(config=config,graph=self.graph) as sess:
            train_writer = tf.summary.FileWriter(train_dir, sess.graph)
            val_writer = None if val_dir is None else tf.summary.FileWriter(val_dir)
            saver = tf.train.Saver(max_to_keep=5)
            sess.run(tf.global_variables_initializer())
            start = 0
            if isinstance(load_path,str):
                if os.path.isdir(load_path):
                    ckpt = tf.train.get_checkpoint_state(load_path)
                    saver.restore(sess, ckpt.model_checkpoint_path)
                    start = ckpt.model_checkpoint_path.split("-")[-1]
                else:
                    saver.restore(sess, load_path)
                    start = load_path.split("-")[-1]
                logger.info("Loading successfully, loading epoch is %s" % start)
            logger.info("The total number of trainable variables: %s"%get_num_params())
            cur_early_stopping=0
            cur_max_acc=0.
            
            logger.info('******* start training with %d *******' % start)
            epoch=0
            for epoch in range(start, num_epochs):
                if lr_changing:
                    try:
                        if (train_loss[-1]>train_loss[-2]):
                            tmp=self.learning_rate_value*learning_rate_decay
                            if (tmp>=min_learning_rate):
                                self.learning_rate_value=tmp
                                logger.info("Learning rate multiplied by %s at epoch %s."
                                            %(learning_rate_decay,epoch+1))
                        else:
                            if (train_loss[-1]<train_loss[-2]-0.015):
                                self.learning_rate_value*=1.05
                                logger.info("Learning rate multiplied by 1.05 at epoch %s."%(epoch+1))
                    except:
                        pass
                avg_loss_t, avg_accuracy_t = 0, 0
                avg_loss_v, avg_accuracy_v = 0, 0
                for step in range(steps_every_epoch):
                    _, batch_seqs, batch_labels, _ = train_generator.next(batch_size)
                    batch_seqs_ns=None
                    if self.model_type=="multichannel":
                        batch_seqs_ns = self._X2X_ns(batch_seqs)
                    sess.run(self.train_op,self._feed_dict_train(batch_x=batch_seqs, batch_y=batch_labels,
                                                                 batch_x_ns=batch_seqs_ns))
                    loss_t, acc_t= sess.run([self.loss_op, self.acc_op],
                                            self._feed_dict_valid(batch_x=batch_seqs,
                                                                batch_y=batch_labels,
                                                                 batch_x_ns=batch_seqs_ns))
                    avg_loss_t += loss_t
                    avg_accuracy_t += acc_t
                avg_loss_t/=steps_every_epoch
                avg_accuracy_t/=steps_every_epoch
                train_loss.append(avg_loss_t)
                train_accuracy.append(avg_accuracy_t)
                self.loss_accuracy_summary.value[0].simple_value = avg_loss_t
                self.loss_accuracy_summary.value[1].simple_value = avg_accuracy_t
                train_writer.add_summary(summary=self.loss_accuracy_summary, global_step=epoch + 1)
                if self.with_validation:
                   # 计算验证集上的表现
                    cur_loop=val_generator.loop
                    _, batch_seqs, batch_labels,_ = val_generator.next(1024,need_all=True)
                    batch_seqs_ns=None
                    if self.model_type == "multichannel":
                        batch_seqs_ns = self._X2X_ns(batch_seqs)
                    cur_count=0
                    while(val_generator.loop==cur_loop):
                        loss_v, acc_v = sess.run([self.loss_op, self.acc_op],
                                                 feed_dict= self._feed_dict_valid(batch_x=batch_seqs,
                                                batch_y=batch_labels,batch_x_ns=batch_seqs_ns))
                        avg_loss_v += loss_v
                        avg_accuracy_v += acc_v
                        cur_count += 1
                        _, batch_seqs, batch_labels, _ = val_generator.next(1024, need_all=True)
                        batch_seqs_ns = None
                        if self.model_type == "multichannel":
                            batch_seqs_ns = self._X2X_ns(batch_seqs)
                    avg_loss_v/=cur_count
                    avg_accuracy_v/=cur_count
                    valid_loss.append(avg_loss_v)
                    valid_accuracy.append(avg_accuracy_v)
                    self.loss_accuracy_summary.value[0].simple_value = avg_loss_v
                    self.loss_accuracy_summary.value[1].simple_value = avg_accuracy_v
                    val_writer.add_summary(summary=self.loss_accuracy_summary, global_step=epoch + 1)
                    logger.info("Epoch: [%04d/%04d], "
                          "Training Loss: %.4f, Training Accuracy: %.4f, "
                          "Validation Loss: %.4f, Validation Accuracy: %.4f" \
                          % (epoch + 1, num_epochs,
                             avg_loss_t, avg_accuracy_t, avg_loss_v, avg_accuracy_v))

                    # 如果验证集上的准确率连续低于历史最高准确率的次数超过early_stopping次,则提前停止迭代。
                    if (avg_accuracy_v > cur_max_acc):
                        cur_max_acc = avg_accuracy_v
                        cur_early_stopping = 0
                        logger.info("Saving model-%s" % (epoch + 1))
                        saver.save(sess, os.path.join(save_dir, 'model.ckpt'), global_step=epoch + 1)
                    else:
                        cur_early_stopping += 1
                    if cur_early_stopping > early_stopping:
                        logger.info("Early stopping after epoch %s !" % (epoch + 1))
                        break
                else:
                    logger.info("Epoch: [%04d/%04d], "
                                "Training Loss: %.4f, Training Accuracy: %.4f " \
                                % (epoch + 1, num_epochs,avg_loss_t, avg_accuracy_t))
                # 保存一次模型
                if (epoch - start + 1) % save_epochs == 0:
                    logger.info("Saving model-%s"%(epoch+1))
                    saver.save(sess, os.path.join(save_dir, 'model.ckpt'), global_step=epoch + 1)

            if num_visual > 0:
                # 可视化最终词向量
                config = projector.ProjectorConfig()
                final_embeddings = {}
                try:
                    final_embeddings["static"] = self.embedding_matrix_s.eval()[:num_visual]
                except:
                    pass
                try:
                    final_embeddings["nonstatic"] = self.embedding_matrix_ns.eval()[:num_visual]
                except:
                    pass
                for (name, final_embedding) in final_embeddings.items():
                    embedding_var = tf.Variable(final_embedding, name="word_embeddings_" + name)
                    sess.run(embedding_var.initializer)
                    saver = tf.train.Saver([embedding_var])
                    saver.save(sess, log_dir + "/embeddings_" + name+".ckpt-"+str(epoch+1))
                    embedding = config.embeddings.add()
                    embedding.tensor_name = embedding_var.name
                    embedding.metadata_path = metadataPath[name]
                projector.visualize_embeddings(train_writer, config)
        return train_loss, train_accuracy, valid_loss, valid_accuracy
Exemple #9
0
        archive_results = bool(sys.argv[2].lower())

    start_time = time.time()
    snp_500_symbols = SimulateTrade.get_snp_symbols(
        SimulateTrade.SNP_SYMBOLS_FILE_PATH)
    zip_files = SimulateTrade.get_zip_files_in_folder(src_dir)
    for curr_file in zip_files:
        files_by_zip = {}
        file_path = os.path.join(src_dir, curr_file)
        files_by_zip[file_path] = SimulateTrade.get_files_from_zip_by_date(
            file_path)
        for zip_file in files_by_zip:
            print(f'Filtering {zip_file}')
            dir_path = DEST_DIR if not archive_results else os.path.join(
                DEST_DIR, os.path.basename(os.path.splitext(zip_file)[0]))
            ensure_dir_exist(dir_path)
            zip_file_obj = zipfile.ZipFile(zip_file)
            for curr_date in files_by_zip[zip_file]:
                file_time = time.time()
                date_info = files_by_zip[zip_file][curr_date]
                day = date_info['day']
                month = date_info['month']
                year = date_info['year']

                stock_quotes_file = date_info['stockquotes']
                stock_quotes_data = pd.read_csv(
                    zip_file_obj.open(stock_quotes_file))
                snp_quotes = SimulateTrade.filter_equity_snp_symbols(
                    stock_quotes_data, snp_500_symbols)
                snp_quotes.to_csv(os.path.join(
                    dir_path, f'stockquotes_{year}{month:02}{day:02}.csv'),