def __init__(self,base_model_list=bagging_config.base_model_list): self.base_model_list = base_model_list.split("-") self.num_random=len(self.base_model_list) self.dataDir = general_config.data_dir + "/random" createRandomData(self.num_random) self.models = [] self.models_name=[] for i in range(self.num_random): base_model = self.base_model_list[i] assert base_model in ["1", "2", "3", "4","5"], "Invalid base model type!" if base_model == "1": model = TextCNN() elif base_model == "2": model = TextRNN() elif base_model == "3": model = CRNN() elif base_model=="4": model = RCNN() else: model=HAN() self.models.append(model) self.models_name.append(modelDict[base_model]) self.logDir = ensure_dir_exist(general_config.log_dir + "/bagging/" + "-".join(self.models_name)) self.saveDir = ensure_dir_exist(general_config.save_dir + "/bagging/" + "-".join(self.models_name)) self.logger=my_logger(self.logDir+"/log.txt")
def __init__(self): self.embeddings = load_embedding_matrix( wv_path=general_config.wv_path, int2vocabPath=general_config.global_static_i2v_path) self.model = GaussianNB() self.log_dir = ensure_dir_exist(general_config.log_dir + "/NB") self.save_dir = ensure_dir_exist(general_config.save_dir + "/NB") self.logger = my_logger(self.log_dir + "/log.txt")
def predict(self, testFile=None, vocab2intPath=None, load_path=general_config.load_path_test, is_save=True, resPath=None): if testFile is None or vocab2intPath is None: testFile = os.path.join(general_config.data_dir, "testing_data_new.txt") vocab2intPath = general_config.global_nonstatic_v2i_path test_generator = BucketedDataIteratorForDoc( loadPath=testFile, vocab2intPath=vocab2intPath) load_dir = load_path if os.path.isdir(load_path) else os.path.dirname( load_path) log_dir = load_dir.replace("checkpoints", "logs") logger = my_logger(log_dir + "/log_predict.txt") os.environ['CUDA_VISIBLE_DEVICES'] = str(0) config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = 0.8 with tf.Session(config=config, graph=self.graph) as sess: logger.info("Loading model...") saver = tf.train.Saver() if os.path.isdir(load_path): ckpt = tf.train.get_checkpoint_state(load_path) saver.restore(sess, ckpt.model_checkpoint_path) global_step = ckpt.model_checkpoint_path.split("-")[-1] else: saver.restore(sess, load_path) global_step = load_path.split("-")[-1] logger.info("Loading successfully, loading epoch is %s" % global_step) cur_loop = test_generator.loop batch_idx, batch_docs, _, batch_doc_lens, batch_sent_lens = test_generator.next( batch_size=1024, need_all=True) res = {} while (test_generator.loop == cur_loop): predicted = sess.run(self.predicted, feed_dict=self._feed_dict_test( batch_docs, batch_doc_len=batch_doc_lens, batch_sent_len=batch_sent_lens)) for (id, label) in zip(batch_idx, predicted): res[id] = int(label) batch_idx, batch_docs, _, batch_doc_lens, batch_sent_lens = test_generator.next( 1024, need_all=True) if is_save: if resPath is None: res_dir = ensure_dir_exist( load_dir.replace("checkpoints", "results")) resPath = os.path.join(res_dir, "predicted.csv-" + str(global_step)) res_save = [[key, value] for (key, value) in res.items()] # 用于存放测试识别结果 WriteToSubmission(fileName=resPath, res=res_save) return res
def __init__(self, base_model_list=stacking_config.base_model_list, num_cv=stacking_config.num_cv): self.base_model_list = base_model_list.split("-") self.num_models = len(self.base_model_list) self.num_cv = num_cv self.dataDir = general_config.data_dir + "/cv/" + str(self.num_cv) if not os.path.exists(self.dataDir): createCrossValidationData(self.num_cv) self.models = [] self.models_name = [] for n in range(self.num_models): base_model = self.base_model_list[n] assert base_model in ["1", "2", "3", "4", "5"], "Invalid base model type!" if base_model == "1": model = TextCNN() elif base_model == "2": model = TextRNN() elif base_model == "3": model = CRNN() elif base_model == "4": model = RCNN() else: model = HAN() self.models.append(model) self.models_name.append(modelDict[base_model]) self.logDir = ensure_dir_exist(general_config.log_dir + "/stacking/" + "-".join(self.models_name) + "/" + str(self.num_cv)) self.saveDir = ensure_dir_exist(general_config.save_dir + "/stacking/" + "-".join(self.models_name) + "/" + str(self.num_cv)) self.classifier = LogisticRegression() self.logger = my_logger(self.logDir + "/log.txt")
def createRandomData(num_random): trainingFile = general_config.training_file with open(trainingFile, 'r') as f: raw_data = np.asarray(f.readlines()) total_size=len(raw_data) saveDir = ensure_dir_exist(general_config.data_dir + "/random") for i in range(num_random): trainFile = saveDir + "/training" + str(i) + ".txt" if os.path.exists(trainFile): continue np.random.seed(seed=10*i) indices=np.random.choice(total_size,total_size,replace=True) with open(trainFile,'w') as f: f.writelines(raw_data[indices]) getNonstaticWordDict(trainFile=trainFile,global_v2i_path=general_config.global_nonstatic_v2i_path) create_visual_metadata(int2vocab_path=trainFile.replace(".txt","_i2v.json"))
def predict(self,load_epochs_list=bagging_config.load_epochs_list, testFile=None): if load_epochs_list is None: load_epochs=None else: load_epochs = load_epochs_list.split("-") assert len(load_epochs)==self.num_random if testFile is None: testFile =general_config.testing_file tmp_res={} res_dir = ensure_dir_exist(self.saveDir.replace("checkpoints", "results")) for i in range(self.num_random): model=self.models[i] model_name=self.models_name[i] vocab2intPath = self.dataDir + "/training" + str(i) + "_v2i.json" if model_name == "TextCNN": load_path = self.saveDir + "/" + str(i) + "_" + model_name + "/nonstatic/train" else: load_path = self.saveDir + "/" + str(i) + "_" + model_name + "/train" resPath = res_dir + "/" + str(i) + "_predicted.csv" if load_epochs is not None: load_path+="/model.ckpt-"+load_epochs[i] resPath = resPath+"-"+load_epochs[i] res=model.predict(testFile=testFile,vocab2intPath=vocab2intPath, load_path=load_path,resPath=resPath) for key, value in res.items(): try: tmp_res[key][value]+=1 except: tmp = {} for j in range(general_config.num_classes): tmp[j] = 0 tmp[value]+=1 tmp_res[key]=tmp res=[] for id,item in tmp_res.items(): tmp=sorted(item.items(),key=lambda d:d[1],reverse=True)[0][0] res.append([id,tmp]) if load_epochs_list is None: WriteToSubmission(res, fileName=res_dir + "/predicted.csv") else: WriteToSubmission(res,fileName=res_dir+"/predicted.csv-"+load_epochs_list)
def createCrossValidationData(num_cv=5): trainingFile = general_config.training_file with open(trainingFile, 'r') as f: raw_data = np.asarray(f.readlines()) saveDir = ensure_dir_exist(general_config.data_dir + "/cv/" + str(num_cv)) kf = KFold(num_cv, random_state=1234 + num_cv, shuffle=True) count = 0 for train_index, test_index in kf.split(raw_data): train = raw_data[train_index] test = raw_data[test_index] with open(saveDir + "/train" + str(count) + ".txt", 'w') as f: f.writelines(train) getNonstaticWordDict( trainFile=saveDir + "/train" + str(count) + ".txt", global_v2i_path=general_config.global_nonstatic_v2i_path) create_visual_metadata(int2vocab_path=saveDir + "/train" + str(count) + "_i2v.json") with open(saveDir + "/valid" + str(count) + ".txt", 'w') as f: f.writelines(test) count += 1
def fit(self,trainFile=None,with_validation=general_config.with_validation, log_dir=general_config.log_dir+"/TextCNN",save_dir=general_config.save_dir+"/TextCNN", load_path=general_config.load_path_train, num_epochs=general_config.num_epochs, steps_every_epoch=general_config.steps_every_epoch, batch_size=general_config.batch_size, learning_rate=general_config.learning_rate, lr_changing=general_config.lr_changing, min_learning_rate=general_config.min_learning_rate, learning_rate_decay=general_config.learning_rate_decay, save_epochs=general_config.save_epochs, early_stopping=general_config.early_stopping, num_visual=general_config.num_visualize): self.learning_rate_value = learning_rate self.trainFile = trainFile self.validFile = None self.with_validation = with_validation if self.trainFile is None: if self.with_validation: self.trainFile = general_config.train_file else: self.trainFile = general_config.training_file if self.with_validation: self.validFile = self.trainFile.replace("train", "valid") tmp = os.path.join(os.path.dirname(self.trainFile), os.path.basename(self.trainFile).replace(".txt", "").split("_")[0]) if self.model_type in ["static","multichannel"]: self.int2vocabPath = general_config.global_static_i2v_path self.vocab2intPath = general_config.global_static_v2i_path else: self.int2vocabPath = tmp + "_i2v.json" self.vocab2intPath = tmp + "_v2i.json" metadataPath = { "static": "/home/leechen/code/python/TextSentimentClassification/data_helpers/dataset/training_testing_metadata.tsv"} metadataPath["nonstatic"] = "/home/leechen/code/python/TextSentimentClassification/" \ + self.vocab2intPath.replace("v2i.json", "metadata.tsv") train_loss = [] train_accuracy = [] valid_loss = [] valid_accuracy = [] # 训练过程中的日志保存文件以及模型保存路径 if self.with_validation: log_dir=ensure_dir_exist(log_dir+"/"+self.model_type+"/train_valid") train_dir = os.path.join(log_dir, "train") val_dir = os.path.join(log_dir, "valid") save_dir = ensure_dir_exist(save_dir + "/" + self.model_type + "/train_valid") else: log_dir=ensure_dir_exist(log_dir+"/"+self.model_type+"/train") train_dir = os.path.join(log_dir, "train") val_dir=None save_dir = ensure_dir_exist(save_dir + "/" + self.model_type + "/train") # 生成日志 logger=my_logger(log_dir+"/log_fit.txt") msg = "\n--filter_size_list: %s\n" % self.filter_size_list \ + "--filter_num: %s\n" % self.filter_num \ + "--fc_layer_size_list: %s\n" % self.fc_layer_size_list \ + "--embedding_size: %s\n" % self.embedding_size \ + "--dropout: %s\n" % self.dropout_value \ + "--max_l2_norm: %s\n" % self.max_l2_norm \ + "--learning_rate: %s\n" % self.learning_rate_value \ + "--lr_changing: %s\n" % lr_changing \ + "--min_learning_rate: %s\n" % min_learning_rate\ + "--learning_rate_decay: %s\n" % learning_rate_decay\ +"--load_path: %s\n" % load_path\ +"--num_epochs: %s\n" % num_epochs\ +"--steps_every_epoch: %s\n" % steps_every_epoch\ +"--batch_size: %s\n" % batch_size\ +"--save_epochs: %s\n" % save_epochs\ +"--early_stopping: %s\n" % early_stopping\ +"--num_visual: %s\n"%num_visual logger.info(msg) # 定义数据生成器 train_generator = PaddedDataIterator(loadPath=self.trainFile,vocab2intPath=self.vocab2intPath) val_generator = None if self.validFile is None else PaddedDataIterator(loadPath=self.validFile, vocab2intPath=self.vocab2intPath) os.environ["CUDA_VISIBLE_DEVICES"] = str(0) config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = 0.8 with tf.Session(config=config,graph=self.graph) as sess: train_writer = tf.summary.FileWriter(train_dir, sess.graph) val_writer = None if val_dir is None else tf.summary.FileWriter(val_dir) saver = tf.train.Saver(max_to_keep=5) sess.run(tf.global_variables_initializer()) start = 0 if isinstance(load_path,str): if os.path.isdir(load_path): ckpt = tf.train.get_checkpoint_state(load_path) saver.restore(sess, ckpt.model_checkpoint_path) start = ckpt.model_checkpoint_path.split("-")[-1] else: saver.restore(sess, load_path) start = load_path.split("-")[-1] logger.info("Loading successfully, loading epoch is %s" % start) logger.info("The total number of trainable variables: %s"%get_num_params()) cur_early_stopping=0 cur_max_acc=0. logger.info('******* start training with %d *******' % start) epoch=0 for epoch in range(start, num_epochs): if lr_changing: try: if (train_loss[-1]>train_loss[-2]): tmp=self.learning_rate_value*learning_rate_decay if (tmp>=min_learning_rate): self.learning_rate_value=tmp logger.info("Learning rate multiplied by %s at epoch %s." %(learning_rate_decay,epoch+1)) else: if (train_loss[-1]<train_loss[-2]-0.015): self.learning_rate_value*=1.05 logger.info("Learning rate multiplied by 1.05 at epoch %s."%(epoch+1)) except: pass avg_loss_t, avg_accuracy_t = 0, 0 avg_loss_v, avg_accuracy_v = 0, 0 for step in range(steps_every_epoch): _, batch_seqs, batch_labels, _ = train_generator.next(batch_size) batch_seqs_ns=None if self.model_type=="multichannel": batch_seqs_ns = self._X2X_ns(batch_seqs) sess.run(self.train_op,self._feed_dict_train(batch_x=batch_seqs, batch_y=batch_labels, batch_x_ns=batch_seqs_ns)) loss_t, acc_t= sess.run([self.loss_op, self.acc_op], self._feed_dict_valid(batch_x=batch_seqs, batch_y=batch_labels, batch_x_ns=batch_seqs_ns)) avg_loss_t += loss_t avg_accuracy_t += acc_t avg_loss_t/=steps_every_epoch avg_accuracy_t/=steps_every_epoch train_loss.append(avg_loss_t) train_accuracy.append(avg_accuracy_t) self.loss_accuracy_summary.value[0].simple_value = avg_loss_t self.loss_accuracy_summary.value[1].simple_value = avg_accuracy_t train_writer.add_summary(summary=self.loss_accuracy_summary, global_step=epoch + 1) if self.with_validation: # 计算验证集上的表现 cur_loop=val_generator.loop _, batch_seqs, batch_labels,_ = val_generator.next(1024,need_all=True) batch_seqs_ns=None if self.model_type == "multichannel": batch_seqs_ns = self._X2X_ns(batch_seqs) cur_count=0 while(val_generator.loop==cur_loop): loss_v, acc_v = sess.run([self.loss_op, self.acc_op], feed_dict= self._feed_dict_valid(batch_x=batch_seqs, batch_y=batch_labels,batch_x_ns=batch_seqs_ns)) avg_loss_v += loss_v avg_accuracy_v += acc_v cur_count += 1 _, batch_seqs, batch_labels, _ = val_generator.next(1024, need_all=True) batch_seqs_ns = None if self.model_type == "multichannel": batch_seqs_ns = self._X2X_ns(batch_seqs) avg_loss_v/=cur_count avg_accuracy_v/=cur_count valid_loss.append(avg_loss_v) valid_accuracy.append(avg_accuracy_v) self.loss_accuracy_summary.value[0].simple_value = avg_loss_v self.loss_accuracy_summary.value[1].simple_value = avg_accuracy_v val_writer.add_summary(summary=self.loss_accuracy_summary, global_step=epoch + 1) logger.info("Epoch: [%04d/%04d], " "Training Loss: %.4f, Training Accuracy: %.4f, " "Validation Loss: %.4f, Validation Accuracy: %.4f" \ % (epoch + 1, num_epochs, avg_loss_t, avg_accuracy_t, avg_loss_v, avg_accuracy_v)) # 如果验证集上的准确率连续低于历史最高准确率的次数超过early_stopping次,则提前停止迭代。 if (avg_accuracy_v > cur_max_acc): cur_max_acc = avg_accuracy_v cur_early_stopping = 0 logger.info("Saving model-%s" % (epoch + 1)) saver.save(sess, os.path.join(save_dir, 'model.ckpt'), global_step=epoch + 1) else: cur_early_stopping += 1 if cur_early_stopping > early_stopping: logger.info("Early stopping after epoch %s !" % (epoch + 1)) break else: logger.info("Epoch: [%04d/%04d], " "Training Loss: %.4f, Training Accuracy: %.4f " \ % (epoch + 1, num_epochs,avg_loss_t, avg_accuracy_t)) # 保存一次模型 if (epoch - start + 1) % save_epochs == 0: logger.info("Saving model-%s"%(epoch+1)) saver.save(sess, os.path.join(save_dir, 'model.ckpt'), global_step=epoch + 1) if num_visual > 0: # 可视化最终词向量 config = projector.ProjectorConfig() final_embeddings = {} try: final_embeddings["static"] = self.embedding_matrix_s.eval()[:num_visual] except: pass try: final_embeddings["nonstatic"] = self.embedding_matrix_ns.eval()[:num_visual] except: pass for (name, final_embedding) in final_embeddings.items(): embedding_var = tf.Variable(final_embedding, name="word_embeddings_" + name) sess.run(embedding_var.initializer) saver = tf.train.Saver([embedding_var]) saver.save(sess, log_dir + "/embeddings_" + name+".ckpt-"+str(epoch+1)) embedding = config.embeddings.add() embedding.tensor_name = embedding_var.name embedding.metadata_path = metadataPath[name] projector.visualize_embeddings(train_writer, config) return train_loss, train_accuracy, valid_loss, valid_accuracy
archive_results = bool(sys.argv[2].lower()) start_time = time.time() snp_500_symbols = SimulateTrade.get_snp_symbols( SimulateTrade.SNP_SYMBOLS_FILE_PATH) zip_files = SimulateTrade.get_zip_files_in_folder(src_dir) for curr_file in zip_files: files_by_zip = {} file_path = os.path.join(src_dir, curr_file) files_by_zip[file_path] = SimulateTrade.get_files_from_zip_by_date( file_path) for zip_file in files_by_zip: print(f'Filtering {zip_file}') dir_path = DEST_DIR if not archive_results else os.path.join( DEST_DIR, os.path.basename(os.path.splitext(zip_file)[0])) ensure_dir_exist(dir_path) zip_file_obj = zipfile.ZipFile(zip_file) for curr_date in files_by_zip[zip_file]: file_time = time.time() date_info = files_by_zip[zip_file][curr_date] day = date_info['day'] month = date_info['month'] year = date_info['year'] stock_quotes_file = date_info['stockquotes'] stock_quotes_data = pd.read_csv( zip_file_obj.open(stock_quotes_file)) snp_quotes = SimulateTrade.filter_equity_snp_symbols( stock_quotes_data, snp_500_symbols) snp_quotes.to_csv(os.path.join( dir_path, f'stockquotes_{year}{month:02}{day:02}.csv'),