def changeWord2index(path_word2vec=None, path_word_dic=None, path_save=None): file_word_dic = open(path_word_dic, 'r', encoding='utf-8') file_word2vec = open(path_word2vec, 'r', encoding='utf-8') not_found_word2vec = ' '.join(['0.0'] * 300) word2vec_list = [] line = file_word_dic.readline() word2index_dic = {} word2vec_dic = {} while line: cur_word = line.split()[0] cur_index = line.strip('\n\r').split()[1] word2index_dic[cur_word] = cur_index line = file_word_dic.readline() line = file_word2vec.readline() while line: #print(line) line = line.strip('\n\r').split() cur_key = str(line[0]) cur_value = ' '.join(line[1:]) word2vec_dic[cur_key] = cur_value line = file_word2vec.readline() for cur_key in word2index_dic: try: cur_value = word2vec_dic[cur_key] except: cur_value = not_found_word2vec print(cur_key, word2index_dic[cur_key]) cur_string = str(word2index_dic[cur_key]) + ' ' + cur_value word2vec_list.append(cur_string) DataUtil.save_vector(path_save, word2vec_list, 'w')
def generate_samples(config): sess_config = tf.compat.v1.ConfigProto() sess_config.gpu_options.allow_growth = True sess_config.allow_soft_placement = True default_graph = tf.Graph() with default_graph.as_default(): sess = tf.compat.v1.Session(config=sess_config, graph=default_graph) logger = logging.getLogger('') du = DataUtil(config=config) du.load_vocab() generator = Model(config=config, graph=default_graph, sess=sess) generator.build_train_model() generator.build_generate(max_len=config.train.max_length, generate_devices=config.train.devices, optimizer=config.train.optimizer) generator.init_and_restore(config.train.modelFile) infile=config.train.src_path generate_batch= config.train.batch_size outfile=config.train.out_path print("begin generate the data and save the negative in %s" % outfile) generator.generate_and_save(du, infile, generate_batch, outfile) print("generate the data done!")
def generate_samples(config): sess_config = tf.ConfigProto() sess_config.gpu_options.allow_growth = True sess_config.allow_soft_placement = True default_graph = tf.Graph() with default_graph.as_default(): sess = tf.Session(config=sess_config, graph=default_graph) logger = logging.getLogger('') du = DataUtil(config=config) du.load_vocab(src_vocab=config.src_vocab, dst_vocab=config.dst_vocab, src_vocab_size=config.src_vocab_size_a, dst_vocab_size=config.dst_vocab_size_b) generator = Model(config=config, graph=default_graph, sess=sess) generator.build_variational_train_model() generator.init_and_restore(config.train.modelFile) print("begin generate the data and save the negative") generator.generate_and_save(du, config.train.src_path, config.train.batch_size, config.train.t_domain_generated_data, direction='ab') generator.generate_and_save(du, config.train.dst_path, config.train.batch_size, config.train.s_domain_generated_data, direction='ba') print("generate the data done!")
def save_all_qid2question(): # 读取配置文件 cf = ConfigParser.ConfigParser() cf.read("../conf/python.conf") # 加载train.csv文件 train_data = pd.read_csv('%s/train.csv' % cf.get('DEFAULT', 'origin_pt')).fillna(value="") # [:100] # 加载test.csv文件 test_data = pd.read_csv('%s/test_with_qid.csv' % cf.get('DEFAULT', 'devel_pt')).fillna(value="") # [:100] # 存储索引文件 qid2question_qid_fp = '%s/qid2question.all.qid' % cf.get('DEFAULT', 'devel_pt') qid2question_question_fp = '%s/qid2question.all.question' % cf.get('DEFAULT', 'devel_pt') # 获取qid2question all_qid2question = BTM.get_all_qid2question(train_data, test_data) all_qid = [] all_question = [] for qid in all_qid2question: all_qid.append(qid) all_question.append(all_qid2question[qid]) # 存储索引 DataUtil.save_vector(qid2question_qid_fp, all_qid, 'w') DataUtil.save_vector(qid2question_question_fp, all_question, 'w')
def generate_samples(config): sess_config = tf.ConfigProto() sess_config.gpu_options.allow_growth = True sess_config.allow_soft_placement = True default_graph = tf.Graph() with default_graph.as_default(): sess = tf.Session(config=sess_config, graph=default_graph) logger = logging.getLogger('') du = DataUtil(config=config) du.load_vocab() generator = Model(config=config, graph=default_graph, sess=sess) generator.build_train_model() generator.build_generate(max_len=config.train.max_length, generate_devices=config.train.devices, optimizer=config.train.optimizer) generator.init_and_restore(config.train.modelFile) infile = config.train.src_path outfile = config.train.out_path refile = [config.train.dst_path] generate_batch = config.train.batch_size print("begin generate the data and save the negative in %s" % outfile) generator.generate_and_save(du, infile, generate_batch, outfile) print("generate the data done!") SARI_results, BLEU_results = process_evaluation_file_multi( infile, outfile, refile) logging.info("SARI: {}, BLEU: {}".format(SARI_results, BLEU_results)) '''
def rescale(online_preds_fp): online_preds = DataUtil.load_vector(online_preds_fp, 'float') print(PostProcessor.getResultMean(online_preds)) for index in range(len(online_preds)): score = online_preds[index] score = PostProcessor.adj(score, te=0.35, tr=0.25) online_preds[index] = score print(PostProcessor.getResultMean(online_preds)) DataUtil.save_vector(online_preds_fp + '.rescale', online_preds, 'w')
def saveSmallWord2Vec(path_save, small_word2vec_dict): word2vec_list = [] count = 0 for key in small_word2vec_dict: count += 1 if count < 20: print(key) cur_string = str(key) + ' ' + str(small_word2vec_dict[key]) word2vec_list.append(cur_string) DataUtil.save_vector(path_save, word2vec_list, 'w')
def saveNameAndCalc(self, save_filename, mul_feature_name, feature_calcpearson): feature_analysis_pt = self.config.get( 'DIRECTORY', 'feature_analysis_pt') + save_filename merge_string = [] for cur_feature_name, cur_feature_calc in zip(mul_feature_name, feature_calcpearson): merge_string.append( str(cur_feature_name) + ':' + str(cur_feature_calc)) DataUtil.save_vector(feature_analysis_pt, merge_string, 'w')
def random_split_index_offline(config): question_offline_fp = config.get('DIRECTORY', 'source_pt') + '/question_train_set.txt' question_offline = open(question_offline_fp, 'r').readlines() [train, valid] = DataUtil.random_split(range(len(question_offline)), [0.966, 0.034]) train_fp = config.get('DIRECTORY', 'index_pt') + 'train_996.offline.index' valid_fp = config.get('DIRECTORY', 'index_pt') + 'valid_034.offline.index' DataUtil.save_vector(train_fp, train, 'w') DataUtil.save_vector(valid_fp, valid, 'w')
def train(config): logger = logging.getLogger('') """Train a model with a config file.""" du = DataUtil(config=config) du.load_vocab() model = Model(config=config) model.build_train_model() sess_config = tf.compat.v1.ConfigProto() sess_config.gpu_options.allow_growth = True sess_config.allow_soft_placement = True with model.graph.as_default(): saver = tf.compat.v1.train.Saver(var_list=tf.compat.v1.global_variables()) summary_writer = tf.compat.v1.summary.FileWriter(config.train.logdir, graph=model.graph) # saver_partial = tf.train.Saver(var_list=[v for v in tf.trainable_variables() if 'Adam' not in v.name]) with tf.compat.v1.Session(config=sess_config) as sess: # Initialize all variables. sess.run(tf.compat.v1.global_variables_initializer()) try: # saver_partial.restore(sess, tf.train.latest_checkpoint(config.train.logdir)) # print('Restore partial model from %s.' % config.train.logdir) saver.restore(sess, tf.train.latest_checkpoint(config.train.logdir)) except: logger.info('Failed to reload model.') for epoch in range(1, config.train.num_epochs+1): for batch in du.get_training_batches_with_buckets(): start_time = time.time() step = sess.run(model.global_step) # Summary if step % config.train.summary_freq == 0: step, lr, gnorm, loss, acc, summary, _ = sess.run( [model.global_step, model.learning_rate, model.grads_norm, model.loss, model.acc, model.summary_op, model.train_op], feed_dict={model.src_pl: batch[0], model.dst_pl: batch[1]}) summary_writer.add_summary(summary, global_step=step) else: step, lr, gnorm, loss, acc, _ = sess.run( [model.global_step, model.learning_rate, model.grads_norm, model.loss, model.acc, model.train_op], feed_dict={model.src_pl: batch[0], model.dst_pl: batch[1]}) logger.info( 'epoch: {0}\tstep: {1}\tlr: {2:.6f}\tgnorm: {3:.4f}\tloss: {4:.4f}\tacc: {5:.4f}\ttime: {6:.4f}'. format(epoch, step, lr, gnorm, loss, acc, time.time() - start_time)) # Save model if step % config.train.save_freq == 0: mp = config.train.logdir + '/model_epoch_%d_step_%d' % (epoch, step) saver.save(sess, mp) logger.info('Save model in %s.' % mp) logger.info("Finish training.")
def random_split_dataset(config): all_fp = config.get('DIRECTORY', 'dataset_pt') + 'title_content_word.all.csv' all_data = open(all_fp, 'r').readlines() all_data = [line.strip('\n') for line in all_data] [train, valid] = DataUtil.random_split(all_data, [0.966, 0.034]) train_fp = config.get('DIRECTORY', 'dataset_pt') + 'title_content_word.train_996.csv' valid_fp = config.get('DIRECTORY', 'dataset_pt') + 'title_content_word.valid_034.csv' DataUtil.save_vector(train_fp, train, 'w') DataUtil.save_vector(valid_fp, valid, 'w')
def init_from_config(self, config): # self.model = Model(config) self.model = Transformer(config, config.test.devices) self.model.build_test_model() sess_config = tf.ConfigProto() sess_config.gpu_options.allow_growth = True sess_config.allow_soft_placement = True self.sess = tf.Session(config=sess_config, graph=self.model.graph) # Restore model. self.model.saver.restore( self.sess, tf.train.latest_checkpoint(config.train.logdir)) self.du = DataUtil(config)
def save_all_wordtoken(cf): # 加载train.csv文件 train_data = pd.read_csv('%s/train.csv' % cf.get('DEFAULT', 'origin_pt')).fillna(value="") # [:100] # 加载test.csv文件 test_data = pd.read_csv('%s/test_with_qid.csv' % cf.get('DEFAULT', 'devel_pt')).fillna(value="") # [:100] # 文件存储路径 all_wt_fp = '%s/all.wordtoken' % cf.get('DEFAULT', 'devel_pt') # 获取all_wordtoken all_wt = BTM.get_all_wordtoken(train_data, test_data) # 存储 DataUtil.save_vector(all_wt_fp, all_wt, 'w')
def generate_index_with_swap(self): """ Generate the index file of `train_with_swap.csv` :return: none """ train_index_fp = '%s/train_311.train.index' % self.config.get('DEFAULT', 'feature_index_pt') train_with_swap_index_fp = '%s/train_311.train_with_swap.index' % self.config.get('DEFAULT', 'feature_index_pt') train_index = DataUtil.load_vector(train_index_fp, False) train_index = [int(x) for x in train_index] offset = 404290 train_swap_index = [x + offset for x in train_index] train_with_swap_index = train_index + train_swap_index DataUtil.save_vector(train_with_swap_index_fp, train_with_swap_index, 'w')
def exe_train(sess, data, epoch, batch_size, hf, feature_shape, train, loss, input_video, y, bidirectional=False, step=False,modality='rgb' ): np.random.shuffle(data) total_data = len(data) num_batch = int(math.ceil(total_data/batch_size))+1 total_loss = 0.0 for batch_idx in xrange(num_batch): batch_data = data[batch_idx*batch_size:min((batch_idx+1)*batch_size,total_data)] tic = time.time() data_v,data_y = DataUtil.getOversamplingBatchVideoFeature(batch_data,hf,(10,feature_shape[1],feature_shape[2],feature_shape[3]),modality=modality) if bidirectional: flag = np.random.randint(0,2) if flag==1: data_v = data_v[:,::-1] data_time = time.time()-tic tic = time.time() # print('data_v mean:', np.mean(data_v),' std:', np.std(data_v)) _, l = sess.run([train,loss],feed_dict={input_video:data_v, y:data_y}) run_time = time.time()-tic total_loss += l print(' batch_idx:%d/%d, loss:%.5f, data_time:%.3f, run_time:%.3f' %(batch_idx+1,num_batch,l,data_time,run_time)) total_loss = total_loss/num_batch return total_loss
def generate(config, argv): data_name = argv[0] LogUtil.log('INFO', 'data_name=%s' % data_name) # load data set if 'offline' == data_name: # load offline valid dataset index valid_index_off_fp = '%s/%s.offline.index' % (config.get('DIRECTORY', 'index_pt'), config.get('TITLE_CONTENT_CNN', 'valid_index_offline_fn')) valid_index_off = DataUtil.load_vector(valid_index_off_fp, 'int') valid_index_off = [num - 1 for num in valid_index_off] source_file_path = config.get('DIRECTORY', 'source_pt') + '/question_train_set.txt' source_data = load_raw_line_from_file(config, source_file_path, valid_index_off) elif 'online' == data_name: source_file_path = config.get('DIRECTORY', 'source_pt') + '/question_eval_set.txt' source_data = open(source_file_path, 'r').readlines() else: source_data = None feature_file_path = '%s/instance_fs_length.%s.smat' % (config.get('DIRECTORY', 'dataset_pt'), data_name) feature_file = open(feature_file_path, 'w') feature_file.write('%d %d\n' % (len(source_data), 4)) for line in source_data: qid, tc, tw, dc, dw = parse_question_set(line) feature = list() feature.append(len(tc)) feature.append(len(tw)) feature.append(len(dc)) feature.append(len(dw)) Feature.save_feature(feature, feature_file) feature_file.close()
def saveLabel(data_set_name): config_fp = '../conf/featwheel.conf' config = ConfigParser.ConfigParser() config.read(config_fp) data = pd.read_csv('%s/%s' % (config.get('DIRECTORY', 'csv_spanish_cleaning_pt'), data_set_name)).fillna(value="") labels_pt = '%s/%s.label' % (config.get( 'DIRECTORY', 'label_pt'), config.get('FEATURE', 'offline_rawset_name')) labels = [] for index, row in data.iterrows(): cur_label = str(row['is_duplicate']) labels.append(cur_label) DataUtil.save_vector(labels_pt, labels, 'w')
def __init__(self, data_dir, split): self.split = split self.data_dir = data_dir if split == "train": self.alldata = DU.load_json( os.path.join(data_dir, "train_data.json")) elif split == "test": self.alldata = DU.load_json( os.path.join(data_dir, "test_data.json")) else: print("split shoud be train or test") return self.graph_lists = [] self.graph_labels = [] self.n_samples = len(self.alldata) self._prepare()
def demo(): ''' 使用样例代码 ''' # 读取配置文件 cf = ConfigParser.ConfigParser() cf.read("../conf/python.conf") # 加载特征文件 features = Feature.load("%s/feature1.demo.smat" % cf.get('DEFAULT', 'feature_question_pt')) # 存储特征文件 Feature.save( features, "%s/feature2.demo.smat" % cf.get('DEFAULT', 'feature_question_pt')) # 合并特征 Feature.merge_col(features, features) # 获取<问题>特征池中的特征名 Feature.get_feature_names_question(cf) # 加载索引文件 indexs = Feature.load_index("%s/vali.demo.index" % cf.get('DEFAULT', 'feature_index_pt')) # 根据索引对特征采样 features = Feature.sample_row(features, indexs) # 正负样本均衡化 rate = 0.165 train311_train_indexs_fp = '%s/train_311.train.index' % cf.get( 'DEFAULT', 'feature_index_pt') train311_train_indexs = Feature.load_index(train311_train_indexs_fp) train_labels_fp = '%s/train.label' % cf.get('DEFAULT', 'feature_label_pt') train_labels = DataUtil.load_vector(train_labels_fp, True) balanced_indexs = Feature.balance_index(train311_train_indexs, train_labels, rate)
def exe_test(sess, data, batch_size, v2i, i2v, hf, feature_shape, predict_words, input_video, input_captions, y, capl=16): caption_output = [] total_data = len(data) num_batch = int(round(total_data * 1.0 / batch_size)) + 1 for batch_idx in xrange(num_batch): batch_caption = data[batch_idx * batch_size:min((batch_idx + 1) * batch_size, total_data)] data_v = DataUtil.getBatchStepVideoFeature(batch_caption, hf, feature_shape) data_c, data_y = DataUtil.getBatchTestCaption(batch_caption, v2i, capl=capl) [gw] = sess.run([predict_words], feed_dict={ input_video: data_v, input_captions: data_c, y: data_y }) generated_captions = DataUtil.convertCaptionI2V(batch_caption, gw, i2v) for idx, sen in enumerate(generated_captions): print('%s : %s' % (batch_caption[idx].keys()[0], sen)) caption_output.append({ 'image_id': batch_caption[idx].keys()[0], 'caption': sen }) js = {} js['val_predictions'] = caption_output return js
def run_for_set(): start_time = LoggingUtil.log_start_time() data = DataUtil.prepare_data() cls = classifier.Classifier() cls.perform_with_cross_validation(data, load_from_pickle=False) LoggingUtil.log_end_time(start_time)
def generate(config, argv): # load valid dataset index valid_index_fp = '%s/%s.offline.index' % (config.get('DIRECTORY', 'index_pt'), config.get('TITLE_CONTENT_CNN', 'valid_index_offline_fn')) valid_index = DataUtil.load_vector(valid_index_fp, 'int') valid_index = [num - 1 for num in valid_index] # load topic btm vec topic_btm_vec = load_topic_btm_vec(config) # offline / online data_name = argv[0] dis_func_names = ["cosine", "cityblock", "jaccard", "canberra", "euclidean", "minkowski", "braycurtis"] btm_dis_feature_fn = ['vote_fs_btm_dis_%s' % dis_func_name for dis_func_name in dis_func_names] btm_dis_feature_f = [open('%s/%s.%s.csv' % (config.get('DIRECTORY', 'dataset_pt'), fn, data_name), 'w') for fn in btm_dis_feature_fn] if 'offline' == data_name: btm_tw_cw_features = load_features_from_file(config, 'fs_btm_tw_cw', data_name, valid_index) LogUtil.log('INFO', 'load_features_from_file, len=%d' % len(btm_tw_cw_features)) for line_id in range(len(btm_tw_cw_features)): doc_vec = btm_tw_cw_features[line_id] for dis_id in range(len(dis_func_names)): vec = [0.] * 1999 for topic_id in range(1999): topic_vec = topic_btm_vec[topic_id] if 'minkowski' == dis_func_names[dis_id]: vec[topic_id] = eval(dis_func_names[dis_id])(doc_vec, topic_vec, 3) else: vec[topic_id] = eval(dis_func_names[dis_id])(doc_vec, topic_vec) btm_dis_feature_f[dis_id].write('%s\n' % ','.join([str(num) for num in vec])) else: btm_vec_fp = '%s/fs_btm_tw_cw.%s.csv' % (config.get('DIRECTORY', 'dataset_pt'), data_name) btm_vec_f = open(btm_vec_fp, 'r') for line in btm_vec_f: doc_vec = np.nan_to_num(parse_feature_vec(line)) for dis_id in range(len(dis_func_names)): vec = [0.] * 1999 for topic_id in range(1999): topic_vec = topic_btm_vec[topic_id] if 'minkowski' == dis_func_names[dis_id]: vec[topic_id] = eval(dis_func_names[dis_id])(doc_vec, topic_vec, 3) else: vec[topic_id] = eval(dis_func_names[dis_id])(doc_vec, topic_vec) btm_dis_feature_f[dis_id].write('%s\n' % ','.join([str(num) for num in vec])) for f in btm_dis_feature_f: f.close()
def save_tmp_grid_jsons(tmp_grid_loc, accumulation_data, raw_data_loc): for i in range(5): for j in range(10): DU.mkdir( os.path.join(tmp_grid_loc, "p%s" % str(i + 1), "m%s" % str(j + 1))) grid_data_info = {} if os.path.exists(os.path.join(tmp_grid_loc, "config.json")) and accumulation_data: grid_data_info = DU.load_json(os.path.join(tmp_grid_loc, "config.json")) row_p = grid_data_info['row_p'] col_m = grid_data_info['col_m'] dir_count = grid_data_info['json_count'] else: row_p = [5 + 2 * (i + 1) for i in range(5)] col_m = [4 + 4 * (i + 1) for i in range(10)] dir_count = np.zeros([len(row_p), len(col_m)], dtype=int) grid_data_info['row_p'] = row_p grid_data_info['col_m'] = col_m print(dir_count) print(row_p) print(col_m) all_data_json = DU.load_json(raw_data_loc) print(len(all_data_json)) # 分栏保存数据 for data in all_data_json.values(): petri_net = data["petri_net"] v_list = data["arr_vlist"] p_num = get_lowest_idx(len(petri_net), row_p) m_num = get_lowest_idx(len(v_list), col_m) dir_count[p_num - 1][m_num - 1] = dir_count[p_num - 1][m_num - 1] + 1 DU.save_data_to_json( os.path.join( tmp_grid_loc, "p%s" % str(p_num), "m%s" % str(m_num), "data%s.json" % str(int(dir_count[p_num - 1][m_num - 1]))), data) print(dir_count) # print(get_lowest_idx(7,row_p)) if isinstance(dir_count, list): grid_data_info["json_count"] = dir_count else: grid_data_info["json_count"] = dir_count.tolist() DU.save_data_to_json(os.path.join(tmp_grid_loc, "config.json"), grid_data_info)
def __init__(self, config): self.config = config # Load model self.model = Model(config) self.model.build_test_model() self.du = DataUtil(config) self.du.load_vocab() # Create session sess_config = tf.ConfigProto() sess_config.gpu_options.allow_growth = True sess_config.allow_soft_placement = True self.sess = tf.Session(config=sess_config, graph=self.model.graph) # Restore model. with self.model.graph.as_default(): saver = tf.train.Saver(tf.global_variables()) saver.restore(self.sess, tf.train.latest_checkpoint(config.train.logdir))
def exe_train(sess, data, batch_size, v2i, hf, feature_shape, train, loss, input_video, input_captions, y, capl=16): np.random.shuffle(data) total_data = len(data) num_batch = int(round(total_data * 1.0 / batch_size)) total_loss = 0.0 for batch_idx in xrange(num_batch): # if batch_idx < 100: batch_caption = data[batch_idx * batch_size:min((batch_idx + 1) * batch_size, total_data)] data_v = DataUtil.getBatchStepVideoFeature(batch_caption, hf, feature_shape) data_c, data_y = DataUtil.getNewBatchTrainCaption(batch_caption, v2i, capl=capl) _, l = sess.run([train, loss], feed_dict={ input_video: data_v, input_captions: data_c, y: data_y }) total_loss += l print(' batch_idx:%d/%d, loss:%.5f' % (batch_idx + 1, num_batch, l)) total_loss = total_loss / num_batch return total_loss
def generate(config, argv): data_name = argv[0] word_idf_fp = '%s/words.idf' % config.get('DIRECTORY', 'devel_pt') with open(word_idf_fp, 'r') as word_idf_f: word_idf = json.load(word_idf_f) LogUtil.log("INFO", "load word_idf done, len(word_idf)=%d" % len(word_idf)) char_idf_fp = '%s/chars.idf' % config.get('DIRECTORY', 'devel_pt') with open(char_idf_fp, 'r') as char_idf_f: char_idf = json.load(char_idf_f) LogUtil.log("INFO", "load char_idf done, len(char_idf)=%d" % len(char_idf)) # load data set if 'offline' == data_name: # load offline valid dataset index valid_index_off_fp = '%s/%s.offline.index' % ( config.get('DIRECTORY', 'index_pt'), config.get('TITLE_CONTENT_CNN', 'valid_index_offline_fn')) valid_index_off = DataUtil.load_vector(valid_index_off_fp, 'int') valid_index_off = [num - 1 for num in valid_index_off] source_file_path = config.get('DIRECTORY', 'source_pt') + '/question_train_set.txt' source_data = load_raw_line_from_file(config, source_file_path, valid_index_off) features = valid_index_off elif 'online' == data_name: source_file_path = config.get('DIRECTORY', 'source_pt') + '/question_eval_set.txt' source_data = open(source_file_path, 'r').readlines() features = range(len(source_data)) else: source_data = None features = None id_feature_file_path = '%s/instance_fs_id.%s.smat' % (config.get( 'DIRECTORY', 'dataset_pt'), data_name) feature_file = open(id_feature_file_path, 'w') feature_file.write('%d %d\n' % (len(source_data), 1)) for id_num in features: feature = list() feature.append(id_num % 100000) Feature.save_feature(feature, feature_file) feature_file.close()
def generate_spn(config, write_loc, data_idx): place_upper_bound = config['place_upper_bound'] marks_lower_limit = config['marks_lower_limit'] marks_upper_limit = config['marks_upper_limit'] prune_flag = config['prune_flag'] add_token = config['add_token'] max_place_num = config['max_place_num'] min_place_num = config['min_place_num'] finish = False while finish == False: place_num = np.random.randint(min_place_num, max_place_num + 1) tran_num = place_num + np.random.randint(-3, 1) petri_matrix = PeGen.rand_generate_petri(place_num, tran_num) if prune_flag: petri_matrix = PeGen.prune_petri(petri_matrix) if add_token: petri_matrix = PeGen.add_token(petri_matrix) results_dict, finish = SPN.filter_spn(petri_matrix, place_upper_bound, marks_lower_limit, marks_upper_limit) DU.save_data_to_json( os.path.join(write_loc, "data%s.json" % str(data_idx)), results_dict)
def save_all_question2wordtoken(cf): # 加载train.csv文件 train_data = pd.read_csv('%s/train.csv' % cf.get('DEFAULT', 'origin_pt')).fillna(value="") # [:100] # 加载test.csv文件 test_data = pd.read_csv('%s/test_with_qid.csv' % cf.get('DEFAULT', 'devel_pt')).fillna(value="") # [:100] # 文件存储路径 q2wt_q_fp = '%s/q2wt.all.question' % cf.get('DEFAULT', 'devel_pt') q2wt_wt_fp = '%s/q2wt.all.wordtoken' % cf.get('DEFAULT', 'devel_pt') # 获取qid2question all_q2wt = BTM.get_all_question2wordtoken(train_data, test_data) all_q = [] all_wt = [] for q in all_q2wt: all_q.append(q) all_wt.append(all_q2wt[q]) # 存储索引 DataUtil.save_vector(q2wt_q_fp, all_q, 'w') DataUtil.save_vector(q2wt_wt_fp, all_wt, 'w')
def __init__(self, config): self.config = config # Load model self.model = Model(config) # self.model.build_test_model() self.model.build_variational_test_model(mode=config.test.mode) logging.info('build_test_variational_model done!') self.du = DataUtil(config) self.du.load_vocab(src_vocab=config.src_vocab, dst_vocab=config.dst_vocab, src_vocab_size=config.src_vocab_size_a, dst_vocab_size=config.src_vocab_size_b) # Create session sess_config = tf.ConfigProto() sess_config.gpu_options.allow_growth = True sess_config.allow_soft_placement = True self.sess = tf.Session(config=sess_config, graph=self.model.graph) # Restore model. with self.model.graph.as_default(): saver = tf.train.Saver(tf.global_variables()) saver.restore(self.sess, tf.train.latest_checkpoint(config.train.logdir))
def generate_cv_subset_index(cf, argv): """ Generate index used for 5-fold cross validation :param cf: configuration file :param argv: parameter list :return: none """ tag = argv[0] cv_num = 5 cv_rawset_name = 'train_with_swap' train_data_size = 404290 index_all = [] for i in range(cv_num): index_all.append([]) for i in range(train_data_size): index_all[int(random.random() * cv_num)].append(i) for i in range(cv_num): LogUtil.log('INFO', 'size(part%d)=%d' % (i, len(index_all[i]))) index_fp = cf.get('DEFAULT', 'feature_index_pt') for i in range(cv_num): fold_id = i # train fp = '%s/cv_tag%s_n%d_f%d_train.%s.index' % ( index_fp, tag, cv_num, fold_id, cv_rawset_name) for j in range(cv_num - 2): part_id = (i + j) % cv_num DataUtil.save_vector(fp, index_all[part_id], 'a') for j in range(cv_num - 2): part_id = (i + j) % cv_num DataUtil.save_vector( fp, [index + train_data_size for index in index_all[part_id]], 'a') # valid fp = '%s/cv_tag%s_n%d_f%d_valid.%s.index' % ( index_fp, tag, cv_num, fold_id, cv_rawset_name) part_id = (fold_id + cv_num - 2) % cv_num DataUtil.save_vector(fp, index_all[part_id], 'w') # test fp = '%s/cv_tag%s_n%d_f%d_test.%s.index' % ( index_fp, tag, cv_num, fold_id, cv_rawset_name) part_id = (fold_id + cv_num - 1) % cv_num DataUtil.save_vector(fp, index_all[part_id], 'w')
def save_train_qid2question(): # 读取配置文件 cf = ConfigParser.ConfigParser() cf.read("../conf/python.conf") # 加载train.csv文件 train_data = pd.read_csv('%s/train.csv' % cf.get('DEFAULT', 'origin_pt')).fillna(value="") # [:100] # 存储文件路径 qid2question_qid_fp = '%s/qid2question.train.qid' % cf.get('DEFAULT', 'devel_pt') qid2question_question_fp = '%s/qid2question.train.question' % cf.get('DEFAULT', 'devel_pt') # 获取qid2question train_qid2question = BTM.get_qid2question(train_data) train_qid = [] train_question = [] for qid in train_qid2question: train_qid.append(qid) train_question.append(train_qid2question[qid]) # 存储索引 DataUtil.save_vector(qid2question_qid_fp, train_qid, 'w') DataUtil.save_vector(qid2question_question_fp, train_question, 'w')
def run_for_examples(): start_time = LoggingUtil.log_start_time() data = DataUtil.prepare_data() cls = MultinomialNB() vect = CountVectorizer(ngram_range=(1, 2)) train_labels = data['label'].values train_features = vect.fit_transform(data['email'].values) cls.fit(train_features, train_labels) examples = ['Congrats! Boss is proud of your promotion. Keep doing well. Regards.', 'Congrats! You are lucky one to be offered a promotion!', 'Congrats! You are promoted!', 'Congrats! You won one million!'] test_features = vect.transform(examples) predictions = cls.predict(test_features) print(predictions) LoggingUtil.log_end_time(start_time)