def mdr_falt_business(qg_info, name): qg_set = [] sanlei = [] IsMatchingFault = None IsADRorAccident_tag = None for SuperFaultName in qg_info: qg_sql = ( "SELECT SuperClassName,subname FROM `mdr_deviceinstrument` where Name='%s' limit 1" % name ) rows_qg = mdrsql.mdr_select(qg_sql) if rows_qg: IsMatchingFault = u'是' for row_qg_data in rows_qg: s_name_1 = row_qg_data[0] s_name_2 = row_qg_data[1] compose_data = s_name_1+':'+s_name_2+':'+name accident_sql = ( "replace into mdr_fault(SuperClassName,SubName,Name,NonStandardName,IsDeviceMatching,RealFaultName) " "values(%s,%s,%s,%s,%s,%s)" ) accident_data = (s_name_1, s_name_2, name, None, u'是', SuperFaultName) mdrsql.mdr_insert_alone(accident_sql, accident_data) qg_set.append(SuperFaultName) qg_setdata = utils.data_set(qg_set) UnMatchFault = "" IsADRorAccident_tag = "2" sanlei.append(s_name_1) sanlei.append(s_name_2) sanlei.append(name) else: accident_sql = ( "replace into mdr_fault(SuperClassName,SubName,Name,NonStandardName,IsDeviceMatching,RealFaultName) " "values(%s,%s,%s,%s,%s,%s)" ) accident_data = (None, None, None, name, u'否', SuperFaultName) mdrsql.mdr_insert_alone(accident_sql, accident_data) IsMatchingFault = u'是' UnMatchFault = utils.str_to_unicode(SuperFaultName) qg_set.append(UnMatchFault) qg_setdata = utils.data_set(qg_set) sanlei.append(name) # qg_data = ( # data["BianMa"], data["ProvinceName"], data["District"], data["County"], # data["ReportUnitName"], data["ReportUnitAddress"], data["ReportUnitTel"], data["Postalcode"], # data["UnitType"], data["HappenDate"], data["KnowDate"], data["ReportDate"], # data["ReportDate"], data["StateReportDate"], data["State"],StandardFault, # StandardFault, IsMatchingFault, SuperFaultName, UnMatchFault) # qg_sql = ( # "replace into mdr_faultbusiness(BianMa,ProvinceName,District,County,ReportUnitName,ReportUnitAddress,ReportUnitTel,Postalcode,UnitType ,HappenDate,KnowDate,ReportDate,AcceptDate,StateReportDate,State,StandardFault,Name,IsMatchingFault,SuperFaultName,UnMatchFault)" # "values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" # ) # mdrsql.mdr_insert_alone(qg_sql, qg_data) return (IsMatchingFault, IsADRorAccident_tag, qg_setdata, sanlei)
def read_input_data(data_dir): train_url = os.path.join(data_dir, 'out_x20.txt') train_url_y = os.path.join(data_dir, 'out_y20.txt') test_url = os.path.join(data_dir, 'out_x_test20.txt') test_url_y = os.path.join(data_dir, 'out_y_test20.txt') train_set_y = utils.data_set_y(train_url_y) test_set_y = utils.data_set_y(test_url_y) train_set, train_count = utils.data_set(train_url) test_set, test_count = utils.data_set(test_url) train_set_y.extend(test_set_y) train_set.extend(test_set) train_count.extend(test_count) return train_set_y,train_set, train_count
def prepare_data(bs, rate=0.95, sample=False): """Prepare the train-set and evaluation-set for the model. Parameters: ------------ bs: batch_size rate: 从有标注数据中提取的用作训练集的数据比例 Returns: -------- train_set, eval_set: DataLoader""" import numpy as np text_a, text_b, label = load_data("../source/train.txt") if sample: corpus_a, corpus_b, corpus_label = sample_corpus( "../source/corpus.txt") text_a.extend(corpus_a) text_b.extend(corpus_b) label.extend(corpus_label) data = data_set(text_a, text_b, label) nSamples = int(len(data) * rate) selected = np.random.choice(len(data), nSamples, replace=False) ind_train = np.zeros(len(data), dtype=np.bool) ind_train[selected] = True ind_eval = np.ones(len(data), dtype=np.bool) ind_eval[selected] = False train_set = DataLoader(TensorDataset(*data[ind_train]), bs, shuffle=True) eval_set = DataLoader(TensorDataset(*data[ind_eval]), 2 * bs, shuffle=False) return train_set, eval_set
def test(sess, model, test_url, batch_size): test_set, test_count, _ = utils.data_set(test_url) test_batches = utils.create_batches(len(test_set), batch_size, shuffle=False) loss_sum = 0.0 kld_sum = 0.0 ppx_sum = 0.0 word_count = 0 doc_count = 0 for idx_batch in test_batches: data_batch, count_batch, mask = utils.fetch_data( test_set, test_count, idx_batch, FLAGS.vocab_size) input_feed = {model.x.name: data_batch, model.mask.name: mask} loss, kld = sess.run([model.objective, model.kld], input_feed) loss_sum += np.sum(loss) kld_sum += np.sum(kld) / np.sum(mask) word_count += np.sum(count_batch) count_batch = np.add(count_batch, 1e-12) ppx_sum += np.sum(np.divide(loss, count_batch)) doc_count += np.sum(mask) print_ppx = np.exp(loss_sum / word_count) print_ppx_perdoc = np.exp(ppx_sum / doc_count) print_kld = kld_sum / len(test_batches) print('| Epoch test: {:d} |'.format(1), '| Perplexity: {:.9f}'.format(print_ppx), '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc), '| KLD: {:.5}'.format(print_kld))
def predict( self, file="../XuChuanyi_NJU_predict.txt", device=torch.device("cuda" if torch.cuda.is_available() else "cpu")): self.eval() text_a, text_b = load_data("../source/test.txt") text_a.append(torch.zeros(32, dtype=torch.int)) text_b.append(torch.zeros(32, dtype=torch.int)) test_set = data_set(text_a, text_b) text_a, text_b, *_ = test_set[:-1] text_a.squeeze_(-1) text_b.squeeze_(-1) text_a.unsqueeze_(1) text_b.unsqueeze_(1) x = torch.cat((text_a, text_b), dim=1).to(dtype=torch.float32, device=device) with torch.no_grad(): pred = self.forward(x) with open(file, 'w') as obj: for label in pred: label = 1 if label.item() > 0.5 else 0 label = str(label) + '\n' obj.write(label) return
def get_sh_info(sh_info, data): sh_set = [] is_adr = None clinicdetail_Name = None clinicdetail_SubID = None clinicsub_ID = None clinicsub_Name = None clinic_ID = None clinic_NAME = None IsADRorAccident_tag = "" for sh_info_item in sh_info: sh_query_sql = "SELECT clinicdetail.SubID, clinicdetail.Name,clinicsub.ID,clinicsub.Name,clinic.ID,clinic.NAME FROM clinicdetail, clinicsub,clinic WHERE clinicdetail.SubID=clinicsub.ID and clinicsub.PID=clinic.ID and clinicdetail.Name= '%s' limit 1" %(sh_info_item) rows_sh = mdrsql.mdr_select(sh_query_sql) if rows_sh: is_adr = u'是' for row_sh_data in rows_sh: #sh_s_name = row_sh_data[0] clinicdetail_Name = row_sh_data[1] clinicdetail_SubID = row_sh_data[0] clinicsub_ID = row_sh_data[2] clinicsub_Name = row_sh_data[3] clinic_ID = row_sh_data[4] clinic_NAME = row_sh_data[5] sh_set.append(sh_info_item) adr_data_list = sh_info_item IsADRorAccident_tag = "1" _un_sh_info = "" else: is_adr = u'否' _un_sh_info = u"[非标准:" + sh_info_item + u"]" sh_set.append(_un_sh_info) adr_data_list = utils.data_set(sh_set) #sh_s_name = "" clinicdetail_Name = "" clinicdetail_SubID = "" clinicsub_ID = "" clinicsub_Name = "" clinic_ID = "" clinic_NAME = "" sh_data = [ data["BianMa"], data["ProvinceName"], data["District"], data["County"], data["ReportUnitName"], data["ReportUnitAddress"], data["ReportUnitTel"], data["Postalcode"], data["UnitType"], data["HappenDate"], data["KnowDate"], data["ReportDate"], data["ReportDate"], data["StateReportDate"], data["State"], is_adr, clinicdetail_Name, clinicdetail_Name, clinicdetail_SubID, clinicsub_ID, clinicsub_Name, clinic_ID, clinic_NAME, _un_sh_info] sh_sql = ( "replace into mdr_adrbusiness(BianMa,ProvinceName,District,County,ReportUnitName,ReportUnitAddress,ReportUnitTel,Postalcode,UnitType ,HappenDate,KnowDate,ReportDate,AcceptDate,StateReportDate,State,IsMatchingADR,ADRStandardID,Name,SID1,SubID,SubName,PID,PName,UnMatchADR)" "values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" ) mdrsql.mdr_insert_alone(sh_sql, sh_data) return (is_adr, adr_data_list,IsADRorAccident_tag)
def get_icd_info(data,icd_ok): icd_standname = "" icd_set = [] IsMatchingAffect = None UnMatchAffect = None icd_setdata = None _un_tag_icd_1 = utils.str_to_unicode("[非标准:") _un_tag_icd_2 = utils.str_to_unicode("]") for item in icd_ok: #_item = item.strip(_trim_tag).strip() icd_sql = ( "SELECT StandardIcdName,icd_a_name,icd_b_name,icd_c_name,PathName FROM `mdr_icd` where StandardIcdName='%s' limit 1" %(item) ) rows_icd = mdrsql.mdr_select(icd_sql) if rows_icd: IsMatchingAffect = u'是' UnMatchAffect = "" for icd_ok_info in rows_icd: icd_a_name = icd_ok_info[0] icd_b_name = icd_ok_info[1] AffectStandardName = icd_ok_info[2] PathName = icd_ok_info[3] icd_set.append(item) icd_setdata = item icd_standname = AffectStandardName else: IsMatchingAffect = u'否' un_icd_info = _un_tag_icd_1+item+_un_tag_icd_2 _un_icd_info = utils.str_to_unicode(un_icd_info) UnMatchAffect = _un_icd_info icd_a_name = "" icd_b_name = "" AffectStandardName = "" PathName = "" icd_standname = AffectStandardName icd_set.append(UnMatchAffect) icd_setdata = utils.data_set(icd_set) icd_sql = ( "replace into mdr_icdbusiness(BianMa,ProvinceName,District,County,ReportUnitName,ReportUnitAddress,ReportUnitTel,Postalcode,UnitType ,HappenDate,KnowDate,ReportDate,AcceptDate,StateReportDate,State,IsMatchingAffect,AffectStandardName,icd_a_name,icd_b_name,icd_c_name,PathName,UnMatchAffect)" "values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" ) icd_data = [ data["BianMa"], data["ProvinceName"], data["District"], data["County"], data["ReportUnitName"], data["ReportUnitAddress"], data["ReportUnitTel"], data["Postalcode"], data["UnitType"], data["HappenDate"], data["KnowDate"], data["ReportDate"], data["ReportDate"], data["StateReportDate"], data["State"],IsMatchingAffect, AffectStandardName, icd_a_name, icd_b_name, AffectStandardName, PathName, UnMatchAffect] mdrsql.mdr_insert_alone(icd_sql, icd_data) return (IsMatchingAffect, UnMatchAffect,icd_setdata)
def train(nvdm, train_url, optimizer, batch_size=64, training_epochs=1000): train_set, train_count = utils.data_set(train_url) for epoch in range(training_epochs): train_batches = utils.create_batches(len(train_set), batch_size) loss_sum = 0.0 for idx_batch in train_batches: data_batch, count_batch, mask = utils.fetch_data( train_set, train_count, idx_batch, 2000) data_batch = torch.FloatTensor(data_batch) mask = torch.FloatTensor(mask) loss = nvdm(data_batch, mask) optimizer.zero_grad() loss.backward() optimizer.step() loss_sum += loss.item() print(loss_sum / len(train_batches))
def main(argv=None): if FLAGS.non_linearity == 'tanh': non_linearity = tf.nn.tanh elif FLAGS.non_linearity == 'sigmoid': non_linearity = tf.nn.sigmoid else: non_linearity = tf.nn.relu nvdm = NVDM(vocab_size=FLAGS.vocab_size, n_hidden=FLAGS.n_hidden, n_topic=FLAGS.n_topic, n_sample=FLAGS.n_sample, learning_rate=FLAGS.learning_rate, batch_size=FLAGS.batch_size, non_linearity=non_linearity) config = tf.ConfigProto() config.gpu_options.allow_growth=True sess = tf.Session(config=config) init = tf.initialize_all_variables() sess.run(init) train_url = os.path.join(FLAGS.data_dir, 'train.feat') if not FLAGS.test: train(sess, nvdm, train_url, FLAGS.batch_size, FLAGS.epochs) else: #Test saver = tf.train.Saver() saver.restore(sess, os.path.join(ckpt, 'model.ckpt')) print("Model restored.") #Training data train_set, train_count = utils.data_set(train_url) evaluate(nvdm, train_set, train_count, sess, 'test')
net['pool4'] = PoolLayer(net['conv4_2'], pool_size=2, stride=2) net['fc3'] = DenseLayer(net['pool4'], num_units=256) net['fc4'] = DenseLayer(net['fc3'], num_units=30) net['prob'] = NonlinearityLayer(net['fc4'], nonlinearity=identity) return net if __name__ == "__main__": # path to train and testing data PATH_train = "../data/training.csv" PATH_test = "../data/test.csv" # load data print 'loading data' data = data_set(path_train=PATH_train, path_test=PATH_test) # drop the missing values print 'drop missing values' data.drop_missing_values() # center data VGG style print 'center alexnet' data.center_alexnet() # generate test validation split train_set_x, valid_set_x, train_set_y, valid_set_y = train_test_split( data.X, data.y, test_size=0.2, random_state=42) # change type and load to GPU print 'load data to gpu' train_set_x = train_set_x.reshape(-1, 1, 96, 96).astype(theano.config.floatX) valid_set_x = valid_set_x.reshape(-1, 1, 96, 96).astype(theano.config.floatX)
def train(sess, model, train_url, test_url, dev_url, model_url, batch_size, saver, training_epochs=400, alternate_epochs=1): """train nvctm model.""" train_set, train_count = utils.data_set(train_url) dev_set, dev_count = utils.data_set(dev_url) test_set, test_count = utils.data_set(test_url) dev_batches = utils.create_batches(len(dev_set), batch_size, shuffle=False) test_batches = utils.create_batches(len(test_set), batch_size, shuffle=False) train_theta = [] train_beta = [] for epoch in range(training_epochs): train_batches = utils.create_batches(len(train_set), batch_size, shuffle=True) # ------------------------------- # train for switch in range(0, 2): if switch == 0: optim = model.optim_dec print_mode = 'updating decoder' else: optim = model.optim_enc print_mode = 'updating encoder' for i in range(alternate_epochs): loss_sum = 0.0 ppx_sum = 0.0 kld_sum = 0.0 word_count = 0 doc_count = 0 res_sum = 0 log_sum = 0 mean_sum = 0 var_sum = 0 m = None Um = None enc = None for idx_batch in train_batches: data_batch, count_batch, mask = utils.fetch_data( train_set, train_count, idx_batch, FLAGS.vocab_size) input_feed = { model.x.name: data_batch, model.mask.name: mask } _, (loss, kld, mean, Umean, enc, rec_loss, log_s, mean_s, vk_show, theta, beta, lp, v) = sess.run((optim, [ model.objective, model.kld, model.mean, model.U, model.vk, model.recons_loss, model.log_squre, model.mean_squre, model.vk_show, model.theta, model.beta, model.log_prob, model.variance ]), input_feed) m = mean Um = Umean # print('*********************vk show', vk_show) # print('Umean', Umean[0]) loss_sum += np.sum(loss) kld_sum += np.sum(kld) / np.sum(mask) word_count += np.sum(count_batch) res_sum += np.sum(rec_loss) log_sum += np.sum(log_s) mean_sum += np.sum(mean_s) var_sum += np.sum(v) / np.sum(mask) # to avoid nan error count_batch = np.add(count_batch, 1e-12) # per document loss ppx_sum += np.sum(np.divide(loss, count_batch)) doc_count += np.sum(mask) if epoch == training_epochs - 1 and switch == 1 and i == alternate_epochs - 1: train_theta.extend(theta) train_beta.extend(beta) print_ppx = np.exp(loss_sum / word_count) # print_ppx_perdoc = np.exp(ppx_sum / doc_count) print_kld = kld_sum / len(train_batches) print_res = res_sum / len(train_batches) print_log = log_sum / len(train_batches) print_mean = mean_sum / len(train_batches) print_var = var_sum / len(train_batches) print( '| Epoch train: {:d} |'.format(epoch + 1), print_mode, '{:d}'.format(i), '| Corpus ppx: {:.5f}'.format( print_ppx), # perplexity per word # '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc), # perplexity for per doc '| KLD: {:.5}'.format(print_kld), '| stddev {:.5}'.format(print_var), '| res_loss: {:5}'.format(print_res), '| log_loss: {:5}'.format(print_log), '| mean_loss: {:5}'.format(print_mean)) with codecs.open('./nvctm_train_theta', 'wb') as fp: pickle.dump(np.array(train_theta), fp) fp.close() if (epoch + 1 ) % 50 == 0 and switch == 1 and i == alternate_epochs - 1: with codecs.open('./nvctm_train_beta', 'wb') as fp: pickle.dump(beta, fp) fp.close() npmi.print_coherence('nvctm', FLAGS.data_dir + '/train.feat', FLAGS.vocab_size) # dev loss_sum = 0.0 kld_sum = 0.0 ppx_sum = 0.0 var_sum = 0 word_count = 0 doc_count = 0 for idx_batch in dev_batches: data_batch, count_batch, mask = utils.fetch_data( dev_set, dev_count, idx_batch, FLAGS.vocab_size) input_feed = {model.x.name: data_batch, model.mask.name: mask} loss, kld, v = sess.run( [model.objective, model.kld, model.variance], input_feed) loss_sum += np.sum(loss) kld_sum += np.sum(kld) / np.sum(mask) var_sum += np.sum(v) / np.sum(mask) word_count += np.sum(count_batch) count_batch = np.add(count_batch, 1e-12) ppx_sum += np.sum(np.divide(loss, count_batch)) doc_count += np.sum(mask) print_ppx = np.exp(loss_sum / word_count) print_var = var_sum / len(train_batches) # print_ppx_perdoc = np.exp(ppx_sum / doc_count) print_kld = kld_sum / len(dev_batches) print('\n| Epoch dev: {:d}'.format(epoch + 1), '| Perplexity: {:.9f}'.format(print_ppx), '| stddev {:.5}'.format(print_var), '| KLD: {:.5}'.format(print_kld)) # test if FLAGS.test: loss_sum = 0.0 kld_sum = 0.0 ppx_sum = 0.0 var_sum = 0.0 word_count = 0 doc_count = 0 for idx_batch in test_batches: data_batch, count_batch, mask = utils.fetch_data( test_set, test_count, idx_batch, FLAGS.vocab_size) input_feed = {model.x.name: data_batch, model.mask.name: mask} loss, kld, v = sess.run( [model.objective, model.kld, model.variance], input_feed) loss_sum += np.sum(loss) kld_sum += np.sum(kld) / np.sum(mask) var_sum += np.sum(v) / np.sum(mask) word_count += np.sum(count_batch) count_batch = np.add(count_batch, 1e-12) ppx_sum += np.sum(np.divide(loss, count_batch)) doc_count += np.sum(mask) print_ppx = np.exp(loss_sum / word_count) print_var = var_sum / len(train_batches) # print_ppx_perdoc = np.exp(ppx_sum / doc_count) print_kld = kld_sum / len(test_batches) print('| Epoch test: {:d}'.format(epoch + 1), '| Perplexity: {:.9f}'.format(print_ppx), '| stddev {:.5}'.format(print_var), '| KLD: {:.5}\n'.format(print_kld)) npmi.print_coherence('nvctm', FLAGS.data_dir + '/train.feat', FLAGS.vocab_size) saver.save(sess, model_url)
# # temp = ConvLayer(net, num_filters=n_f, filter_size=3, stride=1, pad=1, nonlinearity=identity, flip_filters=False ) # temp = ConvLayer(temp, num_filters=n_f, filter_size=1, stride=1, pad=0, nonlinearity=identity, flip_filters=False ) # # # return net if __name__ == "__main__": # path to train and testing data PATH_train = "../data/training.csv" PATH_test = "../data/test.csv" # load data print 'loading data \n' data = data_set(path_train=PATH_train, path_test=PATH_test) print 'sobel stacking image' data.stack_origi_sobel() # augmentation # data.augment() # center data # print 'center alexnet \n' # data.center_alexnet() # print 'center Xs VGG Style, X doesnt have missing values \n' # data.center_VGG() # generate test validation split data.split_trainval()
def train(sess, model, train_url, batch_size, training_epochs=1000, alternate_epochs=10): train_set, train_count = utils.data_set(train_url) summaries = None#get_summaries(sess) writer = None#tf.summary.FileWriter(ckpt + '/logs/', sess.graph) saver = tf.train.Saver() sess.graph.finalize() total_mem = 0 mem = 0 for epoch in range(training_epochs): train_batches = utils.create_batches(len(train_set), batch_size, shuffle=True) for switch in range(0, 2): if switch == 0: optim = model.optim_dec print_mode = 'updating decoder' else: optim = model.optim_enc print_mode = 'updating encoder' for i in range(alternate_epochs): loss_sum = 0.0 ppx_sum = 0.0 kld_sum = 0.0 word_count = 0 doc_count = 0 for idx_batch in train_batches: data_batch, count_batch, mask = utils.fetch_data(train_set, train_count, idx_batch, FLAGS.vocab_size) input_feed = {model.x.name: data_batch, model.mask.name: mask} _, (loss, kld) = sess.run((optim, [model.objective, model.kld]), input_feed) #loss, kld = tf.cast(loss, tf.float64), tf.cast(kld, tf.float64) loss_sum += np.sum(loss) kld_sum += np.sum(kld) / np.sum(mask) word_count += np.sum(count_batch) # to avoid nan error count_batch = np.add(count_batch, 1e-12) # per document loss ppx_sum += np.sum(np.divide(loss, count_batch)) doc_count += np.sum(mask) print_ppx = np.exp(loss_sum / word_count) print_ppx_perdoc = np.exp(ppx_sum / doc_count) print_kld = kld_sum/len(train_batches) print('| Epoch train: {:d} |'.format(epoch+1), print_mode, '{:d}'.format(i), '| Corpus ppx: {:.5f}'.format(print_ppx), # perplexity for all docs '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc), # perplexity for per doc '| KLD: {:.5}'.format(print_kld)) evaluate(model, train_set, train_count, sess, 'val', (loss_sum + kld_sum), epoch, summaries, writer, saver) current_mem = process.memory_info().rss / (1024 ** 2) total_mem += (current_mem - mem) print("Memory increase: {}, Cumulative memory: {}, and current {} in MB".format(current_mem - mem, total_mem, current_mem)) mem = current_mem gc.collect()
def train_cnn(): import time device = torch.device("cuda" if torch.cuda.is_available() else "cpu") BS = 64 LR = 0.001 EPOCH = 30 data = data_set(*load_data()) boundary = 200000 train_set = DataLoader(TensorDataset(*data[:boundary]), BS, shuffle=True) eval_set = DataLoader(TensorDataset(*data[boundary:]), 2 * BS, shuffle=False) cnn = CNN().to(device=device) loss_fn = nn.BCELoss() optim = torch.optim.Adam(cnn.parameters(), lr=LR) begin = time.time() ACC_train, ACC = [], [] for epoch in range(EPOCH): cnn.train() acc_train = 0 for i, (a, b, y, *_) in enumerate(train_set): a.squeeze_(-1) b.squeeze_(-1) a.unsqueeze_(1) b.unsqueeze_(1) x = torch.cat((a, b), dim=1).to(dtype=torch.float32, device=device) y = y.to(dtype=torch.float32, device=device) optim.zero_grad() y_pre = cnn(x) loss = loss_fn(y_pre, y) loss.backward() optim.step() y_pre[y_pre > 0.5] = 1 y_pre[y_pre <= 0.5] = 0 acc_train += len(torch.nonzero(y == y_pre)) if (i + 1) % 500 == 0 or (i + 1) == len(train_set): time_cost = int(time.time() - begin) print('Time cost so far: {}h {}min {}s'.format( time_cost // 3600, time_cost % 3600 // 60, time_cost % 3600 % 60 // 1)) print("Epoch[{}/{}], Step [{}/{}], Loss {:.4f}".format( epoch + 1, EPOCH, i + 1, len(train_set), loss.item())) acc_train /= len(train_set.dataset) ACC_train.append(acc_train) cnn.eval() with torch.no_grad(): eval_loss, acc = 0, 0 for i, (a, b, y, *_) in enumerate(eval_set): a.squeeze_(-1) b.squeeze_(-1) a.unsqueeze_(1) b.unsqueeze_(1) x = torch.cat((a, b), dim=1).to(dtype=torch.float32, device=device) y = y.to(dtype=torch.float32, device=device) y_pre = cnn(x) eval_loss += loss_fn(y_pre, y) y_pre[y_pre > 0.5] = 1 y_pre[y_pre <= 0.5] = 0 acc += len(torch.nonzero(y == y_pre)) eval_loss /= len(eval_set) acc /= len(eval_set.dataset) ACC.append(acc) time_cost = int(time.time() - begin) print('\nTime cost so far: {}h {}min {}s'.format( time_cost // 3600, time_cost % 3600 // 60, time_cost % 3600 % 60 // 1)) print('Evaluation set: loss: {:.4f}, acc: {:.4f}\n'.format( eval_loss, acc)) if acc > 0.8: cnn.predict(device=device) return cnn, ACC_train, ACC
def train(sess, model, train_url, test_url, batch_size, vocab_size, training_epochs=200, alternate_epochs=1,#10 lexicon=[], result_file='test.txt', B=1, warm_up_period=100): """train nvdm model.""" train_set, train_count = utils.data_set(train_url) test_set, test_count = utils.data_set(test_url) # hold-out development dataset train_size=len(train_set) validation_size=int(train_size*0.1) dev_set = train_set[:validation_size] dev_count = train_count[:validation_size] train_set = train_set[validation_size:] train_count = train_count[validation_size:] print('sizes',train_size,validation_size,len(dev_set),len(train_set)) optimize_jointly = True dev_batches = utils.create_batches(len(dev_set), batch_size, shuffle=False) test_batches = utils.create_batches(len(test_set), batch_size, shuffle=False) warm_up = 0 start_min_alpha = 0.00001 min_alpha = start_min_alpha warm_up_alpha=False start_B=4 curr_B=B #for early stopping best_print_ana_ppx=1e10 early_stopping_iters=30 no_improvement_iters=0 stopped=False epoch=-1 #for epoch in range(training_epochs): while not stopped: epoch+=1 train_batches = utils.create_batches(len(train_set), batch_size, shuffle=True) if warm_up<1.: warm_up += 1./warm_up_period else: warm_up=1. # train #for switch in range(0, 2): if optimize_jointly: optim = model.optim_all print_mode = 'updating encoder and decoder' elif switch == 0: optim = model.optim_dec print_mode = 'updating decoder' else: optim = model.optim_enc print_mode = 'updating encoder' for i in range(alternate_epochs): loss_sum = 0.0 ana_loss_sum = 0.0 ppx_sum = 0.0 kld_sum = 0.0 ana_kld_sum = 0.0 word_count = 0 doc_count = 0 recon_sum=0.0 for idx_batch in train_batches: data_batch, count_batch, mask = utils.fetch_data( train_set, train_count, idx_batch, vocab_size) input_feed = {model.x.name: data_batch, model.mask.name: mask,model.keep_prob.name: 0.75,model.warm_up.name: warm_up,model.min_alpha.name:min_alpha,model.B.name: curr_B} _, (loss,recon, kld,ana_loss,ana_kld) = sess.run((optim, [model.true_objective, model.recons_loss, model.kld,model.analytical_objective,model.analytical_kld]), input_feed) loss_sum += np.sum(loss) ana_loss_sum += np.sum(ana_loss) kld_sum += np.sum(kld) / np.sum(mask) ana_kld_sum += np.sum(ana_kld) / np.sum(mask) word_count += np.sum(count_batch) # to avoid nan error count_batch = np.add(count_batch, 1e-12) # per document loss ppx_sum += np.sum(np.divide(loss, count_batch)) doc_count += np.sum(mask) recon_sum+=np.sum(recon) print_loss = recon_sum/len(train_batches) dec_vars = utils.variable_parser(tf.trainable_variables(), 'decoder') phi = dec_vars[0] phi = sess.run(phi) utils.print_top_words(phi, lexicon,result_file=None) print_ppx = np.exp(loss_sum / word_count) print_ana_ppx = np.exp(ana_loss_sum / word_count) print_ppx_perdoc = np.exp(ppx_sum / doc_count) print_kld = kld_sum/len(train_batches) print_ana_kld = ana_kld_sum/len(train_batches) print('| Epoch train: {:d} |'.format(epoch+1), print_mode, '{:d}'.format(i), '| Corpus ppx: {:.5f}'.format(print_ppx), # perplexity for all docs '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc), # perplexity for per doc '| KLD: {:.5}'.format(print_kld), '| Loss: {:.5}'.format(print_loss), '| ppx anal.: {:.5f}'.format(print_ana_ppx), '|KLD anal.: {:.5f}'.format(print_ana_kld)) if warm_up_alpha: if min_alpha>0.0001: min_alpha-=(start_min_alpha-0.0001)/training_epochs #------------------------------- # dev loss_sum = 0.0 kld_sum = 0.0 ppx_sum = 0.0 word_count = 0 doc_count = 0 recon_sum=0.0 print_ana_ppx = 0.0 ana_loss_sum = 0.0 for idx_batch in dev_batches: data_batch, count_batch, mask = utils.fetch_data( dev_set, dev_count, idx_batch, vocab_size) input_feed = {model.x.name: data_batch, model.mask.name: mask,model.keep_prob.name: 1.0,model.warm_up.name: 1.0,model.min_alpha.name:min_alpha,model.B.name: B}#,model.B.name: B loss,recon, kld,ana_loss = sess.run([model.objective, model.recons_loss, model.analytical_kld,model.analytical_objective], input_feed) loss_sum += np.sum(loss) ana_loss_sum += np.sum(ana_loss) kld_sum += np.sum(kld) / np.sum(mask) word_count += np.sum(count_batch) count_batch = np.add(count_batch, 1e-12) ppx_sum += np.sum(np.divide(loss, count_batch)) doc_count += np.sum(mask) recon_sum+=np.sum(recon) print_ana_ppx = np.exp(ana_loss_sum / word_count) print_ppx = np.exp(loss_sum / word_count) print_ppx_perdoc = np.exp(ppx_sum / doc_count) print_kld = kld_sum/len(dev_batches) print_loss = recon_sum/len(dev_batches) if print_ana_ppx<best_print_ana_ppx: no_improvement_iters=0 best_print_ana_ppx=print_ana_ppx #check on validation set, if ppx better-> save improved model tf.train.Saver().save(sess, 'models/improved_model_bernoulli') else: no_improvement_iters+=1 print('no_improvement_iters',no_improvement_iters,'best ppx',best_print_ana_ppx) if no_improvement_iters>=early_stopping_iters: #if model has not improved for 30 iterations, stop training ###########STOP TRAINING############ stopped=True print('stop training after',epoch,'iterations,no_improvement_iters',no_improvement_iters) ###########LOAD BEST MODEL########## print('load stored model') tf.train.Saver().restore(sess,'models/improved_model_bernoulli') print('| Epoch dev: {:d} |'.format(epoch+1), '| Perplexity: {:.9f}'.format(print_ppx), '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc), '| KLD: {:.5}'.format(print_kld) , '| Loss: {:.5}'.format(print_loss)) #------------------------------- # test #if epoch%10==0 or epoch==training_epochs-1: if FLAGS.test: #if epoch==training_epochs-1: if stopped: #only do it once in the end coherence=utils.topic_coherence(test_set,phi, lexicon) print('topic coherence',str(coherence)) loss_sum = 0.0 kld_sum = 0.0 ppx_sum = 0.0 word_count = 0 doc_count = 0 recon_sum = 0.0 ana_loss_sum = 0.0 ana_kld_sum = 0.0 for idx_batch in test_batches: data_batch, count_batch, mask = utils.fetch_data( test_set, test_count, idx_batch, vocab_size) input_feed = {model.x.name: data_batch, model.mask.name: mask,model.keep_prob.name: 1.0,model.warm_up.name: 1.0,model.min_alpha.name:min_alpha,model.B.name: B} loss, recon,kld,ana_loss,ana_kld = sess.run([model.objective, model.recons_loss,model.kld,model.analytical_objective,model.analytical_kld], input_feed) loss_sum += np.sum(loss) kld_sum += np.sum(kld)/np.sum(mask) ana_loss_sum += np.sum(ana_loss) ana_kld_sum += np.sum(ana_kld) / np.sum(mask) word_count += np.sum(count_batch) count_batch = np.add(count_batch, 1e-12) ppx_sum += np.sum(np.divide(loss, count_batch)) doc_count += np.sum(mask) recon_sum+=np.sum(recon) print_loss = recon_sum/len(test_batches) print_ppx = np.exp(loss_sum / word_count) print_ppx_perdoc = np.exp(ppx_sum / doc_count) print_kld = kld_sum/len(test_batches) print_ana_ppx = np.exp(ana_loss_sum / word_count) print_ana_kld = ana_kld_sum/len(train_batches) print('| Epoch test: {:d} |'.format(epoch+1), '| Perplexity: {:.9f}'.format(print_ppx), '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc), '| KLD: {:.5}'.format(print_kld), '| Loss: {:.5}'.format(print_loss), '| ppx anal.: {:.5f}'.format(print_ana_ppx), '|KLD anal.: {:.5f}'.format(print_ana_kld))
def train(sess, model, train_url, test_url, batch_size, FLAGS, train_csv_filename, dev_csv_filename, test_csv_filename, training_epochs=1000, alternate_epochs=10, is_restore=False): """train nvdm model.""" train_set, train_count = utils.data_set(train_url) test_set, test_count = utils.data_set(test_url) # hold-out development dataset dev_set = test_set[:50] dev_count = test_count[:50] dev_batches = utils.create_batches(len(dev_set), batch_size, shuffle=False) test_batches = utils.create_batches(len(test_set), batch_size, shuffle=False) #save model saver = tf.train.Saver() if is_restore: saver.restore(sess, "./checkpoints/model.ckpt") for epoch in range(training_epochs): train_batches = utils.create_batches(len(train_set), batch_size, shuffle=True) #------------------------------- # train for switch in xrange(0, 2): if switch == 0: optim = model.optim_dec print_mode = 'updating decoder' else: optim = model.optim_enc print_mode = 'updating encoder' for i in xrange(alternate_epochs): loss_sum = 0.0 ppx_sum = 0.0 kld_sum = 0.0 word_count = 0 doc_count = 0 for idx_batch in train_batches: data_batch, count_batch, mask = utils.fetch_data( train_set, train_count, idx_batch, FLAGS.vocab_size) input_feed = { model.x.name: data_batch, model.mask.name: mask } _, (loss, kld) = sess.run( (optim, [model.objective, model.kld]), input_feed) loss_sum += np.sum(loss) kld_sum += np.sum(kld) / np.sum(mask) word_count += np.sum(count_batch) # to avoid nan error count_batch = np.add(count_batch, 1e-12) # per document loss ppx_sum += np.sum(np.divide(loss, count_batch)) doc_count += np.sum(mask) print_ppx = np.exp(loss_sum / word_count) print_ppx_perdoc = np.exp(ppx_sum / doc_count) print_kld = kld_sum / len(train_batches) with open(train_csv_filename, 'a') as train_csv: train_writer = csv.writer(train_csv, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) train_writer.writerow([ epoch + 1, print_mode, i, print_ppx, print_ppx_perdoc, print_kld ]) print( '| Epoch train: {:d} |'.format(epoch + 1), print_mode, '{:d}'.format(i), '| Corpus ppx: {:.5f}'.format( print_ppx), # perplexity for all docs '| Per doc ppx: {:.5f}'.format( print_ppx_perdoc), # perplexity for per doc '| KLD: {:.5}'.format(print_kld)) #------------------------------- # dev loss_sum = 0.0 kld_sum = 0.0 ppx_sum = 0.0 word_count = 0 doc_count = 0 for idx_batch in dev_batches: data_batch, count_batch, mask = utils.fetch_data( dev_set, dev_count, idx_batch, FLAGS.vocab_size) input_feed = {model.x.name: data_batch, model.mask.name: mask} loss, kld = sess.run([model.objective, model.kld], input_feed) loss_sum += np.sum(loss) kld_sum += np.sum(kld) / np.sum(mask) word_count += np.sum(count_batch) count_batch = np.add(count_batch, 1e-12) ppx_sum += np.sum(np.divide(loss, count_batch)) doc_count += np.sum(mask) print_ppx = np.exp(loss_sum / word_count) print_ppx_perdoc = np.exp(ppx_sum / doc_count) print_kld = kld_sum / len(dev_batches) with open(dev_csv_filename, 'a') as dev_csv: dev_writer = csv.writer(dev_csv, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) dev_writer.writerow( [epoch + 1, print_ppx, print_ppx_perdoc, print_kld]) print('| Epoch dev: {:d} |'.format(epoch + 1), '| Perplexity: {:.9f}'.format(print_ppx), '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc), '| KLD: {:.5}'.format(print_kld)) #------------------------------- # test if FLAGS.test: loss_sum = 0.0 kld_sum = 0.0 ppx_sum = 0.0 word_count = 0 doc_count = 0 for idx_batch in test_batches: data_batch, count_batch, mask = utils.fetch_data( test_set, test_count, idx_batch, FLAGS.vocab_size) input_feed = {model.x.name: data_batch, model.mask.name: mask} loss, kld = sess.run([model.objective, model.kld], input_feed) loss_sum += np.sum(loss) kld_sum += np.sum(kld) / np.sum(mask) word_count += np.sum(count_batch) count_batch = np.add(count_batch, 1e-12) ppx_sum += np.sum(np.divide(loss, count_batch)) doc_count += np.sum(mask) print_ppx = np.exp(loss_sum / word_count) print_ppx_perdoc = np.exp(ppx_sum / doc_count) print_kld = kld_sum / len(test_batches) with open(test_csv_filename, 'a') as test_csv: test_writer = csv.writer(test_csv, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) test_writer.writerow( [epoch + 1, print_ppx, print_ppx_perdoc, print_kld]) print('| Epoch test: {:d} |'.format(epoch + 1), '| Perplexity: {:.9f}'.format(print_ppx), '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc), '| KLD: {:.5}'.format(print_kld))
def train(sess, model, train_url, test_url, batch_size, training_epochs=1000, alternate_epochs=10): """train gsm model.""" # train_set: 维度为1 x vocab_size,每一维是对应的词出现次数, train_count: 训练集的总词数 train_set, train_count = utils.data_set(train_url) test_set, test_count = utils.data_set(test_url) # hold-out development dataset, 选取前50篇文档 dev_set = test_set[:50] dev_count = test_count[:50] dev_batches = utils.create_batches(len(dev_set), batch_size, shuffle=False) test_batches = utils.create_batches(len(test_set), batch_size, shuffle=False) for epoch in range(training_epochs): # 创建batches,大小为batch_size train_batches = utils.create_batches(len(train_set), batch_size, shuffle=True) # ------------------------------- # train for switch in range(0, 2): if switch == 0: optimize = model.optimize_dec print_mode = 'updating decoder' elif switch == 1: optimize = model.optimize_enc print_mode = 'updating encoder' for i in range(alternate_epochs): loss_sum = 0.0 ppx_sum = 0.0 kld_sum = 0.0 word_count = 0 doc_count = 0 # 训练每个batch for idx_batch in train_batches: ''' data_batch: 当前batch的词频向量集合,batch_size*vocab_size count_batch: 当前batch中每篇文档的词数 train_set: 训练集 train_count: 训练集词数 idx_batch: 当前batch mask: 用于某个batch文档不足时做序列对齐 ''' data_batch, count_batch, mask = utils.fetch_data( train_set, train_count, idx_batch, FLAGS.vocab_size) # input: x = data_batch, mask = mask input_feed = { model.x.name: data_batch, model.mask.name: mask } # return: loss = objective, kld = kld, optimizer = optimize # 以上三者组成feed_dict, 将模型中的tensor映射到具体的值 _, (loss, kld) = sess.run( (optimize, [model.objective, model.kld]), input_feed) loss_sum += np.sum(loss) kld_sum += np.sum(kld) / np.sum(mask) # 总词数 word_count += np.sum(count_batch) # to avoid nan error, 避免0分母 count_batch = np.add(count_batch, 1e-12) # per document loss ppx_sum += np.sum(np.divide(loss, count_batch)) doc_count += np.sum(mask) print_ppx = np.exp(loss_sum / word_count) print_ppx_perdoc = np.exp(ppx_sum / doc_count) print_kld = kld_sum / len(train_batches) print( '| Epoch train: {:d} |'.format(epoch + 1), print_mode, '{:d}'.format(i + 1), '| Corpus ppx: {:.5f}'.format( print_ppx), # perplexity for all docs '| Per doc ppx: {:.5f}'.format( print_ppx_perdoc), # perplexity for per doc '| KLD: {:.5}'.format(print_kld)) # ------------------------------- # dev loss_sum = 0.0 kld_sum = 0.0 ppx_sum = 0.0 word_count = 0 doc_count = 0 for idx_batch in dev_batches: data_batch, count_batch, mask = utils.fetch_data( dev_set, dev_count, idx_batch, FLAGS.vocab_size) input_feed = {model.x.name: data_batch, model.mask.name: mask} loss, kld = sess.run([model.objective, model.kld], input_feed) loss_sum += np.sum(loss) kld_sum += np.sum(kld) / np.sum(mask) word_count += np.sum(count_batch) count_batch = np.add(count_batch, 1e-12) ppx_sum += np.sum(np.divide(loss, count_batch)) doc_count += np.sum(mask) print_ppx = np.exp(loss_sum / word_count) print_ppx_perdoc = np.exp(ppx_sum / doc_count) print_kld = kld_sum / len(dev_batches) print('| Epoch dev: {:d} |'.format(epoch + 1), '| Perplexity: {:.9f}'.format(print_ppx), '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc), '| KLD: {:.5}'.format(print_kld)) # ------------------------------- # test if FLAGS.test: loss_sum = 0.0 kld_sum = 0.0 ppx_sum = 0.0 word_count = 0 doc_count = 0 for idx_batch in test_batches: data_batch, count_batch, mask = utils.fetch_data( test_set, test_count, idx_batch, FLAGS.vocab_size) input_feed = {model.x.name: data_batch, model.mask.name: mask} loss, kld = sess.run([model.objective, model.kld], input_feed) loss_sum += np.sum(loss) kld_sum += np.sum(kld) / np.sum(mask) word_count += np.sum(count_batch) count_batch = np.add(count_batch, 1e-12) ppx_sum += np.sum(np.divide(loss, count_batch)) doc_count += np.sum(mask) print_ppx = np.exp(loss_sum / word_count) print_ppx_perdoc = np.exp(ppx_sum / doc_count) print_kld = kld_sum / len(test_batches) print('| Epoch test: {:d} |'.format(epoch + 1), '| Perplexity: {:.9f}'.format(print_ppx), '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc), '| KLD: {:.5}'.format(print_kld))
def read_input_data_tweets(data_dir, path_x, path_y): data_url = os.path.join(data_dir, path_x) data_url_y = os.path.join(data_dir, path_y) data_set_y = utils.data_set_y(data_url_y) data_set, data_count = utils.data_set(data_url) return data_set_y, data_set, data_count
def serve(): train_test_url = opj(args.data_dir, 'train_test.feat') entity_map_url = opj(args.data_dir, 'entity.map') feat_map_url = opj(args.data_dir, 'vocab.new') entity_sent_url = opj(args.data_dir, 'entities.sentences') guid2name = {} guid2id = {} id2guid = {} guid2sent = {} # The train_test.feat file contains some entities such as number 1997 # that has no features. Its feature line is blank. # These entities were removed while training the neural network architecture. # Therefore to map the embeddings in NVGE back to the KB we need to use this # alignment information. This information is not necessary for BS because BS # can easily handle the fact that some entities have no features (ie. the ) # document is empty. data_set, data_count, alignment = utils.data_set(train_test_url) for idx, row in enumerate( codecs.open(entity_map_url, 'r', 'utf-8').read().split('\n')): if row == '': continue dbid, canonical = row.split('\t') guid2name[dbid] = canonical if idx in alignment: guid2id[dbid] = alignment[idx] id2guid[alignment[idx]] = dbid GUID2SENT_PKL_FILE = opj(args.data_dir, os.path.pardir, 'guid2sent.pkl') try: print 'Loading', GUID2SENT_PKL_FILE guid2sent = pkl.load(open(GUID2SENT_PKL_FILE)) except: print 'Could not find', GUID2SENT_PKL_FILE concrete_entity_files = os.listdir(args.concrete_entity_dir) for commidx, filename in enumerate(concrete_entity_files): print '%-5d\r' % ((commidx * 100) / len(concrete_entity_files)), comm = read_communication_from_file( opj(args.concrete_entity_dir, filename)) guid = comm.id for sent in comm.sectionList[0].sentenceList: uuid = sent.uuid.uuidString tokens = [ e.text for e in sent.tokenization.tokenList.tokenList ] try: guid2sent[guid].append((uuid, tokens)) except KeyError: guid2sent[guid] = [(uuid, tokens)] with open(GUID2SENT_PKL_FILE, 'wb') as gpf: print 'Dumping', GUID2SENT_PKL_FILE pkl.dump(guid2sent, gpf) # for row in codecs.open(entity_sent_url, 'r', 'utf-8').read().split('\n'): # row = row.split(' ||| ') # guid = row[0] # for sent in row[1:]: # tokens = sent.split() # try: # guid2sent[guid].append(tokens) # except KeyError: # guid2sent[guid] = [tokens] id2feat_data = codecs.open(feat_map_url, 'r', 'utf-8').read().split('\n') id2feat = dict((((sum(1 for e in id2feat_data if e != '') - 1) if idx == 0 else (idx - 1)), row.split()[0]) for idx, row in enumerate(id2feat_data) if row != '') print('Checking feature size =', len(data_set[guid2id[":Entity_ENG_EDL_0092354"]]), 'for', guid2name[":Entity_ENG_EDL_0092354"], 'max(id2feat.values())', max(id2feat.keys())) def load(args): import cPickle as pkl with open(opj(args.data_dir, args.model_pkl), 'rb') as f: nnp = pkl.load(f) return nnp handler = EntitySearchProvider( args.language, NVBS(data_set=data_set, nnp=load(args), method=getattr(NVBSALGO, args.algorithm), opts=args, id2guid=id2guid, guid2id=guid2id, guid2name=guid2name, guid2sent=guid2sent, id2feat=id2feat), args.k_query, args.k_rationale) server = SearchServiceWrapper(handler) if args.serve: print('Starting NVBS Server') server.serve(args.host, args.port) else: return handler.index
def train( train_url, test_url, model_url, vocab_url, non_linearity, embedding_url, training_epochs, alternate_epochs, vocab_size, embedding_size, n_hidden, n_topic, n_sample, learning_rate, batch_size, is_training, mix_num, ): """train crntm model.""" train_set, train_count = utils.data_set(train_url) test_set, test_count = utils.data_set(test_url) vocab = utils.get_vocab(vocab_url) embedding_table = utils.load_embedding( embedding_url, embedding_size, vocab, FLAGS.data_dir + '/vocab_embedding-{}.pkl'.format(embedding_size)) # hold-out development dataset dev_count = test_count[:50] dev_onehot_set = test_set[:50] dev_batches = utils.create_batches(len(dev_onehot_set), batch_size, shuffle=False) test_batches = utils.create_batches(len(test_set), batch_size, shuffle=False) # create model crntm = CRNTM(vocab_size=vocab_size, embedding_size=embedding_size, n_hidden=n_hidden, n_topic=n_topic, n_sample=n_sample, learning_rate=learning_rate, batch_size=batch_size, non_linearity=non_linearity, embedding_table=embedding_table, is_training=is_training, mix_num=mix_num) crntm.construct_model() sess = tf.Session() init = tf.initialize_all_variables() sess.run(init) model = crntm saver = tf.train.Saver() # # if RESTORE: # return embedding_table[1:] for epoch in range(training_epochs): train_batches = utils.create_batches(len(train_set), batch_size, shuffle=True) #------------------------------- # train for switch in range(0, 2): if switch == 0: optim = model.optim_dec print_mode = 'updating decoder' else: optim = model.optim_enc print_mode = 'updating encoder' for i in range(alternate_epochs): loss_sum = 0.0 ppx_sum = 0.0 kld_sum = 0.0 word_count = 0 doc_count = 0 res_sum = 0 log_sum = 0 r_sum = 0 log_s = None r_loss = None g_loss = None for bn, idx_batch in enumerate(train_batches): data_onehot_batch, count_batch, mask = utils.fetch_data( train_set, train_count, idx_batch, FLAGS.vocab_size) input_feed = { model.x_onehot.name: data_onehot_batch, model.mask.name: mask } _, (loss, kld, rec_loss, log_s, r_loss, g_loss) = sess.run( (optim, [ model.objective, model.kld, model.recons_loss, model.logits, model.doc_vec, model.topic_word_prob ]), input_feed) # if switch==0: # # # print(bn, len(train_batches), mask.sum(), r_loss.shape) # print('ptheta', log_s) # print('doc_Vec', r_loss) # print('topic_prob', g_loss) res_sum += np.sum(rec_loss) log_sum += np.sum(log_s) loss_sum += np.sum(loss) r_sum += np.sum(r_loss) kld_sum += np.sum(kld) / np.sum(mask) word_count += np.sum(count_batch) # to avoid nan error count_batch = np.add(count_batch, 1e-12) # per document loss ppx_sum += np.sum(np.divide(loss, count_batch)) # print(np.sum(np.divide(loss, count_batch))) doc_count += np.sum(mask) # if doc_count>11264: # print('debug:: ', doc_count, rec_loss, kld, loss[-1], count_batch[-1]) print_ppx = np.exp(loss_sum / word_count) print_ppx_perdoc = np.exp(ppx_sum / doc_count) print_kld = kld_sum / len(train_batches) print_res = res_sum / len(train_batches) print_log = log_sum / len(train_batches) print_mean = r_sum / len(train_batches) message = '| Epoch train: {:d} | {} {:d} | Corpus ppx: {:.5f}::{} | Per doc ppx: {:.5f}::{} | KLD: {:.5} | res_loss: {:5} | log_loss: {:5} | r_loss: {:5}'.format( epoch + 1, print_mode, i, print_ppx, word_count, print_ppx_perdoc, doc_count, print_kld, print_res, print_log, print_mean, ) print(message) write_result(message) TopicWords(sess, vocab_url, embedding_table[1:]) #------------------------------- # dev loss_sum = 0.0 ppx_sum = 0.0 kld_sum = 0.0 word_count = 0 doc_count = 0 res_sum = 0 log_sum = 0 mean_sum = 0 r_sum = 0 for idx_batch in dev_batches: data_onehot_batch, count_batch, mask = utils.fetch_data( dev_onehot_set, dev_count, idx_batch, FLAGS.vocab_size) input_feed = { model.x_onehot.name: data_onehot_batch, model.mask.name: mask } loss, kld, rec_loss, log_s, r_loss = sess.run([ model.objective, model.kld, model.recons_loss, model.embedding_loss, model.res_loss ], input_feed) res_sum += np.sum(rec_loss) log_sum += np.sum(log_s) loss_sum += np.sum(loss) r_sum += np.sum(r_loss) kld_sum += np.sum(kld) / np.sum(mask) word_count += np.sum(count_batch) # to avoid nan error count_batch = np.add(count_batch, 1e-12) # per document loss ppx_sum += np.sum(np.divide(loss, count_batch)) # print(np.sum(np.divide(loss, count_batch))) doc_count += np.sum(mask) # if doc_count>11264: # print('debug:: ', doc_count, rec_loss, kld, loss[-1], count_batch[-1]) print_ppx = np.exp(loss_sum / word_count) print_ppx_perdoc = np.exp(ppx_sum / doc_count) # print_ppx_perdoc = ppx_sum / doc_count # print(loss_sum, word_count) print_kld = kld_sum / len(train_batches) print_res = res_sum / len(train_batches) print_log = log_sum / len(train_batches) print_mean = r_sum / len(train_batches) message = '| Epoch dev: {:d} | Corpus ppx: {:.5f}::{} | Per doc ppx: {:.5f}::{} | KLD: {:.5} | res_loss: {:5} | log_loss: {:5} | r_loss: {:5}'.format( epoch + 1, print_ppx, word_count, print_ppx_perdoc, doc_count, print_kld, print_res, print_log, print_mean, ) print(message) write_result(message) # test if FLAGS.test: loss_sum = 0.0 kld_sum = 0.0 ppx_sum = 0.0 word_count = 0 doc_count = 0 for idx_batch in test_batches: data_onehot_batch, count_batch, mask = utils.fetch_data( test_set, test_count, idx_batch, FLAGS.vocab_size) input_feed = { model.x_onehot.name: data_onehot_batch, model.mask.name: mask } loss, kld = sess.run([model.objective, model.kld], input_feed) loss_sum += np.sum(loss) kld_sum += np.sum(kld) / np.sum(mask) word_count += np.sum(count_batch) count_batch = np.add(count_batch, 1e-12) ppx_sum += np.sum(np.divide(loss, count_batch)) doc_count += np.sum(mask) print_ppx = np.exp(loss_sum / word_count) print_ppx_perdoc = np.exp(ppx_sum / doc_count) print_kld = kld_sum / len(test_batches) message = '| Epoch test: {:d} | Corpus ppx: {:.5f} | Per doc ppx: {:.5f} | KLD: {:.5} '.format( epoch + 1, print_ppx, print_ppx_perdoc, print_kld, ) print(message) write_result(message) saver.save(sess, model_url)
def train(sess, model, train_url, test_url, batch_size, training_epochs=1000, alternate_epochs=10): """train nvdm model.""" train_set, train_count = utils.data_set(train_url) test_set, test_count = utils.data_set(test_url) # hold-out development dataset dev_set = test_set[:50] dev_count = test_count[:50] dev_batches = utils.create_batches(len(dev_set), batch_size, shuffle=False) test_batches = utils.create_batches(len(test_set), batch_size, shuffle=False) for epoch in range(training_epochs): train_batches = utils.create_batches(len(train_set), batch_size, shuffle=True) #------------------------------- # train for switch in range(0, 2): if switch == 0: optim = model.optim_dec print_mode = 'updating decoder' else: optim = model.optim_enc print_mode = 'updating encoder' for i in range(alternate_epochs): loss_sum = 0.0 ppx_sum = 0.0 kld_sum = 0.0 word_count = 0 doc_count = 0 for idx_batch in train_batches: data_batch, count_batch, mask = utils.fetch_data( train_set, train_count, idx_batch, FLAGS.vocab_size) input_feed = { model.x.name: data_batch, model.mask.name: mask } _, (loss, kld) = sess.run( (optim, [model.objective, model.kld]), input_feed) loss_sum += np.sum(loss) kld_sum += np.sum(kld) / np.sum(mask) word_count += np.sum(count_batch) # to avoid nan error count_batch = np.add(count_batch, 1e-12) # per document loss ppx_sum += np.sum(np.divide(loss, count_batch)) doc_count += np.sum(mask) print_ppx = np.exp(loss_sum / word_count) print_ppx_perdoc = np.exp(ppx_sum / doc_count) print_kld = kld_sum / len(train_batches) print( '| Epoch train: {:d} |'.format(epoch + 1), print_mode, '{:d}'.format(i), '| Corpus ppx: {:.5f}'.format( print_ppx), # perplexity for all docs '| Per doc ppx: {:.5f}'.format( print_ppx_perdoc), # perplexity for per doc '| KLD: {:.5}'.format(print_kld)) #------------------------------- # dev loss_sum = 0.0 kld_sum = 0.0 ppx_sum = 0.0 word_count = 0 doc_count = 0 for idx_batch in dev_batches: data_batch, count_batch, mask = utils.fetch_data( dev_set, dev_count, idx_batch, FLAGS.vocab_size) input_feed = {model.x.name: data_batch, model.mask.name: mask} loss, kld = sess.run([model.objective, model.kld], input_feed) loss_sum += np.sum(loss) kld_sum += np.sum(kld) / np.sum(mask) word_count += np.sum(count_batch) count_batch = np.add(count_batch, 1e-12) ppx_sum += np.sum(np.divide(loss, count_batch)) doc_count += np.sum(mask) print_ppx = np.exp(loss_sum / word_count) print_ppx_perdoc = np.exp(ppx_sum / doc_count) print_kld = kld_sum / len(dev_batches) print('| Epoch dev: {:d} |'.format(epoch + 1), '| Perplexity: {:.9f}'.format(print_ppx), '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc), '| KLD: {:.5}'.format(print_kld)) #------------------------------- # test if FLAGS.test: loss_sum = 0.0 kld_sum = 0.0 ppx_sum = 0.0 word_count = 0 doc_count = 0 for idx_batch in test_batches: data_batch, count_batch, mask = utils.fetch_data( test_set, test_count, idx_batch, FLAGS.vocab_size) input_feed = {model.x.name: data_batch, model.mask.name: mask} loss, kld = sess.run([model.objective, model.kld], input_feed) loss_sum += np.sum(loss) kld_sum += np.sum(kld) / np.sum(mask) word_count += np.sum(count_batch) count_batch = np.add(count_batch, 1e-12) ppx_sum += np.sum(np.divide(loss, count_batch)) doc_count += np.sum(mask) print_ppx = np.exp(loss_sum / word_count) print_ppx_perdoc = np.exp(ppx_sum / doc_count) print_kld = kld_sum / len(test_batches) print('| Epoch test: {:d} |'.format(epoch + 1), '| Perplexity: {:.9f}'.format(print_ppx), '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc), '| KLD: {:.5}'.format(print_kld))
def train(sess, model, train_url, test_url, dev_url, batch_size, training_epochs=1000, alternate_epochs=1): """train gsm model.""" train_set, train_count = utils.data_set(train_url) test_set, test_count = utils.data_set(test_url) dev_set, dev_count = utils.data_set(dev_url) dev_batches = utils.create_batches(len(dev_set), batch_size, shuffle=False) test_batches = utils.create_batches(len(test_set), batch_size, shuffle=False) kld_list = [] var_list = [] train_theta = [] train_beta = [] test_theta = [] test_beta = [] for epoch in range(training_epochs): train_batches = utils.create_batches(len(train_set), batch_size, shuffle=True) # ------------------------------- # train for switch in range(0, 2): if switch == 0: optimize = model.optimize_dec print_mode = 'updating decoder' elif switch == 1: optimize = model.optimize_enc print_mode = 'updating encoder' for i in range(alternate_epochs): loss_sum = 0.0 ppx_sum = 0.0 kld_sum = 0.0 word_count = 0 doc_count = 0 var_sum = 0 for idx_batch in train_batches: data_batch, count_batch, mask = utils.fetch_data( train_set, train_count, idx_batch, FLAGS.vocab_size) input_feed = {model.x.name: data_batch, model.mask.name: mask, model.is_training.name: True, model.gamma.name: epoch/training_epochs} _, (loss, kld, v, theta, beta) =\ sess.run((optimize, [model.reconstruction_loss, model.kld, model.variance, model.topic_dist, model.beta]), input_feed) loss_sum += np.sum(loss) kld_sum += np.sum(kld) / np.sum(mask) var_sum += np.sum(v) / np.sum(mask) # print([np.max(theta[i]) for i in range(batch_size)]) # print([np.argmax(theta[i]) for i in range(batch_size)]) word_count += np.sum(count_batch) # to avoid nan error count_batch = np.add(count_batch, 1e-12) # per document loss ppx_sum += np.sum(np.divide(loss, count_batch)) doc_count += np.sum(mask) if epoch == training_epochs - 1 and switch == 1 and i == alternate_epochs - 1: train_theta.extend(theta) train_beta.extend(beta) print_ppx = np.exp(loss_sum / word_count) print_ppx_perdoc = np.exp(ppx_sum / doc_count) print_kld = kld_sum / len(train_batches) print_var = var_sum / len(train_batches) kld_list.append(print_kld) var_list.append(print_var) print('| Epoch train: {:d}'.format(epoch + 1), print_mode, '{:d}'.format(i + 1), '| Corpus ppx: {:.5f}'.format(print_ppx), # perplexity for all docs '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc), # perplexity for per doc '| KLD: {:.5}'.format(print_kld), '| stddev {:.5}'.format(print_var)) with codecs.open('./gsm_train_theta', 'wb') as fp: pickle.dump(np.array(train_theta), fp) fp.close() if (epoch + 1) % 50 == 0 and switch == 1 and i == alternate_epochs - 1: with codecs.open('./gsm_train_beta', 'wb') as fp: pickle.dump(beta, fp) fp.close() npmi.print_coherence('gsm', FLAGS.data_dir + '/train.feat', FLAGS.vocab_size) # ------------------------------- # dev loss_sum = 0.0 kld_sum = 0.0 ppx_sum = 0.0 word_count = 0 doc_count = 0 var_sum = 0 for idx_batch in dev_batches: data_batch, count_batch, mask = utils.fetch_data(dev_set, dev_count, idx_batch, FLAGS.vocab_size) input_feed = {model.x.name: data_batch, model.mask.name: mask, model.is_training.name: False, model.gamma.name: 0} loss, kld, v = sess.run([model.objective, model.kld, model.variance], input_feed) loss_sum += np.sum(loss) kld_sum += np.sum(kld) / np.sum(mask) word_count += np.sum(count_batch) count_batch = np.add(count_batch, 1e-12) ppx_sum += np.sum(np.divide(loss, count_batch)) var_sum += np.sum(v) / np.sum(mask) doc_count += np.sum(mask) print_ppx = np.exp(loss_sum / word_count) print_ppx_perdoc = np.exp(ppx_sum / doc_count) print_kld = kld_sum / len(dev_batches) print_var = var_sum / len(train_batches) print('\n| Epoch dev: {:d}'.format(epoch + 1), '| Perplexity: {:.9f}'.format(print_ppx), '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc), '| KLD: {:.5}'.format(print_kld), '| stddev: {:.5}'.format(print_var)) # test if FLAGS.test: loss_sum = 0.0 kld_sum = 0.0 ppx_sum = 0.0 word_count = 0 doc_count = 0 for idx, idx_batch in enumerate(test_batches): data_batch, count_batch, mask = utils.fetch_data( test_set, test_count, idx_batch, FLAGS.vocab_size) input_feed = {model.x.name: data_batch, model.mask.name: mask, model.is_training.name: False, model.gamma.name: 0} loss, kld, theta, beta, v = sess.run([model.objective, model.kld, model.topic_dist, model.beta, model.variance], input_feed) loss_sum += np.sum(loss) kld_sum += np.sum(kld) / np.sum(mask) word_count += np.sum(count_batch) count_batch = np.add(count_batch, 1e-12) ppx_sum += np.sum(np.divide(loss, count_batch)) doc_count += np.sum(mask) test_theta.extend(theta) if idx == len(test_batches) - 1: test_beta.extend(beta) print_ppx = np.exp(loss_sum / word_count) print_ppx_perdoc = np.exp(ppx_sum / doc_count) print_kld = kld_sum / len(test_batches) print('| Epoch test: {:d}'.format(epoch + 1), '| Perplexity: {:.9f}'.format(print_ppx), '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc), '| KLD: {:.5}'.format(print_kld), '| stddev: {:.5}\n'.format(print_var)) npmi.print_coherence('gsm', FLAGS.data_dir + '/train.feat', FLAGS.vocab_size) with codecs.open('./test_theta', 'wb') as fp: pickle.dump(test_theta, fp) fp.close() with codecs.open('./test_beta', 'wb') as fp: pickle.dump(test_beta, fp) fp.close() with codecs.open('./kld.txt', 'w', 'utf-8') as fp: for idx, kld in enumerate(kld_list): if idx < len(kld_list) - 1: fp.write(str(kld) + ', ') else: fp.write(str(kld)) fp.close() with codecs.open('./var.txt', 'w', 'utf-8') as fp: for idx, var in enumerate(var_list): if idx < len(var_list) - 1: fp.write(str(var) + ', ') else: fp.write(str(var)) fp.close()
def main(): # select model: NB or GNB if model == 'NBNTM': net = NBNTM.NBNTM(device, vocab_num, hidden_num, topic_num, shape_prior, scale_prior) else: net = GNBNTM.GNBNTM(device, vocab_num, hidden_num, topic_num, shape_prior, scale_prior) net = net.to(device) optimizer = optim.Adam(net.parameters(), lr=learning_rate) # load data data_dir = 'data/' + data_name + '/' train_list, train_mat, train_count = utils.data_set( data_dir + 'train.feat', vocab_num) test_list, test_mat, test_count = utils.data_set(data_dir + 'test.feat', vocab_num) # auxiliary dir setting if not os.path.exists('./result'): os.mkdir('./result') os.mkdir('./result/NBNTM') os.mkdir('./result/GNBNTM') if not os.path.exists('./checkpoint'): os.mkdir('./checkpoint') os.mkdir('./checkpoint/NBNTM') os.mkdir('./checkpoint/GNBNTM') flag_str = (data_name + '_shape_' + str(shape_prior) + '_scale_' + str(scale_prior) + '_K_' + str(topic_num) + '_V_' + str(vocab_num) + '_H_' + str(hidden_num) + '_batch_' + str(batch_size) + '_lr_' + str(learning_rate) + '_epoch_' + str(epochs)) result_dir = './result/' + model + '/' + flag_str if not os.path.exists(result_dir): os.mkdir(result_dir) # record in file train_ppl_time = [] best_train_ppl = 1e12 best_coherence = -1 start_time = time.time() addition_time = 0 for epoch in range(epochs): # train perplexity, kld = run(net, optimizer, train_list, train_count, True) current_time_cost = time.time() - start_time train_ppl_time.append([perplexity.detach().item(), current_time_cost]) print_result(epoch, 'train', perplexity, kld) temp_time = time.time() # prepare for test if epoch % 10 == 9: if perplexity < best_train_ppl: best_train_ppl = perplexity # save model state = { 'net': net.state_dict(), 'optimizer': optimizer.state_dict(), 'epochs': epoch } torch.save( state, './checkpoint/' + model + '/' + flag_str + '_best_ppl') # coherence coherence = evaluate_coherence(net, train_mat, [5]) print('train coherence = ', coherence) if coherence > best_coherence: best_coherence = coherence # save model state = { 'net': net.state_dict(), 'optimizer': optimizer.state_dict(), 'epochs': epoch } torch.save( state, './checkpoint/' + model + '/' + flag_str + '_best_coherence') addition_time += time.time() - temp_time end_time = time.time() print(f'time cost:{end_time - start_time - addition_time}') record_result(result_dir + './train_ppl_time_record', train_ppl_time) # test perplexity checkpoint = torch.load('./checkpoint/' + model + '/' + flag_str + '_best_ppl', map_location='cuda:0') net.load_state_dict(checkpoint['net']) optimizer.load_state_dict(checkpoint['optimizer']) epoch = checkpoint['epochs'] perplexity, kld = run(net, optimizer, test_list, test_count, False) print_result(epoch, 'test', perplexity, kld) # test coherence checkpoint = torch.load('./checkpoint/' + model + '/' + flag_str + '_best_coherence', map_location='cuda:0') net.load_state_dict(checkpoint['net']) optimizer.load_state_dict(checkpoint['optimizer']) print( 'whole coherence = ', evaluate_coherence(net, np.concatenate((train_mat, test_mat)), [5, 10, 15])) # save topic words utils.print_topic_word('data/' + data_name + '/' + data_name + '.vocab', model + '_topic_words.txt', net.out_fc.weight.detach().cpu().t(), 15)
def evaluate(model, training_data, training_count, session, step, train_loss=None, epoch=None, summaries=None, writer=None, saver=None): #Get theta for the H1. data_url = os.path.join(FLAGS.data_dir, 'valid_h1.feat' if step != 'test' else 'test_h1.feat') dataset, dataset_count = utils.data_set(data_url) data_batches = utils.create_batches(len(dataset), FLAGS.batch_size, shuffle=False) theta = [] for idx_batch in data_batches: data_batch, count_batch, mask = utils.fetch_data(dataset, dataset_count, idx_batch, FLAGS.vocab_size) input_feed = {model.x.name: data_batch, model.mask.name: mask} logit_theta = session.run(model.doc_vec, input_feed) theta.append(softmax(logit_theta, axis=1)) theta = np.concatenate(theta, axis=0) weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='decoder/projection/Matrix:0')[0].eval(session) bias = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='decoder/projection/Bias:0')[0].eval(session) beta = softmax(weights + bias, axis=1) #H2 to calculate perplexity. data_url = os.path.join(FLAGS.data_dir, 'valid_h2.feat' if step != 'test' else 'test_h2.feat') dataset, dataset_count = utils.data_set(data_url) data_batches = utils.create_batches(len(dataset), FLAGS.batch_size, shuffle=False) test_data = [utils.fetch_data(dataset, dataset_count, idx_batch, FLAGS.vocab_size)[0] for idx_batch in data_batches] test_data = np.concatenate(test_data, axis=0) perplexity = get_perplexity(test_data, theta, beta) coherence = get_topic_coherence(beta, training_data, 'nvdm') if step == 'test' else np.nan diversity = get_topic_diversity(beta, 'nvdm') if step == 'test' else np.nan if step == 'val': #tloss = tf.get_default_graph().get_tensor_by_name('tloss:0') #vppl = tf.get_default_graph().get_tensor_by_name('vppl:0') #weight_summaries = session.run(summaries, feed_dict={tloss: train_loss, vppl: perplexity}) #weight_summaries = summaries.eval(session=session) #writer.add_summary(weight_summaries, epoch) save_path = saver.save(session, os.path.join(ckpt, 'model.ckpt')) print("Model saved in path: %s" % ckpt) print('| Epoch dev: {:d} |'.format(epoch+1)) else: ## get most used topics cnt = 0 thetaWeightedAvg = np.zeros((1, FLAGS.n_topic)) data_batches = utils.create_batches(len(training_data), FLAGS.batch_size, shuffle=False) for idx_batch in data_batches: batch, count_batch, mask = utils.fetch_data(training_data, training_count, idx_batch, FLAGS.vocab_size) sums = batch.sum(axis=1) cnt += sums.sum(axis=0) input_feed = {model.x.name: batch, model.mask.name: mask} logit_theta = session.run(model.doc_vec, input_feed) theta = softmax(logit_theta, axis=1) weighed_theta = (theta.T * sums).T thetaWeightedAvg += weighed_theta.sum(axis=0) thetaWeightedAvg = thetaWeightedAvg.squeeze() / cnt print('\nThe 10 most used topics are {}'.format(thetaWeightedAvg.argsort()[::-1][:10])) with open(FLAGS.data_dir + '/vocab.new', 'rb') as f: vocab = pkl.load(f) topic_indices = list(np.random.choice(FLAGS.n_topic, 10)) # 10 random topics print('\n') with open(ckpt + '/topics.txt', 'w') as f: for k in range(FLAGS.n_topic): gamma = beta[k] top_words = list(gamma.argsort()[-FLAGS.n_words+1:][::-1]) topic_words = [vocab[a] for a in top_words] f.write(str(k) + ' ' + str(topic_words) + '\n') print('Topic {}: {}'.format(k, topic_words)) with open(ckpt + '/' + step + '_scores.csv', 'a') as handle: handle.write(str(perplexity) + ',' + str(coherence) + ',' + str(diversity) + '\n')