def write_netfile(wtoi, data_set, word_file_path, idx_file_path): d = {} for doc in data_set: words = list(set(doc)) for i in range(len(words)-1): for j in range( i+1 , len(words)): pair_1 = tuple((words[i] , words[j])) pair_2 = tuple((words[j] , words[i])) if pair_1 in d: d[pair_1] += 1 d[pair_2] += 1 else: d[pair_1] = 1 d[pair_2] = 1 trace('{} pair(edges) catched.'.format(len(d)), file=config.log_file) with open(word_file_path, 'w') as writer1 , open(idx_file_path, 'w') as writer2: for pair_set , value in d.items(): if value<5: continue w1,w2 = pair_set writer1.write('{} {} {}\n'.format(w1,w2,value)) writer2.write('{} {} {}\n'.format(wtoi[w1],wtoi[w2],value)) #w.write('{} {} {}\n'.format(w2,w1,value)) writer1.close() writer2.close()
def train_model(model, optimizer, loss_func, train_data_iter, valid_data_iter, config): trainer = Trainer(model, loss_func, optimizer, config) for epoch in range(1, config.epochs + 1): train_iter = iter(train_data_iter) valid_iter = iter(valid_data_iter) # train train_stats = trainer.train(train_iter, epoch, train_data_iter.num_batches) print('') trace('Epoch %d, Train acc: %g, ppl: %g' % (epoch, train_stats.accuracy(), train_stats.ppl())) # validate valid_stats = trainer.validate(valid_iter) trace('Epoch %d, Valid acc: %g, ppl: %g' % (epoch, valid_stats.accuracy(), valid_stats.ppl())) # # log # train_stats.log("train", config.model_name, optimizer.lr) # valid_stats.log("valid", config.model_name, optimizer.lr) # update the learning rate trainer.lr_step(valid_stats.ppl(), epoch) # dump a checkpoint if needed. trainer.dump_checkpoint(epoch, config, train_stats)
def make_vocab(self): if self.cur_counter is None: pass else: i = 0 for w in self.cur_counter.keys(): self.itow[i] = w self.wtoi[w] = i i += 1 trace('made vocab size {}'.format(len(self.itow)), self.config.log_file)
def update_dataset(self): new_times = [] ; new_docs = [] old_c = len(self.times) for time, doc in zip(self.times, self.docs): new_doc = [word for word in doc.split() if word in self.vocab] if len(new_doc)<self.config.min_num_words: continue else: new_times.append(time) new_docs.append(new_doc) self.times = new_times self.docs = new_docs new_c = len(self.times) trace('update dataset from {} to {}'.format(old_c, new_c), self.config.log_file)
def build_twnp(assignments, centroids, nodes_i2w, word_emb, wtoi): num_topics = len(centroids) ; num_words = len(wtoi) trace('tw_np shape: {} x {} , non-zero value(word_emb) : {}'.format(num_topics, num_words, len(word_emb)), file=config.log_file) tw_np = np.zeros((num_topics, num_words)) tw_np = neg_init(tw_np, -99) print('negatived np') for ass_idx, node_indices in assignments.items(): for node_idx in node_indices: word = nodes_i2w[node_idx] word_idx = wtoi[word] dist = distance(centroids[ass_idx], word_emb[word]) tw_np[ass_idx][word_idx] = -dist return tw_np
def update_lr(self, ppl, epoch): """ Decay learning rate if val perf does not improve or we hit the start_decay_at limit. """ if self.start_decay_at is not None and epoch >= self.start_decay_at: self.start_decay = True # if self.last_ppl is not None and ppl > self.last_ppl: # self.start_decay = True if self.start_decay: self.lr = self.lr * self.lr_decay_rate trace("Decaying learning rate to %g" % self.lr) self.last_ppl = ppl if self.method != 'sparseadam': self.optimizer.param_groups[0]['lr'] = self.lr
def main(): # Load checkpoint if we resume from a previous training. args, parser = parse_args("train") config = read_config(args, parser, args.config) trace(format_config(config)) train_data_iter = DataBatchIterator(config=config, is_train=True, dataset="train", batch_size=config.batch_size, shuffle=True) train_data_iter.load() src_vocab = train_data_iter.src_vocab trg_vocab = train_data_iter.trg_vocab check_save_path(config.save_vocab) torch.save(src_vocab, config.save_vocab + "." + config.src_lang) torch.save(trg_vocab, config.save_vocab + "." + config.trg_lang) valid_data_iter = DataBatchIterator(config=config, is_train=True, dataset="dev", batch_size=config.valid_batch_size) valid_data_iter.set_vocab(src_vocab, trg_vocab) valid_data_iter.load() # Build model. model = model_factory(config, src_vocab, trg_vocab) # if len(config.gpu_ids) > 1: # trace('Multi gpu training: ', config.gpu_ids) # model = nn.DataParallel(model, device_ids=config.gpu_ids, dim=1) trace(model) # Build optimizer. optimizer = build_optimizer(model, config) padding_idx = trg_vocab.stoi[PAD_WORD] # Build loss functions for training set and validation set. loss_func = NMTLoss(config, padding_idx) # Do training. train_model(model, optimizer, loss_func, train_data_iter, valid_data_iter, config)
def seg(self, unit=None, time_period=None): # according to 'self' unit and period d = {} for t_i, time in enumerate(self.times): new_datetime = trans_datetime(cur_date=time, unit=self.config.unit, time_period=self.config.time_period,start_date=self.start_date) if new_datetime in d: d[new_datetime].append(t_i) else: d[new_datetime] = [t_i] #d = list(filter(lambda x:len(x[1])>self.config.min_num_docs, d.items())) d = sorted(d.items(), key=lambda x:x[0]) #print(d) self.sorted_idx = [] self.time_slices = [] self.times_tag = [] for i in range(len(d)-1): if len(d[i][1])<self.config.min_num_docs: continue self.time_slices.append(len(d[i][1])) self.sorted_idx += d[i][1] self.times_tag.append(str(d[i][0])) # cut for test set i = -1 ; cut=2000 if len(d[i][1])<cut: self.time_slices.append(len(d[i][1])) self.sorted_idx += d[i][1] self.times_tag.append(str(d[i][0])+'_test') else: self.time_slices.append(len(d[i][1][:-cut])) self.sorted_idx += d[i][1][:-cut] self.times_tag.append(str(d[i][0])) self.time_slices.append(len(d[i][1][-cut:])) self.sorted_idx += d[i][1][-cut:] self.times_tag.append(str(d[i][0])+'_test') trace('seg documents({}) to {} slices [{}] based on [{}]'.format( sum(self.time_slices), len(self.time_slices), ','.join([str(i) for i in self.time_slices]), ','.join(self.times_tag)), self.config.log_file)
def read_config(args, args_parser, config_file=None): if config_file is None: return args_parser.parse_args() if not os.path.isfile(config_file): trace("""# Cannot find the configuration file. {} does not exist! Please check.""".format(config_file)) sys.exit(1) config = SafeConfigParser() config.read(config_file) for section in config.sections(): default = get_correct_args(config, config.items(section), section) args_parser.set_defaults( **{ k: v for k, v in filter(lambda x: hasattr(args, x[0]), default.items()) }) args = args_parser.parse_args() return args
def main(): trace('---train topics---', config.log_file) model = DtmModel(dtm_path, corpus=gensim_data.corpus, id2word=gensim_data.dictionary, time_slices=train_set.time_slices[:-1], num_topics=config.z_dim, lda_sequence_min_iter=50, lda_sequence_max_iter=config.epochs) trace('---model trained---', config.log_file) # sample_topic = model.dtm_coherence(time=0, num_words=10) print('sample topic is like: {}'.format(' '.join(sample_topic[0])), config.log_file) # tw_nps = model.show_topics(num_topics=config.z_dim, times=-1, num_words=train_set.vocab_size(), formatted=False) for t in range(T): # topics in time t tw_np = tw_nps[t * config.z_dim:(t + 1) * config.z_dim] tw_np = get_topic_np(tw_np, config.z_dim, gensim_data.dictionary.token2id) tw_tensor = torch.from_numpy(tw_np) tw_list_t = get_tw_list(tw_tensor, gensim_data.dictionary) # coh cohs_t = get_cohs(tw_list_t) p = ppl(gensim_data.test, tw_tensor) TWmatrix.append(tw_np) TWlist.append(tw_list_t) COHs.append(cohs_t) PPLs.append(p) avg_COHs.append((sum(cohs_t) / len(cohs_t))) seg = '---------- topics in time {}/{} ----------'.format(t + 1, T) display_topics(tw_list=tw_list_t, cohs=cohs_t, head='topics', seg=seg, file=config.topic_file) trace('topic result(coherence) written.', file=config.log_file) p_file = os.path.join(config.output_path, 'ppl.jpg') draw_ppl(PPLs, title='perplexities over time', file=p_file) a_file = os.path.join(config.output_path, 'avg_coh.jpg') draw_ppl(avg_COHs, title='avg coherence over time', file=a_file)
def log(self, sent_number): """ Log translation to stdout. """ output = '\nINPUT {}: {}\n'.format(sent_number, " ".join(self.src_sent)) best_pred = self.pred_sents[0] best_score = self.pred_scores[0] pred_sent = ' '.join(best_pred) output += 'PRED {}: {}\n'.format(sent_number, pred_sent) trace("PRED SCORE: {:.4f}".format(best_score)) if self.gold_sent is not None: trg_sent = ' '.join(self.gold_sent) output += 'GOLD {}: {}\n'.format(sent_number, trg_sent) # output += ("GOLD SCORE: {:.4f}".format(self.gold_score)) trace("GOLD SCORE: {:.4f}".format(self.gold_score)) if len(self.pred_sents) > 1: trace('\nBEST HYP:') for score, sent in zip(self.pred_scores, self.pred_sents): output += "[{:.4f}] {}\n".format(score, sent) return output
def main(): args, parser = parse_args("translate") config = read_config(args, parser, args.config) config.batch_size = 1 test_data_iter = DataBatchIterator(config=config, is_train=False, dataset="test", batch_size=config.batch_size) src_vocab = torch.load(config.save_vocab + "." + config.src_lang) trg_vocab = torch.load(config.save_vocab + "." + config.trg_lang) test_data_iter.set_vocab(src_vocab, trg_vocab) test_data_iter.load() checkpoint = torch.load(config.save_model + ".pt") # Load the model. model = model_factory(config, src_vocab, trg_vocab, train_mode=False, checkpoint=checkpoint) if config.verbose: trace(model) # File to write sentences to. pred_file = codecs.open(config.output + ".pred.txt", 'w', 'utf-8') ref_file = codecs.open(config.output + ".ref.txt", 'w', 'utf-8') src_file = codecs.open(config.output + ".src.txt", 'w', 'utf-8') # Sort batch by decreasing lengths of sentence required by pytorch. # sort=False means "Use dataset's sortkey instead of iterator's". # Translator scorer = GNMTGlobalScorer(config.alpha, config.beta, config.coverage_penalty, config.length_penalty) translator = BatchTranslator(model, config, trg_vocab, global_scorer=scorer) data_iter = iter(test_data_iter) builder = TranslationBuilder(src_vocab, trg_vocab, config) # Statistics counter = count(1) pred_score_total, pred_words_total = 0, 0 gold_score_total, gold_words_total = 0, 0 pred_list = [] gold_list = [] for batch in data_iter: outputs = translator.translate_batch(batch) batch_trans = builder.from_batch_translator_output(outputs) for trans in batch_trans: pred_score_total += trans.pred_scores[0] pred_words_total += len(trans.pred_sents[0]) pred_list.append(trans.pred_sents[0]) gold_score_total += trans.gold_score gold_words_total += len(trans.gold_sent) + 1 gold_list.append(trans.gold_sent) k_best_preds = [ " ".join(pred) for pred in trans.pred_sents[:config.k_best] ] #print(" ".join(trans.gold_sent) pred_file.write('\n'.join(k_best_preds) + "\n") ref_file.write(" ".join(trans.gold_sent) + '\n') src_file.write(" ".join(trans.src_sent) + '\n') if config.verbose: sent_number = next(counter) output = trans.log(sent_number) os.write(1, output.encode('utf-8')) report_score('PRED', pred_score_total, pred_words_total) report_score('GOLD', gold_score_total, gold_words_total) if config.plot_attn: plot_attn(trans.src_sent, trans.pred_sents[0], trans.attns[0].cpu()) #break #break report_bleu(gold_list, pred_list) report_rouge(gold_list, pred_list)
# PPLs.append(p) # seg = '---------- topics in time {}/{} ----------'.format(t+1, T) # display_topics(tw_list=tw_list_t, cohs=None, coh_name='(ppl: {})'.format('%.4f' % p), # head='topics', seg=seg, # file = config.topic_file) if __name__ == '__main__': # configuration args, parser = parse_args() config_file = 'config/tryconfig.ini' global config config = read_config(args, parser, config_file) s = 'Start running m_DTM.py \n {}\n'.format(str(config)) trace(s, file=config.log_file, write='w') global dtm_path dtm_path = os.path.join(project_path, 'dtm/dtm/main') # make dataset train_set = Dataset(config) global T T = train_set.T() global TWmatrix, TWlist, COHs, PPLs, avg_COHs TWmatrix = [] TWlist = [] COHs = [] PPLs = [] avg_COHs = []
def main(t, T, train_data, widget, whole_wtoi, whole_itow): # Create dict of distribution when opening file # edge_dist_dict, node_dist_dict, weights, nodedegrees, maxindex = makeDist( # config.graph_path, config.negativepower) # edges_alias_sampler = VoseAlias(edge_dist_dict) # nodes_alias_sampler = VoseAlias(node_dist_dict) # choose graph type #model = choose_graph(config) model = config.graph print('model: ', model) # build input for graph embedding if model == 'TopicMap': graph_file= os.path.join(config.output_path, 'temp_graph_file_{}.txt'.format(t)) elif model == 'LINEs': graph_file = 'temp_graph_file_{}.txt'.format(t) elif model == 'PyGCN': graph_file = 'temp_graph_file_{}.txt'.format(t) elif model == 'MyGCN': graph_file = 'temp_graph_file_{}.txt'.format(t) geinput = build_geinput(model, train_data, whole_wtoi, graph_file) if model == 'TopicMap': centroids = 0 git_path = os.path.abspath(os.path.dirname(os.path.abspath(os.path.dirname(__file__)))) topicmap_path = os.path.join(git_path, 'topicmapping/bin/topicmap') emb_dir = os.path.join(config.output_path, 'result_{}'.format(t)) cmd = '{} -f {} -t 10 -o {}'.format( topicmap_path, graph_file, emb_dir) print(cmd) subprocess.call(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) #centroids = subp.stdout.readlines() write_topicmap(config.topic_file, config.output_path, T) elif model == 'LINEs' or model == 'PyGCN': # get embedding emb_file = 'temp_emb_file.txt' run_cmd(config, model, graph_file, emb_file, geinput) nodes, nodes_i2w, word_emb = read_emb(emb_file) # clustering if FLAG: widget = widget else: widget = None centroids = cluster_ge(t, nodes, nodes_i2w, word_emb, emb_file, wtoi=whole_wtoi, itow=whole_itow, init_c=widget) os.remove(emb_file) elif model == 'MyGCN': A_matrix, X_matrix, cur_wtoi, cur_itow = geinput if FLAG: print('new shape of X_matrix is: {}'.format(X_matrix.shape)) init_weight_11 = np.random.random((len(cur_wtoi), config.h_dim)) a=0; b=0 for word, cur_w_i in cur_wtoi.items(): memory_w_v = MemoryVD[whole_wtoi[word]] if np.all(memory_w_v==0): b+=1 else: a+=1 init_weight_11[cur_w_i] = memory_w_v s = "inherit {} word embeddings and random {}".format(a,b) trace(s, file=config.log_file) w12, w21 = widget init_weight = (init_weight_11, w12, w21) else: # init_weight = np.random.random((len(whole_wtoi), config.h_dim)) # print(init_weight) init_weight = None N_ , F_ = X_matrix.shape ; D_ = config.h_dim print(N_, F_, D_) model_gcn = GCN_model(config, n_dim=N_, d_dim=D_, f_dim=F_, init_weight_np=init_weight) # print(model_gcn.weight_11.data) model_gcn.to(config.device) optimizer = torch.optim.Adam(model_gcn.parameters(), lr=config.lr) model_gcn.train() for epoch in range(config.epochs): b = 5 ; k = math.ceil(N_/b) #; print('k: ',k) for batch in range(2*b): cur_a, chosen_idx = sample_from_matrix(A_matrix, k=k) cur_x, _ = sample_from_matrix(X_matrix, k=k, chosen_idx=chosen_idx) optimizer.zero_grad() cur_inputs = (torch.tensor(cur_a).to(config.device), torch.tensor(cur_x).to(config.device)) # print(cur_inputs[0].dtype) # print(model_gcn.weight_11.data.dtype) rec, loss = model_gcn(cur_inputs) loss.backward() optimizer.step() cur_inputs = (torch.tensor(A_matrix).to(config.device), torch.tensor(X_matrix).to(config.device)) optimizer.zero_grad() rec, loss = model_gcn(cur_inputs) loss.backward() optimizer.step() if epoch%10==0: s = "epoch:{}, loss:{}".format(epoch, loss) trace(s, file=config.log_file) # update memory VD (node/word embedding) nd_matrix, w12, w21 = model_gcn.get_widget() # print('nd_matrix\n', nd_matrix) c = 0 ; d = 0 for cur_w_i, word in cur_itow.items(): memory_w_i = whole_wtoi[word] cur_v = nd_matrix[cur_w_i] if np.all(cur_v==0): c += 1 else: MemoryVD[memory_w_i] = cur_v d += 1 trace('update {}/{} words from cur to memory'.format(d, c)) # write emb_file emb_file = 'temp_emb_file.txt' with open(emb_file, 'w') as writer: writer.write('{} {}'.format(N_, config.h_dim)) for row_i, row in enumerate(nd_matrix): word = cur_itow[row_i] vector_str = [str(item) for item in row] s = "{} {}\n".format(word, ' '.join(vector_str)) writer.write(s) # clustering nodes, nodes_i2w, word_emb = read_emb(emb_file) centroids = cluster_ge(t, nodes, nodes_i2w, word_emb, emb_file, wtoi=whole_wtoi, itow=whole_itow, init_c=widget) return (w12, w21) #os.remove(graph_file) return centroids
centroids = cluster_ge(t, nodes, nodes_i2w, word_emb, emb_file, wtoi=whole_wtoi, itow=whole_itow, init_c=widget) return (w12, w21) #os.remove(graph_file) return centroids if __name__ == '__main__': # configuration args, parser = parse_args() config_file = 'config/tryconfig.ini' global config config = read_config(args, parser, config_file) s = 'Start running ges.py \n {}\n'.format(str(config)) trace(s,file=config.log_file, write='w') global T train_set = Dataset(config) T = train_set.T() global TWmatrix, TWlist, COHs, PPLs, avg_COHs TWmatrix = [] ; TWlist = []; COHs = []; PPLs = [] avg_COHs = [] name = Name(flag=config.flag, config=config, model_name=config.graph, data_name=config.train_file, time_slices=train_set.time_slices[:-1]) result = Result(info=name, TWmatrix=TWmatrix, itow=train_set.itow, twlist=TWlist, COHs=COHs, PPLs=PPLs) result_file = os.path.join(config.output_path, 'result') if config.graph == 'MyGCN': global MemoryVD, V
def read(self, file, dataname='arxiv', prepcs=False): if dataname == 'arxiv': with open(file, 'r') as reader: if prepcs: pass else: line_counter = 0 ; valid_counter = 0 for line in reader: line_counter += 1 tokens = line.strip().split('\t') time = tokens[0] text = tokens[-1] time = to_datetime(time, dataname = 'arxiv') if time == None: continue if time<self.set_start or time>self.set_end: continue text, wc = cleanStr(text, self.config.deli) if wc < self.config.min_num_words: continue self.times.append(time) self.docs.append(text) self.counter.update(text.strip().split()) valid_counter += 1 if time < self.start_date: self.start_date = time elif time > self.end_date: self.end_date = time reader.close() info = 'extract {}/{} lines from {}'.format(valid_counter, line_counter, file) trace(info, self.config.log_file) elif dataname == 'care': with open(file, 'r') as reader: if prepcs: pass else: line_counter = 0 ; valid_counter = 0 for line in reader: line_counter += 1 tokens = line.strip().split('\t') time = tokens[0] if len(tokens) == 2: text = tokens[-1] elif len(tokens) == 3: text = tokens[1] + ' ' + tokens[2] else: continue time = to_datetime(time, dataname='care') if time == None: continue if time<self.set_start or time>self.set_end: continue text, wc = cleanStr(text) if wc < self.config.min_num_words: continue self.times.append(time) self.docs.append(text) self.counter.update(text.strip().split()) valid_counter += 1 if time < self.start_date: self.start_date = time elif time > self.end_date: self.end_date = time reader.close() info = 'extract {}/{} lines from {}'.format(valid_counter, line_counter, file) trace(info, self.config.log_file) else: info = 'unrecgonized argument dataname' trace(info, self.config.log_file) pass