def _get_data(file, classes, is_file_content=False): if not is_file_content: with open(file) as f: file_content = f.read() else: file_content = file data = [line.strip() for line in file_content.split('\n') if line.strip()] rel_labels = {} for line in data: rel_str = line.rsplit('(', 1)[0] entity_str = line.rsplit('(', 1)[-1][:-1] ent0_str, ent1_str, *reverse = entity_str.split(',') doc_id = ent0_str.split('.', 1)[0] ent0 = int(ent0_str.split('.', 1)[-1]) - 1 ent1 = int(ent1_str.split('.', 1)[-1]) - 1 if reverse: key = (doc_id, (ent1, ent0)) else: key = (doc_id, (ent0, ent1)) if rel_str not in classes: from efficiency.log import show_var show_var(['file', 'data']) print('[Warn] rel_str({}) is not among pre-defined classes({})'. format(line, classes)) else: rel_labels[key] = rel_str # assert len(rel_labels) == len(data), "# lines does not match with # data" return rel_labels
def rename_files(self, prefix='My_mp3_'): for f in self.files: dir = dirname(f) fname = basename(f) new_fname = prefix + fname new_f = path.join(dir, new_fname) cmd = 'mv "{f}" "{new_f}"'.format(f=f, new_f=new_f) show_var(['cmd']) shell(cmd)
def count_authors(self): authors = [i['authors'].split(', ') for i in self.papers] from efficiency.function import flatten_list authors = flatten_list(authors) from collections import Counter cnt = Counter(authors) from efficiency.log import show_var show_var(['cnt']) import pdb pdb.set_trace()
def get_total_num_lines_in_large_files(file_list, verbose=True): from efficiency.function import shell num_lines = [] for f in sorted(file_list): cmd = "wc -l {} | cut -d ' ' -f 1".format(f) stdout, stderr = shell(cmd) num_line = int(stdout) num_lines.append(num_line) if verbose: from efficiency.log import show_var show_var(['cmd', 'num_line'], joiner='\t') return sum(num_lines)
def _get_rnn_enc2(self, encoding, length, mask, hx, show_net=False): if show_net: print("<") print("[Net] _get_rnn_enc2") show_var(["self.dropout_rnn2_in"]) # prepare packed_sequence encoding = self.dropout_rnn2_in(encoding) if length is not None: seq_input, hx, rev_order, mask = utils.prepare_rnn_seq( encoding, length, hx=hx, masks=mask, batch_first=True) seq_output, hn = self.rnn2(seq_input, hx=hx) output, hn = utils.recover_rnn_seq(seq_output, rev_order, hx=hn, batch_first=True) if show_net: print("utils.prepare_rnn_seq()") show_var(["self.rnn2"]) else: # output from rnn_out [batch, length, hidden_size] output, hn = self.rnn2(encoding, hx=hx) if show_net: show_var(["self.rnn2"]) output = self.dropout_rnn2_out(output) if show_net: show_var(["self.dropout_rnn2_out"]) return output, hn
def check(): import json file = '/home/ubuntu/proj/1908_clickbait/bitly/bitly.json' with open(file) as f: data = json.load(f) show_var(['len(data)', 'list(data.items())[99]']) titles = [] for item in data.values(): titles.append(item['title']) get_most_common_in_list(titles, most_common_n=10) bad_titles = {None, '金沙澳门官方网址_', 'Featured Content on Myspace', 'Google Maps', 'Games | SYFY WIRE', 'Trending on Offbeat', 'Page Not Found', 'YouTube', 'QuickSnapper.com domain is for sale | Buy with Epik.com', 'Twitter / Account Suspended', 'Jason, 443 AI (@JasonInAI) | Twitter', 'interactive investor: low cost online trading & investment platform', 'Yahoo', 'TechRapidly- Blog Provide Tech and Business Tips & Solutions', 'Tech & Startup Events In New York - GarysGuide (#1 Resource for NYC Tech)', 'The Marmoset Song | quietube', 'Login on Twitter', 'Prepare your taste buds...', 'Good night, Posterous', 'MSN | Outlook, Office, Skype, Bing, Breaking News, and Latest Videos', 'Venture Capitalists Need Money, Too – Gigaom', '502 Bad Gateway', '2008Q4a Home Tour Survey', 'Are you human, bot or alien? | mobile9', 'Twitpic', 'When Robot Programmers get bored - YouTube', 'Account Suspended', 'Free Web Hosting - Your Website need to be migrated', '404 Not Found - Web Partner', 'Resort | Free Parking | Trump Las Vegas, NV - Booking.com', 'TVShowsOnDVD.com - Goodbye', 'Get Satisfaction - Customer Communities For Social Support, Social Marketing & Customer Feedback', 'Google', 'Warning! | There might be a problem with the requested link', 'Designer Clothes, Shoes & Bags for Women | SSENSE', 'NRKbeta', "Movie Review: Paul Farhi Reviews 'Yoo-Hoo, Mrs. Goldberg' - washingtonpost.com", 'ogmaciel.com is coming soon', 'Nico Lumma - Hamburg, Deutschland | about.me', 'Abattement Fiscal'} good_data = {k:v for k,v in data.items() if 'nytimes' in v['long_url']} # good_data = {k:v for k,v in data.items() if v['title'] not in bad_titles} show_var(['len(good_data)']) import pdb; pdb.set_trace()
def _get_rnn_enc(self, input, length, mask, hx, show_net=False): if show_net: print('[Net] _get_rnn_enc') show_var(["self.dropout_rnn_in"]) # apply dropout rnn input input = self.dropout_rnn_in(input) # use lstm or cnn to encode the sentence at token level if self.encoder_mode == 'lstm': # prepare packed_sequence if length is not None: seq_input, hx, rev_order, mask = utils.prepare_rnn_seq( input, length, hx=hx, masks=mask, batch_first=True) seq_output, hn = self.sent_rnn(seq_input, hx=hx) output, hn = utils.recover_rnn_seq(seq_output, rev_order, hx=hn, batch_first=True) if show_net: print("utils.prepare_rnn_seq()") show_var(["self.sent_rnn"]) else: output, hn = self.sent_rnn(input, hx=hx) if show_net: show_var(["self.sent_rnn"]) else: _, _, _, mask = utils.prepare_rnn_seq(input, length, hx=hx, masks=mask, batch_first=True) if length is not None: max_len = length.max() input = input[:, :max_len, :] # first transpose to [batch, hidden_size, length] input = input.transpose(1, 2) # then send into the first cnn layer output = torch.relu(self.sent_conv1d_layer1(input)) # then second cnn layer output = torch.relu(self.sent_conv1d_layer2(output)) # transpose to [batch, length, hidden_size] output = output.transpose(1, 2) # output = torch.cat([input.transpose(1, 2), output], dim=2) hn = None # apply dropout for the output of rnn output = self.dropout_rnn_out(output) if show_net: show_var(["self.dropout_rnn_out"]) return output, hn
def main(): import os import json from efficiency.log import fwrite data = {} dir = '/home/ubuntu/proj/1908_clickbait/bitly' file_filter = lambda f: f.startswith('bitly_') and f.endswith('.json') fm = FileManager(dir=dir, file_filter=file_filter) print(json.dumps(fm.files, indent=4)) for file in fm.files: with open(file) as f: content = json.load(f) data.update(content) show_var( ["file", "len(content)", "len(data)", "list(content.keys())[:3]"]) data = dict(sorted(data.items())) fwrite(json.dumps(data, indent=4), os.path.join(dir, 'bitly.json'))
def _get_rnn_output(self, input_word_orig, input_word, input_char, mask=None, length=None, hx=None, show_net=False): input, length = self._get_word_enc(input_word_orig, input_word, input_char, mask=mask, length=length, show_net=show_net) output, hn = self._get_rnn_enc(input, length, mask, hx, show_net=show_net) if self.tag_space: # [batch, length, tag_space] output = self.dropout_tag(F.elu(self.lstm_to_tag_space(output))) if show_net: print("[Net] to_tag") show_var(["self.lstm_to_tag_space"]) show_var(["F.elu"]) show_var(["self.dropout_tag"]) return output, hn, mask, length
def example(): soup = BeautifulSoup(features="html.parser") ls = [] ls += ["Hello world "] tag = soup_tag(soup, "red", 'font', style="border:2px solid Tomato;") ls += [tag] tag = soup_tag(soup, "maggie", "sub") ls += [tag] tag = soup_tag(soup, "important things", "b") ls += [tag] ls += [soup_tag(soup, None, "br")] ls += [" are transient."] ls += [soup.new_tag("br")] ls += ["end."] soup = soupls2html(soup, ls) show_var(["soup"]) # , "soup.prettify()" return soup
def forward(self, q, k, v, attn_mask=None, show_net=False): assert len(q.size()) == 3 if self.ff_layers == 1: q_out, k_out = self._fc(self.linear1, q, k, use_elu=False) if show_net: show_var(["self.linear1"]) elif self.ff_layers == 2: q_out, k_out = self._fc(self.linear1, q, k) q_out, k_out = self._fc(self.linear2, q_out, k_out, use_elu=False) if show_net: show_var(["self.linear1", "self.linear2"]) if show_net: print("bmm --> self.dropout") show_var(["self.comb_att_n_init_adj"]) attn = torch.bmm(q_out, k_out.transpose(1, 2)) # / self.temper # cos sim needs normalization if attn_mask is not None: assert attn_mask.size() == attn.size(), \ 'Attention mask shape {} mismatch ' \ 'with Attention logit tensor shape ' \ '{}.'.format(attn_mask.size(), attn.size()) attn.masked_fill_(attn_mask, -float('inf')) if attn_mask is not None: attn.data.masked_fill_(attn_mask, 0) # convert NaN to 0 attn = self.dropout(attn) import pdb; pdb.set_trace() output = self.comb_att_n_init_adj(attn, v) return output, attn
cnt = counter.values() radius = np.array(list(cnt)) / 2 area = radius**2 * np.pi # Fixing random state for reproducibility np.random.seed(19680801) colors = np.random.rand(len(counter)) plt.scatter(x, y, s=area, c=colors, alpha=0.5) plt.xlabel('# Triples of the Input Graph in WebNLG 2017') plt.ylabel('# Variations of Generated Text') plt.title("Text Diversity Generated by CycleCVAE") x2argmax_y = {} for (x, y), cnt in counter.items(): if cnt < 10: continue if x in x2argmax_y: prev_y, prev_cnt = x2argmax_y[x] if cnt < prev_cnt: continue x2argmax_y[x] = (y, cnt) x = list(x2argmax_y.keys()) y, cnt = zip(*x2argmax_y.values()) show_var(['x', 'y']) import pdb pdb.set_trace() plt.plot(x, y) plt.show()
stdout, stderr = shell(cmd) stdout = stdout.strip() mem = int(stdout) if stdout != b'' else None return mem def debug(what_to_debug=''): if what_to_debug: print("[Info] start debugging {}".format(what_to_debug)) import pdb pdb.set_trace() # you can use "c" for continue, "p variable", "p locals", "n" for next # you can use "!a += 1" for changes of variables # you can use "import code; code.interact(local=locals)" to iPython with # all variables if __name__ == "__main__": a = "something" b = 1 show_var(["a", "b"], joiner=', ') debug("show_var") show_var(["c"])
def main(): # Arguments parser parser = argparse.ArgumentParser(description='Tuning with DNN Model for NER') # Model Hyperparameters parser.add_argument('--mode', choices=['RNN', 'LSTM', 'GRU'], help='architecture of rnn', default='LSTM') parser.add_argument('--encoder_mode', choices=['cnn', 'lstm'], help='Encoder type for sentence encoding', default='lstm') parser.add_argument('--char_method', choices=['cnn', 'lstm'], help='Method to create character-level embeddings', required=True) parser.add_argument('--hidden_size', type=int, default=128, help='Number of hidden units in RNN for sentence level') parser.add_argument('--char_hidden_size', type=int, default=30, help='Output character-level embeddings size') parser.add_argument('--char_dim', type=int, default=30, help='Dimension of Character embeddings') parser.add_argument('--tag_space', type=int, default=0, help='Dimension of tag space') parser.add_argument('--num_layers', type=int, default=1, help='Number of layers of RNN') parser.add_argument('--dropout', choices=['std', 'gcn'], help='Dropout method', default='gcn') parser.add_argument('--p_em', type=float, default=0.33, help='dropout rate for input embeddings') parser.add_argument('--p_in', type=float, default=0.33, help='dropout rate for input of RNN model') parser.add_argument('--p_rnn', nargs=3, type=float, required=True, help='dropout rate for RNN') parser.add_argument('--p_tag', type=float, default=0.33, help='dropout rate for output layer') parser.add_argument('--bigram', action='store_true', help='bi-gram parameter for CRF') parser.add_argument('--adj_attn', choices=['cossim', 'flex_cossim', 'flex_cossim2', 'concat', '', 'multihead'], default='') # Data loading and storing params parser.add_argument('--embedding_dict', help='path for embedding dict') parser.add_argument('--dataset_name', type=str, default='alexa', help='Which dataset to use') parser.add_argument('--train', type=str, required=True, help='Path of train set') parser.add_argument('--dev', type=str, required=True, help='Path of dev set') parser.add_argument('--test', type=str, required=True, help='Path of test set') parser.add_argument('--results_folder', type=str, default='results', help='The folder to store results') parser.add_argument('--alphabets_folder', type=str, default='data/alphabets', help='The folder to store alphabets files') # Training parameters parser.add_argument('--cuda', action='store_true', help='whether using GPU') parser.add_argument('--num_epochs', type=int, default=100, help='Number of training epochs') parser.add_argument('--batch_size', type=int, default=16, help='Number of sentences in each batch') parser.add_argument('--learning_rate', type=float, default=0.001, help='Base learning rate') parser.add_argument('--decay_rate', type=float, default=0.95, help='Decay rate of learning rate') parser.add_argument('--schedule', type=int, default=3, help='schedule for learning rate decay') parser.add_argument('--gamma', type=float, default=0.0, help='weight for l2 regularization') parser.add_argument('--max_norm', type=float, default=1., help='Max norm for gradients') parser.add_argument('--gpu_id', type=int, nargs='+', required=True, help='which gpu to use for training') parser.add_argument('--learning_rate_gcn', type=float, default=5e-4, help='Base learning rate') parser.add_argument('--gcn_warmup', type=int, default=200, help='Base learning rate') parser.add_argument('--pretrain_lstm', type=float, default=10, help='Base learning rate') parser.add_argument('--adj_loss_lambda', type=float, default=0.) parser.add_argument('--lambda1', type=float, default=1.) parser.add_argument('--lambda2', type=float, default=0.) parser.add_argument('--seed', type=int, default=None) # Misc parser.add_argument('--embedding', choices=['glove', 'senna', 'alexa'], help='Embedding for words', required=True) parser.add_argument('--restore', action='store_true', help='whether restore from stored parameters') parser.add_argument('--save_checkpoint', type=str, default='', help='the path to save the model') parser.add_argument('--o_tag', type=str, default='O', help='The default tag for outside tag') parser.add_argument('--unk_replace', type=float, default=0., help='The rate to replace a singleton word with UNK') parser.add_argument('--evaluate_raw_format', action='store_true', help='The tagging format for evaluation') parser.add_argument('--eval_type', type=str, default="micro_f1",choices=['micro_f1', 'acc']) parser.add_argument('--show_network', action='store_true', help='whether to display the network structure') parser.add_argument('--smooth', action='store_true', help='whether to skip all pdb break points') parser.add_argument('--uid', type=str, default='temp') parser.add_argument('--misc', type=str, default='') args = parser.parse_args() show_var(['args']) uid = args.uid results_folder = args.results_folder dataset_name = args.dataset_name use_tensorboard = True save_dset_dir = '{}../dset/{}/graph'.format(results_folder, dataset_name) result_file_path = '{}/{dataset}_{uid}_result'.format(results_folder, dataset=dataset_name, uid=uid) save_loss_path = '{}/{dataset}_{uid}_loss'.format(results_folder, dataset=dataset_name, uid=uid) save_lr_path = '{}/{dataset}_{uid}_lr'.format(results_folder, dataset=dataset_name, uid='temp') save_tb_path = '{}/tensorboard/'.format(results_folder) logger = get_logger("NERCRF") loss_recorder = LossRecorder(uid=uid) record = TensorboardLossRecord(use_tensorboard, save_tb_path, uid=uid) # rename the parameters mode = args.mode encoder_mode = args.encoder_mode train_path = args.train dev_path = args.dev test_path = args.test num_epochs = args.num_epochs batch_size = args.batch_size hidden_size = args.hidden_size char_hidden_size = args.char_hidden_size char_method = args.char_method learning_rate = args.learning_rate momentum = 0.9 decay_rate = args.decay_rate gamma = args.gamma max_norm = args.max_norm schedule = args.schedule dropout = args.dropout p_em = args.p_em p_rnn = tuple(args.p_rnn) p_in = args.p_in p_tag = args.p_tag unk_replace = args.unk_replace bigram = args.bigram embedding = args.embedding embedding_path = args.embedding_dict evaluate_raw_format = args.evaluate_raw_format o_tag = args.o_tag restore = args.restore save_checkpoint = args.save_checkpoint alphabets_folder = args.alphabets_folder use_elmo = False p_em_vec = 0. graph_model = 'gnn' coref_edge_filt = '' learning_rate_gcn = args.learning_rate_gcn gcn_warmup = args.gcn_warmup pretrain_lstm = args.pretrain_lstm adj_loss_lambda = args.adj_loss_lambda lambda1 = args.lambda1 lambda2 = args.lambda2 if args.smooth: import pdb pdb.set_trace = lambda: None misc = "{}".format(str(args.misc)) score_file = "{}/{dataset}_{uid}_score".format(results_folder, dataset=dataset_name, uid=uid) for folder in [results_folder, alphabets_folder, save_dset_dir]: if not os.path.exists(folder): os.makedirs(folder) def set_seed(seed): if not seed: seed = int(show_time()) print("[Info] seed set to: {}".format(seed)) random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) torch.backends.cudnn.deterministic = True set_seed(args.seed) embedd_dict, embedd_dim = utils.load_embedding_dict(embedding, embedding_path) logger.info("Creating Alphabets") word_alphabet, char_alphabet, ner_alphabet = conll03_data.create_alphabets( "{}/{}/".format(alphabets_folder, dataset_name), train_path, data_paths=[dev_path, test_path], embedd_dict=embedd_dict, max_vocabulary_size=50000) logger.info("Word Alphabet Size: %d" % word_alphabet.size()) logger.info("Character Alphabet Size: %d" % char_alphabet.size()) logger.info("NER Alphabet Size: %d" % ner_alphabet.size()) logger.info("Reading Data") device = torch.device('cuda') if args.cuda else torch.device('cpu') print(device) data_train = conll03_data.read_data(train_path, word_alphabet, char_alphabet, ner_alphabet, graph_model, batch_size, ori_order=False, total_batch="{}x".format(num_epochs + 1), unk_replace=unk_replace, device=device, save_path=save_dset_dir + '/train', coref_edge_filt=coref_edge_filt ) # , shuffle=True, num_data = data_train.data_len num_labels = ner_alphabet.size() graph_types = data_train.meta_info['graph_types'] data_dev = conll03_data.read_data(dev_path, word_alphabet, char_alphabet, ner_alphabet, graph_model, batch_size, ori_order=True, unk_replace=unk_replace, device=device, save_path=save_dset_dir + '/dev', coref_edge_filt=coref_edge_filt) data_test = conll03_data.read_data(test_path, word_alphabet, char_alphabet, ner_alphabet, graph_model, batch_size, ori_order=True, unk_replace=unk_replace, device=device, save_path=save_dset_dir + '/test', coref_edge_filt=coref_edge_filt) writer = CoNLL03Writer(word_alphabet, char_alphabet, ner_alphabet) def construct_word_embedding_table(): scale = np.sqrt(3.0 / embedd_dim) table = np.empty([word_alphabet.size(), embedd_dim], dtype=np.float32) table[conll03_data.UNK_ID, :] = np.random.uniform(-scale, scale, [1, embedd_dim]).astype(np.float32) oov = 0 for word, index in word_alphabet.items(): if word in embedd_dict: embedding = embedd_dict[word] elif word.lower() in embedd_dict: embedding = embedd_dict[word.lower()] else: embedding = np.random.uniform(-scale, scale, [1, embedd_dim]).astype(np.float32) oov += 1 table[index, :] = embedding print('oov: %d' % oov) return torch.from_numpy(table) word_table = construct_word_embedding_table() logger.info("constructing network...") char_dim = args.char_dim window = 3 num_layers = args.num_layers tag_space = args.tag_space initializer = nn.init.xavier_uniform_ p_gcn = [0.5, 0.5] d_graph = 256 d_out = 256 d_inner_hid = 128 d_k = 32 d_v = 32 n_head = 4 n_gcn_layer = 1 p_rnn2 = [0.0, 0.5, 0.5] adj_attn = args.adj_attn mask_singles = True post_lstm = 1 position_enc_mode = 'none' adj_memory = False if dropout == 'gcn': network = BiRecurrentConvGraphCRF(embedd_dim, word_alphabet.size(), char_dim, char_alphabet.size(), char_hidden_size, window, mode, encoder_mode, hidden_size, num_layers, num_labels, graph_model, n_head, d_graph, d_inner_hid, d_k, d_v, p_gcn, n_gcn_layer, d_out, post_lstm=post_lstm, mask_singles=mask_singles, position_enc_mode=position_enc_mode, adj_attn=adj_attn, adj_loss_lambda=adj_loss_lambda, tag_space=tag_space, embedd_word=word_table, use_elmo=use_elmo, p_em_vec=p_em_vec, p_em=p_em, p_in=p_in, p_tag=p_tag, p_rnn=p_rnn, p_rnn2=p_rnn2, bigram=bigram, initializer=initializer) elif dropout == 'std': network = BiRecurrentConvCRF(embedd_dim, word_alphabet.size(), char_dim, char_alphabet.size(), char_hidden_size, window, mode, encoder_mode, hidden_size, num_layers, num_labels, tag_space=tag_space, embedd_word=word_table, use_elmo=use_elmo, p_em_vec=p_em_vec, p_em=p_em, p_in=p_in, p_tag=p_tag, p_rnn=p_rnn, bigram=bigram, initializer=initializer) # whether restore from trained model if restore: network.load_state_dict(torch.load(save_checkpoint + '_best.pth')) # load trained model logger.info("cuda()ing network...") network = network.to(device) if dataset_name == 'conll03' and data_dev.data_len > 26: sample = data_dev.pad_batch(data_dev.dataset[25:26]) else: sample = data_dev.pad_batch(data_dev.dataset[:1]) plot_att_change(sample, network, record, save_tb_path + 'att/', uid='temp', epoch=0, device=device, word_alphabet=word_alphabet, show_net=args.show_network, graph_types=data_train.meta_info['graph_types']) logger.info("finished cuda()ing network...") lr = learning_rate lr_gcn = learning_rate_gcn optim = Optimizer('sgd', 'adam', network, dropout, lr=learning_rate, lr_gcn=learning_rate_gcn, wd=0., wd_gcn=0., momentum=momentum, lr_decay=decay_rate, schedule=schedule, gcn_warmup=gcn_warmup, pretrain_lstm=pretrain_lstm) nn.utils.clip_grad_norm_(network.parameters(), max_norm) logger.info( "Network: %s, encoder_mode=%s, num_layer=%d, hidden=%d, char_hidden_size=%d, char_method=%s, tag_space=%d, crf=%s" % \ (mode, encoder_mode, num_layers, hidden_size, char_hidden_size, char_method, tag_space, 'bigram' if bigram else 'unigram')) logger.info("training: l2: %f, (#training data: %d, batch: %d, unk replace: %.2f)" % ( gamma, num_data, batch_size, unk_replace)) logger.info("dropout(in, out, rnn): (%.2f, %.2f, %s)" % (p_in, p_tag, p_rnn)) num_batches = num_data // batch_size + 1 dev_f1 = 0.0 dev_acc = 0.0 dev_precision = 0.0 dev_recall = 0.0 test_f1 = 0.0 test_acc = 0.0 test_precision = 0.0 test_recall = 0.0 best_epoch = 0 best_test_f1 = 0.0 best_test_acc = 0.0 best_test_precision = 0.0 best_test_recall = 0.0 best_test_epoch = 0.0 loss_recorder.start(save_loss_path, mode='w', misc=misc) fwrite('', save_lr_path) fwrite(json.dumps(vars(args)) + '\n', result_file_path) for epoch in range(1, num_epochs + 1): show_var(['misc']) lr_state = 'Epoch %d (uid=%s, lr=%.2E, lr_gcn=%.2E, decay rate=%.4f): ' % ( epoch, uid, Decimal(optim.curr_lr), Decimal(optim.curr_lr_gcn), decay_rate) print(lr_state) fwrite(lr_state[:-2] + '\n', save_lr_path, mode='a') train_err = 0. train_err2 = 0. train_total = 0. start_time = time.time() num_back = 0 network.train() for batch_i in range(1, num_batches + 1): batch_doc = data_train.next() char, word, posi, labels, feats, adjs, words_en = [batch_doc[i] for i in [ "chars", "word_ids", "posi", "ner_ids", "feat_ids", "adjs", "words_en"]] sent_word, sent_char, sent_labels, sent_mask, sent_length, _ = network._doc2sent( word, char, labels) optim.zero_grad() adjs_into_model = adjs if adj_memory else adjs.clone() loss, (ner_loss, adj_loss) = network.loss(None, word, char, adjs_into_model, labels, graph_types=graph_types, lambda1=lambda1, lambda2=lambda2) # loss = network.loss(_, sent_word, sent_char, sent_labels, mask=sent_mask) loss.backward() optim.step() with torch.no_grad(): num_inst = sent_mask.size(0) train_err += ner_loss * num_inst train_err2 += adj_loss * num_inst train_total += num_inst time_ave = (time.time() - start_time) / batch_i time_left = (num_batches - batch_i) * time_ave # update log if batch_i % 20 == 0: sys.stdout.write("\b" * num_back) sys.stdout.write(" " * num_back) sys.stdout.write("\b" * num_back) log_info = 'train: %d/%d loss1: %.4f, loss2: %.4f, time left (estimated): %.2fs' % ( batch_i, num_batches, train_err / train_total, train_err2 / train_total, time_left) sys.stdout.write(log_info) sys.stdout.flush() num_back = len(log_info) optim.update(epoch, batch_i, num_batches, network) sys.stdout.write("\b" * num_back) sys.stdout.write(" " * num_back) sys.stdout.write("\b" * num_back) print('train: %d loss: %.4f, loss2: %.4f, time: %.2fs' % ( num_batches, train_err / train_total, train_err2 / train_total, time.time() - start_time)) # evaluate performance on dev data with torch.no_grad(): network.eval() tmp_filename = "{}/{dataset}_{uid}_output_dev".format(results_folder, dataset=dataset_name, uid=uid) writer.start(tmp_filename) for batch in data_dev: char, word, posi, labels, feats, adjs, words_en = [batch[i] for i in [ "chars", "word_ids", "posi", "ner_ids", "feat_ids", "adjs", "words_en"]] sent_word, sent_char, sent_labels, sent_mask, sent_length, _ = network._doc2sent( word, char, labels) preds, _ = network.decode( None, word, char, adjs.clone(), target=labels, leading_symbolic=conll03_data.NUM_SYMBOLIC_TAGS, graph_types=graph_types) # preds, _ = network.decode(_, sent_word, sent_char, target=sent_labels, mask=sent_mask, # leading_symbolic=conll03_data.NUM_SYMBOLIC_TAGS) writer.write(sent_word.cpu().numpy(), preds.cpu().numpy(), sent_labels.cpu().numpy(), sent_length.cpu().numpy()) writer.close() if args.eval_type == "acc": acc, precision, recall, f1 =evaluate_tokenacc(tmp_filename) f1 = acc else: acc, precision, recall, f1 = evaluate(tmp_filename, score_file, evaluate_raw_format, o_tag) print('dev acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%%' % (acc, precision, recall, f1)) # plot loss and attention record.plot_loss(epoch, train_err / train_total, f1) plot_att_change(sample, network, record, save_tb_path + 'att/', uid="{}_{:03d}".format(uid, epoch), epoch=epoch, device=device, word_alphabet=word_alphabet, show_net=False, graph_types=graph_types) if dev_f1 < f1: dev_f1 = f1 dev_acc = acc dev_precision = precision dev_recall = recall best_epoch = epoch # evaluate on test data when better performance detected tmp_filename = "{}/{dataset}_{uid}_output_test".format(results_folder, dataset=dataset_name, uid=uid) writer.start(tmp_filename) for batch in data_test: char, word, posi, labels, feats, adjs, words_en = [batch[i] for i in [ "chars", "word_ids", "posi", "ner_ids", "feat_ids", "adjs", "words_en"]] sent_word, sent_char, sent_labels, sent_mask, sent_length, _ = network._doc2sent( word, char, labels) preds, _ = network.decode( None, word, char, adjs.clone(), target=labels, leading_symbolic=conll03_data.NUM_SYMBOLIC_TAGS, graph_types=graph_types) # preds, _ = network.decode(_, sent_word, sent_char, target=sent_labels, mask=sent_mask, # leading_symbolic=conll03_data.NUM_SYMBOLIC_TAGS) writer.write(sent_word.cpu().numpy(), preds.cpu().numpy(), sent_labels.cpu().numpy(), sent_length.cpu().numpy()) writer.close() if args.eval_type == "acc": test_acc, test_precision, test_recall, test_f1 = evaluate_tokenacc(tmp_filename) test_f1 = test_acc else: test_acc, test_precision, test_recall, test_f1 = evaluate(tmp_filename, score_file, evaluate_raw_format, o_tag) if best_test_f1 < test_f1: best_test_acc, best_test_precision, best_test_recall, best_test_f1 = test_acc, test_precision, test_recall, test_f1 best_test_epoch = epoch # save the model parameters if save_checkpoint: torch.save(network.state_dict(), save_checkpoint + '_best.pth') print("best dev acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)" % ( dev_acc, dev_precision, dev_recall, dev_f1, best_epoch)) print("best test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)" % ( test_acc, test_precision, test_recall, test_f1, best_epoch)) print("overall best test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)" % ( best_test_acc, best_test_precision, best_test_recall, best_test_f1, best_test_epoch)) # optim.update(epoch, 1, num_batches, network) loss_recorder.write(epoch, train_err / train_total, train_err2 / train_total, Decimal(optim.curr_lr), Decimal(optim.curr_lr_gcn), f1, best_test_f1, test_f1) with open(result_file_path, 'a') as ofile: ofile.write("best dev acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)\n" % ( dev_acc, dev_precision, dev_recall, dev_f1, best_epoch)) ofile.write("best test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)\n" % ( test_acc, test_precision, test_recall, test_f1, best_epoch)) ofile.write("overall best test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)\n\n" % ( best_test_acc, best_test_precision, best_test_recall, best_test_f1, best_test_epoch)) record.close() print('Training finished!')
def _get_gcn_output(self, input_word_orig, input_word, input_char, adjs, target=None, mask=None, length=None, hx=None, leading_symbolic=0, return_edge=False, show_net=False, graph_types=['coref']): if "wonderful" in graph_types: gold_adj = adjs[:, -1, :].clone() gnn_adjs = adjs[:, :-1, :] mask_singles = self.mask_singles assert len(input_word.size()) == 3, "the input is not document level" # input_word is the packed sents [n_sent, sent_len] input_word, input_char, target, sent_mask, length, doc_n_sent = self._doc2sent( input_word, input_char, target, show_net=show_net) # input: [n_sent, sent_len, enc_dim] input, length = self._get_word_enc(input_word_orig, input_word, input_char, mask=sent_mask, length=length, show_net=show_net) # output from rnn [n_sent, sent_len, enc_dim] sent_output, hn = self._get_rnn_enc(input, length, sent_mask, hx, show_net=show_net) # flatten sents to words [batch, n_word, dim] # mask for packed_doc [batch, n_word] output, doc_word_mask = self._sent2word(sent_output, sent_mask, doc_n_sent, show_net=show_net) # enc for non-repetitive words if mask_singles: if show_net: print("[Net] Block singles from here.") coref_ix = 0 # single is 1, repetitive word is 0 single_mask = gnn_adjs[:, coref_ix].sum(-1, keepdim=True).eq(0).float() sent_single_mask = self._word2sent(single_mask, doc_word_mask, length, sent_mask, show_net=show_net) singles = sent_output * sent_single_mask.expand_as(sent_output) if self.tag_space: # [batch, length, tag_space] singles = self.dropout_tag( F.elu(self.lstm_to_tag_space(singles))) if show_net: print("singles -> self.lstm_to_tag_space") singles = singles * sent_single_mask.expand_as(singles) # [batch, n_word, d_graph] output = output * (1 - single_mask).expand_as(output) # go thru gcn [batch, n_word, d_graph] h_gcn, *_ = self.gcn(output, gnn_adjs, doc_word_mask, return_edge=return_edge, show_net=show_net) output = self._word2sent(h_gcn, doc_word_mask, length, sent_mask, show_net=show_net) if self.post_lstm: # output from rnn [n_sent, sent_len, enc_dim] output, hn = self._get_rnn_enc2(output, length, sent_mask, hx, show_net=show_net) # output from rnn_out [batch, length, tag_space] output = self.dropout_tag(F.elu(self.to_tag_space(output))) if show_net: print("<") print("[Net] to_tag") show_var(["self.to_tag_space"]) show_var(["F.elu"]) show_var(["self.dropout_tag"]) print(">") if mask_singles: output = output * (1 - sent_single_mask).expand_as(output) output = output + singles # repetive word enc + single word enc if show_net: print("[Net] output + singles") if length is not None: max_len = length.max() target = target[:, :max_len] adj_loss = self._adj_loss( gnn_adjs[:, 0, :], gold_adj) if "wonderful" in graph_types else 0 return output, target, sent_mask, length, adj_loss
def _get_word_enc(self, input_word_orig, input_word, input_char, mask=None, length=None, show_net=False): # hack length from mask # we do not hack mask from length for special reasons. # Thus, always provide mask if it is necessary. if length is None and mask is not None: length = mask.sum(dim=1).long() if self.p_em_vec: word = embedded_dropout( self.word_embedd, input_word, dropout=self.p_em_vec if self.training else 0) char = embedded_dropout( self.char_embedd, input_char, dropout=self.p_em_vec if self.training else 0) else: # [batch, length, word_dim] word = self.word_embedd(input_word) # [batch, length, char_length, char_dim] char = self.char_embedd(input_char) char_size = char.size() if self.char_method == 'cnn': # first transform to [batch *length, char_length, char_dim] # then transpose to [batch * length, char_dim, char_length] char = char.view(char_size[0] * char_size[1], char_size[2], char_size[3]).transpose(1, 2) # put into cnn [batch*length, char_filters, char_length] # then put into maxpooling [batch * length, char_filters] char, _ = self.char_conv1d(char).max(dim=2) # reshape to [batch, length, char_filters] char = torch.tanh(char).view(char_size[0], char_size[1], -1) else: # first transform to [batch *length, char_length, char_dim] char = char.view(char_size[0] * char_size[1], char_size[2], char_size[3]) # put into rnn module and get the last hidden state _, (char, _) = self.char_rnn(char) # reshape to [batch, length, char_hidden_size] char = char.view(char_size[0], char_size[1], -1) # apply dropout word on input word = self.dropout_em(word) char = self.dropout_em(char) # concatenate word and char [batch, length, word_dim+char_filter] # choose whether to concatenate the ELMO embeddings if self.elmo: elmo_embeddings = self.elmo(batch_to_ids(input_word_orig)) # TODO: the coefficient for elmo needs to be tuned input = torch.cat([word, char, 0.1 * elmo_embeddings], dim=2) else: input = torch.cat([word, char], dim=2) if show_net: print("[Net] _get_word_enc: torch.cat([word {}, char {}]".format( word.shape[-1], char.shape[-1])) show_var(["self.dropout_em"]) return input, length
radius = np.array(list(cnt)) / 2 area = radius**2 * np.pi # Fixing random state for reproducibility np.random.seed(19680801) colors = np.random.rand(len(counter)) plt.scatter(x, y, s=area, c=colors, alpha=0.5) plt.xlabel("# Triples of the Input Graph in WebNLG 2017") plt.ylabel("# Variations of Generated Text") plt.title("Text Diversity Generated by CycleCVAE") x2argmax_y = {} for (x, y), cnt in counter.items(): if cnt < 10: continue if x in x2argmax_y: prev_y, prev_cnt = x2argmax_y[x] if cnt < prev_cnt: continue x2argmax_y[x] = (y, cnt) x = list(x2argmax_y.keys()) y, cnt = zip(*x2argmax_y.values()) show_var(["x", "y"]) import pdb pdb.set_trace() plt.plot(x, y) plt.show()
def main(): raw_text = 'Hello, world. Here are two people with M.A. degrees from UT Austin. This is Mr. Mike.' nlp = NLP() sentences = nlp.sent_tokenize(raw_text) words = nlp.word_tokenize(sentences[0], lower=True) show_var(['sentences', 'words'])
def forward(self, h_gcn, adjs, doc_word_mask, return_edge=False, show_net=False): gcn_model = self.gcn_model posi = [sent.nonzero().view(-1) for sent in doc_word_mask] posi = torch.stack(posi) + 1 batch_size, n_node = h_gcn.size()[:2] slf_attn_pad_mask = get_attn_padding_mask(doc_word_mask, doc_word_mask) if len(self.gcn_layers) > 0: if show_net: print("<") print("[Net_gcn] gcn prep:") show_var([ "self.to_graph", "F.elu", "self.dropout_gcn_in", "self.position_enc" ]) h_gcn = self.dropout_gcn_in(F.elu(self.to_graph(h_gcn))) # h_gcn = torch.cat((init_enc, pos_enc), dim=2) h_gcn = h_gcn + self.position_enc(posi, h_gcn) if self.adj_attn_type: if show_net: show_var(["self.adj_attn_type"]) old_adjs = adjs.clone() coref_adj = old_adjs[:, 0] new_adj, attns = self.adj_attn(h_gcn, h_gcn, coref_adj, show_net=show_net) adjs[:, 0] = new_adj if return_edge: edge_weights = [] adjs = adjs if gcn_model in ['gnn', 'gnn1'] else adjs.squeeze(1) slf_attn_adj_mask = get_attn_adj_mask(adjs) opts = (adjs, doc_word_mask) if gcn_model in ['gnn', 'gnn1'] \ else (adjs,) if gcn_model == 'gnnattn' \ else (slf_attn_pad_mask,) if gcn_model == 'transformer' \ else (slf_attn_adj_mask,) if gcn_model == 'transformer_graph' \ else None for layer_i, layer in enumerate(self.gcn_layers): if show_net: print("gcn [{m}] layer {i}: {l}".format(m=gcn_model, i=layer_i, l=layer)) print(">gcn") h_gcn, edge_weig = layer(h_gcn, *opts) if return_edge: edge_weights += [edge_weig] if self.ga_heads > 0: h_gcn = self.ga_layer(h_gcn, doc_word_mask) # if return_edge: # edge_weights += [edge_weig] h_gcn = h_gcn.view(batch_size, n_node, -1) if return_edge: edge_weights = torch.stack(edge_weights) if return_edge: return h_gcn, torch.stack(edge_weights) else: return h_gcn,