def main(argv=None): restore_param = util.load_from_dump( os.path.join(FLAGS.train_dir, 'flags.cPickle')) restore_param['train_dir'] = FLAGS.train_dir if restore_param.has_key('contextwise') and restore_param['contextwise']: source_path = os.path.join(restore_param['data_dir'], "ids") target_path = os.path.join(restore_param['data_dir'], "target.txt") _, data = util.read_data_contextwise( source_path, target_path, restore_param['sent_len'], train_size=restore_param['train_size']) else: source_path = os.path.join(restore_param['data_dir'], "ids.txt") target_path = os.path.join(restore_param['data_dir'], "target.txt") _, data = util.read_data(source_path, target_path, restore_param['sent_len'], train_size=restore_param['train_size']) pre, rec = evaluate(data, restore_param) util.dump_to_file(os.path.join(FLAGS.train_dir, 'results.cPickle'), { 'precision': pre, 'recall': rec })
def resolve(exit_desc): """ Resolve exit relay-specific domain. """ exit_url = util.exiturl(exit_desc.fingerprint) # Prepend the exit relay's fingerprint so we know which relay issued the # DNS request. fingerprint = exit_desc.fingerprint.encode("ascii", "ignore") domain = "%s.%s.%s" % (fingerprint, time.strftime("%Y-%m-%d-%H"), TARGET_DOMAIN) log.debug("Resolving %s over %s." % (domain, exit_url)) sock = torsocks.torsocket() sock.settimeout(10) # Resolve the domain using Tor's SOCKS extension. log.debug("Resolving %s over %s." % (domain, exit_url)) try: ipv4 = sock.resolve(domain) except error.SOCKSv5Error as err: # This is expected because all domains resolve to 127.0.0.1. log.warning("SOCKSv5 error while resolving domain: %s" % err) ipv4 = "0.0.0.0" pass except socket.timeout as err: log.debug("Socket over exit relay %s timed out: %s" % (exit_url, err)) return log.info("Successfully resolved domain over %s to %s." % (exit_url, ipv4)) # Log a CSV including timestamp, exit fingerprint, exit IP address, and the # domain we resolved. timestamp = time.strftime("%Y-%m-%d_%H:%M:%S_%z") content = "%s, %s, %s, %s\n" % (timestamp, fingerprint, exit_desc.address, ipv4) util.dump_to_file(content, fingerprint)
def main(argv=None): restore_param = util.load_from_dump(os.path.join(FLAGS.train_dir, 'flags.cPickle')) restore_param['train_dir'] = FLAGS.train_dir source_path = os.path.join(restore_param['data_dir'], 'test_cs_unlabeled_data_combined.txt') target_path = os.path.join(restore_param['data_dir'], 'test_cs_labels_combined.txt') vocab_path = os.path.join(restore_param['data_dir'], 'test_cs_vocab_combined') _, data = util.read_data(source_path, target_path, restore_param['sent_len'], train_size=restore_param['train_size'], hide_key_phrases=restore_param.get('hide_key_phrases', False)) pre, rec, x_input, expected_output, actual_output = evaluate(data, restore_param) actual_output_exp = np.exp(actual_output) actual_output_softmax = actual_output_exp / np.sum(actual_output_exp, axis=1, keepdims=True) output_difference = np.sum(np.abs(actual_output_softmax - expected_output), axis=1) sentence_indices_input = x_input[:,:-2] _,rev_vocab = preprocessing_util.initialize_vocabulary(vocab_path) sentence_input = preprocessing_util.indices_to_sentences(sentence_indices_input,rev_vocab) kp_indices_input = x_input[:,-2:] print('Diff\tType\tSentence\t\tExpected Score (A is-a B, B is-a A, Neither)\tActual Score') for sentence_i, sentence in enumerate(sentence_input): # Label the key phrases of interest in the current sentence with *. sentence[kp_indices_input[sentence_i,1]] += '*' sentence[kp_indices_input[sentence_i,0]] += '*' current_type = 'Neither' if expected_output[sentence_i,0] == 1: current_type = 'A is-a B' elif expected_output[sentence_i,1] == 1: current_type = 'B is-a A' print('%.3f\t%s\t%s\t\t%s\t%s\t' % (output_difference[sentence_i], current_type, ' '.join(sentence), str(expected_output[sentence_i]), str(actual_output_softmax[sentence_i]))) util.dump_to_file(os.path.join(FLAGS.train_dir, 'results.cPickle'), {'precision': pre, 'recall': rec})
def train(train_data, test_data): # train_dir timestamp = str(int(time.time())) out_dir = os.path.abspath(os.path.join(FLAGS.train_dir, timestamp)) # save flags if not os.path.exists(out_dir): os.mkdir(out_dir) FLAGS._parse_flags() config = dict(FLAGS.__flags.items()) # Window_size must not be larger than the sent_len if config['sent_len'] < config['max_window']: config['max_window'] = config['sent_len'] # flag to restore the contextwise model config['split'] = True # save flags config['train_dir'] = out_dir util.dump_to_file(os.path.join(out_dir, 'flags.cPickle'), config) # display parameter settings print 'Parameters:' for k, v in config.iteritems(): print '\t' + k + '=' + str(v) # max number of steps num_batches_per_epoch = int( np.ceil(float(len(train_data)) / FLAGS.batch_size)) max_steps = num_batches_per_epoch * FLAGS.num_epochs with tf.Graph().as_default(): with tf.variable_scope('cnn', reuse=None): m = cnn_split.Model(config, is_train=True) with tf.variable_scope('cnn', reuse=True): mtest = cnn_split.Model(config, is_train=False) # checkpoint saver = tf.train.Saver(tf.all_variables()) save_path = os.path.join(out_dir, 'model.ckpt') summary_op = tf.merge_all_summaries() # session sess = tf.Session(config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement)) with sess.as_default(): train_summary_writer = tf.train.SummaryWriter(os.path.join( out_dir, "train"), graph=sess.graph) dev_summary_writer = tf.train.SummaryWriter(os.path.join( out_dir, "dev"), graph=sess.graph) sess.run(tf.initialize_all_variables()) # assign pretrained embeddings if FLAGS.use_pretrain: print "Initializing model with pretrained embeddings ..." pretrained_embedding = np.load( os.path.join(FLAGS.data_dir, 'emb.npy')) m.assign_embedding(sess, pretrained_embedding) # initialize parameters current_lr = FLAGS.init_lr lowest_loss_value = float("inf") decay_step_counter = 0 global_step = 0 # evaluate on dev set def dev_step(mtest, sess): dev_loss = [] dev_auc = [] dev_f1_score = [] # create batch test_batches = util.batch_iter(test_data, batch_size=FLAGS.batch_size, num_epochs=1, shuffle=False) for batch in test_batches: left_batch, right_batch, y_batch, n_batch = zip(*batch) feed = { mtest.left: np.array(left_batch), mtest.right: np.array(right_batch), mtest.labels: np.array(y_batch) } if FLAGS.negative: feed[mtest.negative] = np.array(n_batch) loss_value, eval_value = sess.run( [mtest.total_loss, mtest.eval_op], feed_dict=feed) dev_loss.append(loss_value) pre, rec = zip(*eval_value) dev_auc.append(util.calc_auc_pr(pre, rec)) dev_f1_score.append((2.0 * pre[5] * rec[5]) / (pre[5] + rec[5])) # threshold = 0.5 return (np.mean(dev_loss), np.mean(dev_auc), np.mean(dev_f1_score)) # train loop print "\nStart training (save checkpoints in %s)\n" % out_dir train_loss = [] train_auc = [] train_f1_score = [] train_batches = util.batch_iter(train_data, batch_size=FLAGS.batch_size, num_epochs=FLAGS.num_epochs) for batch in train_batches: batch_size = len(batch) m.assign_lr(sess, current_lr) global_step += 1 left_batch, right_batch, y_batch, n_batch = zip(*batch) feed = { m.left: np.array(left_batch), m.right: np.array(right_batch), m.labels: np.array(y_batch) } if FLAGS.negative: feed[m.negative] = np.array(n_batch) start_time = time.time() _, loss_value, eval_value = sess.run( [m.train_op, m.total_loss, m.eval_op], feed_dict=feed) proc_duration = time.time() - start_time train_loss.append(loss_value) pre, rec = zip(*eval_value) auc = util.calc_auc_pr(pre, rec) f1 = (2.0 * pre[5] * rec[5]) / (pre[5] + rec[5] ) # threshold = 0.5 train_auc.append(auc) train_f1_score.append(f1) assert not np.isnan(loss_value), "Model loss is NaN." # print log if global_step % FLAGS.log_step == 0: examples_per_sec = batch_size / proc_duration format_str = '%s: step %d/%d, f1 = %.4f, auc = %.4f, loss = %.4f ' + \ '(%.1f examples/sec; %.3f sec/batch), lr: %.6f' print format_str % (datetime.now(), global_step, max_steps, f1, auc, loss_value, examples_per_sec, proc_duration, current_lr) # write summary if global_step % FLAGS.summary_step == 0: summary_str = sess.run(summary_op) train_summary_writer.add_summary(summary_str, global_step) dev_summary_writer.add_summary(summary_str, global_step) # summary loss, f1 train_summary_writer.add_summary(_summary_for_scalar( 'loss', np.mean(train_loss)), global_step=global_step) train_summary_writer.add_summary(_summary_for_scalar( 'auc', np.mean(train_auc)), global_step=global_step) train_summary_writer.add_summary(_summary_for_scalar( 'f1', np.mean(train_f1_score)), global_step=global_step) dev_loss, dev_auc, dev_f1 = dev_step(mtest, sess) dev_summary_writer.add_summary(_summary_for_scalar( 'loss', dev_loss), global_step=global_step) dev_summary_writer.add_summary(_summary_for_scalar( 'auc', dev_auc), global_step=global_step) dev_summary_writer.add_summary(_summary_for_scalar( 'f1', dev_f1), global_step=global_step) print "\n===== write summary =====" print "%s: step %d/%d: train_loss = %.6f, train_auc = %.4f train_f1 = %.4f" \ % (datetime.now(), global_step, max_steps, np.mean(train_loss), np.mean(train_auc), np.mean(train_f1_score)) print "%s: step %d/%d: dev_loss = %.6f, dev_auc = %.4f dev_f1 = %.4f\n" \ % (datetime.now(), global_step, max_steps, dev_loss, dev_auc, dev_f1) # reset container train_loss = [] train_auc = [] train_f1_score = [] # decay learning rate if necessary if loss_value < lowest_loss_value: lowest_loss_value = loss_value decay_step_counter = 0 else: decay_step_counter += 1 if decay_step_counter >= FLAGS.tolerance_step: current_lr *= FLAGS.lr_decay print '%s: step %d/%d, Learning rate decays to %.5f' % \ (datetime.now(), global_step, max_steps, current_lr) decay_step_counter = 0 # stop learning if learning rate is too low if current_lr < 1e-5: break # save checkpoint if global_step % FLAGS.checkpoint_step == 0: saver.save(sess, save_path, global_step=global_step) saver.save(sess, save_path, global_step=global_step)
def train(train_data, test_data): # train_dir timestamp = str(int(time.time())) out_dir = os.path.abspath(os.path.join(FLAGS.train_dir, timestamp)) # save flags if not os.path.exists(out_dir): os.mkdir(out_dir) FLAGS._parse_flags() config = dict(FLAGS.__flags.items()) # Window_size must not be larger than the sent_len if config['sent_len'] < config['max_window']: config['max_window'] = config['sent_len'] util.dump_to_file(os.path.join(out_dir, 'flags.cPickle'), config) print("Parameters:") for k, v in config.items(): print('%20s %r' % (k, v)) num_batches_per_epoch = int( np.ceil(float(len(train_data)) / FLAGS.batch_size)) max_steps = num_batches_per_epoch * FLAGS.num_epochs with tf.Graph().as_default(): with tf.variable_scope('cnn', reuse=None): m = cnn.Model(config, is_train=True) with tf.variable_scope('cnn', reuse=True): mtest = cnn.Model(config, is_train=False) # checkpoint saver = tf.train.Saver(tf.global_variables()) save_path = os.path.join(out_dir, 'model.ckpt') summary_op = tf.summary.merge_all() # session with tf.Session().as_default() as sess: proj_config = tf.contrib.tensorboard.plugins.projector.ProjectorConfig( ) embedding = proj_config.embeddings.add() embedding.tensor_name = m.W_emb.name embedding.metadata_path = os.path.join(FLAGS.data_dir, 'vocab.txt') train_summary_writer = tf.summary.FileWriter(os.path.join( out_dir, "train"), graph=sess.graph) dev_summary_writer = tf.summary.FileWriter(os.path.join( out_dir, "dev"), graph=sess.graph) tf.contrib.tensorboard.plugins.projector.visualize_embeddings( train_summary_writer, proj_config) tf.contrib.tensorboard.plugins.projector.visualize_embeddings( dev_summary_writer, proj_config) sess.run(tf.global_variables_initializer()) # assign pretrained embeddings if FLAGS.use_pretrain: print("Initialize model with pretrained embeddings...") pretrained_embedding = np.load( os.path.join(FLAGS.data_dir, 'emb.npy')) m.assign_embedding(sess, pretrained_embedding) # initialize parameters current_lr = FLAGS.init_lr lowest_loss_value = float("inf") decay_step_counter = 0 global_step = 0 # evaluate on dev set def dev_step(mtest, sess): dev_loss = [] dev_auc = [] dev_f1_score = [] # create batch test_batches = util.batch_iter(test_data, batch_size=FLAGS.batch_size, num_epochs=1, shuffle=False) for batch in test_batches: x_batch, y_batch, _ = zip(*batch) loss_value, eval_value = sess.run( [mtest.total_loss, mtest.eval_op], feed_dict={ mtest.inputs: np.array(x_batch), mtest.labels: np.array(y_batch) }) dev_loss.append(loss_value) pre, rec = zip(*eval_value) # look at the 5th index, which corresponds to a threshold = 0.5 threshold = 5 dev_auc.append(util.calc_auc_pr(pre, rec, threshold)) dev_f1_score.append( (2.0 * pre[threshold] * rec[threshold]) / (pre[threshold] + rec[threshold])) return np.mean(dev_loss), np.mean(dev_auc), np.mean( dev_f1_score) # train loop print("\nStart training (save checkpoints in %s)\n" % out_dir) train_loss = [] train_auc = [] train_f1_score = [] train_batches = util.batch_iter(train_data, batch_size=FLAGS.batch_size, num_epochs=FLAGS.num_epochs) for batch in train_batches: batch_size = len(batch) m.assign_lr(sess, current_lr) global_step += 1 x_batch, y_batch, a_batch = zip(*batch) feed = { m.inputs: np.array(x_batch), m.labels: np.array(y_batch) } if FLAGS.attention: feed[m.attention] = np.array(a_batch) start_time = time.time() _, loss_value, eval_value = sess.run( [m.train_op, m.total_loss, m.eval_op], feed_dict=feed) proc_duration = time.time() - start_time train_loss.append(loss_value) pre, rec = zip(*eval_value) # look at the 5th index, which corresponds to a threshold = 0.5 threshold = 5 auc = util.calc_auc_pr(pre, rec, threshold) f1 = (2.0 * pre[threshold] * rec[threshold]) / (pre[threshold] + rec[threshold]) train_auc.append(auc) train_f1_score.append(f1) assert not np.isnan(loss_value), "Model loss is NaN." # print log if global_step % FLAGS.log_step == 0: examples_per_sec = batch_size / proc_duration format_str = '%s: step %d/%d, f1 = %.4f, auc = %.4f, loss = %.4f ' + \ '(%.1f examples/sec; %.3f sec/batch), lr: %.6f' print(format_str % (datetime.now(), global_step, max_steps, f1, auc, loss_value, examples_per_sec, proc_duration, current_lr)) # write summary if global_step % FLAGS.summary_step == 0: summary_str = sess.run(summary_op) train_summary_writer.add_summary(summary_str, global_step) dev_summary_writer.add_summary(summary_str, global_step) # summary loss, f1 train_summary_writer.add_summary(_summary_for_scalar( 'loss', np.mean(train_loss)), global_step=global_step) train_summary_writer.add_summary(_summary_for_scalar( 'auc', np.mean(train_auc)), global_step=global_step) train_summary_writer.add_summary(_summary_for_scalar( 'f1', np.mean(train_f1_score)), global_step=global_step) dev_loss, dev_auc, dev_f1 = dev_step(mtest, sess) dev_summary_writer.add_summary(_summary_for_scalar( 'loss', dev_loss), global_step=global_step) dev_summary_writer.add_summary(_summary_for_scalar( 'auc', dev_auc), global_step=global_step) dev_summary_writer.add_summary(_summary_for_scalar( 'f1', dev_f1), global_step=global_step) print("\n===== write summary =====") print("%s: step %d/%d: train_loss = %.6f, train_auc = %.4f, train_f1 = %.4f" \ % (datetime.now(), global_step, max_steps, np.mean(train_loss), np.mean(train_auc), np.mean(train_f1_score))) print("%s: step %d/%d: dev_loss = %.6f, dev_auc = %.4f, dev_f1 = %.4f\n" \ % (datetime.now(), global_step, max_steps, dev_loss, dev_auc, dev_f1)) # reset container train_loss = [] train_auc = [] train_f1_score = [] # decay learning rate if necessary if loss_value < lowest_loss_value: lowest_loss_value = loss_value decay_step_counter = 0 else: decay_step_counter += 1 if decay_step_counter >= FLAGS.tolerance_step: current_lr *= FLAGS.lr_decay print('%s: step %d/%d, Learning rate decays to %.5f' % \ (datetime.now(), global_step, max_steps, current_lr)) decay_step_counter = 0 # stop learning if learning rate is too low if current_lr < 1e-5: break # save checkpoint if global_step % FLAGS.checkpoint_step == 0: saver.save(sess, save_path, global_step=global_step) saver.save(sess, save_path, global_step=global_step)
def __save_values(self): values = [self._ball_thresholds, self._contrasts, self._brightnesses, self._gray_thresholds] util.dump_to_file(values, self._path_values)
def __save_pitch_size(self): util.dump_to_file((self._crop_rect, self._coord_rect), self._path_pitch_size)
def loop(step, doc_id, limit, entities, relations, counter): """Distant Supervision Loop""" # Download wiki articles print '[1/4] Downloading wiki articles ...' docs = download_wiki_articles(doc_id, limit) if docs is None: return None # Named Entity Recognition print '[2/4] Performing named entity recognition ...' exec_ner(docs) wiki_data = read_ner_output(docs) path = os.path.join(data_dir, 'candidates%d.tsv' % step) wiki_data.to_csv(path, sep='\t', encoding='utf-8') doc_id.extend([int(s) for s in wiki_data.doc_id.unique()]) # Prepare Containers unique_entities = set([]) unique_entity_pairs = set([]) for idx, row in wiki_data.iterrows(): unique_entities.add((row['subj'], row['subj_tag'])) unique_entities.add((row['obj'], row['obj_tag'])) unique_entity_pairs.add((row['subj'], row['obj'])) # Entity Linkage print '[3/4] Linking entities ...' for name, tag in unique_entities: if not entities.has_key(name) and tag in tag_map.keys(): e = name2qid(name, tag, alias=False) if e is None: e = name2qid(name, tag, alias=True) entities[name] = e util.dump_to_file(os.path.join(data_dir, "entities.cPickle"), entities) # Predicate Linkage print '[4/4] Linking predicates ...' for subj, obj in unique_entity_pairs: if not relations.has_key((subj, obj)): if entities[subj] is not None and entities[obj] is not None: if (entities[subj][0] != entities[obj][0]) or (subj != obj): arg1 = entities[subj][0] arg2 = entities[obj][0] relations[(subj, obj)] = search_property(arg1, arg2) #TODO: alternative name relation #elif (entities[subj][0] == entities[obj][0]) and (subj != obj): # relations[(subj, obj)] = 'P' util.dump_to_file(os.path.join(data_dir, "relations.cPickle"), relations) # Assign relation wiki_data['rel'] = pd.Series(index=wiki_data.index, dtype=str) for idx, row in wiki_data.iterrows(): entity_pair = (row['subj'], row['obj']) if relations.has_key(entity_pair): rel = relations[entity_pair] if rel is not None and len(rel) > 0: counter += 1 wiki_data.set_value(idx, 'rel', ', '.join(set([s[0] for s in rel]))) # Save path = os.path.join(data_dir, 'candidates%d.tsv' % step) wiki_data.to_csv(path, sep='\t', encoding='utf-8') # Cleanup for f in glob.glob(os.path.join(orig_dir, '*')): os.remove(f) for f in glob.glob(os.path.join(ner_dir, '*')): os.remove(f) return doc_id, entities, relations, counter
def loop(step, doc_id, limit, entities, relations, counter): """Distant Supervision Loop""" # Download wiki articles print('[1/4] Downloading wiki articles ...') # docs = download_wiki_articles(doc_id, limit) docs = os.listdir('./data/orig') if docs is None: return None # Named Entity Recognition print('[2/4] Performing named entity recognition ...') # exec_ner(docs) # wiki_data = read_ner_output(docs) path = os.path.join(data_dir, 'candidates%d.tsv' % step) if not os.path.isfile(path): wiki_data = read_ner_spacy(docs) wiki_data.to_csv(path, sep='\t', encoding='utf-8', index=False) else: wiki_data = pd.read_csv(path, sep='\t', encoding='utf-8') doc_id.extend([int(s) for s in wiki_data.doc_id.unique()]) # Prepare Containers unique_entities = set( wiki_data.groupby(['subj', 'subj_tag']).count().index.tolist()) unique_entities.update( set(wiki_data.groupby(['obj', 'obj_tag']).count().index.tolist())) unique_entity_pairs = set( wiki_data.groupby(['subj', 'obj']).count().index.tolist()) # for idx, row in wiki_data.iterrows(): # unique_entities.add((row['subj'], row['subj_tag'])) # unique_entities.add((row['obj'], row['obj_tag'])) # unique_entity_pairs.add((row['subj'], row['obj'])) # Entity Linkage print('[3/4] Linking entities ...') entities_filename = os.path.join(data_dir, "entities.pickle") if os.path.isfile(entities_filename): entities = util.load_from_dump(entities_filename) else: for name, tag in unique_entities: if not name in entities and tag in tag_map.keys(): e = name2qid(name, tag, alias=False) if e is None: e = name2qid(name, tag, alias=True) entities[name] = e util.dump_to_file(entities_filename, entities) # Predicate Linkage print('[4/4] Linking predicates ...') predicates_filename = os.path.join(data_dir, "relations.pickle") if os.path.isfile(predicates_filename): relations = util.load_from_dump(predicates_filename) else: for subj, obj in unique_entity_pairs: if not (subj, obj) in relations: if entities.get(subj) is not None and entities.get( obj) is not None: if (entities[subj][0] != entities[obj][0]) or (subj != obj): arg1 = entities[subj][0] arg2 = entities[obj][0] relations[(subj, obj)] = search_property(arg1, arg2) #TODO: alternative name relation #elif (entities[subj][0] == entities[obj][0]) and (subj != obj): # relations[(subj, obj)] = 'P' util.dump_to_file(predicates_filename, relations) # Assign relation # i.e. extract the 'class' name for this relationship wiki_data['rel'] = pd.Series(index=wiki_data.index, dtype=str) rel = list( map(lambda x: ', '.join(set([s[0] for s in x])), relations.values())) for i, r in enumerate(relations): if len(rel[i]) > 0: # counter += 1 idx = (wiki_data['subj'] == r[0]) & (wiki_data['obj'] == r[1]) wiki_data.loc[idx, 'rel'] = rel[i] # Save path = os.path.join(data_dir, 'candidates%d.tsv' % step) wiki_data.to_csv(path, sep='\t', encoding='utf-8', index=False) # Cleanup # for f in glob.glob(os.path.join(orig_dir, '*')): # os.remove(f) # # for f in glob.glob(os.path.join(ner_dir, '*')): # os.remove(f) return doc_id, entities, relations, counter
def train(train_data, test_data, FLAGS = tf.app.flags.FLAGS): # # train_dir # timestamp = str(int(time.time())) # out_dir = os.path.abspath(os.path.join(FLAGS.train_dir, timestamp)) # # # save flags # if not os.path.exists(out_dir): # os.mkdir(out_dir) # FLAGS._parse_flags() # config = dict(FLAGS.__flags.items()) # # # Window_size must not be larger than the sent_len # if config['sent_len'] < config['max_window']: # config['max_window'] = config['sent_len'] # # util.dump_to_file(os.path.join(out_dir, 'flags.cPickle'), config) train_x = get_key_phrases(train_data) _, train_y, _ = zip(*train_data) test_x = get_key_phrases(test_data) _, test_y, _ = zip(*test_data) # # assign pretrained embeddings # if FLAGS.use_pretrain: print "Initialize model with pretrained embeddings..." print("Please don't forget to change the vocab size to the corresponding on in the embedding.") pretrained_embedding = np.load(os.path.join(FLAGS.data_dir, 'emb.npy')) train_x = key_phrase_indices_to_embedding(train_x, pretrained_embedding) test_x = key_phrase_indices_to_embedding(test_x, pretrained_embedding) # Use SVM. But SVM does not output a probability # train_y = np.argmax(train_y, axis=1) # test_y = np.argmax(test_y, axis=1) # clf = svm.SVC(class_weight='balanced') # clf.fit(train_x, train_y) # predicted_test_y = clf.predict(test_x) # Use fully connected multilayer nn. config = dict(FLAGS.__flags.items()) # train_dir timestamp = str(int(time.time())) out_dir = os.path.abspath(os.path.join(FLAGS.train_dir, timestamp)) # save flags if not os.path.exists(out_dir): os.mkdir(out_dir) FLAGS._parse_flags() config = dict(FLAGS.__flags.items()) util.dump_to_file(os.path.join(out_dir, 'flags.cPickle'), config) num_batches_per_epoch = int(np.ceil(float(len(train_data))/FLAGS.batch_size)) max_steps = num_batches_per_epoch * FLAGS.num_epochs with tf.Graph().as_default(): with tf.variable_scope('multilayer', reuse=None): m = multilayer.Model(config, is_train=True) with tf.variable_scope('multilayer', reuse=True): mtest = multilayer.Model(config, is_train=False) # checkpoint saver = tf.train.Saver(tf.global_variables(), max_to_keep=1) save_path = os.path.join(out_dir, 'model.ckpt') try: summary_op = tf.summary.merge_all() except: summary_op = tf.merge_all_summaries() # session config = tf.ConfigProto(log_device_placement=FLAGS.log_device_placement) if FLAGS.gpu_percentage > 0: config.gpu_options.per_process_gpu_memory_fraction = FLAGS.gpu_percentage else: config = tf.ConfigProto( log_device_placement=FLAGS.log_device_placement, device_count={'GPU': 0} ) sess = tf.Session(config=config) with sess.as_default(): train_summary_writer = tf.train.SummaryWriter(os.path.join(out_dir, "train"), graph=sess.graph) dev_summary_writer = tf.train.SummaryWriter(os.path.join(out_dir, "dev"), graph=sess.graph) try: sess.run(tf.global_variables_initializer()) except: sess.run(tf.initialize_all_variables()) # # assign pretrained embeddings # if FLAGS.use_pretrain: # print "Initialize model with pretrained embeddings..." # print("Please don't forget to change the vocab size to the corresponding on in the embedding.") # pretrained_embedding = np.load(os.path.join(FLAGS.data_dir, 'emb.npy')) # m.assign_embedding(sess, pretrained_embedding) # initialize parameters current_lr = FLAGS.init_lr lowest_loss_value = float("inf") decay_step_counter = 0 global_step = 0 # evaluate on dev set def dev_step(mtest, sess): dev_loss = [] dev_auc = [] dev_f1_score = [] # create batch test_batches = util.batch_iter(zip(test_x, test_y), batch_size=FLAGS.batch_size, num_epochs=1, shuffle=False) for batch in test_batches: x_batch, y_batch, = zip(*batch) # a_batch = np.ones((len(batch), 1), dtype=np.float32) / len(batch) # average loss_value, eval_value = sess.run([mtest.total_loss, mtest.eval_op], feed_dict={mtest.inputs: np.array(x_batch), mtest.labels: np.array(y_batch)}) dev_loss.append(loss_value) pre, rec = zip(*eval_value) dev_auc.append(util.calc_auc_pr(pre, rec)) dev_f1_score.append((2.0 * pre[5] * rec[5]) / (pre[5] + rec[5])) # threshold = 0.5 return (np.mean(dev_loss), np.mean(dev_auc), np.mean(dev_f1_score)) # train loop print "\nStart training (save checkpoints in %s)\n" % out_dir train_loss = [] train_auc = [] train_f1_score = [] train_batches = util.batch_iter(zip(train_x, train_y), batch_size=FLAGS.batch_size, num_epochs=FLAGS.num_epochs) for batch in train_batches: batch_size = len(batch) m.assign_lr(sess, current_lr) global_step += 1 x_batch, y_batch, = zip(*batch) feed = {m.inputs: np.array(x_batch), m.labels: np.array(y_batch)} start_time = time.time() _, loss_value, eval_value = sess.run([m.train_op, m.total_loss, m.eval_op], feed_dict=feed) proc_duration = time.time() - start_time train_loss.append(loss_value) pre, rec = zip(*eval_value) auc = util.calc_auc_pr(pre, rec) f1 = (2.0 * pre[5] * rec[5]) / (pre[5] + rec[5]) # threshold = 0.5 train_auc.append(auc) train_f1_score.append(f1) assert not np.isnan(loss_value), "Model loss is NaN." # print log if global_step % FLAGS.log_step == 0: examples_per_sec = batch_size / proc_duration format_str = '%s: step %d/%d, f1 = %.4f, auc = %.4f, loss = %.4f ' + \ '(%.1f examples/sec; %.3f sec/batch), lr: %.6f' print format_str % (datetime.now(), global_step, max_steps, f1, auc, loss_value, examples_per_sec, proc_duration, current_lr) # write summary if global_step % FLAGS.summary_step == 0: summary_str = sess.run(summary_op) train_summary_writer.add_summary(summary_str, global_step) dev_summary_writer.add_summary(summary_str, global_step) # summary loss, f1 train_summary_writer.add_summary( _summary_for_scalar('loss', np.mean(train_loss)), global_step=global_step) train_summary_writer.add_summary( _summary_for_scalar('auc', np.mean(train_auc)), global_step=global_step) train_summary_writer.add_summary( _summary_for_scalar('f1', np.mean(train_f1_score)), global_step=global_step) dev_loss, dev_auc, dev_f1 = dev_step(mtest, sess) dev_summary_writer.add_summary( _summary_for_scalar('loss', dev_loss), global_step=global_step) dev_summary_writer.add_summary( _summary_for_scalar('auc', dev_auc), global_step=global_step) dev_summary_writer.add_summary( _summary_for_scalar('f1', dev_f1), global_step=global_step) print "\n===== write summary =====" print "%s: step %d/%d: train_loss = %.6f, train_auc = %.4f, train_f1 = %.4f" \ % (datetime.now(), global_step, max_steps, np.mean(train_loss), np.mean(train_auc), np.mean(train_f1_score)) print "%s: step %d/%d: dev_loss = %.6f, dev_auc = %.4f, dev_f1 = %.4f\n" \ % (datetime.now(), global_step, max_steps, dev_loss, dev_auc, dev_f1) # reset container train_loss = [] train_auc = [] train_f1_score = [] # decay learning rate if necessary if loss_value < lowest_loss_value: lowest_loss_value = loss_value decay_step_counter = 0 else: decay_step_counter += 1 if decay_step_counter >= FLAGS.tolerance_step: current_lr *= FLAGS.lr_decay print '%s: step %d/%d, Learning rate decays to %.5f' % \ (datetime.now(), global_step, max_steps, current_lr) decay_step_counter = 0 # stop learning if learning rate is too low if current_lr < 1e-5: break # save checkpoint if global_step % FLAGS.checkpoint_step == 0: saver.save(sess, save_path, global_step=global_step) saver.save(sess, save_path, global_step=global_step) # Lastly evaluate the test set loss_value, predicted_test_y_logits = sess.run([mtest.total_loss, mtest.scores], feed_dict={mtest.inputs: np.array(test_x), mtest.labels: np.array(test_y)}) predicted_test_y = np.argmax(predicted_test_y_logits, axis=1) test_y = np.argmax(test_y, axis=1) result = (predicted_test_y == test_y) accuracy = np.sum(result.astype(np.int32)) / float(result.shape[0]) print("Overall %f%% answers were correct. " %(float(accuracy * 100))) epsilon = 0.00000001 num_categories = 3 true_positive_per_category = [np.bitwise_and(test_y==category_i, predicted_test_y==category_i) for category_i in range(num_categories)] false_positive_per_category = [np.bitwise_and(test_y!=category_i, predicted_test_y==category_i) for category_i in range(num_categories)] true_negative_per_category = [np.bitwise_and(test_y!=category_i, predicted_test_y!=category_i) for category_i in range(num_categories)] false_negative_per_category = [np.bitwise_and(test_y==category_i, predicted_test_y!=category_i) for category_i in range(num_categories)] precision_per_category = [np.sum(true_positive_per_category[category_i].astype(np.int32)) / float(np.sum(true_positive_per_category[category_i].astype(np.int32)) + np.sum(false_positive_per_category[category_i].astype(np.int32)) + epsilon) for category_i in range(num_categories)] recall_per_category = [np.sum(true_positive_per_category[category_i].astype(np.int32)) / float(np.sum(true_positive_per_category[category_i].astype(np.int32)) + np.sum(false_negative_per_category[category_i].astype(np.int32)) + epsilon) for category_i in range(num_categories)] f1_per_category = [2 / (1 / (precision_per_category[category_i] + epsilon) + 1 / (recall_per_category[category_i] + epsilon)) for category_i in range(num_categories)] for category_i in range(num_categories): print("Category %d has f1 score: %f, precision: %f, and recall %f" %(category_i, f1_per_category[category_i], precision_per_category[category_i], recall_per_category[category_i])) return test_x, predicted_test_y_logits
def train(): # train_dir timestamp = str(int(time.time())) out_dir = os.path.abspath(os.path.join(FLAGS.train_dir, timestamp)) # save flags if not os.path.exists(out_dir): os.mkdir(out_dir) FLAGS._parse_flags() config = dict(FLAGS.__flags.items()) util.dump_to_file(os.path.join(out_dir, 'flags.cPickle'), config) print "Parameters:" for k, v in config.iteritems(): print '%20s %r' % (k, v) # load data print "Preparing train data ..." train_loader = util.DataLoader(FLAGS.data_dir, 'train.cPickle', batch_size=FLAGS.batch_size) print "Preparing test data ..." dev_loader = util.DataLoader(FLAGS.data_dir, 'test.cPickle', batch_size=FLAGS.batch_size) max_steps = train_loader.num_batch * FLAGS.num_epoch config['num_classes'] = train_loader.num_classes config['sent_len'] = train_loader.sent_len with tf.Graph().as_default(): with tf.variable_scope('cnn', reuse=None): m = cnn.Model(config, is_train=True) with tf.variable_scope('cnn', reuse=True): mtest = cnn.Model(config, is_train=False) # checkpoint saver = tf.train.Saver(tf.global_variables()) save_path = os.path.join(out_dir, 'model.ckpt') summary_op = tf.summary.merge_all() # session sess = tf.Session() # summary writer proj_config = tf.contrib.tensorboard.plugins.projector.ProjectorConfig( ) embedding = proj_config.embeddings.add() embedding.tensor_name = m.W_emb.name embedding.metadata_path = os.path.join(FLAGS.data_dir, 'metadata.tsv') summary_dir = os.path.join(out_dir, "summaries") summary_writer = tf.summary.FileWriter(summary_dir, graph=sess.graph) tf.contrib.tensorboard.plugins.projector.visualize_embeddings( summary_writer, proj_config) sess.run(tf.global_variables_initializer()) # assign pretrained embeddings if FLAGS.use_pretrain: print "Use pretrained embeddings to initialize model ..." emb_file = os.path.join(FLAGS.data_dir, 'emb.txt') vocab_file = os.path.join(FLAGS.data_dir, 'vocab.txt') pretrained_embedding = util.load_embedding(emb_file, vocab_file, FLAGS.vocab_size) m.assign_embedding(sess, pretrained_embedding) # initialize parameters current_lr = FLAGS.init_lr lowest_loss_value = float("inf") decay_step_counter = 0 global_step = 0 # evaluate on dev set def dev_step(mtest, sess, data_loader): dev_loss = 0.0 dev_accuracy = 0.0 for _ in range(data_loader.num_batch): x_batch_dev, y_batch_dev = data_loader.next_batch() dev_loss_value, dev_true_count = sess.run( [mtest.total_loss, mtest.true_count_op], feed_dict={ mtest.inputs: x_batch_dev, mtest.labels: y_batch_dev }) dev_loss += dev_loss_value dev_accuracy += dev_true_count dev_loss /= data_loader.num_batch dev_accuracy /= float(data_loader.num_batch * FLAGS.batch_size) data_loader.reset_pointer() return dev_loss, dev_accuracy # train loop print '\nStart training, %d batches needed, with %d examples per batch.' % ( train_loader.num_batch, FLAGS.batch_size) for epoch in range(FLAGS.num_epoch): train_loss = [] train_accuracy = [] train_loader.reset_pointer() for _ in range(train_loader.num_batch): m.assign_lr(sess, current_lr) global_step += 1 start_time = time.time() x_batch, y_batch = train_loader.next_batch() feed = {m.inputs: x_batch, m.labels: y_batch} run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() _, loss_value, true_count = sess.run( [m.train_op, m.total_loss, m.true_count_op], feed_dict=feed, options=run_options, run_metadata=run_metadata) proc_duration = time.time() - start_time train_loss.append(loss_value) train_accuracy.append(true_count) assert not np.isnan(loss_value), "Model loss is NaN." if global_step % FLAGS.log_step == 0: examples_per_sec = FLAGS.batch_size / proc_duration accuracy = float(true_count) / FLAGS.batch_size format_str = '%s: step %d/%d (epoch %d/%d), acc = %.2f, loss = %.2f ' + \ '(%.1f examples/sec; %.3f sec/batch), lr: %.6f' print format_str % (datetime.now(), global_step, max_steps, epoch + 1, FLAGS.num_epoch, accuracy, loss_value, examples_per_sec, proc_duration, current_lr) # write summary if global_step % FLAGS.summary_step == 0: summary_str = sess.run(summary_op) summary_writer.add_run_metadata(run_metadata, 'step%04d' % global_step) summary_writer.add_summary(summary_str, global_step) # summary loss/accuracy train_loss_mean = sum(train_loss) / float(len(train_loss)) train_accuracy_mean = sum(train_accuracy) / float( len(train_accuracy) * FLAGS.batch_size) summary_writer.add_summary(_summary( 'train/loss', train_loss_mean), global_step=global_step) summary_writer.add_summary(_summary( 'train/accuracy', train_accuracy_mean), global_step=global_step) test_loss, test_accuracy = dev_step( mtest, sess, dev_loader) summary_writer.add_summary(_summary('dev/loss', test_loss), global_step=global_step) summary_writer.add_summary(_summary( 'dev/accuracy', test_accuracy), global_step=global_step) print "\nStep %d: train_loss = %.6f, train_accuracy = %.3f" % ( global_step, train_loss_mean, train_accuracy_mean) print "Step %d: test_loss = %.6f, test_accuracy = %.3f\n" % ( global_step, test_loss, test_accuracy) # decay learning rate if necessary if loss_value < lowest_loss_value: lowest_loss_value = loss_value decay_step_counter = 0 else: decay_step_counter += 1 if decay_step_counter >= FLAGS.tolerance_step: current_lr *= FLAGS.lr_decay print '%s: step %d/%d (epoch %d/%d), Learning rate decays to %.5f' % \ (datetime.now(), global_step, max_steps, epoch+1, FLAGS.num_epoch, current_lr) decay_step_counter = 0 # stop learning if learning rate is too low if current_lr < 1e-5: break # save checkpoint if global_step % FLAGS.checkpoint_step == 0: saver.save(sess, save_path, global_step=global_step) saver.save(sess, save_path, global_step=global_step)