Beispiel #1
0
def main(argv=None):
    restore_param = util.load_from_dump(
        os.path.join(FLAGS.train_dir, 'flags.cPickle'))
    restore_param['train_dir'] = FLAGS.train_dir

    if restore_param.has_key('contextwise') and restore_param['contextwise']:
        source_path = os.path.join(restore_param['data_dir'], "ids")
        target_path = os.path.join(restore_param['data_dir'], "target.txt")
        _, data = util.read_data_contextwise(
            source_path,
            target_path,
            restore_param['sent_len'],
            train_size=restore_param['train_size'])
    else:
        source_path = os.path.join(restore_param['data_dir'], "ids.txt")
        target_path = os.path.join(restore_param['data_dir'], "target.txt")
        _, data = util.read_data(source_path,
                                 target_path,
                                 restore_param['sent_len'],
                                 train_size=restore_param['train_size'])

    pre, rec = evaluate(data, restore_param)
    util.dump_to_file(os.path.join(FLAGS.train_dir, 'results.cPickle'), {
        'precision': pre,
        'recall': rec
    })
def resolve(exit_desc):
    """
    Resolve exit relay-specific domain.
    """

    exit_url = util.exiturl(exit_desc.fingerprint)

    # Prepend the exit relay's fingerprint so we know which relay issued the
    # DNS request.

    fingerprint = exit_desc.fingerprint.encode("ascii", "ignore")
    domain = "%s.%s.%s" % (fingerprint,
                           time.strftime("%Y-%m-%d-%H"),
                           TARGET_DOMAIN)

    log.debug("Resolving %s over %s." % (domain, exit_url))

    sock = torsocks.torsocket()
    sock.settimeout(10)

    # Resolve the domain using Tor's SOCKS extension.

    log.debug("Resolving %s over %s." % (domain, exit_url))
    try:
        ipv4 = sock.resolve(domain)
    except error.SOCKSv5Error as err:

        # This is expected because all domains resolve to 127.0.0.1.

        log.warning("SOCKSv5 error while resolving domain: %s" % err)
        ipv4 = "0.0.0.0"
        pass
    except socket.timeout as err:
        log.debug("Socket over exit relay %s timed out: %s" % (exit_url, err))
        return

    log.info("Successfully resolved domain over %s to %s." % (exit_url, ipv4))

    # Log a CSV including timestamp, exit fingerprint, exit IP address, and the
    # domain we resolved.

    timestamp = time.strftime("%Y-%m-%d_%H:%M:%S_%z")
    content = "%s, %s, %s, %s\n" % (timestamp,
                                    fingerprint,
                                    exit_desc.address,
                                    ipv4)
    util.dump_to_file(content, fingerprint)
Beispiel #3
0
def main(argv=None):
    restore_param = util.load_from_dump(os.path.join(FLAGS.train_dir, 'flags.cPickle'))
    restore_param['train_dir'] = FLAGS.train_dir

    source_path = os.path.join(restore_param['data_dir'], 'test_cs_unlabeled_data_combined.txt')
    target_path = os.path.join(restore_param['data_dir'], 'test_cs_labels_combined.txt')
    vocab_path = os.path.join(restore_param['data_dir'], 'test_cs_vocab_combined')
    _, data = util.read_data(source_path, target_path, restore_param['sent_len'],
                             train_size=restore_param['train_size'], hide_key_phrases=restore_param.get('hide_key_phrases', False))

    pre, rec, x_input, expected_output, actual_output = evaluate(data, restore_param)

    actual_output_exp = np.exp(actual_output)
    actual_output_softmax = actual_output_exp / np.sum(actual_output_exp, axis=1, keepdims=True)

    output_difference = np.sum(np.abs(actual_output_softmax - expected_output), axis=1)


    sentence_indices_input = x_input[:,:-2]
    _,rev_vocab = preprocessing_util.initialize_vocabulary(vocab_path)
    sentence_input = preprocessing_util.indices_to_sentences(sentence_indices_input,rev_vocab)

    kp_indices_input = x_input[:,-2:]

    print('Diff\tType\tSentence\t\tExpected Score (A is-a B, B is-a A, Neither)\tActual Score')
    for sentence_i, sentence in enumerate(sentence_input):
        # Label the key phrases of interest in the current sentence with *.
        sentence[kp_indices_input[sentence_i,1]] += '*'
        sentence[kp_indices_input[sentence_i,0]] += '*'
        current_type = 'Neither'
        if expected_output[sentence_i,0] == 1:
            current_type = 'A is-a B'
        elif expected_output[sentence_i,1] == 1:
            current_type = 'B is-a A'

        print('%.3f\t%s\t%s\t\t%s\t%s\t'
              % (output_difference[sentence_i], current_type, ' '.join(sentence), str(expected_output[sentence_i]), str(actual_output_softmax[sentence_i])))

    util.dump_to_file(os.path.join(FLAGS.train_dir, 'results.cPickle'), {'precision': pre, 'recall': rec})
Beispiel #4
0
def train(train_data, test_data):
    # train_dir
    timestamp = str(int(time.time()))
    out_dir = os.path.abspath(os.path.join(FLAGS.train_dir, timestamp))

    # save flags
    if not os.path.exists(out_dir):
        os.mkdir(out_dir)
    FLAGS._parse_flags()
    config = dict(FLAGS.__flags.items())

    # Window_size must not be larger than the sent_len
    if config['sent_len'] < config['max_window']:
        config['max_window'] = config['sent_len']

    # flag to restore the contextwise model
    config['split'] = True

    # save flags
    config['train_dir'] = out_dir
    util.dump_to_file(os.path.join(out_dir, 'flags.cPickle'), config)

    # display parameter settings
    print 'Parameters:'
    for k, v in config.iteritems():
        print '\t' + k + '=' + str(v)

    # max number of steps
    num_batches_per_epoch = int(
        np.ceil(float(len(train_data)) / FLAGS.batch_size))
    max_steps = num_batches_per_epoch * FLAGS.num_epochs

    with tf.Graph().as_default():
        with tf.variable_scope('cnn', reuse=None):
            m = cnn_split.Model(config, is_train=True)
        with tf.variable_scope('cnn', reuse=True):
            mtest = cnn_split.Model(config, is_train=False)

        # checkpoint
        saver = tf.train.Saver(tf.all_variables())
        save_path = os.path.join(out_dir, 'model.ckpt')
        summary_op = tf.merge_all_summaries()

        # session
        sess = tf.Session(config=tf.ConfigProto(
            log_device_placement=FLAGS.log_device_placement))
        with sess.as_default():
            train_summary_writer = tf.train.SummaryWriter(os.path.join(
                out_dir, "train"),
                                                          graph=sess.graph)
            dev_summary_writer = tf.train.SummaryWriter(os.path.join(
                out_dir, "dev"),
                                                        graph=sess.graph)
            sess.run(tf.initialize_all_variables())

            # assign pretrained embeddings
            if FLAGS.use_pretrain:
                print "Initializing model with pretrained embeddings ..."
                pretrained_embedding = np.load(
                    os.path.join(FLAGS.data_dir, 'emb.npy'))
                m.assign_embedding(sess, pretrained_embedding)

            # initialize parameters
            current_lr = FLAGS.init_lr
            lowest_loss_value = float("inf")
            decay_step_counter = 0
            global_step = 0

            # evaluate on dev set
            def dev_step(mtest, sess):
                dev_loss = []
                dev_auc = []
                dev_f1_score = []

                # create batch
                test_batches = util.batch_iter(test_data,
                                               batch_size=FLAGS.batch_size,
                                               num_epochs=1,
                                               shuffle=False)
                for batch in test_batches:
                    left_batch, right_batch, y_batch, n_batch = zip(*batch)
                    feed = {
                        mtest.left: np.array(left_batch),
                        mtest.right: np.array(right_batch),
                        mtest.labels: np.array(y_batch)
                    }
                    if FLAGS.negative:
                        feed[mtest.negative] = np.array(n_batch)
                    loss_value, eval_value = sess.run(
                        [mtest.total_loss, mtest.eval_op], feed_dict=feed)
                    dev_loss.append(loss_value)
                    pre, rec = zip(*eval_value)
                    dev_auc.append(util.calc_auc_pr(pre, rec))
                    dev_f1_score.append((2.0 * pre[5] * rec[5]) /
                                        (pre[5] + rec[5]))  # threshold = 0.5

                return (np.mean(dev_loss), np.mean(dev_auc),
                        np.mean(dev_f1_score))

            # train loop
            print "\nStart training (save checkpoints in %s)\n" % out_dir
            train_loss = []
            train_auc = []
            train_f1_score = []
            train_batches = util.batch_iter(train_data,
                                            batch_size=FLAGS.batch_size,
                                            num_epochs=FLAGS.num_epochs)
            for batch in train_batches:
                batch_size = len(batch)

                m.assign_lr(sess, current_lr)
                global_step += 1

                left_batch, right_batch, y_batch, n_batch = zip(*batch)
                feed = {
                    m.left: np.array(left_batch),
                    m.right: np.array(right_batch),
                    m.labels: np.array(y_batch)
                }
                if FLAGS.negative:
                    feed[m.negative] = np.array(n_batch)
                start_time = time.time()
                _, loss_value, eval_value = sess.run(
                    [m.train_op, m.total_loss, m.eval_op], feed_dict=feed)
                proc_duration = time.time() - start_time
                train_loss.append(loss_value)
                pre, rec = zip(*eval_value)
                auc = util.calc_auc_pr(pre, rec)
                f1 = (2.0 * pre[5] * rec[5]) / (pre[5] + rec[5]
                                                )  # threshold = 0.5
                train_auc.append(auc)
                train_f1_score.append(f1)

                assert not np.isnan(loss_value), "Model loss is NaN."

                # print log
                if global_step % FLAGS.log_step == 0:
                    examples_per_sec = batch_size / proc_duration
                    format_str = '%s: step %d/%d, f1 = %.4f, auc = %.4f, loss = %.4f ' + \
                                 '(%.1f examples/sec; %.3f sec/batch), lr: %.6f'
                    print format_str % (datetime.now(), global_step, max_steps,
                                        f1, auc, loss_value, examples_per_sec,
                                        proc_duration, current_lr)

                # write summary
                if global_step % FLAGS.summary_step == 0:
                    summary_str = sess.run(summary_op)
                    train_summary_writer.add_summary(summary_str, global_step)
                    dev_summary_writer.add_summary(summary_str, global_step)

                    # summary loss, f1
                    train_summary_writer.add_summary(_summary_for_scalar(
                        'loss', np.mean(train_loss)),
                                                     global_step=global_step)
                    train_summary_writer.add_summary(_summary_for_scalar(
                        'auc', np.mean(train_auc)),
                                                     global_step=global_step)
                    train_summary_writer.add_summary(_summary_for_scalar(
                        'f1', np.mean(train_f1_score)),
                                                     global_step=global_step)

                    dev_loss, dev_auc, dev_f1 = dev_step(mtest, sess)
                    dev_summary_writer.add_summary(_summary_for_scalar(
                        'loss', dev_loss),
                                                   global_step=global_step)
                    dev_summary_writer.add_summary(_summary_for_scalar(
                        'auc', dev_auc),
                                                   global_step=global_step)
                    dev_summary_writer.add_summary(_summary_for_scalar(
                        'f1', dev_f1),
                                                   global_step=global_step)

                    print "\n===== write summary ====="
                    print "%s: step %d/%d: train_loss = %.6f, train_auc = %.4f train_f1 = %.4f" \
                          % (datetime.now(), global_step, max_steps,
                             np.mean(train_loss), np.mean(train_auc), np.mean(train_f1_score))
                    print "%s: step %d/%d:   dev_loss = %.6f,   dev_auc = %.4f   dev_f1 = %.4f\n" \
                          % (datetime.now(), global_step, max_steps, dev_loss, dev_auc, dev_f1)

                    # reset container
                    train_loss = []
                    train_auc = []
                    train_f1_score = []

                # decay learning rate if necessary
                if loss_value < lowest_loss_value:
                    lowest_loss_value = loss_value
                    decay_step_counter = 0
                else:
                    decay_step_counter += 1
                if decay_step_counter >= FLAGS.tolerance_step:
                    current_lr *= FLAGS.lr_decay
                    print '%s: step %d/%d, Learning rate decays to %.5f' % \
                          (datetime.now(), global_step, max_steps, current_lr)
                    decay_step_counter = 0

                # stop learning if learning rate is too low
                if current_lr < 1e-5:
                    break

                # save checkpoint
                if global_step % FLAGS.checkpoint_step == 0:
                    saver.save(sess, save_path, global_step=global_step)
            saver.save(sess, save_path, global_step=global_step)
Beispiel #5
0
def train(train_data, test_data):
    # train_dir
    timestamp = str(int(time.time()))
    out_dir = os.path.abspath(os.path.join(FLAGS.train_dir, timestamp))

    # save flags
    if not os.path.exists(out_dir):
        os.mkdir(out_dir)
    FLAGS._parse_flags()
    config = dict(FLAGS.__flags.items())

    # Window_size must not be larger than the sent_len
    if config['sent_len'] < config['max_window']:
        config['max_window'] = config['sent_len']

    util.dump_to_file(os.path.join(out_dir, 'flags.cPickle'), config)
    print("Parameters:")
    for k, v in config.items():
        print('%20s %r' % (k, v))

    num_batches_per_epoch = int(
        np.ceil(float(len(train_data)) / FLAGS.batch_size))
    max_steps = num_batches_per_epoch * FLAGS.num_epochs

    with tf.Graph().as_default():
        with tf.variable_scope('cnn', reuse=None):
            m = cnn.Model(config, is_train=True)
        with tf.variable_scope('cnn', reuse=True):
            mtest = cnn.Model(config, is_train=False)

        # checkpoint
        saver = tf.train.Saver(tf.global_variables())
        save_path = os.path.join(out_dir, 'model.ckpt')
        summary_op = tf.summary.merge_all()

        # session
        with tf.Session().as_default() as sess:
            proj_config = tf.contrib.tensorboard.plugins.projector.ProjectorConfig(
            )
            embedding = proj_config.embeddings.add()
            embedding.tensor_name = m.W_emb.name
            embedding.metadata_path = os.path.join(FLAGS.data_dir, 'vocab.txt')

            train_summary_writer = tf.summary.FileWriter(os.path.join(
                out_dir, "train"),
                                                         graph=sess.graph)
            dev_summary_writer = tf.summary.FileWriter(os.path.join(
                out_dir, "dev"),
                                                       graph=sess.graph)
            tf.contrib.tensorboard.plugins.projector.visualize_embeddings(
                train_summary_writer, proj_config)
            tf.contrib.tensorboard.plugins.projector.visualize_embeddings(
                dev_summary_writer, proj_config)

            sess.run(tf.global_variables_initializer())

            # assign pretrained embeddings
            if FLAGS.use_pretrain:
                print("Initialize model with pretrained embeddings...")
                pretrained_embedding = np.load(
                    os.path.join(FLAGS.data_dir, 'emb.npy'))
                m.assign_embedding(sess, pretrained_embedding)

            # initialize parameters
            current_lr = FLAGS.init_lr
            lowest_loss_value = float("inf")
            decay_step_counter = 0
            global_step = 0

            # evaluate on dev set
            def dev_step(mtest, sess):
                dev_loss = []
                dev_auc = []
                dev_f1_score = []

                # create batch
                test_batches = util.batch_iter(test_data,
                                               batch_size=FLAGS.batch_size,
                                               num_epochs=1,
                                               shuffle=False)
                for batch in test_batches:
                    x_batch, y_batch, _ = zip(*batch)
                    loss_value, eval_value = sess.run(
                        [mtest.total_loss, mtest.eval_op],
                        feed_dict={
                            mtest.inputs: np.array(x_batch),
                            mtest.labels: np.array(y_batch)
                        })
                    dev_loss.append(loss_value)
                    pre, rec = zip(*eval_value)
                    # look at the 5th index, which corresponds to a threshold = 0.5
                    threshold = 5
                    dev_auc.append(util.calc_auc_pr(pre, rec, threshold))
                    dev_f1_score.append(
                        (2.0 * pre[threshold] * rec[threshold]) /
                        (pre[threshold] + rec[threshold]))

                return np.mean(dev_loss), np.mean(dev_auc), np.mean(
                    dev_f1_score)

            # train loop
            print("\nStart training (save checkpoints in %s)\n" % out_dir)
            train_loss = []
            train_auc = []
            train_f1_score = []
            train_batches = util.batch_iter(train_data,
                                            batch_size=FLAGS.batch_size,
                                            num_epochs=FLAGS.num_epochs)
            for batch in train_batches:
                batch_size = len(batch)

                m.assign_lr(sess, current_lr)
                global_step += 1

                x_batch, y_batch, a_batch = zip(*batch)
                feed = {
                    m.inputs: np.array(x_batch),
                    m.labels: np.array(y_batch)
                }
                if FLAGS.attention:
                    feed[m.attention] = np.array(a_batch)
                start_time = time.time()
                _, loss_value, eval_value = sess.run(
                    [m.train_op, m.total_loss, m.eval_op], feed_dict=feed)
                proc_duration = time.time() - start_time
                train_loss.append(loss_value)
                pre, rec = zip(*eval_value)
                # look at the 5th index, which corresponds to a threshold = 0.5
                threshold = 5
                auc = util.calc_auc_pr(pre, rec, threshold)
                f1 = (2.0 * pre[threshold] *
                      rec[threshold]) / (pre[threshold] + rec[threshold])
                train_auc.append(auc)
                train_f1_score.append(f1)

                assert not np.isnan(loss_value), "Model loss is NaN."

                # print log
                if global_step % FLAGS.log_step == 0:
                    examples_per_sec = batch_size / proc_duration
                    format_str = '%s: step %d/%d, f1 = %.4f, auc = %.4f, loss = %.4f ' + \
                                 '(%.1f examples/sec; %.3f sec/batch), lr: %.6f'
                    print(format_str % (datetime.now(), global_step, max_steps,
                                        f1, auc, loss_value, examples_per_sec,
                                        proc_duration, current_lr))

                # write summary
                if global_step % FLAGS.summary_step == 0:
                    summary_str = sess.run(summary_op)
                    train_summary_writer.add_summary(summary_str, global_step)
                    dev_summary_writer.add_summary(summary_str, global_step)

                    # summary loss, f1
                    train_summary_writer.add_summary(_summary_for_scalar(
                        'loss', np.mean(train_loss)),
                                                     global_step=global_step)
                    train_summary_writer.add_summary(_summary_for_scalar(
                        'auc', np.mean(train_auc)),
                                                     global_step=global_step)
                    train_summary_writer.add_summary(_summary_for_scalar(
                        'f1', np.mean(train_f1_score)),
                                                     global_step=global_step)

                    dev_loss, dev_auc, dev_f1 = dev_step(mtest, sess)
                    dev_summary_writer.add_summary(_summary_for_scalar(
                        'loss', dev_loss),
                                                   global_step=global_step)
                    dev_summary_writer.add_summary(_summary_for_scalar(
                        'auc', dev_auc),
                                                   global_step=global_step)
                    dev_summary_writer.add_summary(_summary_for_scalar(
                        'f1', dev_f1),
                                                   global_step=global_step)

                    print("\n===== write summary =====")
                    print("%s: step %d/%d: train_loss = %.6f, train_auc = %.4f, train_f1 = %.4f" \
                          % (datetime.now(), global_step, max_steps,
                             np.mean(train_loss), np.mean(train_auc), np.mean(train_f1_score)))
                    print("%s: step %d/%d:   dev_loss = %.6f,   dev_auc = %.4f,   dev_f1 = %.4f\n" \
                          % (datetime.now(), global_step, max_steps, dev_loss, dev_auc, dev_f1))

                    # reset container
                    train_loss = []
                    train_auc = []
                    train_f1_score = []

                # decay learning rate if necessary
                if loss_value < lowest_loss_value:
                    lowest_loss_value = loss_value
                    decay_step_counter = 0
                else:
                    decay_step_counter += 1
                if decay_step_counter >= FLAGS.tolerance_step:
                    current_lr *= FLAGS.lr_decay
                    print('%s: step %d/%d, Learning rate decays to %.5f' % \
                          (datetime.now(), global_step, max_steps, current_lr))
                    decay_step_counter = 0

                # stop learning if learning rate is too low
                if current_lr < 1e-5:
                    break

                # save checkpoint
                if global_step % FLAGS.checkpoint_step == 0:
                    saver.save(sess, save_path, global_step=global_step)
            saver.save(sess, save_path, global_step=global_step)
 def __save_values(self):
     values = [self._ball_thresholds, self._contrasts, self._brightnesses, self._gray_thresholds]
     util.dump_to_file(values, self._path_values)
 def __save_pitch_size(self):
     util.dump_to_file((self._crop_rect, self._coord_rect), self._path_pitch_size)
Beispiel #8
0
def loop(step, doc_id, limit, entities, relations, counter):
    """Distant Supervision Loop"""
    # Download wiki articles
    print '[1/4] Downloading wiki articles ...'
    docs = download_wiki_articles(doc_id, limit)
    if docs is None:
        return None

    # Named Entity Recognition
    print '[2/4] Performing named entity recognition ...'
    exec_ner(docs)
    wiki_data = read_ner_output(docs)
    path = os.path.join(data_dir, 'candidates%d.tsv' % step)
    wiki_data.to_csv(path, sep='\t', encoding='utf-8')
    doc_id.extend([int(s) for s in wiki_data.doc_id.unique()])

    # Prepare Containers
    unique_entities = set([])
    unique_entity_pairs = set([])
    for idx, row in wiki_data.iterrows():
        unique_entities.add((row['subj'], row['subj_tag']))
        unique_entities.add((row['obj'], row['obj_tag']))
        unique_entity_pairs.add((row['subj'], row['obj']))

    # Entity Linkage
    print '[3/4] Linking entities ...'
    for name, tag in unique_entities:
        if not entities.has_key(name) and tag in tag_map.keys():
            e = name2qid(name, tag, alias=False)
            if e is None:
                e = name2qid(name, tag, alias=True)
            entities[name] = e
    util.dump_to_file(os.path.join(data_dir, "entities.cPickle"), entities)

    # Predicate Linkage
    print '[4/4] Linking predicates ...'
    for subj, obj in unique_entity_pairs:
        if not relations.has_key((subj, obj)):
            if entities[subj] is not None and entities[obj] is not None:
                if (entities[subj][0] != entities[obj][0]) or (subj != obj):
                    arg1 = entities[subj][0]
                    arg2 = entities[obj][0]
                    relations[(subj, obj)] = search_property(arg1, arg2)
                    #TODO: alternative name relation
                    #elif (entities[subj][0] == entities[obj][0]) and (subj != obj):
                    #    relations[(subj, obj)] = 'P'
    util.dump_to_file(os.path.join(data_dir, "relations.cPickle"), relations)

    # Assign relation
    wiki_data['rel'] = pd.Series(index=wiki_data.index, dtype=str)
    for idx, row in wiki_data.iterrows():
        entity_pair = (row['subj'], row['obj'])

        if relations.has_key(entity_pair):
            rel = relations[entity_pair]
            if rel is not None and len(rel) > 0:
                counter += 1
                wiki_data.set_value(idx, 'rel',
                                    ', '.join(set([s[0] for s in rel])))
    # Save
    path = os.path.join(data_dir, 'candidates%d.tsv' % step)
    wiki_data.to_csv(path, sep='\t', encoding='utf-8')

    # Cleanup
    for f in glob.glob(os.path.join(orig_dir, '*')):
        os.remove(f)
    for f in glob.glob(os.path.join(ner_dir, '*')):
        os.remove(f)

    return doc_id, entities, relations, counter
Beispiel #9
0
def loop(step, doc_id, limit, entities, relations, counter):
    """Distant Supervision Loop"""
    # Download wiki articles
    print('[1/4] Downloading wiki articles ...')
    # docs = download_wiki_articles(doc_id, limit)
    docs = os.listdir('./data/orig')
    if docs is None:
        return None

    # Named Entity Recognition
    print('[2/4] Performing named entity recognition ...')
    # exec_ner(docs)
    # wiki_data = read_ner_output(docs)
    path = os.path.join(data_dir, 'candidates%d.tsv' % step)
    if not os.path.isfile(path):
        wiki_data = read_ner_spacy(docs)
        wiki_data.to_csv(path, sep='\t', encoding='utf-8', index=False)
    else:
        wiki_data = pd.read_csv(path, sep='\t', encoding='utf-8')

    doc_id.extend([int(s) for s in wiki_data.doc_id.unique()])

    # Prepare Containers
    unique_entities = set(
        wiki_data.groupby(['subj', 'subj_tag']).count().index.tolist())
    unique_entities.update(
        set(wiki_data.groupby(['obj', 'obj_tag']).count().index.tolist()))
    unique_entity_pairs = set(
        wiki_data.groupby(['subj', 'obj']).count().index.tolist())
    # for idx, row in wiki_data.iterrows():
    #     unique_entities.add((row['subj'], row['subj_tag']))
    #     unique_entities.add((row['obj'], row['obj_tag']))
    #     unique_entity_pairs.add((row['subj'], row['obj']))

    # Entity Linkage
    print('[3/4] Linking entities ...')
    entities_filename = os.path.join(data_dir, "entities.pickle")
    if os.path.isfile(entities_filename):
        entities = util.load_from_dump(entities_filename)
    else:
        for name, tag in unique_entities:
            if not name in entities and tag in tag_map.keys():
                e = name2qid(name, tag, alias=False)
                if e is None:
                    e = name2qid(name, tag, alias=True)
                entities[name] = e
        util.dump_to_file(entities_filename, entities)

    # Predicate Linkage
    print('[4/4] Linking predicates ...')
    predicates_filename = os.path.join(data_dir, "relations.pickle")
    if os.path.isfile(predicates_filename):
        relations = util.load_from_dump(predicates_filename)
    else:
        for subj, obj in unique_entity_pairs:
            if not (subj, obj) in relations:
                if entities.get(subj) is not None and entities.get(
                        obj) is not None:
                    if (entities[subj][0] != entities[obj][0]) or (subj !=
                                                                   obj):
                        arg1 = entities[subj][0]
                        arg2 = entities[obj][0]
                        relations[(subj, obj)] = search_property(arg1, arg2)
                        #TODO: alternative name relation
                        #elif (entities[subj][0] == entities[obj][0]) and (subj != obj):
                        #    relations[(subj, obj)] = 'P'
        util.dump_to_file(predicates_filename, relations)

    # Assign relation
    # i.e. extract the 'class' name for this relationship
    wiki_data['rel'] = pd.Series(index=wiki_data.index, dtype=str)
    rel = list(
        map(lambda x: ', '.join(set([s[0] for s in x])), relations.values()))
    for i, r in enumerate(relations):
        if len(rel[i]) > 0:
            # counter += 1
            idx = (wiki_data['subj'] == r[0]) & (wiki_data['obj'] == r[1])
            wiki_data.loc[idx, 'rel'] = rel[i]

    # Save
    path = os.path.join(data_dir, 'candidates%d.tsv' % step)
    wiki_data.to_csv(path, sep='\t', encoding='utf-8', index=False)

    # Cleanup
    # for f in glob.glob(os.path.join(orig_dir, '*')):
    #     os.remove(f)
    #
    # for f in glob.glob(os.path.join(ner_dir, '*')):
    #     os.remove(f)

    return doc_id, entities, relations, counter
def train(train_data, test_data, FLAGS = tf.app.flags.FLAGS):
    # # train_dir
    # timestamp = str(int(time.time()))
    # out_dir = os.path.abspath(os.path.join(FLAGS.train_dir, timestamp))
    #
    # # save flags
    # if not os.path.exists(out_dir):
    #     os.mkdir(out_dir)
    # FLAGS._parse_flags()
    # config = dict(FLAGS.__flags.items())
    #
    # # Window_size must not be larger than the sent_len
    # if config['sent_len'] < config['max_window']:
    #     config['max_window'] = config['sent_len']
    #
    # util.dump_to_file(os.path.join(out_dir, 'flags.cPickle'), config)


    train_x = get_key_phrases(train_data)
    _, train_y, _ = zip(*train_data)

    test_x = get_key_phrases(test_data)
    _, test_y, _ = zip(*test_data)

    # # assign pretrained embeddings
    # if FLAGS.use_pretrain:
    print "Initialize model with pretrained embeddings..."
    print("Please don't forget to change the vocab size to the corresponding on in the embedding.")
    pretrained_embedding = np.load(os.path.join(FLAGS.data_dir, 'emb.npy'))
    train_x = key_phrase_indices_to_embedding(train_x, pretrained_embedding)
    test_x = key_phrase_indices_to_embedding(test_x, pretrained_embedding)

    # Use SVM. But SVM does not output a probability
    # train_y = np.argmax(train_y, axis=1)
    # test_y = np.argmax(test_y, axis=1)
    # clf = svm.SVC(class_weight='balanced')
    # clf.fit(train_x, train_y)
    # predicted_test_y = clf.predict(test_x)

    # Use fully connected multilayer nn.
    config = dict(FLAGS.__flags.items())
    # train_dir
    timestamp = str(int(time.time()))
    out_dir = os.path.abspath(os.path.join(FLAGS.train_dir, timestamp))

    # save flags
    if not os.path.exists(out_dir):
        os.mkdir(out_dir)
    FLAGS._parse_flags()
    config = dict(FLAGS.__flags.items())

    util.dump_to_file(os.path.join(out_dir, 'flags.cPickle'), config)


    num_batches_per_epoch = int(np.ceil(float(len(train_data))/FLAGS.batch_size))
    max_steps = num_batches_per_epoch * FLAGS.num_epochs

    with tf.Graph().as_default():
        with tf.variable_scope('multilayer', reuse=None):
            m = multilayer.Model(config, is_train=True)
        with tf.variable_scope('multilayer', reuse=True):
            mtest = multilayer.Model(config, is_train=False)
        # checkpoint
        saver = tf.train.Saver(tf.global_variables(), max_to_keep=1)
        save_path = os.path.join(out_dir, 'model.ckpt')
        try:
            summary_op = tf.summary.merge_all()
        except:
            summary_op = tf.merge_all_summaries()

        # session
        config = tf.ConfigProto(log_device_placement=FLAGS.log_device_placement)
        if FLAGS.gpu_percentage > 0:
            config.gpu_options.per_process_gpu_memory_fraction = FLAGS.gpu_percentage
        else:
            config = tf.ConfigProto(
                log_device_placement=FLAGS.log_device_placement,
                device_count={'GPU': 0}
            )
        sess = tf.Session(config=config)
        with sess.as_default():
            train_summary_writer = tf.train.SummaryWriter(os.path.join(out_dir, "train"), graph=sess.graph)
            dev_summary_writer = tf.train.SummaryWriter(os.path.join(out_dir, "dev"), graph=sess.graph)
            try:
                sess.run(tf.global_variables_initializer())
            except:
                sess.run(tf.initialize_all_variables())

            # # assign pretrained embeddings
            # if FLAGS.use_pretrain:
            #     print "Initialize model with pretrained embeddings..."
            #     print("Please don't forget to change the vocab size to the corresponding on in the embedding.")
            #     pretrained_embedding = np.load(os.path.join(FLAGS.data_dir, 'emb.npy'))
            #     m.assign_embedding(sess, pretrained_embedding)

            # initialize parameters
            current_lr = FLAGS.init_lr
            lowest_loss_value = float("inf")
            decay_step_counter = 0
            global_step = 0

            # evaluate on dev set
            def dev_step(mtest, sess):
                dev_loss = []
                dev_auc = []
                dev_f1_score = []

                # create batch
                test_batches = util.batch_iter(zip(test_x, test_y), batch_size=FLAGS.batch_size, num_epochs=1, shuffle=False)
                for batch in test_batches:
                    x_batch, y_batch, = zip(*batch)
                    # a_batch = np.ones((len(batch), 1), dtype=np.float32) / len(batch) # average
                    loss_value, eval_value = sess.run([mtest.total_loss, mtest.eval_op],
                                                      feed_dict={mtest.inputs: np.array(x_batch),
                                                                 mtest.labels: np.array(y_batch)})
                    dev_loss.append(loss_value)
                    pre, rec = zip(*eval_value)
                    dev_auc.append(util.calc_auc_pr(pre, rec))
                    dev_f1_score.append((2.0 * pre[5] * rec[5]) / (pre[5] + rec[5]))  # threshold = 0.5

                return (np.mean(dev_loss), np.mean(dev_auc), np.mean(dev_f1_score))

            # train loop
            print "\nStart training (save checkpoints in %s)\n" % out_dir
            train_loss = []
            train_auc = []
            train_f1_score = []
            train_batches = util.batch_iter(zip(train_x, train_y), batch_size=FLAGS.batch_size, num_epochs=FLAGS.num_epochs)
            for batch in train_batches:
                batch_size = len(batch)

                m.assign_lr(sess, current_lr)
                global_step += 1

                x_batch, y_batch, = zip(*batch)
                feed = {m.inputs: np.array(x_batch), m.labels: np.array(y_batch)}
                start_time = time.time()
                _, loss_value, eval_value = sess.run([m.train_op, m.total_loss, m.eval_op], feed_dict=feed)
                proc_duration = time.time() - start_time
                train_loss.append(loss_value)
                pre, rec = zip(*eval_value)
                auc = util.calc_auc_pr(pre, rec)
                f1 = (2.0 * pre[5] * rec[5]) / (pre[5] + rec[5])  # threshold = 0.5
                train_auc.append(auc)
                train_f1_score.append(f1)

                assert not np.isnan(loss_value), "Model loss is NaN."

                # print log
                if global_step % FLAGS.log_step == 0:
                    examples_per_sec = batch_size / proc_duration
                    format_str = '%s: step %d/%d, f1 = %.4f, auc = %.4f, loss = %.4f ' + \
                                 '(%.1f examples/sec; %.3f sec/batch), lr: %.6f'
                    print format_str % (datetime.now(), global_step, max_steps, f1, auc, loss_value,
                                        examples_per_sec, proc_duration, current_lr)

                # write summary
                if global_step % FLAGS.summary_step == 0:
                    summary_str = sess.run(summary_op)
                    train_summary_writer.add_summary(summary_str, global_step)
                    dev_summary_writer.add_summary(summary_str, global_step)

                    # summary loss, f1
                    train_summary_writer.add_summary(
                        _summary_for_scalar('loss', np.mean(train_loss)), global_step=global_step)
                    train_summary_writer.add_summary(
                        _summary_for_scalar('auc', np.mean(train_auc)), global_step=global_step)
                    train_summary_writer.add_summary(
                        _summary_for_scalar('f1', np.mean(train_f1_score)), global_step=global_step)

                    dev_loss, dev_auc, dev_f1 = dev_step(mtest, sess)
                    dev_summary_writer.add_summary(
                        _summary_for_scalar('loss', dev_loss), global_step=global_step)
                    dev_summary_writer.add_summary(
                        _summary_for_scalar('auc', dev_auc), global_step=global_step)
                    dev_summary_writer.add_summary(
                        _summary_for_scalar('f1', dev_f1), global_step=global_step)

                    print "\n===== write summary ====="
                    print "%s: step %d/%d: train_loss = %.6f, train_auc = %.4f, train_f1 = %.4f" \
                          % (datetime.now(), global_step, max_steps,
                             np.mean(train_loss), np.mean(train_auc), np.mean(train_f1_score))
                    print "%s: step %d/%d:   dev_loss = %.6f,   dev_auc = %.4f,   dev_f1 = %.4f\n" \
                          % (datetime.now(), global_step, max_steps, dev_loss, dev_auc, dev_f1)

                    # reset container
                    train_loss = []
                    train_auc = []
                    train_f1_score = []

                # decay learning rate if necessary
                if loss_value < lowest_loss_value:
                    lowest_loss_value = loss_value
                    decay_step_counter = 0
                else:
                    decay_step_counter += 1
                if decay_step_counter >= FLAGS.tolerance_step:
                    current_lr *= FLAGS.lr_decay
                    print '%s: step %d/%d, Learning rate decays to %.5f' % \
                          (datetime.now(), global_step, max_steps, current_lr)
                    decay_step_counter = 0

                # stop learning if learning rate is too low
                if current_lr < 1e-5:
                    break

                # save checkpoint
                if global_step % FLAGS.checkpoint_step == 0:
                    saver.save(sess, save_path, global_step=global_step)
            saver.save(sess, save_path, global_step=global_step)

            # Lastly evaluate the test set
            loss_value, predicted_test_y_logits = sess.run([mtest.total_loss, mtest.scores],
                                              feed_dict={mtest.inputs: np.array(test_x),
                                                         mtest.labels: np.array(test_y)})

            predicted_test_y = np.argmax(predicted_test_y_logits, axis=1)
            test_y = np.argmax(test_y, axis=1)

    result = (predicted_test_y == test_y)
    accuracy = np.sum(result.astype(np.int32)) / float(result.shape[0])

    print("Overall %f%% answers were correct. " %(float(accuracy * 100)))

    epsilon = 0.00000001
    num_categories = 3
    true_positive_per_category = [np.bitwise_and(test_y==category_i, predicted_test_y==category_i) for category_i in range(num_categories)]
    false_positive_per_category = [np.bitwise_and(test_y!=category_i, predicted_test_y==category_i) for category_i in range(num_categories)]
    true_negative_per_category = [np.bitwise_and(test_y!=category_i, predicted_test_y!=category_i) for category_i in range(num_categories)]
    false_negative_per_category = [np.bitwise_and(test_y==category_i, predicted_test_y!=category_i) for category_i in range(num_categories)]
    precision_per_category = [np.sum(true_positive_per_category[category_i].astype(np.int32)) /
                              float(np.sum(true_positive_per_category[category_i].astype(np.int32)) +
                               np.sum(false_positive_per_category[category_i].astype(np.int32)) + epsilon)
                              for category_i in range(num_categories)]

    recall_per_category = [np.sum(true_positive_per_category[category_i].astype(np.int32)) /
                              float(np.sum(true_positive_per_category[category_i].astype(np.int32)) +
                               np.sum(false_negative_per_category[category_i].astype(np.int32)) + epsilon)
                              for category_i in range(num_categories)]

    f1_per_category = [2 / (1 / (precision_per_category[category_i] + epsilon) +
                            1 / (recall_per_category[category_i] + epsilon)) for category_i in range(num_categories)]


    for category_i in range(num_categories):

        print("Category %d has f1 score: %f, precision: %f, and recall %f"
              %(category_i, f1_per_category[category_i], precision_per_category[category_i], recall_per_category[category_i]))

    return test_x, predicted_test_y_logits
Beispiel #11
0
def train():
    # train_dir
    timestamp = str(int(time.time()))
    out_dir = os.path.abspath(os.path.join(FLAGS.train_dir, timestamp))

    # save flags
    if not os.path.exists(out_dir):
        os.mkdir(out_dir)
    FLAGS._parse_flags()
    config = dict(FLAGS.__flags.items())
    util.dump_to_file(os.path.join(out_dir, 'flags.cPickle'), config)
    print "Parameters:"
    for k, v in config.iteritems():
        print '%20s %r' % (k, v)

    # load data
    print "Preparing train data ..."
    train_loader = util.DataLoader(FLAGS.data_dir,
                                   'train.cPickle',
                                   batch_size=FLAGS.batch_size)
    print "Preparing test data ..."
    dev_loader = util.DataLoader(FLAGS.data_dir,
                                 'test.cPickle',
                                 batch_size=FLAGS.batch_size)
    max_steps = train_loader.num_batch * FLAGS.num_epoch
    config['num_classes'] = train_loader.num_classes
    config['sent_len'] = train_loader.sent_len

    with tf.Graph().as_default():
        with tf.variable_scope('cnn', reuse=None):
            m = cnn.Model(config, is_train=True)
        with tf.variable_scope('cnn', reuse=True):
            mtest = cnn.Model(config, is_train=False)

        # checkpoint
        saver = tf.train.Saver(tf.global_variables())
        save_path = os.path.join(out_dir, 'model.ckpt')
        summary_op = tf.summary.merge_all()

        # session
        sess = tf.Session()

        # summary writer
        proj_config = tf.contrib.tensorboard.plugins.projector.ProjectorConfig(
        )
        embedding = proj_config.embeddings.add()
        embedding.tensor_name = m.W_emb.name
        embedding.metadata_path = os.path.join(FLAGS.data_dir, 'metadata.tsv')
        summary_dir = os.path.join(out_dir, "summaries")
        summary_writer = tf.summary.FileWriter(summary_dir, graph=sess.graph)
        tf.contrib.tensorboard.plugins.projector.visualize_embeddings(
            summary_writer, proj_config)

        sess.run(tf.global_variables_initializer())

        # assign pretrained embeddings
        if FLAGS.use_pretrain:
            print "Use pretrained embeddings to initialize model ..."
            emb_file = os.path.join(FLAGS.data_dir, 'emb.txt')
            vocab_file = os.path.join(FLAGS.data_dir, 'vocab.txt')
            pretrained_embedding = util.load_embedding(emb_file, vocab_file,
                                                       FLAGS.vocab_size)
            m.assign_embedding(sess, pretrained_embedding)

        # initialize parameters
        current_lr = FLAGS.init_lr
        lowest_loss_value = float("inf")
        decay_step_counter = 0
        global_step = 0

        # evaluate on dev set
        def dev_step(mtest, sess, data_loader):
            dev_loss = 0.0
            dev_accuracy = 0.0
            for _ in range(data_loader.num_batch):
                x_batch_dev, y_batch_dev = data_loader.next_batch()
                dev_loss_value, dev_true_count = sess.run(
                    [mtest.total_loss, mtest.true_count_op],
                    feed_dict={
                        mtest.inputs: x_batch_dev,
                        mtest.labels: y_batch_dev
                    })
                dev_loss += dev_loss_value
                dev_accuracy += dev_true_count
            dev_loss /= data_loader.num_batch
            dev_accuracy /= float(data_loader.num_batch * FLAGS.batch_size)
            data_loader.reset_pointer()
            return dev_loss, dev_accuracy

        # train loop
        print '\nStart training, %d batches needed, with %d examples per batch.' % (
            train_loader.num_batch, FLAGS.batch_size)
        for epoch in range(FLAGS.num_epoch):
            train_loss = []
            train_accuracy = []
            train_loader.reset_pointer()
            for _ in range(train_loader.num_batch):
                m.assign_lr(sess, current_lr)
                global_step += 1

                start_time = time.time()
                x_batch, y_batch = train_loader.next_batch()

                feed = {m.inputs: x_batch, m.labels: y_batch}
                run_options = tf.RunOptions(
                    trace_level=tf.RunOptions.FULL_TRACE)
                run_metadata = tf.RunMetadata()
                _, loss_value, true_count = sess.run(
                    [m.train_op, m.total_loss, m.true_count_op],
                    feed_dict=feed,
                    options=run_options,
                    run_metadata=run_metadata)
                proc_duration = time.time() - start_time
                train_loss.append(loss_value)
                train_accuracy.append(true_count)

                assert not np.isnan(loss_value), "Model loss is NaN."

                if global_step % FLAGS.log_step == 0:
                    examples_per_sec = FLAGS.batch_size / proc_duration
                    accuracy = float(true_count) / FLAGS.batch_size
                    format_str = '%s: step %d/%d (epoch %d/%d), acc = %.2f, loss = %.2f ' + \
                                 '(%.1f examples/sec; %.3f sec/batch), lr: %.6f'
                    print format_str % (datetime.now(), global_step, max_steps,
                                        epoch + 1, FLAGS.num_epoch, accuracy,
                                        loss_value, examples_per_sec,
                                        proc_duration, current_lr)

                # write summary
                if global_step % FLAGS.summary_step == 0:
                    summary_str = sess.run(summary_op)
                    summary_writer.add_run_metadata(run_metadata,
                                                    'step%04d' % global_step)
                    summary_writer.add_summary(summary_str, global_step)

                    # summary loss/accuracy
                    train_loss_mean = sum(train_loss) / float(len(train_loss))
                    train_accuracy_mean = sum(train_accuracy) / float(
                        len(train_accuracy) * FLAGS.batch_size)
                    summary_writer.add_summary(_summary(
                        'train/loss', train_loss_mean),
                                               global_step=global_step)
                    summary_writer.add_summary(_summary(
                        'train/accuracy', train_accuracy_mean),
                                               global_step=global_step)

                    test_loss, test_accuracy = dev_step(
                        mtest, sess, dev_loader)
                    summary_writer.add_summary(_summary('dev/loss', test_loss),
                                               global_step=global_step)
                    summary_writer.add_summary(_summary(
                        'dev/accuracy', test_accuracy),
                                               global_step=global_step)

                    print "\nStep %d: train_loss = %.6f, train_accuracy = %.3f" % (
                        global_step, train_loss_mean, train_accuracy_mean)
                    print "Step %d:  test_loss = %.6f,  test_accuracy = %.3f\n" % (
                        global_step, test_loss, test_accuracy)

                # decay learning rate if necessary
                if loss_value < lowest_loss_value:
                    lowest_loss_value = loss_value
                    decay_step_counter = 0
                else:
                    decay_step_counter += 1
                if decay_step_counter >= FLAGS.tolerance_step:
                    current_lr *= FLAGS.lr_decay
                    print '%s: step %d/%d (epoch %d/%d), Learning rate decays to %.5f' % \
                          (datetime.now(), global_step, max_steps, epoch+1, FLAGS.num_epoch, current_lr)
                    decay_step_counter = 0

                # stop learning if learning rate is too low
                if current_lr < 1e-5:
                    break

                # save checkpoint
                if global_step % FLAGS.checkpoint_step == 0:
                    saver.save(sess, save_path, global_step=global_step)
        saver.save(sess, save_path, global_step=global_step)