Esempio n. 1
0
def run():
    if len(sys.argv) < 3:
        print("** Usage: python3 " + sys.argv[0] +
              " <<Model Directory>> <<Test Set>>")
        sys.exit(1)

    np.random.seed(42)
    model_dir = sys.argv[1]
    config = Config.load(
        ['./default.conf',
         os.path.join(model_dir, 'model.conf')])
    model = create_model(config)
    test_data = load_data(sys.argv[2], config.dictionary, config.grammar,
                          config.max_length)
    print("unknown", unknown_tokens)

    with tf.Graph().as_default():
        tf.set_random_seed(1234)
        with tf.device('/cpu:0'):
            model.build()

            test_eval = Seq2SeqEvaluator(model,
                                         config.grammar,
                                         test_data,
                                         'test',
                                         config.reverse_dictionary,
                                         beam_size=config.beam_size,
                                         batch_size=config.batch_size)
            loader = tf.train.Saver()

            with tf.Session() as sess:
                loader.restore(sess, os.path.join(model_dir, 'best'))
                test_eval.eval(sess, save_to_file=True)
Esempio n. 2
0
 def config(self):
     if not hasattr(self, '_config'):
         path = config_path()
         if os.path.exists(path):
             self._config = Config.load(path)
         else:
             self._config = Config()
             self._config.path = path
     return self._config
Esempio n. 3
0
def run():
    if len(sys.argv) < 4:
        print("** Usage: python3 " + sys.argv[0] +
              " <<Model Directory>> <<Everything Set>> <<Test Set>>")
        sys.exit(1)

    np.random.seed(42)
    model_dir = sys.argv[1]
    config = Config.load(
        ['./default.conf',
         os.path.join(model_dir, 'model.conf')])
    model = create_model(config)

    everything_labels, everything_label_lengths = load_programs(
        config, sys.argv[2])
    test_labels, test_label_lengths = load_programs(config, sys.argv[3])
    #test_labels, test_label_lengths = sample(config.grammar, test_labels, test_label_lengths)
    print("unknown", unknown_tokens)

    with tf.Graph().as_default():
        tf.set_random_seed(1234)
        model.build()
        loader = tf.train.Saver()

        train_bag_of_tokens = bag_of_tokens(config, everything_labels,
                                            everything_label_lengths)
        V, mean = pca_fit(train_bag_of_tokens, n_components=2)

        eval_bag_of_tokens = bag_of_tokens(config, test_labels,
                                           test_label_lengths)
        transformed = pca_transform(eval_bag_of_tokens, V, mean)

        with tf.Session() as sess:
            loader.restore(sess, os.path.join(model_dir, 'best'))
            transformed = transformed.eval(session=sess)

        programs = reconstruct_programs(test_labels, test_label_lengths,
                                        config.grammar.tokens)
        show_pca(transformed, programs)
Esempio n. 4
0
def load_language(app, tokenizer_service, tag, model_dir):
    config = Config.load([
        './default.conf', './default.' + tag + '.conf',
        os.path.join(model_dir, 'model.conf')
    ])
    model = create_model(config)

    graph = tf.Graph()
    session = tf.Session(graph=graph)
    with graph.as_default():
        # Force everything to run on CPU, we run on single inputs so there is not much point
        # on going through the GPU
        with tf.device('/cpu:0'):
            model.build()
            loader = tf.train.Saver()

        with session.as_default():
            loader.restore(session, os.path.join(model_dir, 'best'))
    tokenizer = Tokenizer(tokenizer_service, tag)
    app.add_language(tag,
                     LanguageContext(tag, tokenizer, session, config, model))
    print('Loaded language ' + tag)
Esempio n. 5
0
def run():
    if len(sys.argv) < 3:
        print("** Usage: python3 " + sys.argv[0] +
              " <<Model Directory>> <<Test Set>>")
        sys.exit(1)

    np.random.seed(42)
    model_dir = sys.argv[1]
    config = Config.load(
        ['./default.conf',
         os.path.join(model_dir, 'model.conf')])
    model = create_model(config)

    test_data = load_data(sys.argv[2], config.dictionary, config.grammar,
                          config.max_length)

    with tf.Graph().as_default():
        tf.set_random_seed(1234)
        model.build()
        loader = tf.train.Saver()

        inputs, input_lengths, parses, labels, label_lengths = test_data

        final_encoder_state = tf.concat(nest.flatten(
            model.final_encoder_state),
                                        axis=1)
        final_encoder_size = final_encoder_state.get_shape()[1]

        final_states = OrderedDict()
        with tf.Session() as sess:
            loader.restore(sess, os.path.join(model_dir, 'best'))

            # capture all the final encoder states
            for input_batch, input_length_batch, parse_batch, label_batch, label_length_batch in get_minibatches(
                [inputs, input_lengths, parses, labels, label_lengths],
                    config.batch_size):
                feed_dict = model.create_feed_dict(input_batch,
                                                   input_length_batch,
                                                   parse_batch)
                state_array = sess.run(final_encoder_state,
                                       feed_dict=feed_dict)
                #print state_array.shape

                for state, input, input_length, label, length in zip(
                        state_array, input_batch, input_length_batch,
                        label_batch, label_length_batch):
                    label = label[:length]
                    program = ' '.join(
                        config.grammar.tokens[x] for x in
                        label)  # if is_function(config.grammar.tokens[x]))
                    if not program in final_states:
                        final_states[program] = [(state, input[:input_length])]
                    else:
                        final_states[program].append(
                            (state, input[:input_length]))

        prog_array = [prog for prog in final_states
                      ]  #if len(final_states[prog]) > 1]
        prog_index = dict()
        num_programs = len(prog_array)
        print('num programs', num_programs)
        centers = np.zeros((num_programs, final_encoder_size),
                           dtype=np.float32)
        for i, program in enumerate(prog_array):
            prog_index[program] = i
            centers[i] = np.mean([x[0] for x in final_states[program]], axis=0)

        eval_data = []
        with open(sys.argv[3]) as fp:
            for line in fp:
                sentence, gold, predicted, _ = line.strip().split('\t')
                if gold == predicted:
                    continue
                gold += ' <<EOS>>'
                predicted += ' <<EOS>>'
                if gold in prog_index and predicted in prog_index:
                    sentence_vector, sentence_length = vectorize(
                        sentence, config.dictionary, config.max_length)
                    gold_index = prog_index[gold]
                    gold_center = centers[gold_index]
                    predicted_index = prog_index[predicted]
                    predicted_center = centers[predicted_index]
                    eval_data.append(
                        (gold, predicted, gold_center, predicted_center,
                         sentence_vector, sentence_length))
                    #print(np.linalg.norm(gold_center-predicted_center), gold, predicted, sentence, sep='\t')
                elif gold not in prog_index:
                    #print('no gold', gold, file=sys.stderr)
                    pass
                elif predicted not in prog_index:
                    #print('no predicted', file=sys.stderr)
                    pass

        with tf.Session() as sess:
            loader.restore(sess, os.path.join(model_dir, 'best'))

            def flip(list_of_tuples):
                inner_length = len(list_of_tuples[0])
                tuple_of_lists = [[x[i] for x in list_of_tuples]
                                  for i in range(inner_length)]
                return tuple_of_lists

            with open('./eval.tsv', 'w') as out:
                for gold_batch, predicted_batch, gold_center_batch, predicted_center_batch, input_batch, input_length_batch in get_minibatches(
                        flip(eval_data), config.batch_size):
                    parse_batch = np.zeros(
                        (len(input_batch), 2 * config.max_length - 1),
                        dtype=np.bool)
                    feed_dict = model.create_feed_dict(input_batch,
                                                       input_length_batch,
                                                       parse_batch)
                    state_array = sess.run(final_encoder_state,
                                           feed_dict=feed_dict)

                    assert len(state_array) == len(gold_batch)
                    for state, input, input_length, gold, predicted, gold_center, predicted_center in zip(
                            state_array, input_batch, input_length_batch,
                            gold_batch, predicted_batch, gold_center_batch,
                            predicted_center_batch):
                        gold_predicted_dist = np.linalg.norm(gold_center -
                                                             predicted_center)
                        sentence_gold_dist = np.linalg.norm(state -
                                                            gold_center)
                        sentence_predicted_dist = np.linalg.norm(
                            state - predicted_center)
                        sentence = ' '.join(config.reverse_dictionary[x]
                                            for x in input[:input_length])
                        print(gold_predicted_dist,
                              sentence_gold_dist,
                              sentence_predicted_dist,
                              gold,
                              predicted,
                              sentence,
                              sep='\t',
                              file=out)
        print('written eval.tsv')

        num_good_sentences = np.zeros((num_programs, ), dtype=np.int32)
        sum_good_distance = np.zeros((num_programs, ), dtype=np.float32)
        num_bad_sentences = np.zeros((num_programs, ), dtype=np.int32)
        sum_bad_distance = np.zeros((num_programs, ), dtype=np.float32)
        for i, program in enumerate(prog_array):
            num_good_sentences[i] = len(final_states[program])

            for encoding, sentence in final_states[program]:
                dist = np.linalg.norm(encoding - centers[i])
                sum_good_distance[i] += dist

            # negative examples
            for negative in np.random.choice(prog_array,
                                             size=(10, ),
                                             replace=False):
                if negative == program:
                    continue
                num_bad_sentences[i] += len(final_states[negative])
                for negative_enc, negative_sentence in final_states[negative]:
                    dist = np.linalg.norm(negative_enc - centers[i])
                    sum_bad_distance[i] += dist

        avg_good_distance = sum_good_distance / num_good_sentences
        avg_bad_distance = sum_bad_distance / num_bad_sentences

        with open('./encoded.csv', 'w') as fp:
            writer = csv.writer(fp)
            writer.writerows(
                zip(num_good_sentences, num_bad_sentences, avg_good_distance,
                    avg_bad_distance, sum_good_distance, sum_bad_distance))
Esempio n. 6
0
def run():
    if len(sys.argv) < 4:
        print("** Usage: python3 " + sys.argv[0] +
              " <<Model Directory>> <<Train Set>> <<Test Set>>")
        sys.exit(1)

    np.random.seed(42)
    model_dir = sys.argv[1]
    config = Config.load(
        ['./default.conf',
         os.path.join(model_dir, 'model.conf')])
    model = create_model(config)
    train_data = load_data(sys.argv[2], config.dictionary, config.grammar,
                           config.max_length)
    pca_data = load_data(sys.argv[3], config.dictionary, config.grammar,
                         config.max_length)
    print("unknown", unknown_tokens)

    with tf.Graph().as_default():
        model.build()
        loader = tf.train.Saver()

        with tf.Session() as sess:
            loader.restore(sess, os.path.join(model_dir, 'best'))

            inputs, input_lengths, parses, _, _ = train_data

            final_encoder_state = tf.concat(nest.flatten(
                model.final_encoder_state),
                                            axis=1)
            final_encoder_size = final_encoder_state.get_shape()[1]

            final_states_arrays = []
            # capture all the final encoder states
            for input_batch, input_length_batch, parse_batch in get_minibatches(
                [inputs, input_lengths, parses], config.batch_size):
                feed_dict = model.create_feed_dict(input_batch,
                                                   input_length_batch,
                                                   parse_batch)
                state_array = sess.run(final_encoder_state,
                                       feed_dict=feed_dict)
                #print state_array.shape
                final_states_arrays.append(state_array)

            X = np.concatenate(final_states_arrays, axis=0)
            assert X.shape == (len(inputs), final_encoder_size)
            X = tf.constant(X)

            mean = tf.reduce_mean(X, axis=0)
            centered_X = X - mean
            S, U, V = tf.svd(centered_X)

            # take only the top 2 components
            V = V[:2]
            V_array, mean_array = sess.run([V, mean])

            inputs, input_lengths, parses, labels, label_lengths = pca_data

            X = final_encoder_state
            centered_X = X - tf.constant(mean_array)
            transformed_X = tf.matmul(centered_X, tf.constant(V_array.T))

            feed_dict = model.create_feed_dict(inputs, input_lengths, parses)
            X_pca = sess.run(transformed_X, feed_dict=feed_dict)

            if False:
                sentences = reconstruct_sentences(inputs, input_lengths,
                                                  config.reverse_dictionary)
            else:
                sentences = reconstruct_sentences(labels, label_lengths,
                                                  config.grammar.tokens)
            show_pca(X_pca, sentences)
Esempio n. 7
0
def run():
    if len(sys.argv) < 3:
        print("** Usage: python3 " + sys.argv[0] +
              " <<Model Directory>> <<Train Set>> [<<Dev Set>>]")
        sys.exit(1)

    np.random.seed(42)

    model_dir = sys.argv[1]
    model_conf = os.path.join(model_dir, 'model.conf')
    config = Config.load(['./default.conf', model_conf])
    model = create_model(config)
    train_data = load_data(sys.argv[2], config.dictionary, config.grammar,
                           config.max_length)
    if len(sys.argv) > 3:
        dev_data = load_data(sys.argv[3], config.dictionary, config.grammar,
                             config.max_length)
    else:
        dev_data = None
    print("unknown", unknown_tokens)
    try:
        os.mkdir(model_dir)
    except OSError:
        pass
    if not os.path.exists(model_conf):
        config.save(model_conf)

    with tf.Graph().as_default():
        tf.set_random_seed(1234)
        model.build()
        init = tf.global_variables_initializer()

        saver = tf.train.Saver(max_to_keep=config.n_epochs)

        train_eval = Seq2SeqEvaluator(model,
                                      config.grammar,
                                      train_data,
                                      'train',
                                      config.reverse_dictionary,
                                      beam_size=config.beam_size,
                                      batch_size=config.batch_size)
        dev_eval = Seq2SeqEvaluator(model,
                                    config.grammar,
                                    dev_data,
                                    'dev',
                                    config.reverse_dictionary,
                                    beam_size=config.beam_size,
                                    batch_size=config.batch_size)
        trainer = Trainer(model,
                          train_data,
                          train_eval,
                          dev_eval,
                          saver,
                          model_dir=model_dir,
                          max_length=config.max_length,
                          batch_size=config.batch_size,
                          n_epochs=config.n_epochs,
                          dropout=config.dropout)

        with tf.Session() as sess:
            # Run the Op to initialize the variables.
            sess.run(init)
            #sess = tf_debug.LocalCLIDebugWrapperSession(sess)
            #sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan)

            # Fit the model
            best_dev, best_train = trainer.fit(sess)

            print("best train", best_train)
            print("best dev", best_dev)
Esempio n. 8
0
def app_config():
    schema = Config()
    config_data = json.load(open('league_config.json'))
    return schema.load(config_data)