コード例 #1
0
ファイル: config.py プロジェクト: seanie12/QA_SQuAD_2.0
 def __init__(self):
     self.vocab_file = "data/vocab"
     self.train_file = "data/train.txt"
     self.dev_file = "data/dev.txt"
     self.dict_file = "data/dict.p"
     self.max_vocab_size = 5e4
     self.debug = True
     self.num_epochs = 20
     self.batch_size = 16
     self.dropout = 0.1
     self.vocab_size = 5e4
     self.embedding_size = 300
     self.lr = 1e-3
     self.lstm_size = 128
     self.filter_size = 96
     self.attention_size = 128
     self.grad_clip = 5
     self.alpha = 1e-1
     self.beta = 1e-1
     self.l2_lambda = 3e-7
     self.num_heads = 8
     self.ans_limit = 20
     self.embeddings = load_glove("data/glove.npz")
     self.dir_output = "results/save/"
     self.dir_model = self.dir_output + "model.weights/"
     if not os.path.exists(self.dir_output):
         os.makedirs(self.dir_output)
コード例 #2
0
 def _build_vars(self):
     with tf.variable_scope(self._name):
         vocab_path = "vocab.txt"
         vocab = Vocab(vocab_path, self._vocab_size)
         if self.word_init:
             word2vec = utils.load_glove(self._embedding_size)
             self.embeddings = utils.create_embedding(
                 self, word2vec, vocab.id_to_word, self._embedding_size)
         else:
             self.embeddings = np.random.uniform(
                 -0.5, 0.5, (len(vocab.id_to_word), self._embedding_size))
         self.embeddings = tf.Variable(self.embeddings.astype(np.float32),
                                       name="Embedding")
コード例 #3
0
min_score = min(resolved_scores)
if essay_set_id == 7:
    min_score, max_score = 0, 30
elif essay_set_id == 8:
    min_score, max_score = 0, 60
print 'max_score is {} \t min_score is {}\n'.format(max_score, min_score)
with open(out_dir+'/params', 'a') as f:
    f.write('max_score is {} \t min_score is {} \n'.format(max_score, min_score))

# include max score
score_range = range(min_score, max_score+1)

#word_idx, _ = data_utils.build_vocab(essay_list, vocab_limit)

# load glove
word_idx, word2vec = data_utils.load_glove(num_tokens, embedding_size)
vocab_size = len(word_idx) + 1
# stat info on data set

sent_size_list = map(len, [essay for essay in essay_list])
max_sent_size = max(sent_size_list)
mean_sent_size = int(np.mean(map(len, [essay for essay in essay_list])))

print 'max sentence size: {} \nmean sentence size: {}\n'.format(max_sent_size, mean_sent_size)
with open(out_dir+'/params', 'a') as f:
    f.write('max sentence size: {} \nmean sentence size: {}\n'.format(max_sent_size, mean_sent_size))

print 'The length of score range is {}'.format(len(score_range))
E = data_utils.vectorize_data(essay_list, word_idx, max_sent_size)

labeled_data = zip(E, resolved_scores, sent_size_list)
コード例 #4
0
    def __init__(self,
                 data_dir,
                 model_dir,
                 task_id,
                 isInteractive=True,
                 OOV=False,
                 memory_size=50,
                 random_state=None,
                 batch_size=32,
                 learning_rate=0.001,
                 epsilon=1e-8,
                 max_grad_norm=40.0,
                 evaluation_interval=10,
                 hops=3,
                 epochs=200,
                 embedding_size=100):
        self.data_dir = data_dir
        self.task_id = task_id
        self.model_dir = model_dir
        # self.isTrain=isTrain
        self.isInteractive = isInteractive
        self.OOV = OOV
        self.memory_size = memory_size
        self.random_state = random_state
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.epsilon = epsilon
        self.max_grad_norm = max_grad_norm
        self.evaluation_interval = evaluation_interval
        self.hops = hops
        self.epochs = epochs
        self.embedding_size = embedding_size
        self.vocab = {}
        self.ivocab = {}
        self.word2vec = {}
        self.word2vec_init = True

        if self.word2vec_init:
            # assert config.embed_size == 100
            self.word2vec = load_glove(self.embedding_size)

        process_word(word="<eos>",
                     word2vec=self.word2vec,
                     vocab=self.vocab,
                     ivocab=self.ivocab,
                     word_vector_size=self.embedding_size,
                     to_return="index")

        # Define uncertain or unknown word index and vec for use later for training out-of-context data
        self.uncertain_word_index = process_word(
            word="sdfsssdf",
            word2vec=self.word2vec,
            vocab=self.vocab,
            ivocab=self.ivocab,
            word_vector_size=self.embedding_size,
            to_return="index")

        candidates, self.candid2indx = load_candidates(self.data_dir,
                                                       self.task_id)
        self.n_cand = len(candidates)
        print("Candidate Size", self.n_cand)
        self.indx2candid = dict(
            (self.candid2indx[key], key) for key in self.candid2indx)
        # task data
        self.trainData, self.testData, self.valData = load_dialog_task(
            self.data_dir, self.task_id, self.candid2indx, self.OOV)
        data = self.trainData + self.testData + self.valData

        self.build_vocab(data, candidates)
        self.set_max_sentence_length()
        # self.candidates_vec=vectorize_candidates_sparse(candidates,self.word_idx)
        self.trainS, self.trainQ, self.trainA = vectorize_data_match(
            self.trainData,
            self.word2vec,
            self.max_sentence_size,
            self.batch_size,
            self.n_cand,
            self.memory_size,
            self.vocab,
            self.ivocab,
            self.embedding_size,
            uncertain=self.uncertain_word_index)
        self.valS, self.valQ, self.valA = vectorize_data_match(
            self.valData,
            self.word2vec,
            self.max_sentence_size,
            self.batch_size,
            self.n_cand,
            self.memory_size,
            self.vocab,
            self.ivocab,
            self.embedding_size,
            uncertain_word=True,
            uncertain=self.uncertain_word_index)

        self.candidates_vec = vectorize_candidates(
            candidates, self.word2vec, self.candidate_sentence_size,
            self.vocab, self.ivocab, self.embedding_size)
        optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate,
                                           epsilon=self.epsilon)
        self.sess = tf.Session()
        # Set max sentence vector size
        self.build_vocab(data, candidates)

        answer_n_hot = np.zeros((self.vocab_size, len(self.candid2indx)))
        for ans_it in range(len(self.indx2candid)):
            ans = self.indx2candid[ans_it]
            n_hot = np.zeros((self.vocab_size, ))
            for w in tokenize(ans):
                assert w in self.word_idx
                n_hot[self.word_idx[w]] = 1
            answer_n_hot[:, ans_it] = n_hot

        # Need to understand more about sentence size. Model failing because sentence size > candidate_sentence_size? Answers longer than queries?
        self.model = MemN2NDialogHybridMatch(self.batch_size,
                                             self.vocab_size,
                                             self.max_sentence_size,
                                             self.memory_size,
                                             self.embedding_size,
                                             answer_n_hot,
                                             match=FLAGS.match,
                                             session=self.sess,
                                             hops=self.hops,
                                             max_grad_norm=self.max_grad_norm,
                                             optimizer=optimizer,
                                             task_id=self.task_id)
        # self.model = MemN2NDialogHybrid(self.batch_size, self.vocab_size, self.n_cand, self.max_sentence_size, self.embedding_size, self.candidates_vec, session=self.sess,
        #                           hops=self.hops, max_grad_norm=self.max_grad_norm, optimizer=optimizer, task_id=task_id)
        self.saver = tf.train.Saver(max_to_keep=50)

        self.summary_writer = tf.summary.FileWriter(
            self.model.root_dir, self.model.graph_output.graph)

        self.kb = parse_kb(FLAGS.kb_file)
コード例 #5
0
def train(restore=False):
    config = Config()

    data, vocab = utils.load_sentiment_treebank(SST_DIR, config.fine_grained)
    train_set, dev_set, test_set = data['train'], data['dev'], data['test']

    num_emb = len(vocab)
    num_labels = 5 if config.fine_grained else 3
    for _, dataset in data.items():
        labels = [label for _, label in dataset]
        assert set(labels) <= set(xrange(num_labels)), set(labels)

    config.num_emb = num_emb
    config.output_dim = num_labels

    config.maxseqlen = utils.get_max_len_data(data)
    config.maxnodesize = utils.get_max_node_size(data)

    if config.fine_grained:
        classify_type = "fine_grained"
    else:
        classify_type = "binary"

    random.seed()
    np.random.seed()

    with tf.Graph().as_default():

        #model = seq_att.tf_seqLSTM(config)
        #model = seq_att.tf_seqLSTMAtt(config)

        #model = seq_att.tf_seqbiLSTM(config)
        model = seq_att.tf_seqbiLSTMAtt(config)

        model_name = model.__class__.__name__
        print 'The model is running now:', model_name, classify_type

        ckpt_base = os.path.join(ckpt_dir, model_name, classify_type)
        summary_base = os.path.join(summaries_dir, model_name, classify_type)

        init = tf.global_variables_initializer()
        saver = tf.train.Saver()

        with tf.Session() as sess:
            tf.summary.FileWriter(summary_base, sess.graph)

            sess.run(init)

            if restore:
                f = os.path.join(ckpt_base, 'lstm_weights')
                saver.restore(sess, f)

                test_score = model.evaluate(test_set, sess, isDev=False)
                print 'test_score:', test_score

                if config.use_attention:
                    visualize_attention(model, test_set, vocab, sess,
                                        ckpt_base)

            else:

                if config.emb_dim == 300:
                    glove_file = os.path.join(GLOVE_DIR, 'glove.840B.300d.txt')
                else:
                    tmp = 'glove.twitter.27B.' + str(config.emb_dim) + 'd.txt'
                    glove_file = os.path.join(GLOVE_DIR, tmp)

                glove_embeddings = utils.load_glove(glove_file, vocab,
                                                    config.emb_dim)
                sess.run(model.embedding_init,
                         feed_dict={model.initial_emb: glove_embeddings})

                avg_loss = model.train(train_set, test_set, sess, saver)
                print 'avg loss', avg_loss
コード例 #6
0
# Network Parameters
n_input = FLAGS.embedding_size
#n_steps = ? # timesteps = num of words in article, cannot defined here
n_hidden = FLAGS.num_hidden_nodes  # hidden layer num of features
n_classes = 2  # total classes (0-1 digits)

#deal with input data
training_path = '200_clean_graded_data.csv'  #put name of training file here
essay_list, label, problem_id, count_one, question_list = data_utils.load_open_response_data(
    training_path)

#majorty class

print('majorty class accuracy is : \n', count_one / len(label))
# load glove
word_idx, word2vec = data_utils.load_glove(n_input)

vocab_size = len(word_idx) + 1
# stat info on data set

sent_size_list = map(len, [essay for essay in essay_list])
max_sent_size = max(sent_size_list)
mean_sent_size = int(np.mean(map(len, [essay for essay in essay_list])))

question_sent_size_list = map(len, [question for question in question_list])
question_max_sent_size = max(question_sent_size_list)
question_mean_sent_size = int(
    np.mean(map(len, [question for question in question_list])))

print('max sentence size: {} \nmean sentence size: {}\n'.format(
    max_sent_size, mean_sent_size))
コード例 #7
0
    def __init__(self):
        # load word embedding
        glove = data_utils.load_glove(FLAGS.glove_file)
        word2vec = data_utils.load_word2vec(FLAGS.word2vec_file)
        merged_embed, self.vocab_size = self.merge_glove_word2vec(
            glove, word2vec)
        dim = len(merged_embed[0])
        merged_embed.append([0. for _ in xrange(dim)])

        # load doc embedding
        self.doc_embedding, doc_dim = data_utils.load_fastText_embed(\
          FLAGS.fastText_doc_file, FLAGS.fastText_vector_file)
        self.zero_doc_key = self.doc_key([self.vocab_size], [self.vocab_size])
        self.doc_embedding[self.zero_doc_key] = [0. for _ in xrange(doc_dim)]

        FLAGS.fc_units = map(int, FLAGS.fc_units.split(','))

        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        self.session = tf.Session(config=config)
        ''' graph '''
        print 'Initializing model graph...'
        with tf.variable_scope('inputs'):
            self.training = tf.placeholder(tf.bool, name='training')

            self.title = tf.placeholder(
                tf.int32, shape=[None, None],
                name='title')  # [batch size, sequence length]
            self.content = tf.placeholder(tf.int32,
                                          shape=[None, None],
                                          name='content')
            self.title_length = tf.placeholder(tf.int32, shape=[None], \
              name='title_length')
            self.content_length = tf.placeholder(tf.int32, shape=[None],\
              name='content_length')

            self.prices = tf.placeholder(tf.float32, name='prices', \
              shape=[None, None, 7])
            self.price_length = tf.placeholder(tf.int32, shape=[None], \
              name='price_length')

            self.docs = tf.placeholder(tf.float32, name='docs', \
              shape=[None, None, doc_dim])
            self.doc_length = tf.placeholder(tf.int32, shape=[None], \
              name='doc_length')

            self.label = tf.placeholder(tf.int32,
                                        shape=[None, 2],
                                        name='label')

        with tf.variable_scope('birnn_embed'):
            self.word_embedding = tf.Variable(merged_embed,
                                              dtype=tf.float32,
                                              name='word_embedding_matrix')
            title_embed = self.embed_birnn(FLAGS.title_units,
                                           FLAGS.title_layers,
                                           self.title,
                                           self.title_length,
                                           scope='title_embed_birnn')
            content_embed = self.embed_birnn(FLAGS.content_units,
                                             FLAGS.content_layers,
                                             self.content,
                                             self.content_length,
                                             scope='content_embed_birnn')
            price_embed = self.birnn(FLAGS.price_units,
                                     FLAGS.price_layers,
                                     self.prices,
                                     self.price_length,
                                     scope='price_birnn')
            doc_embed = self.birnn(FLAGS.doc_units,
                                   FLAGS.doc_layers,
                                   self.docs,
                                   self.doc_length,
                                   scope='doc_birnn')
            final_embed = tf.concat(
                [title_embed, content_embed, doc_embed, price_embed], 1)

        with tf.variable_scope('full_connect'):
            fc_inputs = final_embed
            for i in range(FLAGS.fc_layers):
                with tf.variable_scope('full_connect_layer_%d' % i):
                    fc_outputs = tf.contrib.layers.legacy_fully_connected(
                        fc_inputs,
                        FLAGS.fc_units[i],
                        activation_fn=tf.nn.relu,
                        weight_regularizer=tf.contrib.layers.l2_regularizer(
                            FLAGS.l2_coef))
                    fc_inputs = fc_outputs

        with tf.variable_scope('dropout'):
            dropout = tf.layers.dropout(fc_outputs, training=self.training)

        with tf.variable_scope('output'):
            W = tf.get_variable('W',
                                shape=[FLAGS.fc_units[-1], 2],
                                initializer=tf.truncated_normal_initializer())
            biases = tf.get_variable(
                'biases',
                shape=[2],
                initializer=tf.random_normal_initializer())
            logits = tf.matmul(dropout, W) + biases
            self.result = tf.nn.softmax(logits)

        with tf.variable_scope('train'):
            self.cross_entropy = tf.reduce_mean(
                tf.nn.softmax_cross_entropy_with_logits(labels=self.label,
                                                        logits=logits))

            self.learning_rate = tf.Variable(FLAGS.init_lr,
                                             trainable=False,
                                             name="learning_rate")
            self.lr_decay_op = self.learning_rate.assign(self.learning_rate *
                                                         FLAGS.lr_decay)

            self.global_step = tf.Variable(0,
                                           trainable=False,
                                           name='global_step')
            self.train_op = tf.train.AdamOptimizer(FLAGS.init_lr) \
                .minimize(self.cross_entropy, self.global_step)

        with tf.variable_scope('logs'):
            self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=100)
            self.log_writer = tf.summary.FileWriter(
                os.path.join(FLAGS.train_dir, 'logs/'), self.session.graph)
            self.summary = tf.Summary()