Exemple #1
0
    torch.manual_seed(123)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    '''
    1. データの準備
    '''
    data_dir = os.path.join(os.path.dirname(__file__), 'data')

    en_train_path = os.path.join(data_dir, 'train.en')
    en_val_path = os.path.join(data_dir, 'dev.en')
    en_test_path = os.path.join(data_dir, 'test.en')

    ja_train_path = os.path.join(data_dir, 'train.ja')
    ja_val_path = os.path.join(data_dir, 'dev.ja')
    ja_test_path = os.path.join(data_dir, 'test.ja')

    en_vocab = Vocab()
    ja_vocab = Vocab()

    en_vocab.fit(en_train_path)
    ja_vocab.fit(ja_train_path)

    x_train = en_vocab.transform(en_train_path)
    x_val = en_vocab.transform(en_val_path)
    x_test = en_vocab.transform(en_test_path)

    t_train = ja_vocab.transform(ja_train_path, eos=True)
    t_val = ja_vocab.transform(ja_val_path, eos=True)
    t_test = ja_vocab.transform(ja_test_path, eos=True)

    def sort(x, t):
        lens = [len(i) for i in x]
Exemple #2
0
def main(args):

    print(args)

    ts = datetime.datetime.now().timestamp()

    logger = SummaryWriter(
        os.path.join('exp/qgen/', '{}_{}'.format(args.exp_name, ts)))
    logger.add_text('exp_name', args.exp_name)
    logger.add_text('args', str(args))

    torch.manual_seed(args.seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(args.seed)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    vocab = Vocab(os.path.join(args.data_dir, 'vocab.csv'), args.min_occ)
    category_vocab = CategoryVocab(
        os.path.join(args.data_dir, 'categories.csv'))

    load_vgg_features, load_resnet_features = False, False
    if args.visual_representation == 'vgg':
        load_vgg_features = True
    elif args.visual_representation == 'resnet-mlb':
        load_resnet_features = True

    data_loader = OrderedDict()
    splits = ['train', 'valid']

    for split in splits:
        file = os.path.join(args.data_dir, 'guesswhat.' + split + '.jsonl.gz')
        data_loader[split] = DataLoader(
            dataset=QuestionerDataset(
                split,
                file,
                vocab,
                category_vocab,
                True,
                load_vgg_features=load_vgg_features,
                load_resnet_features=load_resnet_features),
            batch_size=args.batch_size,
            shuffle=split == 'train',
            #collate_fn=QuestionerDataset.get_collate_fn(device),
            collate_fn=QuestionerDataset.collate_fn)

    model = QGen(len(vocab),
                 args.word_embedding_dim,
                 args.num_visual_features,
                 args.visual_embedding_dim,
                 args.hidden_size,
                 visual_representation=args.visual_representation,
                 query_tokens=vocab.answer_tokens).to(device)
    print(model)

    loss_fn = torch.nn.CrossEntropyLoss(ignore_index=0)
    optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate)

    forward_kwargs_mapping = {
        'dialogue': 'source_dialogue',
        'dialogue_lengths': 'dialogue_lengths'
    }
    if load_vgg_features:
        forward_kwargs_mapping['visual_features'] = 'vgg_features'
    if load_resnet_features:
        forward_kwargs_mapping['visual_features'] = 'resnet_features'

    target_kwarg = 'target_dialogue'

    best_val_loss = 1e9
    for epoch in range(args.epochs):
        train_loss, _ = eval_epoch(model, data_loader['train'],
                                   forward_kwargs_mapping, target_kwarg,
                                   loss_fn, optimizer)

        valid_loss, _ = eval_epoch(model, data_loader['valid'],
                                   forward_kwargs_mapping, target_kwarg,
                                   loss_fn)

        if valid_loss < best_val_loss:
            best_val_loss = valid_loss
            model.save(
                os.path.join('bin', 'qgen_{}_{}.pt'.format(args.exp_name, ts)))

        logger.add_scalar('train_loss', train_loss, epoch)
        logger.add_scalar('valid_loss', valid_loss, epoch)

        print(("Epoch {:2d}/{:2d} Train Loss {:07.4f} Vaild Loss {:07.4f}"
               ).format(epoch, args.epochs, train_loss, valid_loss))
Exemple #3
0
def preprocess(field='body'):
    load_boilerpipe()
    binary = args.binary_html
    data_dir = args.data_dir
    max_vocab_size = args.max_vocab_size
    docs_dir = os.path.join(data_dir, 'docs')
    query_filepath = os.path.join(data_dir, 'query')
    train_filepath = os.path.join(data_dir, 'train.pointwise')
    test_filepath = os.path.join(data_dir, 'test.pointwise')
    vocab = Vocab(max_size=max_vocab_size)
    train_query_ids, train_doc_ids = get_query_doc_ids(train_filepath)
    test_query_ids, test_doc_ids = get_query_doc_ids(test_filepath)
    query_ids = train_query_ids | test_query_ids
    doc_ids = train_doc_ids | test_doc_ids
    print('total query: {}, total doc: {}'.format(len(query_ids),
                                                  len(doc_ids)))
    query_dict = load_from_query_file(query_filepath)
    doc_dict = {}
    for qid in sorted(train_query_ids):
        for term in query_dict[qid].split():
            vocab.add(term)
    count = 0
    for docid in sorted(train_doc_ids):
        count += 1
        if count % 10000 == 0:
            print('processed {}w docs'.format(count // 10000))
        loaded_html = load_from_html_cascade(os.path.join(
            docs_dir, docid + '.html'),
                                             binary=binary,
                                             field=[field])
        doc_dict[docid] = loaded_html[field]
        #print(docid)
        #print(' '.join(doc_dict[docid]))
        #input()
        for term in doc_dict[docid]:
            vocab.add(term)
    vocab.build()
    vocab.save_to_file(os.path.join(data_dir, 'vocab'))
    empty_qid, empty_docid = set(), set()
    with open(os.path.join(data_dir, 'query.prep'), 'w') as fp:
        for qid in sorted(query_ids):
            qt = query_dict[qid].split()
            if len(qt) == 0:
                empty_qid.add(qid)
                continue
            fp.write('{}\t{}\n'.format(
                qid, ' '.join(map(lambda x: str(x), vocab.encode(qt)))))
    with open(os.path.join(data_dir, 'docs.prep'), 'w') as fp:
        for docid in sorted(doc_ids):
            if docid in doc_dict:
                doc_text = doc_dict[docid]
            else:
                doc_text = load_from_html_cascade(os.path.join(
                    docs_dir, docid + '.html'),
                                                  binary=binary,
                                                  field=[field])[field]
            if len(doc_text) == 0:
                empty_docid.add(docid)
                continue
            fp.write('{}\t{}\n'.format(
                docid, ' '.join(map(lambda x: str(x),
                                    vocab.encode(doc_text)))))
    print('have {} empty query, have {} empty doc'.format(
        len(empty_qid), len(empty_docid)))
    filter_samples(train_filepath,
                   '{}.prep.{}'.format(*train_filepath.rsplit('.', 1)),
                   empty_qid, empty_docid)
    filter_samples(test_filepath,
                   '{}.prep.{}'.format(*test_filepath.rsplit('.', 1)),
                   empty_qid, empty_docid)
Exemple #4
0
parser.add_argument("--inter_alpha",
                    type=float,
                    default=0.1,
                    help="adjust the penalty on intermediate labels")
parser.add_argument("--corpus", type=str, default='raw', help="acd|raw")
parser.add_argument("--mode", type=str, default='lstm', help="rnn|lstm")
params, _ = parser.parse_known_args()

if __name__ == '__main__':
    data = params.corpus
    print(data)
    assert data == 'acd_trees_128d' or data == 'acd_trees_512d' or data == 'raw' or data == 'acd_trees_512d_rand'
    train_data, dev_data, test_data = tr.simplified_data(0, 0, 0, data)
    print(len(train_data), len(dev_data), len(test_data))
    print(train_data[0])
    vocab = Vocab()
    train_sents = [t.get_words() for t in train_data]
    vocab.construct(list(itertools.chain.from_iterable(train_sents)))
    if params.mode == 'lstm':
        model = RNN_LSTM_Model(vocab, embed_size=embed_size).cuda()
    else:
        model = RNN_Model(vocab, embed_size=embed_size).cuda()

    loss_history = []
    optimizer = torch.optim.SGD(model.parameters(),
                                lr=lr,
                                momentum=0.9,
                                dampening=0.0)

    for epoch in range(max_epochs):
        print("epoch = ", epoch)
    max_conv_len = args.max_conversation_length
    max_vocab_size = args.max_vocab_size
    min_freq = args.min_vocab_frequency

    print("Loading conversations...")
    train = load_conversations(datasets_dir.joinpath("train.txt"))
    valid = load_conversations(datasets_dir.joinpath("dev.txt"))
    test = load_conversations(datasets_dir.joinpath("test.txt"))

    print("#train=%d, #val=%d, #test=%d" % (len(train), len(valid), len(test)))

    def to_pickle(obj, path):
        with open(path, 'wb') as f:
            pickle.dump(obj, f)

    vocab = Vocab(lang="zh")
    for split_type, conversations in [('train', train), ('valid', valid),
                                      ('test', test)]:
        print(f'Processing {split_type} dataset...')
        split_data_dir = datasets_dir.joinpath(split_type)
        split_data_dir.mkdir(exist_ok=True)
        conversation_length = [
            min(len(conv), max_conv_len) for conv in conversations
        ]

        sentences, sentence_length = pad_sentences(
            conversations,
            max_sentence_length=max_sent_len,
            max_conversation_length=max_conv_len)

        print('Saving preprocessed data at', split_data_dir)
Exemple #6
0
def test(args):
    vocab = Vocab()
    vocab.load(args.vocab, args.lowercase)
    vocab.add_special_token()

    sufvocab = Vocab()
    sufvocab.load(args.sufvocab, args.lowercase)
    sufvocab.add_special_token(['s>', '<UNK>'])

    pos2id = Vocab()
    pos2id.load(args.poslist)

    if args.gpu > -1:
        cuda.get_device(args.gpu).use()
        xp = cuda.cupy
    else:
        xp = np

    model = WordnnTagger.load(args.model)

    out_path = making_data(args.test_path, model.window)

    if args.gpu > -1:
        model.to_gpu()
    model.make_oov_vector(args.gpu > -1)

    # start evaluation
    n_data = 0
    n_correct = 0
    sum_loss = xp.zeros((), dtype=xp.float32)
    start = time.time()
    for tags, contexts in line_iter(out_path, args.minibatch, False):
        batch_ts = xp.array([pos2id[tag] for tag in tags], dtype=xp.int32)
        batch_caps = xp.array([[get_capf(word) for word in context]
                               for context in contexts],
                              dtype=xp.int32)
        if args.lowercase:
            contexts = [[word.lower() for word in context]
                        for context in contexts]
        batch_xs = xp.array(
            [[vocab[word] for word in vocab.check_words(context)]
             for context in contexts],
            dtype=xp.int32)
        # maybe inefficient...
        batch_sufs = [[word[-2:] for word in context] for context in contexts]
        batch_sufs = xp.array(
            [[sufvocab[suf] for suf in sufvocab.check_words(sufs)]
             for sufs in batch_sufs],
            dtype=xp.int32)
        batch_caps = xp.array([[get_capf(word) for word in context]
                               for context in contexts],
                              dtype=xp.int32)
        batch_features = [batch_xs, batch_sufs, batch_caps]
        cur_batch_size = batch_ts.shape[0]
        ys, loss = model(batch_features, batch_ts)
        sum_loss += loss.data * cur_batch_size
        pred_labels = ys.data.argmax(1)
        n_correct += sum(1 for j in range(cur_batch_size)
                         if pred_labels[j] == batch_ts[j])
        n_data += cur_batch_size
    end = time.time()
    accuracy = float(n_correct / n_data)
    print('test loss : {}'.format(sum_loss))
    print('test accuracy : {}'.format(accuracy))
    print('(time to run : {})'.format(end - start))
Exemple #7
0
def main(argv=()):
    del argv  # Unused.

    vocab = Vocab()

    shp_p = tf.placeholder(tf.int32, shape=(2, ))
    sen_batch_p = tf.placeholder(tf.int32, shape=(FLAGS.batch_size, None))
    mask_batch_p = tf.placeholder(tf.int32, shape=(FLAGS.batch_size, None))
    labels_batch_p = tf.placeholder(tf.int32, shape=(FLAGS.batch_size, ))

    max_sampling = (FLAGS.sampling_mode == 'max')
    decoded_samples = model_sample(sen_batch_p,
                                   mask_batch_p,
                                   shp_p,
                                   labels_batch_p,
                                   max_sampling=max_sampling)
    saver = tf.train.Saver()

    with tf.Session() as sess:
        coord = tf.train.Coordinator()

        saver.restore(sess, FLAGS.restore_ckpt_path)

        threads = tf.train.start_queue_runners(sess=sess, coord=coord)

        for label in range(FLAGS.Nlabels):
            if FLAGS.flip_label:
                flip_label = 1 - label
            else:
                flip_label = label
            input_file = FLAGS.input_file.split(',')[label]
            input_sents = open(input_file, 'r').readlines()

            input_sents = [sent.strip() for sent in input_sents]

            samples = []
            for it in range(int(len(input_sents) / FLAGS.batch_size) + 1):

                labels_batch = np.array([0] * FLAGS.batch_size)
                sents = input_sents[it * FLAGS.batch_size:(it + 1) *
                                    FLAGS.batch_size]
                num_sents = len(sents)
                while len(sents) < FLAGS.batch_size:
                    sents.extend(sents[:FLAGS.batch_size - len(sents)])
                sen_batch, mask_batch, shp = vocab.construct_batch(sents)

                out = sess.run(decoded_samples,
                               feed_dict={
                                   sen_batch_p: sen_batch,
                                   mask_batch_p: mask_batch,
                                   shp_p: shp,
                                   labels_batch_p: labels_batch
                               })

                for k in range(FLAGS.batch_size):
                    if k >= num_sents:
                        break
                    samples.append(vocab.convert_to_str(out[flip_label][k]))

            fname = FLAGS.samples_dir + '/' + FLAGS.mdl_name + '_sample_' + str(
                flip_label)
            fname += '.txt'
            with open(fname, 'w') as results_file:
                results_file.write('\n'.join(samples))

        coord.request_stop()
        coord.join(threads)
Exemple #8
0
        sentences, sentence_length = pad_sentences(
            conv_sentences,
            max_sentence_length=max_sent_len,
            max_conversation_length=max_conv_len)

        for sentence_len, label in zip(conversation_length, conv_labels):
            assert(sentence_len ==len(label))

        
        print('Saving preprocessed data at', split_data_dir)
        to_pickle(conversation_length, split_data_dir.joinpath(
            'conversation_length.pkl'))
        to_pickle(sentences, split_data_dir.joinpath('sentences.pkl'))
        to_pickle(conv_labels, split_data_dir.joinpath('labels.pkl'))
        to_pickle(sentence_length, split_data_dir.joinpath(
            'sentence_length.pkl'))
        to_pickle(iemocap.vids[split_type], split_data_dir.joinpath('video_id.pkl'))

        if split_type == 'train':

            print('Save Vocabulary...')
            vocab = Vocab(tokenizer)
            vocab.add_dataframe(conv_sentences)

            assert(GLOVE_DIR != "")
            vocab.update(GLOVE_DIR, max_size=max_vocab_size, min_freq=min_freq)

            print('Vocabulary size: ', len(vocab))
            vocab.pickle(iemocap_dir.joinpath('word2id.pkl'),
                         iemocap_dir.joinpath('id2word.pkl'),
                         iemocap_dir.joinpath('word_emb.pkl'))
Exemple #9
0
args = parse_args()

logging.basicConfig(level=args.log_level)

train_data = "snli_1.0/snli_1.0_train.jsonl"
dev_data = "snli_1.0/snli_1.0_dev.jsonl"
test_data = "snli_1.0/snli_1.0_test.jsonl"

if args.no_cache or not os.path.exists("cache"):
    logging.info("Cache not found, reprocessing data.")
    train_sentence1, train_sentence2, train_labels = read(train_data, 550152)
    dev_sentence1, dev_sentence2, dev_labels = read(dev_data, 10000)
    test_sentence1, test_sentence2, test_labels = read(test_data, 10000)
    raw_data = chain(*train_sentence1, *train_sentence2)
    vocab = Vocab(raw_data)
    if not os.path.exists("cache"):
        os.makedirs("cache")
    pickle.dump(vocab, open("cache/vocab.p", "wb"))
    pickle.dump([train_sentence1, train_sentence2, train_labels],
                open("cache/train.p", "wb"))
    pickle.dump([test_sentence1, test_sentence2, test_labels],
                open("cache/test.p", "wb"))
    pickle.dump([dev_sentence1, dev_sentence2, dev_labels],
                open("cache/dev.p", "wb"))
else:
    logging.info("Loading data from the cache.")
    vocab = pickle.load(open("cache/vocab.p", "rb"))
    train_sentence1, train_sentence2, train_labels = pickle.load(
        open("cache/train.p", "rb"))
    dev_sentence1, dev_sentence2, dev_labels = pickle.load(