Beispiel #1
0
def _process_vocab(args, questions) -> Dict:
    """If input_vocab_json is provided, then use (or expand) it, o.w. build vocab from train files"""
    # Either create the vocab or load it from disk
    if args.input_vocab_json == '' or args.expand_vocab == 1:
        logger.info('Building vocab')
        if 'answer' in questions[0]:
            answer_token_to_idx = preprocess_utils.build_vocab(
                (q['answer'] for q in questions))
        question_token_to_idx = preprocess_utils.build_vocab(
            (q['question'] for q in questions),
            min_token_count=args.unk_threshold,
            punct_to_keep=[';', ','],
            punct_to_remove=['?', '.'])
        all_program_strs = []
        for q in questions:
            if 'program' not in q: continue
            program_str = program_to_str(q['program'], args.mode)
            if program_str is not None:
                all_program_strs.append(program_str)
        program_token_to_idx = preprocess_utils.build_vocab(all_program_strs)
        vocab = {
            'question_token_to_idx': question_token_to_idx,
            'program_token_to_idx': program_token_to_idx,
            'answer_token_to_idx': answer_token_to_idx,
        }

    if args.input_vocab_json != '':
        logger.info('Loading vocab')
        if args.expand_vocab == 1:
            new_vocab = vocab
        with open(args.input_vocab_json) as f:
            vocab = json.load(f)
        if args.expand_vocab == 1:
            num_new_words = 0
            for word in new_vocab['question_token_to_idx']:
                if word not in vocab['question_token_to_idx']:
                    logger.info('Found new word %s' % word)
                    idx = len(vocab['question_token_to_idx'])
                    vocab['question_token_to_idx'][word] = idx
                    num_new_words += 1
            logger.info('Found %d new words' % num_new_words)

    if args.output_vocab_json != '':
        utils.mkdirs(os.path.dirname(args.output_vocab_json))
        with open(args.output_vocab_json, 'w') as f:
            json.dump(vocab, f)

    return vocab
Beispiel #2
0
def test():
    _, _, _, sentence_size, vocab_size = build_corpus()
    v2i, _ = build_vocab()
    _, i2l = build_label()
    origin_questions = ['今天 天气 不错', '介绍 贵金属 产品']
    questions = [q.split() for q in origin_questions]
    questions = [[v2i[vocab] for vocab in ques if vocab in v2i]
                 for ques in questions]

    config = tf.ConfigProto()
    with tf.Session(config=config) as sess:
        model = Model(sentence_size, vocab_size, FLAGS.embed_size,
                      FLAGS.class_num, FLAGS.learning_rate, FLAGS.decay_step,
                      FLAGS.decay_rate, FLAGS.layer_size,
                      FLAGS.multi_channel_size)

        saver = tf.train.Saver()
        saver.restore(sess, tf.train.latest_checkpoint(FLAGS.check_point))

        questions = pad_sequences(questions, maxlen=sentence_size, value=0)
        feed_dict = {
            model.encoder_input: questions,
            model.batch_size: FLAGS.batch_size
        }

        p = sess.run([model.predict], feed_dict=feed_dict)
        p = p[0].tolist()
    for index in range(len(questions)):
        print(f'{origin_questions[index]} is_business: {i2l[p[index]]}')
Beispiel #3
0
def process_dataset():
    train_dataset = dp.load_dataset(args.data,
                                    is_english=True,
                                    has_label=True,
                                    use_target='word',
                                    use_first_target=False)
    # Y = [ins['label'] for ins in train_dataset]
    # class_distri = class_weight.compute_class_weight('balanced', np.unique(Y), Y)
    # print('Set class weights: {}'.format(class_distri))
    val_dataset = None
    if args.dev:
        val_dataset = dp.load_dataset(args.dev,
                                      is_english=True,
                                      has_label=True,
                                      use_target='word',
                                      use_first_target=False)
        full_dataset = train_dataset + val_dataset
    else:
        full_dataset = train_dataset
    args.text_seq_len = max([len(ins['text_words']) for ins in full_dataset])
    args.tar_seq_len = max([len(ins['tar_words']) for ins in full_dataset])
    print('text seq len: ', args.text_seq_len)
    print('tar seq len: ', args.tar_seq_len)
    args.vocab_size, token2id, args.embeddings, args.emb_dim = dp.build_vocab(
        args.vocab,
        data=[ins['text_words'] for ins in full_dataset],
        embedding=args.emb,
        tf_limit=args.tf_limit)
    # list of tuple (encoded_text, encoded_target, encoded_idx, encoded_label)
    train_encoded = dp.build_dataset(train_dataset, token2id, args.cat2id,
                                     args.text_seq_len, args.tar_seq_len)
    np.random.shuffle(train_encoded)
    if args.dev:
        val_encoded = dp.build_dataset(val_dataset, token2id, args.cat2id,
                                       args.text_seq_len, args.tar_seq_len)
    else:
        train_encoded, val_encoded = train_test_split(train_encoded,
                                                      test_size=0.2,
                                                      random_state=1314)
    if pad_dataset:
        train_encoded = dp.pad_dataset(train_encoded, args.batch_size)
    train_text, train_target, train_tar_idx, train_label = map(
        lambda filed: np.array(filed), zip(*train_encoded))
    val_text, val_target, val_tar_idx, val_label = map(
        lambda filed: np.array(filed), zip(*val_encoded))

    # load test dataset
    test_dataset = dp.load_dataset(args.test,
                                   is_english=True,
                                   has_label=True,
                                   use_target='word',
                                   use_first_target=False)
    test_encoded = dp.build_dataset(test_dataset, token2id, args.cat2id,
                                    args.text_seq_len, args.tar_seq_len)
    test_text, test_target, test_tar_idx, test_label = map(
        lambda filed: np.array(filed), zip(*test_encoded))

    return args, (train_text, train_target, train_tar_idx, train_label), (val_text, val_target, val_tar_idx, val_label), \
           (test_text, test_target, test_tar_idx, test_label)
Beispiel #4
0
def process_dataset():
    train_dataset = dp.load_dataset(args.data,
                                    is_english=True,
                                    has_label=True,
                                    use_target='word',
                                    use_first_target=False)
    val_dataset = None
    if args.dev:
        val_dataset = dp.load_dataset(args.dev,
                                      is_english=True,
                                      has_label=True,
                                      use_target='word',
                                      use_first_target=False)
        full_dataset = train_dataset + val_dataset
    else:
        full_dataset = train_dataset
    args.text_seq_len = max([len(ins['text_words']) for ins in full_dataset])
    args.tar_seq_len = max([len(ins['tar_words']) for ins in full_dataset])
    print('text seq len: ', args.text_seq_len)
    print('tar seq len: ', args.tar_seq_len)
    args.vocab_size, token2id, args.embeddings, args.emb_dim = dp.build_vocab(
        args.vocab,
        data=[ins['text_words'] for ins in full_dataset],
        embedding=args.emb,
        tf_limit=args.tf_limit)

    # list of tuple (encoded_text, encoded_target, encoded_idx, encoded_label)
    train_encoded = dp.build_dataset(train_dataset, token2id, args.cat2id,
                                     args.text_seq_len, args.tar_seq_len)
    np.random.shuffle(train_encoded)
    if args.dev:
        val_encoded = dp.build_dataset(val_dataset, token2id, args.cat2id,
                                       args.text_seq_len, args.tar_seq_len)
    else:
        train_encoded, val_encoded = train_test_split(train_encoded,
                                                      test_size=0.2,
                                                      random_state=1234)
    if pad_dataset:
        train_encoded = dp.pad_dataset(train_encoded, args.batch_size)
    train_text, train_target, train_tar_idx, train_label = map(
        lambda filed: np.array(filed), zip(*train_encoded))
    val_text, val_target, val_tar_idx, val_label = map(
        lambda filed: np.array(filed), zip(*val_encoded))

    return args, (train_text, train_target, train_tar_idx,
                  train_label), (val_text, val_target, val_tar_idx, val_label)
Beispiel #5
0
def load_vocabulary():
    if os.path.exists(conf['vocab_path']) and not conf['first_run?']:

        # Trick for allow_pickle issue in np.load
        np_load_old = np.load
        # modify the default parameters of np.load
        np.load = lambda *a, **k: np_load_old(*a, allow_pickle=True, **k)

        # need to use .item() to access a class object
        vocab = np.load(conf['vocab_path']).item()
        # restore np.load for future normal usage
        np.load = np_load_old
        print('Loaded vocabulary.')
    else:
        # build a single vocab for both the languages
        print('Building vocabulary...')
        vocab = preprocess.build_vocab(conf)
        print('Built vocabulary.')
    return vocab
Beispiel #6
0
def process_dataset_cv():
    train_dataset = dp.load_dataset(args.data, is_english=True, has_label=True, use_target='word', use_first_target=False)
    if args.test:
        test_dataset = dp.load_dataset(args.test, is_english=True, has_label=True, use_target='word', use_first_target=False) 
        full_dataset = train_dataset + test_dataset
    else:
        full_dataset = train_dataset
    args.text_seq_len = max([len(ins['text_words']) for ins in full_dataset])
    args.tar_seq_len = max([len(ins['tar_words']) for ins in full_dataset])
    print('text seq len: ', args.text_seq_len)
    print('tar seq len: ', args.tar_seq_len)
    args.vocab_size, token2id, args.embeddings, args.emb_dim = dp.build_vocab(args.vocab, data=[ins['text_words'] for ins in full_dataset],
                                                                              embedding=args.emb, tf_limit=args.tf_limit)
    # list of tuple (encoded_text, encoded_target, encoded_idx, encoded_label)
    data_encoded = dp.build_dataset(full_dataset, token2id, args.cat2id, args.text_seq_len, args.tar_seq_len)
    text, target, tar_idx, label = map(lambda filed: np.array(filed), zip(*data_encoded))
    Y = [ins['label'] for ins in full_dataset]
    fold = list(StratifiedKFold(n_splits=args.kfold, shuffle=True, random_state=1234).split(text, Y))
    return args, fold, (text, target, tar_idx), label
Beispiel #7
0
def test():
    v2i, _ = build_vocab()
    _, i2l = build_label()
    origin_questions = ['今天 天气 不错', '介绍 贵金属 产品']
    questions = [q.split() for q in origin_questions]
    questions = [[v2i[vocab] for vocab in ques if vocab in v2i] for ques in questions]

    with tf.Session() as sess:
        saver = tf.train.import_meta_graph(checkpoint_path + model_name)
        saver.restore(sess, tf.train.latest_checkpoint(checkpoint_path))

        model = tf.get_default_graph()
        x = model.get_tensor_by_name("x:0")
        predict = model.get_tensor_by_name("predictions:0")

        questions = pad_sequences(questions, maxlen=x.shape[1], value=0)
        feed_dict = {x: questions}

        p = sess.run([predict], feed_dict=feed_dict)
        p = p[0].tolist()
    for index in range(len(questions)):
        print(f'{origin_questions[index]} is_business: {i2l[p[index]]}')
Beispiel #8
0
def get_data(args):
    path = Path(args.data_path)
    f_train = path / 'train.json'
    f_test = path / 'test.json'
    f_val = path / 'val.json'

    tokenizer = Tokenizer()
    vocab = build_vocab([f_train, f_test, f_val],
                        tokenizer=tokenizer.tokenize,
                        min_freq=2,
                        max_size=50000)

    train_ds = ClassificationDataset(fname=f_train,
                                     tokenizer=tokenizer.tokenize,
                                     vocab=vocab)
    test_ds = ClassificationDataset(fname=f_test,
                                    tokenizer=tokenizer.tokenize,
                                    vocab=vocab)
    val_ds = ClassificationDataset(fname=f_val,
                                   tokenizer=tokenizer.tokenize,
                                   vocab=vocab)

    collator = ClfPadCollator(args.max_seq_length)
    train_iter = DataLoader(train_ds,
                            batch_size=args.batch_size,
                            shuffle=True,
                            collate_fn=collator.collate)
    test_iter = DataLoader(test_ds,
                           batch_size=args.batch_size,
                           shuffle=False,
                           collate_fn=collator.collate)
    val_iter = DataLoader(val_ds,
                          batch_size=args.batch_size,
                          shuffle=False,
                          collate_fn=collator.collate)
    return train_iter, val_iter, test_iter, vocab
def main(args):
    print('Loading captions')
    with open(args.input_captions_json, 'r') as f:
        captions = json.load(f)
    with open(args.input_neg_captions_json, 'r') as f:
        neg_captions = json.load(f)
    with open(args.split_json, 'r') as f:
        splits = json.load(f)
    all_imgs = sorted(os.listdir(args.input_image_dir))
    captioned_imgs = list(captions.keys())
    all_captions = []
    for img, caps in captions.items():
        all_captions.extend(caps)
    all_neg_captions = []
    for img, caps in neg_captions.items():
        all_neg_captions.extend(caps)

    # Extract train data points
    train_split = splits['train']
    train_imgs = [all_imgs[idx] for idx in train_split]
    train_captions = []
    train_neg_captions = []
    for img in train_imgs:
        cap = captions[img]
        neg_cap = neg_captions[img]
        train_captions.extend(cap)
        train_neg_captions.extend(neg_cap)

    N = len(all_imgs)
    N_captioned = len(captions)
    M = len(all_captions)
    M_neg = len(all_neg_captions)
    print('Total images: %d' % N)
    print('Total captioned images: %d' % N_captioned)
    print('Total captions: %d' % M)
    print('Total negative captions: %d' % M_neg)
    print('Total train images: %d' % len(train_imgs))
    print('Total train captions: %d' % len(train_captions))
    print('Total train neg captions: %d' % len(train_neg_captions))

    # Either create the vocab or load it from disk
    if args.input_vocab_json == '':
        print('Building vocab')
        word_to_idx = build_vocab(train_captions + train_neg_captions,
                                  min_token_count=args.word_count_threshold,
                                  punct_to_keep=[';', ','],
                                  punct_to_remove=['?', '.'])
    else:
        print('Loading vocab')
        with open(args.input_vocab_json, 'r') as f:
            word_to_idx = json.load(f)
    if args.output_vocab_json != '':
        with open(args.output_vocab_json, 'w') as f:
            json.dump(word_to_idx, f)

    # Encode all captions
    # First, figure out max length of captions
    all_cap_tokens = []
    max_length = -1
    cap_keys = sorted(list(captions.keys()))
    for img in cap_keys:
        caps = captions[img]
        n = len(caps)
        assert n > 0, 'error: some image has no caption'
        tokens_list = []
        for cap in caps:
            cap_tokens = tokenize(cap,
                                  add_start_token=True,
                                  add_end_token=False,
                                  punct_to_keep=[';', ','],
                                  punct_to_remove=['?', '.'])
            tokens_list.append(cap_tokens)
            max_length = max(max_length, len(cap_tokens))
        all_cap_tokens.append((img, tokens_list))

    all_neg_cap_tokens = []
    cap_keys = sorted(list(captions.keys()))
    for img in cap_keys:
        neg_caps = neg_captions[img]
        neg_n = len(neg_caps)
        assert neg_n > 0, 'error: some image has no caption'
        neg_tokens_list = []
        for neg_cap in neg_caps:
            neg_cap_tokens = tokenize(neg_cap,
                                      add_start_token=True,
                                      add_end_token=False,
                                      punct_to_keep=[';', ','],
                                      punct_to_remove=['?', '.'])
            neg_tokens_list.append(neg_cap_tokens)
        all_neg_cap_tokens.append((img, neg_tokens_list))

    print('Encoding captions')
    label_arrays = []
    label_start_idx = -np.ones(N, dtype=np.int)
    label_end_idx = -np.ones(N, dtype=np.int)
    label_length = np.zeros(M, dtype=np.int)
    caption_counter = 0
    counter = 0

    # Then encode
    for img, tokens_list in all_cap_tokens:
        i = int(img.split('.')[0].split('_')[-1])
        n = len(tokens_list)
        Li = np.zeros((n, max_length), dtype=np.int)
        for j, tokens in enumerate(tokens_list):
            label_length[caption_counter] = len(tokens)
            caption_counter += 1
            tokens_encoded = encode(tokens,
                                    word_to_idx,
                                    allow_unk=args.allow_unk == 1)
            for k, w in enumerate(tokens_encoded):
                Li[j, k] = w
        # captions are padded with zeros
        label_arrays.append(Li)
        label_start_idx[i] = counter
        label_end_idx[i] = counter + n - 1

        counter += n

    L = np.concatenate(label_arrays, axis=0)  # put all labels together
    assert L.shape[0] == M, "lengths don't match?"
    assert np.all(label_length > 0), 'error: some captions have no word?'

    print('Encoding negative captions')
    neg_label_arrays = []
    neg_label_start_idx = -np.ones(N, dtype=np.int)
    neg_label_end_idx = -np.ones(N, dtype=np.int)
    neg_label_length = np.zeros(M_neg, dtype=np.int)
    neg_caption_counter = 0
    neg_counter = 0

    # Then encode
    for img, tokens_list in all_neg_cap_tokens:
        i = int(img.split('.')[0].split('_')[-1])
        n = len(tokens_list)
        Li = np.zeros((n, max_length), dtype=np.int)
        for j, tokens in enumerate(tokens_list):
            neg_label_length[neg_caption_counter] = len(tokens)
            neg_caption_counter += 1
            tokens_encoded = encode(tokens,
                                    word_to_idx,
                                    allow_unk=args.allow_unk == 1)
            for k, w in enumerate(tokens_encoded):
                Li[j, k] = w
        # captions are padded with zeros
        neg_label_arrays.append(Li)
        neg_label_start_idx[i] = neg_counter
        neg_label_end_idx[i] = neg_counter + n - 1

        neg_counter += n

    neg_L = np.concatenate(neg_label_arrays, axis=0)  # put all labels together
    assert neg_L.shape[0] == M_neg, "lengths don't match?"
    assert np.all(neg_label_length > 0), 'error: some captions have no word?'

    # Create h5 file
    print('Writing output')
    print('Encoded captions array size: ', L.shape)
    print('Encoded negative captions array size: ', neg_L.shape)
    with h5py.File(args.output_h5, 'w') as f:
        f.create_dataset('labels', data=L)
        f.create_dataset('label_start_idx', data=label_start_idx)
        f.create_dataset('label_end_idx', data=label_end_idx)
        f.create_dataset('label_length', data=label_length)
        f.create_dataset('neg_labels', data=neg_L)
        f.create_dataset('neg_label_start_idx', data=neg_label_start_idx)
        f.create_dataset('neg_label_end_idx', data=neg_label_end_idx)
        f.create_dataset('neg_label_length', data=neg_label_length)
Beispiel #10
0
def main(args):
    if (args.input_vocab_json == '') and (args.output_vocab_json == ''):
        print('Must give one of --input_vocab_json or --output_vocab_json')
        return

    print('Loading data')
    with open(args.input_questions_json, 'r') as f:
        questions = json.load(f)['questions']

    # Either create the vocab or load it from disk
    if args.input_vocab_json == '' or args.expand_vocab == 1:
        print('Building vocab')
        if 'answer' in questions[0]:
            answer_token_to_idx = preprocess_utils.build_vocab(
                (q['answer'] for q in questions))
        question_token_to_idx = preprocess_utils.build_vocab(
            (q['question'] for q in questions),
            min_token_count=args.unk_threshold,
            punct_to_keep=[';', ','],
            punct_to_remove=['?', '.'])
        all_program_strs = []
        for q in questions:
            if 'program' not in q: continue
            program_str = program_to_str(q['program'], args.mode)
            if program_str is not None:
                all_program_strs.append(program_str)
        program_token_to_idx = preprocess_utils.build_vocab(all_program_strs)
        vocab = {
            'question_token_to_idx': question_token_to_idx,
            'program_token_to_idx': program_token_to_idx,
            'answer_token_to_idx': answer_token_to_idx,
        }

    if args.input_vocab_json != '':
        print('Loading vocab')
        if args.expand_vocab == 1:
            new_vocab = vocab
        with open(args.input_vocab_json, 'r') as f:
            vocab = json.load(f)
        if args.expand_vocab == 1:
            num_new_words = 0
            for word in new_vocab['question_token_to_idx']:
                if word not in vocab['question_token_to_idx']:
                    print('Found new word %s' % word)
                    idx = len(vocab['question_token_to_idx'])
                    vocab['question_token_to_idx'][word] = idx
                    num_new_words += 1
            print('Found %d new words' % num_new_words)

    if args.output_vocab_json != '':
        utils.mkdirs(os.path.dirname(args.output_vocab_json))
        with open(args.output_vocab_json, 'w') as f:
            json.dump(vocab, f)

    # Encode all questions and programs
    print('Encoding data')
    questions_encoded = []
    programs_encoded = []
    question_families = []
    orig_idxs = []
    image_idxs = []
    answers = []
    for orig_idx, q in enumerate(questions):
        question = q['question']

        orig_idxs.append(orig_idx)
        image_idxs.append(q['image_index'])
        if 'question_family_index' in q:
            question_families.append(q['question_family_index'])
        question_tokens = preprocess_utils.tokenize(question,
                                                    punct_to_keep=[';', ','],
                                                    punct_to_remove=['?', '.'])
        question_encoded = preprocess_utils.encode(
            question_tokens,
            vocab['question_token_to_idx'],
            allow_unk=args.encode_unk == 1)
        questions_encoded.append(question_encoded)

        if 'program' in q:
            program = q['program']
            program_str = program_to_str(program, args.mode)
            program_tokens = preprocess_utils.tokenize(program_str)
            program_encoded = preprocess_utils.encode(
                program_tokens, vocab['program_token_to_idx'])
            programs_encoded.append(program_encoded)

        if 'answer' in q:
            answers.append(vocab['answer_token_to_idx'][q['answer']])

    # Pad encoded questions and programs
    max_question_length = max(len(x) for x in questions_encoded)
    for qe in questions_encoded:
        while len(qe) < max_question_length:
            qe.append(vocab['question_token_to_idx']['<NULL>'])

    if len(programs_encoded) > 0:
        max_program_length = max(len(x) for x in programs_encoded)
        for pe in programs_encoded:
            while len(pe) < max_program_length:
                pe.append(vocab['program_token_to_idx']['<NULL>'])

    # Create h5 file
    print('Writing output')
    questions_encoded = np.asarray(questions_encoded, dtype=np.int32)
    programs_encoded = np.asarray(programs_encoded, dtype=np.int32)
    print(questions_encoded.shape)
    print(programs_encoded.shape)
    utils.mkdirs(os.path.dirname(args.output_h5_file))
    with h5py.File(args.output_h5_file, 'w') as f:
        f.create_dataset('questions', data=questions_encoded)
        f.create_dataset('image_idxs', data=np.asarray(image_idxs))
        f.create_dataset('orig_idxs', data=np.asarray(orig_idxs))

        if len(programs_encoded) > 0:
            f.create_dataset('programs', data=programs_encoded)
        if len(question_families) > 0:
            f.create_dataset('question_families',
                             data=np.asarray(question_families))
        if len(answers) > 0:
            f.create_dataset('answers', data=np.asarray(answers))