Exemple #1
0
def main():
    pos_train = './data/train_pos_full.txt'
    neg_train = './data/train_neg_full.txt'
    vocab = './data/vocab.dat'
    inv_vocab = './data/inv_vocab.dat'

    build_vocab([pos_train, neg_train], DataSet.PAD_WORD, vocab, inv_vocab)
def _process_keras(
    features_train: List[str], labels: List[int], path_embeddings: str
) -> Tuple[Dict[str, int], Optional[Word2Vec], Dict[str, int]]:
    """
    This method is used to process the sentences with respect to a keras model
    :param features_train: input train
    :param labels: label set.
    :param path_embeddings: path of pre-trained embeddings
    :return: vocabulary, word2vec model, word2vec vocabulary
    """
    print("Loading pre-trained embeddings...")
    # load the w2v matrix with genism
    w2v = gensim.models.KeyedVectors.load_word2vec_format(path_embeddings, binary=True)
    for i, t in enumerate(features_train):
        if not t:
            del features_train[i]
            del labels[i]
    # build the vocab from the w2v model
    w2v_vocab = preprocess.vocab_from_w2v(w2v)
    print("Word2Vec model vocab len:", len(w2v_vocab))
    # build vocab from the dataset
    data_vocab = preprocess.build_vocab([features_train])
    # filter pretrained w2v with words from the dataset
    w2v = utils.restrict_w2v(w2v, set(data_vocab.keys()))
    w2v_vocab = preprocess.vocab_from_w2v(w2v)
    utils.write_dictionary(config.TRAIN_VOCAB, w2v_vocab)
    print("Cleaned vocab len:", len(w2v_vocab))
    # idx2word = {v: k for k, v in vocab.items()}
    return data_vocab, w2v, w2v_vocab
Exemple #3
0
def get_data():
    # Load and preprocess data
    sentences, labels = load_data_and_labels()
    sentences_padded = pad_sentences(sentences)
    vocabulary, vocabulary_inv = build_vocab(sentences_padded)
    x, y = build_input_data(sentences_padded, labels, vocabulary)

    vocab_size = len(vocabulary)

    # randomly shuffle data
    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    x_shuffled = x[shuffle_indices]
    y_shuffled = y[shuffle_indices]

    # split train/dev set
    # there are a total of 10662 labeled examples to train on
    x_train, x_dev = x_shuffled[:-1000], x_shuffled[-1000:]
    y_train, y_dev = y_shuffled[:-1000], y_shuffled[-1000:]

    sentence_size = x_train.shape[1]

    print 'Train/Dev split: %d/%d' % (len(y_train), len(y_dev))
    print 'train shape:', x_train.shape
    print 'dev shape:', x_dev.shape
    print 'vocab_size', vocab_size
    print 'sentence max words', sentence_size

    return Data(x_train, y_train, x_dev, y_dev, vocab_size, sentence_size)
Exemple #4
0
def main(args):

    data = json.load(open(args.input_refexps_json, 'r'))
    max_length = 0
    all_refexps = []
    for keys in data:
        for ref_id in data[keys]:
            all_refexps.append(data[keys][ref_id])

    for r in all_refexps:
        t = tokenize(
            r,
            punct_to_keep=[',', ';'],
            punct_to_remove=['?', '.']
        )
        if len(t) > max_length:
            max_length = len(t)

    refexp_token_to_idx = build_vocab(
        all_refexps,
        punct_to_keep=[',', ';'],
        punct_to_remove=['?', '.']
    )

    with open(args.output_vocab_json, 'w') as f:
        json.dump(refexp_token_to_idx, f)

    with h5py.File(args.output_refexps_h5df, 'w') as f:
        for keys in data:
            one_image_refexps = []
            # img_name = keys.split('.')[0]
            one_image_refexps_to_idx = []
            img_all_refexps = data[keys]

            for ref_id in img_all_refexps:
                # refexp = img_all_refexps[ref_id]
                # one_image_refexps.append(refexp)
                refexp = img_all_refexps[ref_id]
                one_image_refexps.append(refexp)

            for refexps in one_image_refexps:
                tokens = tokenize(refexps, punct_to_remove=['?', '.'], punct_to_keep=[';', ','])
                refexps_idx = encode(tokens, refexp_token_to_idx)
                one_image_refexps_to_idx.append(refexps_idx)

            for refexp_ in one_image_refexps_to_idx:
                num_null = max_length - len(refexp_)
                if num_null > 0:
                    refexp_ += [refexp_token_to_idx['<NULL>']]*num_null

            one_image_refexps_to_idx_numpy = np.asarray(one_image_refexps_to_idx, dtype=np.int32)

            f.create_dataset(keys, data=one_image_refexps_to_idx_numpy)
def main():
    pos_file = './data/train_pos.txt'
    neg_file = './data/train_neg.txt'
    validation = './data/test_data.txt'
    stopwords = './data/stopwords.txt'

    vocab_file = 'vocab.dat'
    inv_vocab_file = 'inv_vocab.dat'

    cooc_file = 'cooc.dat'

    embeddings_file = 'embeddings.dat'

    label_file = 'labels.dat'

    submission_file = 'submission.csv'

    glove_seed = 1234
    kmeans_seed = 4321
    xgb_seed = 1337
    sampler_seed = 7331

    build_vocab([pos_file, neg_file],
                stopwords,
                vocab_file,
                inv_vocab_file,
                cutoff=5)

    vocab = load_pickled(vocab_file)
    inv_vocab = load_pickled(inv_vocab_file)

    build_cooc([pos_file, neg_file], vocab, cooc_file)

    train_glove(cooc_file, embeddings_file, glove_seed)

    train_kmeans(embeddings_file, label_file, kmeans_seed)

    train_xgb(vocab_file, pos_file, neg_file, label_file, validation,
              submission_file, xgb_seed, sampler_seed)
Exemple #6
0
def main():
    NAME_IDX = 5
    data_file = '../data/the_office_scripts.csv'

    length = get_num_lines(data_file)
    data_names = get_data(data_file, NAME_IDX, length)
    vocab = build_vocab(data_names)
    data_ids = convert_to_id(vocab, data_names)
    train_data, test_data = split_data(data_ids, length)

    num_tokens = len(vocab)
    model = RNN_WORD_Model(num_tokens)

    for i in range(40):
        train(model, train_data)

    test(model, test_data)
Exemple #7
0
    for row in val_data[tags_predicted].iterrows():
        val_targets.append(list(row[1].values))
    
    train_sample_num = len(train_targets)
    if params['loss_weight_on']:
        loss_weights = {}
        for task_id in range(len(tags_predicted)):
            loss_weights[task_id] = torch.Tensor([(train_sample_num-pos_num_tags[task_id])/pos_num_tags[task_id]]).to(device)
    else:
        loss_weights = None
    train_X = train_data[steps_token]
    val_X = val_data[steps_token]
    test_X = test_data[steps_token]
    all_train_tokens = all_tokens_list(train_X)
    max_vocab_size = len(list(set(all_train_tokens)))
    token2id, id2token = build_vocab(all_train_tokens, max_vocab_size)
    emb_weight = build_emb_weight(words_emb_dict, id2token)
    train_data_indices = token2index_dataset(train_X, token2id)
    val_data_indices = token2index_dataset(val_X, token2id)
    test_data_indices = token2index_dataset(test_X, token2id)

    # batchify datasets: 
    batch_size = params['batch_size']
    max_sent_len = np.array([94, 86, 87, 90, 98, 91])
    train_loader, val_loader, test_loader = create_dataset_obj(train_data_indices, val_data_indices,
                                                           test_data_indices, train_targets,
                                                           val_targets, test_targets,
                                                           batch_size, max_sent_len, 
                                                           collate_func)
    
    val_auc, val_acc, model_to_test = train_model(params, emb_weight, train_loader, val_loader, test_loader, loss_weights)
Exemple #8
0
n_epochs = 10000
embedding_dim = 500
hidden_dim = 126
layer_dim = 2
output_dim = 1
seq_dim = embedding_dim

configure(tensor_board_log_dir)


# %%
# Pre-process the data ========================================================
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

word_2_int, int_2_word = preprocess.build_vocab(train, test)
max_sent_len = preprocess.longest_sentence_length(train, test)

print('Longest sentence: {}'.format(max_sent_len))

train = preprocess.MSRPWordVectorsDataSet(train, word_2_int, max_sent_len, GPU)
test = preprocess.MSRPWordVectorsDataSet(test, word_2_int, max_sent_len, GPU)


train_loader = torch.utils.data.DataLoader(dataset=train,
                                           batch_size=batch_size,
                                           shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=test,
                                          batch_size=batch_size,
                                          shuffle=False)
def main(stopword_file, infiles):
    build_vocab(infiles, stopword_file, 'vocab.dat', 'inv_vocab.dat', cutoff=5)
Exemple #10
0
# Data Preparatopn
# ==================================================
# Load data
print("Loading text data...")
qq, ll, cc, aa = preprocess.train_data()
assert len(qq) == len(ll) == len(cc) == len(aa)
# Build vocabulary using same preprocessor
print("Building vocabularies...")
c = [y for x in cc for y in x]  # single sents from context
max_len = max([len(x) for x in c])
vocab = learn.preprocessing.VocabularyProcessor(max_len)
vocab.fit(c)
vocab.save('save/vocab.pkl')

print("Convert text to data...")
train_q = preprocess.build_vocab(qq, vocab)
train_c = [preprocess.build_vocab(x, vocab) for x in cc]
train_l = [preprocess.build_vocab(x, vocab) for x in ll]
train_a = [train_l[i][x - 1] for i, x in enumerate(aa)]
assert len(train_q) == len(train_c) == len(train_l) == len(train_a)


# Shuffle data
def shuf_data(data):
    ''' Return shuf_data, [context, question, right_ans, wrong_ans]'''
    print("Shuffle data...")
    shuf_idx = np.random.permutation(np.arange(len(data)))
    data_shuf = [data[i] for i in shuf_idx]
    print("Done shuffle.")
    return data_shuf
Exemple #11
0
def main(args):
    if (args.input_vocab_json == '') and (args.output_vocab_json == ''):
        print("Must give one of --input_vocab_json or --output_vocab_json")
        return

    print("Loading questions...")
    with open(args.input_questions, 'r') as f:
        questions = f.read()
    questions = questions.split("\n")
    questions = questions[:-1]

    print("Loading answers...")
    with open(args.input_answers, 'r') as f:
        answers = f.read()
    answers = answers.split("\n")
    answers = answers[:-1]

    answer_token_to_idx = None
    # Either create the vocab or load it from disk
    if args.input_vocab_json == "" or args.expand_vocab == 1:
        print("Building vocab...")

        # Convert the answer tokens to unique id
        answer_token_to_idx = build_vocab([answer for answer in answers])

        # convert the tokens in all questions to unique id
        question_token_to_idx = build_vocab(
            [question for question in questions],
            min_token_count=args.unk_threshold,
            punct_to_keep=[';', ','],
            punct_to_remove=['?', '.'])

        answer_idx_to_token = {}
        question_idx_to_token = {}

        # create a reverse dictionary for answer idx to token mapping
        for key, value in answer_token_to_idx.items():
            answer_idx_to_token[value] = key

        # create a reverse dictionary for question idx to token mapping
        for key, value in question_token_to_idx.items():
            question_idx_to_token[value] = key

        # dump all the dictionaries as a single JSON file
        vocab = {
            "question_token_to_idx": question_token_to_idx,
            "answer_token_to_idx": answer_token_to_idx,
            "question_idx_to_token": question_idx_to_token,
            "answer_idx_to_token": answer_idx_to_token
        }

    if args.input_vocab_json != "":
        print("Loading vocab...")
        if args.expand_vocab == 1:
            new_vocab = vocab
        with open(args.input_vocab_json, 'r') as f:
            vocab = json.load(f)

        if args.expand_vocab == 1:
            num_new_words = 0
            for word in new_vocab["question_token_to_idx"]:
                if word not in vocab["question_token_to_idx"]:
                    print("Found new word %s" % word)
                    idx = len(vocab["question_token_to_idx"])
                    vocab["question_token_to_idx"][word] = idx
                    num_new_words += 1
                print("Found %d new words" % num_new_words)

    if args.output_vocab_json != "":
        with open(args.output_vocab_json, 'w') as f:
            json.dump(vocab, f)

    # Encode all questions and programs
    # This converts question strings to integers
    print("Encoding data")
    questions_encoded = []
    _answers = []

    for question, answer in zip(questions, answers):
        question_tokens = tokenize(question,
                                   punct_to_keep=[';', ','],
                                   punct_to_remove=['?', '.'])
        question_encoded = encode(question_tokens,
                                  vocab["question_token_to_idx"],
                                  allow_unk=args.encode_unk == 1)
        questions_encoded.append(question_encoded)
        _answers.append(vocab["answer_token_to_idx"][answer])

    # Pad encoded questions and programs
    max_question_length = max(len(x) for x in questions_encoded)
    for qe in questions_encoded:
        while len(qe) < max_question_length:
            qe.append(vocab["question_token_to_idx"]["<NULL>"])

    # Create h5 dataset file
    print("Writing output")
    questions_encoded = np.asarray(questions_encoded, dtype=np.int32)
    print("Questions encoded shape is {}".format(questions_encoded.shape))
    print("Length of answer tokens is {}".format(len(answer_token_to_idx)))

    with h5py.File(args.output_h5_file, 'w') as f:
        f.create_dataset("questions", data=questions_encoded)

        if len(_answers) > 0:
            f.create_dataset("answers", data=np.asarray(_answers))
def main(args):
    if (args.input_vocab_json == '') and (args.output_vocab_json == ''):
        print('Must give one of --input_vocab_json or --output_vocab_json')
        return

    print('Loading data...')
    with open(args.input_questions_json, 'r') as f:
        questions = json.load(f)['questions']

    '''Either create the vocab or load it from disk'''
    if args.input_vocab_json == '' or args.expand_vocab == 1:
        print('Building vocab...')
        if 'answer' in questions[0]:
            answer_token_to_idx = build_vocab((str(q['answer'])
                                               for q in questions))
        question_token_to_idx = build_vocab(
            [q['question'] for q in questions],
            min_token_count=args.unk_threshold,
            punct_to_keep=[';', ','],
            punct_to_remove=['?', '.']
        )

        all_program_strs = []
        for q in questions:
            if 'program' not in q.keys():
                continue
            program_str = program_to_str(q['program'], args.mode)
            if program_str is not None:
                all_program_strs.append(program_str)
        program_token_to_idx = build_vocab(all_program_strs)
        vocab = {
            'question_token_to_idx': question_token_to_idx,
            'program_token_to_idx': program_token_to_idx,
            'answer_token_to_idx': answer_token_to_idx,
        }

    if args.input_vocab_json != '':
        print('Loading vocab...')
        if args.expand_vocab == 1:
            new_vocab = vocab
        with open(args.input_vocab_json, 'r') as f:
            vocab = json.load(f)

        if args.expand_vocab == 1:
            num_new_words = 0
            for word in new_vocab['question_token_to_idx']:
                if word not in vocab['question_token_to_idx']:
                    print('Found new word %s' % word)
                    idx = len(vocab['question_token_to_idx'])
                    vocab['question_token_to_idx'][word] = idx
                    num_new_words += 1
                print('Found %d new words' % num_new_words)

    if args.output_vocab_json != '':
        with open(args.output_vocab_json, 'w') as f:
            json.dump(vocab, f)

    '''Encode all questions and programs'''
    print('Encoding data')
    questions_encoded = []
    programs_encoded = []
    question_families = []
    orig_idxs = []
    image_idxs = []
    answers = []

    for orig_idx, q in enumerate(questions):
        question = q['question']

        orig_idxs.append(orig_idx)
        image_idxs.append(q['image_index'])
        if 'question_family_index' in q:
            question_families.append(q['question_family_index'])
        question_tokens = tokenize(question, punct_to_keep=[';', ','],
                                   punct_to_remove=['?', '.'])
        question_encoded = encode(question_tokens,
                                  vocab['question_token_to_idx'],
                                  allow_unk=args.encode_unk == 1)
        questions_encoded.append(question_encoded)

        if 'program' in q:
            program = q['program']
            program_str = program_to_str(program, args.mode)
            program_tokens = tokenize(program_str)
            program_encoded = encode(program_tokens,
                                     vocab['program_token_to_idx'])
            programs_encoded.append(program_encoded)

        if 'answer' in q:
            answers.append(vocab['answer_token_to_idx'][str(q['answer'])])

    '''Pad encoded questions and programs'''
    max_question_length = max(len(x) for x in questions_encoded)
    for qe in questions_encoded:
        while len(qe) < max_question_length:
            qe.append(vocab['question_token_to_idx']['<NULL>'])

    if len(programs_encoded) > 0:
        max_program_length = max(len(x) for x in programs_encoded)
        for pe in programs_encoded:
            while len(pe) < max_program_length:
                pe.append(vocab['program_token_to_idx']['<NULL>'])

    '''Create h5 file'''
    print('Writing output')
    questions_encoded = np.asarray(questions_encoded, dtype=np.int32)
    programs_encoded = np.asarray(programs_encoded, dtype=np.int32)
    print('Questions encoded shape is {}'.format(questions_encoded.shape))
    print('Programs encoded shape is {}'.format(programs_encoded.shape))

    with h5py.File(args.output_h5_file, 'w') as f:
        f.create_dataset('questions', data=questions_encoded)
        f.create_dataset('image_idxs', data=np.asarray(image_idxs))
        f.create_dataset('orig_idxs', data=np.asarray(orig_idxs))

        if len(programs_encoded) > 0:
            f.create_dataset('programs', data=programs_encoded)
        if len(question_families) > 0:
            f.create_dataset('question_families',
                             data=np.asarray(question_families))
        if len(answers) > 0:
            f.create_dataset('answers', data=np.asarray(answers))
Exemple #13
0
        metrics.classification_report(y_test_cls,
                                      y_pred_cls,
                                      target_names=categories))

    # 混淆矩阵
    print("Confusion Matrix...")
    cm = metrics.confusion_matrix(y_test_cls, y_pred_cls)
    print(cm)

    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)


if __name__ == '__main__':
    if len(sys.argv) != 2 or sys.argv[1] not in ['train', 'test']:
        raise ValueError("""usage: python run_cnn.py [train / test]""")

    print('Configuring CNN model...')
    config = TCNNConfig()
    if not os.path.exists(vocab_dir):  # 如果不存在词汇表,重建
        build_vocab(train_dir, vocab_dir, config.vocab_size)
    categories, cat_to_id = read_category()
    words, word_to_id = read_vocab(vocab_dir)
    config.vocab_size = len(words)
    model = TextCNN(config)
    # train()

    if sys.argv[1] == 'train':
        train()
    else:
        test()