def get_model(max_words, sequence_length, rnn_size, rnn_layers):
    print('Gathering data to scaffold model…')
    embeddings = data.get_embeddings()
    text = data.get_data()
    idx_word, word_idx, sequence = data.get_indices_and_sequence(
        text, max_words)
    embedding_matrix = data.get_embedding_matrix(idx_word, embeddings)
    corp_size = len(idx_word)
    embedding_layer = Embedding(corp_size + 1,
                                EMBED_DIM,
                                weights=[embedding_matrix],
                                input_length=sequence_length,
                                trainable=False)

    model = Sequential()
    model.add(embedding_layer)

    for i in range(0, rnn_layers):
        output_seqs = True if i < rnn_layers - 1 else False
        model.add(
            LSTM(rnn_size,
                 input_shape=(sequence_length, EMBED_DIM),
                 return_sequences=output_seqs))

    model.add(Dense(corp_size))
    model.add(Activation('softmax'))
    model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.01))

    return model
Beispiel #2
0
    WORD_EMBEDDING_PATH = "mojing/word_embed.txt"
elif params.feature == "chars":
    WORD_EMBEDDING_PATH = "mojing/char_embed.txt"
else:
    raise Exception("Unknown feature: %s" % (params.feature))
"""
SEED
"""
np.random.seed(params.seed)
torch.manual_seed(params.seed)
torch.cuda.manual_seed(params.seed)
"""
DATA
"""
questions_dict, train, test = get_data(params.datapath)
word_vec = get_embeddings(WORD_EMBEDDING_PATH)

params.word_emb_dim = 300

params.enc_lstm_dims = [int(d) for d in params.enc_lstm_dims.split(",")]

if not os.path.exists(params.save_dir):
    os.mkdir(params.save_dir)
"""
MODEL
"""
# model config
config_mojing_model = {
    'n_words': len(word_vec),
    'word_emb_dim': params.word_emb_dim,
    'enc_lstm_dim': params.enc_lstm_dim,
Beispiel #3
0
# 3. Test data
test_tokens = test['tokens']
test_counts = test['counts']
args.num_docs_test = len(test_tokens)
test_1_tokens = test['tokens_1']
test_1_counts = test['counts_1']
args.num_docs_test_1 = len(test_1_tokens)
test_2_tokens = test['tokens_2']
test_2_counts = test['counts_2']
args.num_docs_test_2 = len(test_2_tokens)

# 4. Labels
can_classify = True
if can_classify:
    _, labels_ts, _ = data.get_labels(args.data_path)
    _, embed_ts, _ = data.get_embeddings(args.data_path)

embeddings = None
if not args.train_embeddings:
    emb_path = args.emb_path
    vect_path = os.path.join(args.data_path.split('/')[0], 'embeddings.pkl')
    vectors = {}
    with open(emb_path, 'rb') as f:
        for l in f:
            line = l.decode().split()
            word = line[0]
            if word in vocab:
                vect = np.array(line[1:]).astype(np.float)
                vectors[word] = vect
    embeddings = np.zeros((vocab_size, args.emb_size))
    words_found = 0
Beispiel #4
0
def main():
    parser = argparse.ArgumentParser(description='Mojing inference')
    # paths
    parser.add_argument("--datapath",
                        type=str,
                        default='mojing/',
                        help="mojing data path")
    parser.add_argument("--modelpath",
                        type=str,
                        default='savedir/model.pickle',
                        help="inference model path")
    parser.add_argument("--batch_size", type=int, default=512)

    parser.add_argument("--feature",
                        type=str,
                        default='words',
                        help="words or chars")

    # gpu
    parser.add_argument("--gpu_id", type=int, default=0, help="GPU ID")
    parser.add_argument("--seed", type=int, default=1234, help="seed")

    params, _ = parser.parse_known_args()

    # set gpu device
    torch.cuda.set_device(params.gpu_id)
    """
    SEED
    """
    np.random.seed(params.seed)
    torch.manual_seed(params.seed)
    torch.cuda.manual_seed(params.seed)
    """
    DATA
    """
    if params.feature == "words":
        WORD_EMBEDDING_PATH = "mojing/word_embed.txt"
    elif params.feature == "chars":
        WORD_EMBEDDING_PATH = "mojing/char_embed.txt"
    else:
        raise Exception("Unknown feature: %s" % (params.feature))

    questions_dict, train, test = get_data(params.datapath)

    dev_size = 9000
    dev = train.iloc[-dev_size:]
    dev = dev.reset_index(drop=True)
    train = train.iloc[0:-dev_size]
    train = train.reset_index(drop=True)
    dev = dev.values

    word_vec = get_embeddings(WORD_EMBEDDING_PATH)

    mojing_net = torch.load(params.modelpath)
    #print(mojing_net)

    # cuda by default
    mojing_net.cuda()

    mojing_net.eval()
    correct = 0.

    for i in range(0, len(dev), params.batch_size):
        # prepare batch

        label_batch, q1_batch, q1_len, q2_batch, q2_len = get_batch(
            questions_dict,
            dev[i:i + params.batch_size],
            word_vec,
            random_flip=False,
            feature=params.feature)

        q1_batch, q2_batch = Variable(q1_batch).cuda(), Variable(
            q2_batch).cuda()
        tgt_batch = Variable(torch.FloatTensor(label_batch)).cuda()

        # model forward
        output = mojing_net((q1_batch, q1_len), (q2_batch, q2_len))

        pred = output.data > 0
        correct += pred.long().eq(tgt_batch.data.long()).cpu().sum().numpy()

    # save model
    eval_acc = round(100 * correct / len(dev), 4)
    print eval_acc
Beispiel #5
0
def main():
    parser = argparse.ArgumentParser(description='Mojing inference')
    # paths
    parser.add_argument("--datapath",
                        type=str,
                        default='mojing/',
                        help="mojing data path")
    parser.add_argument("--modelpath",
                        type=str,
                        default='savedir/model.pickle',
                        help="inference model path")
    parser.add_argument("--output", type=str, default='output')
    parser.add_argument("--batch_size", type=int, default=1024)

    parser.add_argument("--feature",
                        type=str,
                        default='words',
                        help="words or chars")

    # gpu
    parser.add_argument("--gpu_id", type=int, default=0, help="GPU ID")
    parser.add_argument("--seed", type=int, default=1234, help="seed")

    params, _ = parser.parse_known_args()

    # set gpu device
    torch.cuda.set_device(params.gpu_id)

    # print parameters passed, and all parameters
    print('\ntogrep : {0}\n'.format(sys.argv[1:]))
    print(params)
    """
    SEED
    """
    np.random.seed(params.seed)
    torch.manual_seed(params.seed)
    torch.cuda.manual_seed(params.seed)
    """
    DATA
    """
    if params.feature == "words":
        WORD_EMBEDDING_PATH = "mojing/word_embed.txt"
    elif params.feature == "chars":
        WORD_EMBEDDING_PATH = "mojing/char_embed.txt"
    else:
        raise Exception("Unknown feature: %s" % (params.feature))

    questions_dict, train, test = get_data(params.datapath)
    word_vec = get_embeddings(WORD_EMBEDDING_PATH)

    test = test.values

    mojing_net = torch.load(params.modelpath)
    print(mojing_net)

    # cuda by default
    mojing_net.cuda()

    def inference():
        mojing_net.eval()
        results = []
        for i in tqdm(range(0, len(test), params.batch_size)):
            # prepare batch

            q1_batch, q1_len, q2_batch, q2_len = get_test_batch(
                questions_dict,
                test[i:i + params.batch_size],
                word_vec,
                feature=params.feature)

            q1_batch, q2_batch = Variable(q1_batch).cuda(), Variable(
                q2_batch).cuda()

            # model forward
            probs = mojing_net.predict_prob((q1_batch, q1_len),
                                            (q2_batch, q2_len))
            probs = probs.data.cpu().numpy().reshape((-1))
            results.extend(list(probs))
        return results

    results = inference()
    make_submission(results, params.output)