Example #1
0
def main(_):
    '''
    count = [['<eos>', 20], ['word1', 24]...]
    '''
    count = [] 
    '''
    word2idx = {'<eos>':0, 'word1':1, ...}
    '''
    word2idx = {}
    if not os.path.exists(FLAGS.checkpoint_dir):
      os.makedirs(FLAGS.checkpoint_dir)

    train_data = read_data('%s/%s.train.txt' % (FLAGS.data_dir, FLAGS.data_name), count, word2idx)
    valid_data = read_data('%s/%s.valid.txt' % (FLAGS.data_dir, FLAGS.data_name), count, word2idx)
    test_data = read_data('%s/%s.test.txt' % (FLAGS.data_dir, FLAGS.data_name), count, word2idx)

    '''
    idx2word = {0:'<eos>', 1:'word1'}
    '''
    idx2word = dict(zip(word2idx.values(), word2idx.keys()))
    # number of words
    FLAGS.nwords = len(word2idx)

    pp.pprint(flags.FLAGS.__flags)

    with tf.Session() as sess:
        model = MemN2N(FLAGS, sess)
        model.build_model()

        if FLAGS.is_test:
            model.run(valid_data, test_data)
        else:
            model.run(train_data, valid_data)
Example #2
0
def main(_):
    count = []
    word2idx = {}

    train_data = read_data('data/ptb.train.txt', count, word2idx)
    valid_data = read_data('data/ptb.valid.txt', count, word2idx)
    test_data = read_data('data/ptb.test.txt', count, word2idx)

    idx2word = dict(zip(word2idx.values(), word2idx.keys()))
    FLAGS.nwords = len(word2idx)

    with tf.Session() as sess:
        model = MemN2N(FLAGS, sess)
        model.build_model()
        model.run(train_data, test_data)
Example #3
0
def simulate_skin(steps=5, max_iter=100, learning_rate=0.1):
    """Simulate learning skin data set."""
    data = read_data('Skin_NonSkin.txt')
    train_data, test_data = split_list(data, 0.75)

    start = len(train_data)/steps  # First step training set size.
    end = len(train_data)  # Final step training set size.

    sizes = []  # Training data set sizes.
    success = []  # Success rates according to training data set sizes.
    for i in xrange(steps):
        # Increase training data size according to iteration.
        size = start + i*end/steps
        current_train_data = train_data[:size]

        w = train(current_train_data, max_iter=max_iter, r=learning_rate)
        error = test(test_data, w)

        status(current_train_data, test_data, error)
        print

        # Record size-success statistics.
        sizes.append(size)
        success.append(100 - error)

    plot_success_per_size(sizes, success)
    show()
Example #4
0
def main(_):
    count = []
    word2idx = {}

    train_data = read_data('%s/%s.train.txt' % (FLAGS.data_dir, FLAGS.data_name), count, word2idx)
    valid_data = read_data('%s/%s.valid.txt' % (FLAGS.data_dir, FLAGS.data_name), count, word2idx)
    test_data = read_data('%s/%s.test.txt' % (FLAGS.data_dir, FLAGS.data_name), count, word2idx)

    idx2word = dict(zip(word2idx.values(), word2idx.keys()))
    FLAGS.nwords = len(word2idx)

    pp.pprint(tf.app.flags.FLAGS.__flags)

    with tf.Session() as sess:
        model = MemN2N(FLAGS, sess)
        model.build_model()
        model.run(train_data, valid_data)
Example #5
0
def main():
    dataFile = "/home/laga/uni/kurse/2013/ss/dist_semantik/hausarbeit/nmr_relations_5class.data"
    # only nounds
    words = data.read_data(dataFile)
    # build vocabulary
    vocab = set()
    concepts = set()
    for word in words:
        vocab.update(wn.get_hypernyms(word["firstWordSynset"], 7))
        concepts.add(word["firstWordSynset"])
        concepts.add(word["secondWordSynset"])
        vocab.update(wn.get_hypernyms(word["secondWordSynset"], 7))
    s = sorted(vocab)
    print s 
    indices = {}
    for x in range(0, len(s)):
        indices[s[x]] = x
    print indices
    vocabSize = len(indices)
    print "WN vocab size: %s" % len(indices)

    failed = set()
    # download image for concepts
    for w in concepts:
        ret = imgnet.download_image_urls(w, "/home/laga/uni/kurse/2013/ss/dist_semantik/hausarbeit/imgnet-data/")
        #time.sleep(0.3)
        if not ret:
            failed.add(w)

    print "Failed to download img for synsets: %s" % failed

    rep = {}
    for word in words:
        vector = [0 for x in xrange(0, 2 * vocabSize)]
        w1 = word["firstWordSynset"]
        w2 = word["secondWordSynset"]
        if w1 in failed or w2 in failed:
            print "Skipping current word, one of synsets does not have image: %s" % word
            continue
        rep[(w1,w2)] = vector
        for w in wn.get_hypernyms(w1, 7):
            idx = indices[w]
            vector[idx] = 1
        for w in wn.get_hypernyms(w2, 7):
            idx = indices[w]
            vector[idx] = 1
        vector.append(word["relation"])
    fh = open("wordnet.data", "w")
    for (key,val) in rep.iteritems():
        fh.write(key[0].name)
        fh.write(",")
        fh.write(key[1].name)
        fh.write(",")
        for f in val:
            fh.write(str(f))
            fh.write(",")
        fh.write("\n")
    fh.close()
Example #6
0
def main(_):
    count = []
    word2idx = {}

    train_data = read_data('%s/%s.train.txt' % (FLAGS.data_dir, FLAGS.data_name), count, word2idx)
    valid_data = read_data('%s/%s.valid.txt' % (FLAGS.data_dir, FLAGS.data_name), count, word2idx)
    test_data = read_data('%s/%s.test.txt' % (FLAGS.data_dir, FLAGS.data_name), count, word2idx)

    FLAGS.nwords = len(word2idx)

    pp.pprint(flags.FLAGS.__flags)

    with tf.Session() as sess:
        model = MemN2N(sess, **(flags.FLAGS.__flags))
        model.build_model()

        if FLAGS.is_test:
            model.run(valid_data, test_data)
        else:
            model.run(train_data, valid_data)
Example #7
0
def data_processing_model():
    init_data = read_data("../data/raw/heart.csv")
    train_params = read_training_pipeline_params("../configs/config_lr.yml")
    classifier = Classifier(train_params.classifier_params,
                            train_params.model_type)
    pipeline = DataProcessingPipeline(
        train_params.feature_params.categorical_features,
        train_params.feature_params.numerical_features)
    pipeline.fit(init_data)
    transformed_data = pipeline.transform(init_data)
    classifier.fit(transformed_data, init_data['target'].values)
    return pipeline, classifier
Example #8
0
def get_k_day_return(code,
                     days=5,
                     start_date="2012-01-01",
                     end_date="2017-01-01"):
    """
    获得k日收益率,计算公式:(当日收盘价-k日前收盘价)/当日收盘价
    """
    fname = data.get_filename(code)
    df = data.read_data(fname)
    df = df[df.index >= start_date]
    factor_name = "%d-day return" % (days)
    df[factor_name] = df['close'].pct_change(periods=days)
    return df[factor_name]
Example #9
0
def main(verbose, path):
    ##
    # Function that read all data
    ##
    # read all arguments
    if verbose:
        print('###\tStart reading data')
    start = timer()
    train, validation, test = dt.read_data(path)
    end = timer()
    if verbose:
        print('###\tTime for reading data: %.2f sec' % (end - start))
    return train, validation, test, path
Example #10
0
def main(_):
    word2idx = {}
    max_words = 0
    max_sentences = 0

    if not os.path.exists(FLAGS.checkpoint_dir):
        os.makedirs(FLAGS.checkpoint_dir)

    train_stories, train_questions, max_words, max_sentences = read_data(
        '{}/qa{}_train.txt'.format(FLAGS.data_dir, FLAGS.babi_task), word2idx,
        max_words, max_sentences)
    valid_stories, valid_questions, max_words, max_sentences = read_data(
        '{}/qa{}_valid.txt'.format(FLAGS.data_dir, FLAGS.babi_task), word2idx,
        max_words, max_sentences)
    test_stories, test_questions, max_words, max_sentences = read_data(
        '{}/qa{}_test.txt'.format(FLAGS.data_dir, FLAGS.babi_task), word2idx,
        max_words, max_sentences)

    pad_data(train_stories, train_questions, max_words, max_sentences)
    pad_data(valid_stories, valid_questions, max_words, max_sentences)
    pad_data(test_stories, test_questions, max_words, max_sentences)

    idx2word = dict(zip(word2idx.values(), word2idx.keys()))
    FLAGS.nwords = len(word2idx)
    FLAGS.max_words = max_words
    FLAGS.max_sentences = max_sentences

    pp.pprint(flags.FLAGS.__flags)

    with tf.Session() as sess:
        model = MemN2N(FLAGS, sess)
        model.build_model()

        if FLAGS.is_test:
            model.run(valid_stories, valid_questions, test_stories,
                      test_questions)
        else:
            model.run(train_stories, train_questions, valid_stories,
                      valid_questions)
Example #11
0
def main():

    #tweets,labels=data.read_data("../data/dataset_terremoto_iquique_2014.csv")
    tweets, labels = data.read_data(
        "../data/tweets-iquique-2014-tipo-informacion.csv")
    processed_tweets = process_tweet.process_tweets(tweets)

    X_train, X_test, y_train, y_test = train_test_split(tweets,
                                                        labels,
                                                        test_size=0.30,
                                                        random_state=42)

    classifier(X_train, y_train, X_test, y_test)
Example #12
0
def prepare_data():
    '''
    Prepare Data:
    1. read in audio meta data
    2. set data loaders for training
    3. save categorical look up tabels
    '''

    train_data, valid_data = data.read_data(data.path)
    train_loader, valid_loader = data.gen_data_loader(train_data, valid_data)
    utils.save_cat_idx(train_data, 'models/idx2cat.pkl')

    return train_loader, valid_loader
Example #13
0
def do_retrieval_experiments(
        descriptions='air/problem_descriptions',
        solutions='air/solutions',
        graph_types=['co-occurrence', 'dependency', 'random'],
        use_frequency=True):
    """
    Experiment used for comparative evaluation of different network
    representations on the retrieval task.

    Toggle comparison with frequency-based methods using *use_frequency*.
    """
    results = {
        '_solutions': solutions,
        '_descriptions': descriptions,
        '_evaluation': 'retrieval'
    }

    print '> Evaluation type: retrieval'
    print '> Reading cases..'
    descriptions_path = '../data/' + descriptions
    descriptiondata = data.read_data(descriptions_path, graph_types)

    solutions_path = '../data/' + solutions + '_preprocessed'
    solution_texts, labels = data.read_files(solutions_path)
    solution_vectors = freq_representation.text_to_vector(
        solution_texts, freq_representation.FrequencyMetrics.TF_IDF)

    print '> Evaluating..'
    for gtype in graph_types:
        print '   ', gtype
        docs, labels = descriptiondata[gtype]
        graphs = graph_representation.create_graphs(docs, gtype)
        results[gtype] = {}
        for metric in graph_representation.get_metrics():
            print '    -', metric
            vectors = graph_representation.graphs_to_vectors(graphs, metric)
            results[gtype][metric] = evaluation.evaluate_retrieval(
                vectors, solution_vectors)
    if use_frequency:
        print '    frequency'
        results['freq'] = {}
        for metric in freq_representation.get_metrics():
            print '    -', metric
            docs, labels = data.read_files(descriptions_path + '_preprocessed')
            vectors = freq_representation.text_to_vector(docs, metric)
            results['freq'][metric] = evaluation.evaluate_retrieval(
                vectors, solution_vectors)

    print
    pp.pprint(results)
    return results
Example #14
0
def main(_):
    count = []
    word2idx = {}

    if not os.path.exists(FLAGS.checkpoint_dir):
        os.makedirs(FLAGS.checkpoint_dir)

    if FLAGS.hdfs:
        write_data_to_local(FLAGS.pai_data_dir, FLAGS.data_dir,
                            FLAGS.data_name + '.train.txt')
        write_data_to_local(FLAGS.pai_data_dir, FLAGS.data_dir,
                            FLAGS.data_name + '.valid.txt')
        write_data_to_local(FLAGS.pai_data_dir, FLAGS.data_dir,
                            FLAGS.data_name + '.test.txt')

    train_data = read_data(
        os.path.join(FLAGS.data_dir, FLAGS.data_name + '.train.txt'), count,
        word2idx, FLAGS.hdfs)
    valid_data = read_data(
        os.path.join(FLAGS.data_dir, FLAGS.data_name + '.valid.txt'), count,
        word2idx, FLAGS.hdfs)
    test_data = read_data(
        os.path.join(FLAGS.data_dir, FLAGS.data_name + '.test.txt'), count,
        word2idx, FLAGS.hdfs)

    idx2word = dict(zip(word2idx.values(), word2idx.keys()))
    FLAGS.nwords = len(word2idx)

    pp.pprint(flags.FLAGS.__flags)

    with tf.Session() as sess:
        model = MemN2N(FLAGS, sess)
        model.build_model()

        if FLAGS.is_test:
            model.run(valid_data, test_data)
        else:
            model.run(train_data, valid_data)
Example #15
0
def main(_):
    source_count = []
    source_word2idx = {}

    train_data = read_data(FLAGS.train_data, source_count, source_word2idx)
    test_data = read_data(FLAGS.test_data, source_count, source_word2idx)

    FLAGS.pad_idx = source_word2idx['<pad>']
    FLAGS.nwords = len(source_word2idx)
    FLAGS.mem_size = train_data[4] if train_data[4] > test_data[4] else test_data[4]

    pp.pprint(flags.FLAGS.__flags)

    print('loading pre-trained word vectors...')
    FLAGS.pre_trained_context_wt = init_word_embeddings(source_word2idx)
    # pad idx has to be 0
    FLAGS.pre_trained_context_wt[FLAGS.pad_idx, :] = 0
    # FLAGS.pre_trained_target_wt = init_word_embeddings(target_word2idx)

    with tf.Session() as sess:
        model = MemN2N(FLAGS, sess)
        model.build_model()
        model.run(train_data, test_data)
Example #16
0
def get_k_day_average_return(code,
                             days=5,
                             start_date="2012-01-01",
                             end_date="2017-01-01"):
    """
    获得k日平均收益率
    """
    fname = data.get_filename(code)
    df = data.read_data(fname)
    df = df[df.index >= start_date]
    factor_name = "%d-day average return" % (days)
    df["return"] = df['close'].pct_change()
    df[factor_name] = df["return"].rolling(window=days).mean()
    return df[factor_name]
Example #17
0
def get_k_day_volatility(code,
                         days,
                         start_date,
                         end_date):
    """
    获得k日波动率
    """
    fname = data.get_filename(code)
    df = data.read_data(fname)
    df = df[df.index >= start_date]
    factor_name = "%d-day volatility"%(days)
    df["return"] = df["close"].pct_change()
    df[factor_name] = df["return"].rolling(window=days).std() * np.sqrt(243)
    return df[factor_name]
Example #18
0
def main(_):
    source_count = []
    source_word2idx = {}

    if os.path.isfile("abc") and os.path.isfile("def"):
        train_data = pickle.load(open("abc", "rb"))
        test_data = pickle.load(open("def", "rb"))
    else:
        train_data = read_data(FLAGS.train_data, source_count, source_word2idx)
        test_data = read_data(FLAGS.test_data, source_count, source_word2idx)
        pickle.dump(train_data, open("abc", "wb"))
        pickle.dump(test_data, open("def", "wb"))

    # test_data = read_data("./data/cust_sent.xml", source_count, source_word2idx)

    source_word2idx = train_data[5]

    FLAGS.pad_idx = source_word2idx['<pad>']
    FLAGS.nwords = len(source_word2idx)
    FLAGS.mem_size = train_data[
        4] if train_data[4] > test_data[4] else test_data[4]

    pp.pprint(flags.FLAGS.__flags)

    print('loading pre-trained word vectors...')
    FLAGS.pre_trained_context_wt = init_word_embeddings(source_word2idx)
    # pad idx has to be 0
    FLAGS.pre_trained_context_wt[FLAGS.pad_idx, :] = 0
    # FLAGS.pre_trained_target_wt = init_word_embeddings(target_word2idx)
    # for i in range(15):
    #     print " Source_data: {}, target_data: {}, Label: {}, Og_source_data: {}, og_target_data:{}".format(test_data[0][i],test_data[2][i],test_data[3][i],test_data[6][i],test_data[7][i])
    with tf.Session() as sess:
        model = MemN2N(FLAGS, sess)
        model.build_model()
        # print(np.array(train_data).shape)

        model.run(train_data, test_data)
Example #19
0
def train_and_predict_by_self_realization(train_data_path, valid_data_path,
                                          vocab_path):
    """
    利用pytorch自实现fasttext. 14轮达到最优, 测试集准确率是0.723
    :param train_data_path: 训练集路径
    :param valid_data_path: 测试集路径
    :param vocab_path: 字典路径
    """
    tokenizer, label_map, data = read_data(train_data_path, valid_data_path,
                                           vocab_path)
    model = self_fasttext(vocab_size=tokenizer.get_vocab_size(),
                          embedding_size=32,
                          label_size=len(label_map))
    learner = Learner(data, model)
    learner.fit(epochs=20, init_lr=0.001, opt_fn=optim.Adam)
Example #20
0
def main():
    df = data.read_data('Fri', 'subset-100').set_index('Timestamp')
    time = pd.Timestamp(datetime(2014, 6, 6, 12))
    df2 = get_positions_at(df, time)

    im = data.read_image('Grey')
    fig, ax = plt.subplots()
    plt.imshow(im, extent=[0, 100, 0, 100])
    plt.plot()
    plt.show()
    ani = animation.FuncAnimation(fig,
                                  animate,
                                  blit=False,
                                  interval=10,
                                  repeat=False)
Example #21
0
def main(_):
    count = []
    word2idx = {}

    if not os.path.exists(FLAGS.checkpoint_dir): # 체크포인트 디렉토리가 없으면
        os.makedirs(FLAGS.checkpoint_dir) # 만든다

    train_data = read_data('%s/%s.train.txt' % (FLAGS.data_dir, FLAGS.data_name), count, word2idx)
    valid_data = read_data('%s/%s.valid.txt' % (FLAGS.data_dir, FLAGS.data_name), count, word2idx)
    test_data = read_data('%s/%s.test.txt' % (FLAGS.data_dir, FLAGS.data_name), count, word2idx)

    idx2word = dict(zip(word2idx.values(), word2idx.keys())) # word2index 를 key,value만 바꾸면 index2word dict!
    FLAGS.nwords = len(word2idx)

    pp.pprint(flags.FLAGS.__flags)

    with tf.Session() as sess:
        model = MemN2N(FLAGS, sess)
        model.build_model()

        if FLAGS.is_test:
            model.run(valid_data, test_data)
        else:
            model.run(train_data, valid_data)
Example #22
0
def main(_):
    count = []
    word2idx = {}

    train_data = read_data(
        '%s/%s.train.txt' % (FLAGS.data_dir, FLAGS.data_name), count, word2idx)
    valid_data = read_data(
        '%s/%s.valid.txt' % (FLAGS.data_dir, FLAGS.data_name), count, word2idx)
    test_data = read_data('%s/%s.test.txt' % (FLAGS.data_dir, FLAGS.data_name),
                          count, word2idx)

    idx2word = dict(zip(word2idx.values(), word2idx.keys()))
    FLAGS.nwords = len(word2idx)

    pp.pprint(flags.FLAGS.__flags)

    with tf.Session() as sess:
        model = MemN2N(FLAGS, sess)
        model.build_model()

        if FLAGS.is_test:
            model.run(valid_data, test_data)
        else:
            model.run(train_data, valid_data)
Example #23
0
def update_frequency_factor_data(asset_df, frequency='m'):
    """
    把因子数据按frequency提取出来并保存,为了提升速度
    """
    codes = asset_df.index
    for code in codes:
        fname = data.get_factor_filename(code, frequency='d')
        temp = data.read_data(fname)
        if frequency == "m":
            temp = temp.resample('M').last()
        if frequency == "y":
            temp = temp.resample('A').last()

        fname = data.get_factor_filename(code, frequency=frequency)
        temp.to_excel(fname)
Example #24
0
def main():
    """Entrypoint"""
    dataset = read_data('input.weight.txt')
    tasks = [
        calc(data[0], [(data[i], data[i + 1]) for i in range(1, len(data), 2)])
        for data in dataset
    ]
    width = max(len(x) for x, _ in tasks)
    collect = [
        pad(x, (0, width - len(x)), mode='constant', constant_values=nan)
        for x, _ in tasks
    ]
    print('')
    print('Output:')
    print(mat(collect))
Example #25
0
def main():
    """Entrypoint"""
    raw = mat(read_data('input.super.txt'))
    count, diff = 0, 1
    data = raw
    while count < 100 and diff > 0.5 * (10**-4):
        new = data * raw
        count += 1
        diff = max([
            row_diff([col for col in row if col != 0]) for row in new.tolist()
        ])  # pylint: disable=E1101
        if isinf(diff):
            break
        data = new
    print(data)
Example #26
0
def get_asset_factor_data(asset_df, factors, frequency='m'):
    """
    得到资产的价格与因子数据

    frequency: m/y
    """
    print("getting asset factor data...")
    codes = asset_df.index
    dic = {}
    for code in codes:
        fname = data.get_factor_filename(code, frequency=frequency)
        temp = data.read_data(fname)
        temp = temp[factors + ["close"]]
        dic[code] = temp
    pnl = pd.Panel(dic)
    return pnl
Example #27
0
def main(args):
    logging.info('Running trainer with {}'.format(args))
    language = led_parser.propositional_language()
    parser = data.Parser(language)
    n_ops = len(language.symbols)

    # construct a pwn using a treenn with a sat3 cell
    sat3 = csat.Sat3Cell(n_ops, args.num_units, args.batch_size, args.n_worlds)
    nn = treenn.TreeNN(sat3, parser, args.batch_size)
    possibleworldsnet = pwn.PossibleWorlds(nn, args.n_worlds, args.num_units)

    logging.info('N variables = {}'.format(
        np.sum([np.prod(var.shape) for var in possibleworldsnet.variables])))
    opt = tf.train.AdamOptimizer()

    checkpoint = tf.contrib.eager.Checkpoint(
        **{var.name: var
           for var in possibleworldsnet.variables})
    writer = tf.contrib.summary.create_file_writer(args.logdir)
    writer.set_as_default()

    for e in range(args.epochs):
        # Train
        for A, B, E in data.batch_data(
                data.read_data(os.path.join(args.datadir, 'train.txt')),
                args.batch_size):
            loss, grads, p = compute_step(possibleworldsnet, A, B, E)
            gnvs = zip(grads, possibleworldsnet.variables)
            step = tf.train.get_or_create_global_step()
            opt.apply_gradients(gnvs, global_step=step)

            logging.info('step: {} loss: {}'.format(step.numpy(),
                                                    tf.reduce_mean(loss)))

            with tf.contrib.summary.record_summaries_every_n_global_steps(10):
                tf.contrib.summary.scalar('loss', loss)
                tf.contrib.summary.scalar('acc', accuracy(p, E))

        # Evaluate
        for test_name, test_set in data.fetch_test_sets(
                args.datadir, args.batch_size):
            logging.info('Eval: {}'.format(test_name))
            acc = np.mean(
                [accuracy(possibleworldsnet(A, B), E) for A, B, E in test_set])
            tf.contrib.summary.scalar(test_name, acc)

        checkpoint.save(os.path.join(args.logdir, 'ckpt{}'.format(e)))
Example #28
0
def run():
    """
    Run the experiment.
    """
    is_ptr = False
    np.random.seed(RANDOM_SEED)
    max_val, max_length, pairs = read_data(name="test")
    np.random.shuffle(pairs)
    training_pairs = [tensors_from_pair(pair) for pair in pairs]

    data_dim = max_val + 1
    hidden_dim = embedding_dim = 256

    encoder = Encoder(input_dim=data_dim,
                      embedding_dim=embedding_dim,
                      hidden_dim=hidden_dim).to(device)
    if is_ptr:
        decoder = PtrDecoder(output_dim=data_dim,
                             embedding_dim=embedding_dim,
                             hidden_dim=hidden_dim).to(device)
    else:
        decoder = AttnDecoder(output_dim=data_dim,
                              embedding_dim=embedding_dim,
                              hidden_dim=hidden_dim).to(device)

    checkpoint = load_checkpoint("ptr" if is_ptr else "vanilla")
    if checkpoint:
        encoder.load_state_dict(checkpoint["encoder"])
        decoder.load_state_dict(checkpoint["decoder"])
    else:
        print("Count not find checkpoint file.")

    permutation_count, nondecreasing_count = 0, 0
    for i in range(len(training_pairs)):
        input_tensor, target_tensor = training_pairs[i]
        output_tensor = evaluate(encoder=encoder,
                                 decoder=decoder,
                                 input_tensor=training_pairs[i][0],
                                 is_ptr=is_ptr)
        target, output = list(np.asarray(
            input_tensor.data).squeeze()), output_tensor[:-1]
        if is_permutation(target, output):
            permutation_count += 1
        if nondecreasing(output) == 0:
            nondecreasing_count += 1
    print("Permutation: %s" % (permutation_count / len(training_pairs)))
    print("Nondecreasing: %s" % (nondecreasing_count / len(training_pairs)))
Example #29
0
def train_and_predict_by_textcnn(train_data_path, valid_data_path, vocab_path):
    """
    利用pytorch实现textcnn进行建模. 14轮, 测试集准确率为0.702
    :param train_data_path: 训练集路径
    :param valid_data_path: 测试集路径
    :param vocab_path: 字典路径
    """
    tokenizer, label_map, data = read_data(train_data_path, valid_data_path, vocab_path)
    model = TextCNN(vocab_size=tokenizer.get_vocab_size(),
                    embedding_size=32,
                    max_seq_length=48,
                    kernel_num=32,
                    kernel_sizes=(2,3,4,5),
                    dropout=0.3,
                    label_size=len(label_map))
    learner = Learner(data, model)
    learner.fit(epochs=20, init_lr=0.001, opt_fn=optim.Adam)
Example #30
0
def main():
    dataset = 'abalone'
    data_path = '../data'
    out_path = '../out/rf/models'
    model_path = f'{out_path}/{dataset}.pkl'
    seed = 2019

    np.random.seed(seed)

    model = RandomForestClassifier(n_estimators=100, n_jobs=-1)
    data_dict = data.read_data(data_path, dataset, validation=False)
    trn_x = data_dict['trn_x']
    trn_y = data_dict['trn_y']
    model.fit(trn_x, trn_y)

    os.makedirs(os.path.dirname(model_path), exist_ok=True)
    joblib.dump(model, model_path)
Example #31
0
def main():
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')

    df = data.read_data('Fri', 'subset-300')
    # df = df[(df.id % 3) == 0]
    p_id = 391338
    path = df[df['id'] == p_id]
    # plot_trajectories(ax=ax, df=path, show_map=True, colors='#2FB787', line_kw=dict(lw=2))
    plot_trajectories(ax=ax, df=path, show_map=True, line_kw=dict(lw=2))

    # for angle in range(0, 360, 10):
    #     print(angle)
    #     ax.view_init(30, angle)
    #     plt.savefig('Data61 angle {}.png'.format(angle), bbox_inches='tight')

    plt.show()
Example #32
0
def do_retrieval_experiments(descriptions='air/problem_descriptions',
                                solutions='air/solutions',
                                graph_types=['co-occurrence','dependency','random'],
                                use_frequency=True):
    """
    Experiment used for comparative evaluation of different network
    representations on the retrieval task.

    Toggle comparison with frequency-based methods using *use_frequency*.
    """
    results = {'_solutions':solutions,
                '_descriptions':descriptions,
                '_evaluation':'retrieval'}

    print '> Evaluation type: retrieval'
    print '> Reading cases..'
    descriptions_path = '../data/'+descriptions
    descriptiondata = data.read_data(descriptions_path, graph_types)

    solutions_path = '../data/'+solutions+'_preprocessed'
    solution_texts, labels = data.read_files(solutions_path)
    solution_vectors = freq_representation.text_to_vector(solution_texts, freq_representation.FrequencyMetrics.TF_IDF)

    print '> Evaluating..'
    for gtype in graph_types:
        print '   ',gtype
        docs, labels = descriptiondata[gtype]
        graphs = graph_representation.create_graphs(docs, gtype)
        results[gtype] = {}
        for metric in graph_representation.get_metrics():
            print '    -', metric
            vectors = graph_representation.graphs_to_vectors(graphs, metric)
            results[gtype][metric] = evaluation.evaluate_retrieval(vectors, solution_vectors)
    if use_frequency:
        print '    frequency'
        results['freq'] = {}
        for metric in freq_representation.get_metrics():
            print '    -', metric
            docs, labels = data.read_files(descriptions_path+'_preprocessed')
            vectors = freq_representation.text_to_vector(docs, metric)
            results['freq'][metric] = evaluation.evaluate_retrieval(vectors, solution_vectors)

    print
    pp.pprint(results)
    return results
Example #33
0
def predict_pipeline(params: PredictionPipelineParams):
    logger.info(f"Start prediction.")

    data = read_data(params.data_path)
    logger.info(f"Data loaded. Raw data shape: {data.shape}")

    pipeline = load_transformer(params.transformer_path)
    logger.info(f"Transformer loaded: {pipeline}")

    model = load_model(params.model_path)
    logger.info(f"Model loaded: {model}")

    train_features = make_features(pipeline, data)
    logger.info(f"Test features shape: {train_features.shape}")

    predictions = predict_model(train_features, model)
    predictions_path = save_prediction(predictions, params.output_path)
    logger.info(f"Predictions saved in {predictions_path}")
Example #34
0
def main():
    path = os.getcwd()
    path_save, path_source = os.path.split(path)
    path_save = path_save + "/Results"
    if not os.path.exists(path_save):
        os.makedirs(path_save)

    #tweets,labels=data.read_data("../data/dataset_terremoto_iquique_2014.csv")
    #tweets,labels = data.read_data("../data/tweets-iquique-2014-tipo-informacion.csv")
    tweets_data = data.read_data("../data/results-random-inf.csv")
    #print(tweets_data[13])

    #group = 0
    for tweet_info in tweets_data:
        processed_tweets = process_tweet.process_tweets(
            tweets_data[tweet_info]["tweets"])

    #ACA EMPIEZA EL PROCESO DE "ACTIVE LEARNING"

    #se entrena el grupo 0 solo inicialmente, luego dentro del loop se entrena el resto de los grupos aucmulados
    print("entrenando grupo 0")
    best_score_svm, best_score_dt = data_split.train_cv_grid(tweets_data[0]["tweets"], tweets_data[0]["labels"], \
                path_save, 0)

    tweets_data_new = {}
    dict_new = {}

    for group in tweets_data:
        for key in tweets_data[0].keys():
            if group == 0:
                dict_new[key] = tweets_data[group][key] + tweets_data[group +
                                                                      1][key]
                tweets_data_new[group] = dict_new
            else:
                if (group + 1) in tweets_data:
                    dict_new[key] = dict_new[key] + tweets_data[group + 1][key]
                    tweets_data_new[group] = dict_new
        #print(dict_new)
        #X_train, X_test, y_train, y_test = train_test_split(dict_new["tweets"],dict_new["labels"], stratify = dict_new["labels"], \
        #                                                         test_size=0.20, random_state=199993)
        if group > 0:
            print("entrenando con grupo: ", group)
            best_score_svm, best_score_dt = data_split.train_cv_grid(dict_new["tweets"], dict_new["labels"], \
               path_save, group)
Example #35
0
def run():
    """
    Run the experiment.
    """
    name = "train"
    is_ptr = True
    hidden_dim = embedding_dim = 256
    n_epochs = 1
    grad_clip = 2
    teacher_force_ratio = 0.5
    optimizer = optim.Adam
    optimizer_params = {}

    max_val, max_length, pairs = read_data(name)

    set_max_length(max_length)
    training_pairs = [tensors_from_pair(pair) for pair in pairs]

    data_dim = max_val + 1
    encoder = Encoder(input_dim=data_dim,
                      embedding_dim=embedding_dim,
                      hidden_dim=hidden_dim).to(device)
    if is_ptr:
        decoder = PtrDecoder(output_dim=data_dim,
                             embedding_dim=embedding_dim,
                             hidden_dim=hidden_dim).to(device)
    else:
        decoder = AttnDecoder(output_dim=data_dim,
                              embedding_dim=embedding_dim,
                              hidden_dim=hidden_dim).to(device)

    train(encoder=encoder,
          decoder=decoder,
          optim=optimizer,
          optim_params=optimizer_params,
          weight_init=weight_init,
          grad_clip=grad_clip,
          is_ptr=True,
          training_pairs=training_pairs,
          n_epochs=n_epochs,
          teacher_force_ratio=teacher_force_ratio,
          print_every=50,
          plot_every=50,
          save_every=100)
Example #36
0
def main() -> None:
    """
    main entry point for program
    """
    strategy = initialize()

    # read in the data (tokenized with tf-idf)
    x_train_padded, x_valid_padded, y_train, y_valid, max_len, word_indexes = read_data()

    # run base models
    # simple RNN model
    simple_rnn(strategy, x_train_padded, x_valid_padded, y_train,
               y_valid, max_len, len(word_indexes) + 1, EMBEDDING_SIZE_Y, EPOCHS_BASE)
    # create embeddings layer
    embeddings_output = build_embeddings(EMBEDDING_SIZE_Y, word_indexes)
    # train and run LSTM model
    run_lstm(strategy, x_train_padded, x_valid_padded, y_train, y_valid,
             max_len, len(word_indexes) + 1, EMBEDDING_SIZE_Y, embeddings_output, EPOCHS_BASE)
    # train and run GRU model
    run_gru(strategy, x_train_padded, x_valid_padded, y_train, y_valid,
            max_len, len(word_indexes) + 1, EMBEDDING_SIZE_Y, embeddings_output, EPOCHS_BASE)
    # train and run bidirectional LSTM model
    run_rnn(strategy, x_train_padded, x_valid_padded, y_train, y_valid,
            max_len, len(word_indexes) + 1, EMBEDDING_SIZE_Y, embeddings_output, EPOCHS_BASE)

    # max length for transformers models (different from base models above)
    attention_max_len = 192

    # read in attention data (into tensorflow datasets)
    x_train, x_valid, y_train, y_valid, train_dataset, \
        valid_dataset, test_dataset, batch_size = read_data_attention(
            strategy, attention_max_len)
    # build, train and test distilBERT model
    run_distilibert(strategy, x_train, x_valid, y_train, y_valid,
                    train_dataset, valid_dataset, test_dataset, attention_max_len, EPOCHS_TRANSFORMERS, batch_size)

    # read second dataset (tokenized with xlm roberta tokenizer)
    x_train, x_valid, y_train, y_valid, train_dataset, \
        valid_dataset, test_dataset, batch_size = read_data_attention(
            strategy, attention_max_len)
    # build, train, and run xlm roberta
    run_roberta(strategy, x_train, x_valid, y_train, y_valid,
                train_dataset, valid_dataset, test_dataset, attention_max_len, EPOCHS_TRANSFORMERS, batch_size)
Example #37
0
def doit():
    args = command_parser()
    print(args)
    states = args.states
    variables = args.vars
    datfilename = args.data[0] + ".csv"
    startdate = "2020-09-01"
    dim = [4, 2.5]
    if args.data:
        print("covid: saving data in " + datfilename)
        df = data.read_data(startdate, states, variables)
        df.to_csv(datfilename)
    if args.graph:
        graphfilename = args.graph[0]
        print("covid: generating graph from " + datfilename + " into " +
              graphfilename)
        df = (pd.read_csv(datfilename,
                          parse_dates=['date']).query("state in @states"))
        graph.graph_b(df, states, variables, graphfilename, dim)
Example #38
0
def test_batch_image():
    data = read_data(gt_path='E:\All_My_Files\All_My_DataSets\derain_datasets\Derain_ML_Proj\\final_testset\ground_truth\\',
              rain_path='E:\All_My_Files\All_My_DataSets\derain_datasets\Derain_ML_Proj\\final_testset\\rainy_image\\',
              num_channel=3,
              size_input=128,
              batch_size=1
              )
    rain_image, gt_image = data
    out1 = torch.squeeze(rain_image)
    out1 = norm_range(out1, None)
    out1 = out1.mul(255).clamp(0, 255).byte().permute(1, 2, 0).numpy()  # W H C
    imag1 = Image.fromarray(out1)  # W H C

    out2 = torch.squeeze(gt_image)
    out2 = norm_range(out2, None)
    out2 = out2.mul(255).clamp(0, 255).byte().permute(1, 2, 0).numpy()  # W H C
    imag2 = Image.fromarray(out2)  # W H C
    imag1.save('C:\\Users\Administrator\Desktop\Derain_ML_proj_v2.5\\test\output\\rain.jpg')
    imag2.save('C:\\Users\Administrator\Desktop\Derain_ML_proj_v2.5\\test\output\\gt.jpg')
Example #39
0
File: reflex.py Project: clu8/2048
def train(feature_transform):
    inputs, labels = data.read_data()

    # shuffle data
    random.seed(1234)
    shuffled = zip(inputs, labels)
    random.shuffle(shuffled)
    inputs = numpy.asarray([example[0] for example in shuffled])
    labels = [numpy.asarray([1 if example[1] == label else 0 for example in shuffled]) for label in range(4)]

    if feature_transform:
        inputs = numpy.log2(inputs)

    training_steps = 20000

    x = [T.matrix('x{}'.format(label)) for label in range(4)]
    y = [T.vector('y{}'.format(label)) for label in range(4)]
    w = [theano.shared(numpy.random.randn(16), name='w{}'.format(label)) for label in range(4)]
    b = [theano.shared(0., name='b{}'.format(label)) for label in range(4)]

    # Construct Theano expression graph
    p_1 = [1 / (1 + T.exp(-T.dot(x[l], w[l]) - b[l])) for l in range(4)]   # Probability that target = 1
    xent = [-y[l] * T.log(p_1[l]) - (1-y[l]) * T.log(1-p_1[l]) for l in range(4)] # Cross-entropy loss function
    cost = [xent[l].mean() + 0.01 * (w[l] ** 2).sum() for l in range(4)] # The cost to minimize
    gwb = [T.grad(cost[l], [w[l], b[l]]) for l in range(4)]    # Compute the gradient of the cost

    train = [theano.function(
          inputs=[x[l],y[l]],
          outputs=[p_1[l], xent[l]],
          updates=((w[l], w[l] - 0.1 * gwb[l][0]), (b[l], b[l] - 0.1 * gwb[l][1]))) for l in range(4)]

    predict = [theano.function(inputs=[x[l]], outputs=p_1[l]) for l in range(4)]

    # Train
    for l in range(4):
        print('Training for label {}'.format(l))
        for i in range(training_steps):
            pred, err = train[l](inputs, labels[l])

    return predict
Example #40
0
def do_classification_experiments(dataset='tasa/TASA900',
                                    graph_types = ['co-occurrence','dependency','random'],
                                    use_frequency = True):
    """
    Experiment used for comparative evaluation of different network
    representations on classification.

    Toggle comparison with frequency-based methods using *use_frequency*.
    """
    results = {'_dataset':dataset,
                '_evaluation':'classification'}
    print '> Evaluation type: classification'
    print '> Reading data..', dataset
    corpus_path = '../data/'+dataset
    docdata = data.read_data(corpus_path, graph_types)

    print '> Evaluating..'
    for gtype in graph_types:
        print '   ',gtype
        documents, labels = docdata[gtype]
        graphs = graph_representation.create_graphs(documents, gtype)
        results[gtype] = {}
        for metric in graph_representation.get_metrics():
            print '    -', metric
            vectors = graph_representation.graphs_to_vectors(graphs, metric)
            results[gtype][metric] = evaluation.evaluate_classification(vectors, labels)
    if use_frequency:
        print '    frequency'
        results['freq'] = {}
        for metric in freq_representation.get_metrics():
            print '    -', metric
            documents, labels = data.read_files(corpus_path+'_preprocessed')
            vectors = freq_representation.text_to_vector(documents, metric)
            results['freq'][metric] = evaluation.evaluate_classification(vectors, labels)

    print
    pp.pprint(results)
    return results
def main():
    name, print_names = parse_options(argv)
    results, features, feature_names = read_data(name)

    features = np.hstack((features, results.reshape((len(results), 1))))

    correlation = find_correlation(features)

    correlationorder = sorted(correlation,
                              key=lambda x: abs(correlation.get(x)[0]),
                              reverse=True)

    #print("Feature numbers")
    #print()
    #print("number\tname")
    #print("------\t----")
    #for i in range(len(feature_names)):
        #print("{}\t{}".format(i, feature_names[i]))

    #print()
    #print()
    #print("Correlations")
    #print()
    #print("ft1\tft2\tpearson")
    #print("---\t---\t-------")
    #for c in correlationorder:
        #print("{}\t{}\t{}".format(c[0], c[1], correlation[c][0]))

    for c in correlationorder:
        if print_names:
            print('"{}","{}",{}'.format(feature_names[c[0]],
                                        feature_names[c[1]],
                                        correlation[c][0]))
        else:
            print('{},{},{}'.format(c[0],
                                    c[1],
                                    correlation[c][0]))
def main():
    relieff_file = open(RELIEFF_FILENAME, "r")
    relieff_features = parse_relieff_list(relieff_file.readlines())
    relieff_features.sort(key=lambda x: x[0], reverse=True)

    results, features, feature_names = read_data(FEATURE_FILENAME)

    #features = np.hstack((features, results.reshape((len(results), 1))))

    correlations = find_correlation(features)
    #print("\n".join("{}: {}".format(x, correlations[x]) for x in correlations))
    #return

    for threshold in frange(1.0, 0.0, -0.1):
        selected = select_features(features, relieff_features, correlations, threshold)

        print("============================================================")
        print("THRESHOLD =", threshold)
        print("Count:", len(selected))
        print("\n".join(["{} ({})".format(x, feature_names[x]) for x in selected]))

        print()
        print()
        print()
Example #43
0
File: reflex.py Project: clu8/2048
def train_with_val(learning_rate=0.13, n_epochs=3, batch_size=600):
    inputs, labels = data.read_data()

    # shuffle data
    random.seed(1234)
    shuffled = zip(inputs, labels)
    random.shuffle(shuffled)
    inputs = [example[0] for example in shuffled]
    labels = [example[1] for example in shuffled]

    def shared_dataset(data_x, data_y):
        shared_x = theano.shared(numpy.asarray(data_x, dtype=theano.config.floatX), borrow=True)
        shared_y = theano.shared(numpy.asarray(data_y, dtype=theano.config.floatX), borrow=True)
        return (shared_x, T.cast(shared_y, 'int32'))

    train_set_x, train_set_y = shared_dataset(inputs[:int(0.8 * len(inputs))], labels[:int(0.8 * len(inputs))])
    valid_set_x, valid_set_y = shared_dataset(inputs[int(0.8 * len(labels)):int(0.9 * len(inputs))], labels[int(0.8 * len(labels)):int(0.9 * len(inputs))])
    test_set_x, test_set_y = shared_dataset(inputs[int(0.9 * len(inputs)):], labels[int(0.9 * len(inputs)):])

    # train_set_x = numpy.asarray(inputs[:int(0.8 * len(inputs))])
    # train_set_y = numpy.asarray(inputs[:int(0.8 * len(labels))])
    # valid_set_x = numpy.asarray(inputs[int(0.8 * len(labels)):int(0.9 * len(labels))])
    # valid_set_y = numpy.asarray(inputs[int(0.8 * len(labels)):int(0.9 * len(labels))])
    # test_set_x = numpy.asarray(inputs[int(0.9 * len(inputs)):])
    # test_set_y = numpy.asarray(inputs[int(0.9 * len(labels)):])

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
    n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch

    # generate symbolic variables for input (x and y represent a
    # minibatch)
    x = T.matrix('x')  # data, presented as rasterized images
    y = T.ivector('y')  # labels, presented as 1D vector of [int] labels

    # construct the logistic regression class
    # Each MNIST image has size 28*28
    classifier = LogisticRegression(input=x, n_in=28 * 28, n_out=10)

    # the cost we minimize during training is the negative log likelihood of
    # the model in symbolic format
    cost = classifier.negative_log_likelihood(y)

    # compiling a Theano function that computes the mistakes that are made by
    # the model on a minibatch
    test_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: test_set_x[index * batch_size: (index + 1) * batch_size],
            y: test_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )

    validate_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: valid_set_x[index * batch_size: (index + 1) * batch_size],
            y: valid_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )

    # compute the gradient of cost with respect to theta = (W,b)
    g_W = T.grad(cost=cost, wrt=classifier.W)
    g_b = T.grad(cost=cost, wrt=classifier.b)

    # specify how to update the parameters of the model as a list of
    # (variable, update expression) pairs.
    updates = [(classifier.W, classifier.W - learning_rate * g_W),
               (classifier.b, classifier.b - learning_rate * g_b)]

    # compiling a Theano function `train_model` that returns the cost, but in
    # the same time updates the parameter of the model based on the rules
    # defined in `updates`
    train_model = theano.function(
        inputs=[index],
        outputs=cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size: (index + 1) * batch_size],
            y: train_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )

    ###############
    # TRAIN MODEL #
    ###############
    print '... training the model'
    # early-stopping parameters
    patience = 5000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                                  # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                  # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_validation_loss = numpy.inf
    test_score = 0.
    start_time = timeit.default_timer()

    done_looping = False
    epoch = 0
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in xrange(n_train_batches):

            minibatch_avg_cost = train_model(minibatch_index)
            # iteration number
            iter = (epoch - 1) * n_train_batches + minibatch_index

            if (iter + 1) % validation_frequency == 0:
                # compute zero-one loss on validation set
                validation_losses = [validate_model(i)
                                     for i in xrange(n_valid_batches)]
                this_validation_loss = numpy.mean(validation_losses)

                print(
                    'epoch %i, minibatch %i/%i, validation error %f %%' %
                    (
                        epoch,
                        minibatch_index + 1,
                        n_train_batches,
                        this_validation_loss * 100.
                    )
                )

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:
                    #improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                       improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    best_validation_loss = this_validation_loss
                    # test it on the test set

                    test_losses = [test_model(i)
                                   for i in xrange(n_test_batches)]
                    test_score = numpy.mean(test_losses)

                    print(
                        (
                            '     epoch %i, minibatch %i/%i, test error of'
                            ' best model %f %%'
                        ) %
                        (
                            epoch,
                            minibatch_index + 1,
                            n_train_batches,
                            test_score * 100.
                        )
                    )

                    # save the best model
                    with open(REFLEX_MODEL_PICKLE, 'w') as f:
                        cPickle.dump(classifier, f)

            if patience <= iter:
                done_looping = True
                break

    end_time = timeit.default_timer()
    print(
        (
            'Optimization complete with best validation score of %f %%,'
            'with test performance %f %%'
        )
        % (best_validation_loss * 100., test_score * 100.)
    )
Example #44
0
import tensorflow as tf

from data import read_data
from model import MemN2N

count = []
word2idx = {}

train_data = read_data('data/ptb.train.txt', count, word2idx)
valid_data = read_data('data/ptb.valid.txt', count, word2idx)
test_data = read_data('data/ptb.test.txt', count, word2idx)

idx2word = dict(zip(word2idx.values(), word2idx.keys()))

params = {
    'show': True,
    'nhop': 3,
    'edim': 150,
    'lindim': 75,
    'mem_size': 100,
    'batch_size': 125,
    'max_grad_norm': 50,
    'init_hid': 0.01,
    'nwords': 10000,
}

if __name__ == '__main__':
    with tf.Session() as sess:
        model = MemN2N(params, sess)
        model.build_model()
        model.run(train_data, test_data, 100)
Example #45
0
                              (query),
                              sent.answer))

    return d


if __name__ == '__main__':
    import data
    vocab = collections.defaultdict(lambda: len(vocab))
#    data_dir = '/home/unno/qa/tasks_1-20_v1-2'
    data_dir = '../../data/tasks_1-20_v1-2'
    data_type = 'en'
    for data_id in range(1, 21):

        train_data = data.read_data(
            vocab,
            glob.glob('%s/%s/qa%d_*train.txt' % (data_dir, data_type, data_id))[0])
        test_data = data.read_data(
            vocab,
            glob.glob('%s/%s/qa%d_*test.txt' % (data_dir, data_type, data_id))[0])
        print('Training data: %d' % len(train_data))

        gpu = 0
        train_data = convert_data(train_data, gpu)
        test_data = convert_data(test_data, gpu)

        model = MemNN(20, len(vocab), 50)
        opt = optimizers.Adam()
        #opt = optimizers.SGD(lr=0.01)
        #opt.add_hook(chainer.optimizer.GradientClipping(40))
        batch_size = 100