Esempio n. 1
0
def data_gen(input_dir, output_dir):

    output_direc = merging_datafile(input_dir, output_dir)

    input_tensor, target_tensor, inp_lang, targ_lang = load_dataset(
        output_direc)

    # Calculate max_length of the target tensors
    max_length_targ, max_length_inp = target_tensor.shape[
        1], input_tensor.shape[1]
    input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(
        input_tensor, target_tensor, test_size=0.2)

    # Show length
    print(len(input_tensor_train), len(target_tensor_train),
          len(input_tensor_val), len(target_tensor_val))

    print("Input Language; index to word mapping")
    convert(inp_lang, input_tensor_train[0])
    print()
    print("Target Language; index to word mapping")
    convert(targ_lang, target_tensor_train[0])
    buffer_size = len(input_tensor_train)
    batch_size = 16
    steps_per_epoch = len(input_tensor_train) // batch_size
    embedding_dim = 256
    units = 1024
    vocab_inp_size = len(inp_lang.word_index) + 1
    vocab_tar_size = len(targ_lang.word_index) + 1

    dataset = tf.data.Dataset.from_tensor_slices(
        (input_tensor_train, target_tensor_train)).shuffle(buffer_size)
    dataset = dataset.batch(batch_size, drop_remainder=True)
    example_input_batch, example_target_batch = next(iter(dataset))

    return dataset, vocab_inp_size, vocab_tar_size, embedding_dim, units, batch_size, example_input_batch, steps_per_epoch, targ_lang, max_length_targ, max_length_inp, inp_lang, targ_lang
Esempio n. 2
0
                    os.path.basename(train_tarball), n_units)))
    else:
        model.intersect_word2vec_format(pre_trained_path, binary=True)
        model.train(sentences,
                    total_words=None,
                    epochs=model.iter,
                    total_examples=model.corpus_count,
                    queue_factor=2,
                    report_delay=report_delay)
        model.save(
            os.path.join(
                out_folder, 'finetuned_gigaword_{}_{}.model'.format(
                    os.path.basename(train_tarball), n_units)))

    if bins:
        vocab, pos2id, n_classes, n_participants, train, val, test = pd.load_dataset(
            model.wv.vocab, gensim=True, bins=True)
    else:
        vocab, pos2id, train, val, test, mean, std = pd.load_dataset(
            model.wv.vocab, gensim=True)

    print('Data samples eyetracking: %d' % len(train))

    train_iter = EyetrackingBatchIterator(train,
                                          window_eyetracking,
                                          batchsize_eyetracking,
                                          repeat=True,
                                          shuffle=True,
                                          wlen=wlen,
                                          pos=pos,
                                          prev_fix=prev_fix,
                                          freq=freq,
Esempio n. 3
0
                elif incorrect_visualized_counter < visualize_incorrect and corrects_vector[i] == False:
                    visualize.visualize_with_correct(batch[inputs_placeholder][i], predictions[i], true_label,
                                                     name + "_incorrect")
                    incorrect_visualized_counter += 1

    precision = correct_num / number_of_examples
    summary = tf.Summary()
    summary.value.add(tag='Accuracy_' + name, simple_value=precision)
    summary_writer.add_summary(summary, learning_step)
    print("Accuracy %.3f" % precision)


if __name__ == '__main__':

    # Load dataset
    train = load_dataset("train.p", False)
    validation = load_dataset("validation.p", False)
    test = load_dataset("test.p", False)

    with tf.Graph().as_default():
        # Wiring
        # model = Sequence()
        # model = SequenceBiggerOutput()
        # model = SequenceReshapedConvolution()
        model = SequenceReshapedConvolutionBatchnorm()
        # model = SequenceReshapedConvolutionDeeper()

        inputs_placeholder, labels_placeholder, keep_prob_placeholder, is_training_placeholder = model.input_placeholders()
        logits = model.inference(inputs_placeholder, keep_prob_placeholder, is_training_placeholder)
        loss = model.loss(logits, labels_placeholder)
        training = model.training(loss, 0.0001)
Esempio n. 4
0
    num_workers = args.num_workers

    print('dataset:    ', dataset_name)
    print('model:      ', model_name)
    print('pretrained: ', pretrained)
    print('epochs:     ', epochs)
    print('batch_size: ', batch_size)
    print('lr:         ', lr)
    print('seed:       ', seed)
    print('num_workers: ', num_workers)
    print('-' * 30)

    torch.manual_seed(seed)

    # load dataset
    train_data, test_data, in_channels, num_classes = load_dataset(
        dataset_name)
    train_loader = torch.utils.data.DataLoader(train_data,
                                               batch_size=batch_size,
                                               shuffle=True,
                                               num_workers=num_workers)
    test_loader = torch.utils.data.DataLoader(test_data,
                                              batch_size=batch_size,
                                              shuffle=True,
                                              num_workers=num_workers)

    # load model
    model = getattr(target_models, model_name)(in_channels, num_classes)
    model.cuda()

    optimizer = optim.SGD(model.parameters(),
                          lr=lr,
Esempio n. 5
0
        chainer.cuda.get_device_from_id(args.gpu).use()
        cuda.check_cuda_available()

    print('GPU: {}'.format(args.gpu))
    print('# unit: {}'.format(args.unit))
    #print('Minibatch-size: {}'.format(args.batchsize))
    print('# epoch: {}'.format(args.epoch))
    print('Output type: {}'.format(args.out_type))

    assert (args.surprisal_order in range(2, 6))

    if args.gpu >= 0:
        cuda.get_device_from_id(args.gpu).use()

    if args.bins:
        vocab, pos2id, n_classes, n_participants, train, val, test = pd.load_dataset(
            bins=True, surprisal_order=args.surprisal_order)
    else:
        vocab, pos2id, train, val, test, mean, std = pd.load_dataset(
            surprisal_order=args.surprisal_order)

        print('')
        print('Mean dataset times: {}'.format(mean))
        print('Std_dev dataset times: {}'.format(std))

    index2word = {v: k for k, v in vocab.items()}

    n_vocab = len(vocab)
    n_pos = len(pos2id)
    print('n_vocab: %d' % n_vocab)
    print('data length: %d' % len(train))
    print('n_pos: %d' % n_pos)
Esempio n. 6
0
    pos = True
    prev_fix = True
    freq = True
    surprisal = True
    n_pos_units = 50
    out = F.tanh
    n_hidden = 200

    configs = [(False, 0, 0), (False, 1, 0), (False, 0, 2),
                (True, 0, 0), (True, 1, 0), (True, 0, 2)]

    for config in configs:
        bins, window, n_layers = config

        if bins:
            vocab, pos2id, n_classes, n_participants, train, val, test = pd.load_dataset(bins=True)
        else:
            vocab, pos2id, train, val, test, mean, std = pd.load_dataset()
        
        n_vocab = len(vocab)
        n_pos = len(pos2id)

        if bins:
            loss_func = F.softmax_cross_entropy
        else:
            loss_func = F.mean_squared_error

        if bins:
            model = EyetrackingClassifier(n_vocab, n_units, n_participants, n_classes, loss_func, out, n_hidden=n_hidden, window=window, n_layers=n_layers, wlen=wlen, pos=pos, prev_fix=prev_fix, freq=freq, surprisal=surprisal, n_pos=n_pos, n_pos_units=50)
        else:
            model = EyetrackingLinreg(n_vocab, n_units, loss_func, out, n_hidden=n_hidden, window=window, n_layers=n_layers, wlen=wlen, pos=pos, prev_fix=prev_fix, freq=freq, surprisal=surprisal, n_pos=n_pos, n_pos_units=50)
Esempio n. 7
0
        '--output', dest='output', metavar='outputDirectory', help='dataset directory', required=True)
    requiredNamed.add_argument(
            '--batch-size', dest='BatchSize', help='Input Batch Size for dataset according to data size', required=True)
    requiredNamed.add_argument(
            '--epochs', dest='Epochs', help='Input string for translation', required=True)
    args = parser.parse_args()
    input_dir = args.input
    output_dir = args.output
    BATCH_SIZE = int(args.BatchSize)
    Epoc=int(args.Epochs)
    output_direc = merging_datafile(input_dir,output_dir)
    pic_dir=input_dir+'/pickle_objects'
    os.mkdir(pic_dir)

    num_examples = None
    input_tensor, target_tensor, inp_lang, targ_lang = load_dataset(output_direc, num_examples)
    max_length_targ, max_length_inp = target_tensor.shape[1], input_tensor.shape[1]
    input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

    BUFFER_SIZE = len(input_tensor_train)
    steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
    embedding_dim = 256
    units = 1024
    vocab_inp_size = len(inp_lang.word_index)+1
    vocab_tar_size = len(targ_lang.word_index)+1

    dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
    dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)
    example_input_batch, example_target_batch = next(iter(dataset))

    encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)
Esempio n. 8
0
gpu = -1

if out_type_eyetracking == 'tanh':
    out_eyetracking = F.tanh
elif out_type_eyetracking == 'relu':
    out_eyetracking = F.relu
elif out_type_eyetracking == 'sigmoid':
    out_eyetracking = F.sigmoid
elif out_type_eyetracking == 'id':
    out_eyetracking = F.identity
else:
    raise Exception('Unknown output type: {}'.format(out_type))

if os.path.exists(sys.argv[1] + '.model'):
    model = gensim.models.Word2Vec.load(sys.argv[1] + '.model')
    vocab, pos2id, train, val, test, mean, std = pd.load_dataset(
        model.wv.vocab, gensim=True)
    n_vocab = len(model.wv.vocab)
    n_pos = len(pos2id)
else:
    vocab, pos2id, train, val, test, mean, std = pd.load_dataset()
    n_vocab = len(vocab)
    n_pos = len(pos2id)

loss_func = F.mean_squared_error

model_eyetracking = EyetrackingLinreg(n_vocab,
                                      n_units,
                                      loss_func,
                                      out_eyetracking,
                                      window=window_eyetracking,
                                      n_layers=n_layers,