def grade3():
    marks = 0

    try:
        X, Y = utils.load_data2('data4.csv')
        X, Y = utils.preprocess(X, Y)
        X = X[:, 2:]
        r = np.ones((X.shape[0], ))
        W = multi_var_reg.weighted_regression(X, Y, r)
        R = np.diag(r * r)

        W_act = (np.linalg.inv(X.T @ R @ X)) @ (X.T @ R @ Y)

        assert np.allclose(W, W_act)
        marks += 0.5
    except:
        print('Q3 identity incorrect', file=stderr)

    try:
        X, Y = utils.load_data2('data4.csv')
        X, Y = utils.preprocess(X, Y)
        r = X[:, 1].reshape((X.shape[0], ))
        X = X[:, 2:]
        W = multi_var_reg.weighted_regression(X, Y, r)
        W_act = (np.linalg.inv(X.T @ R @ X)) @ (X.T @ R @ Y)

        assert np.allclose(W, W_act)
        marks += 0.5
    except:
        print('Q3 incorrect', file=stderr)

    return marks
def grade1():
    print("=" * 20 + "Grading Problem 1" + "=" * 20)
    marks = 0.0
    try:
        X, Y = utils.load_data2('data2.csv')
        X, Y = utils.preprocess(X, Y)
        X_train, Y_train, X_test, Y_test = utils.split_data(X, Y)
        W, train_mses, test_mses = p1.ista(X_train,
                                           Y_train,
                                           X_test,
                                           Y_test,
                                           _lambda=0.1)
        assert train_mses[-1] < 0.2

        marks += 1.5
    except:
        print('Train Error is large')

    try:
        assert test_mses[-1] < 0.25
        marks += 1.5
    except:
        print('Test Error is large')
    print("Marks obtained in Problem 1: ", marks)
    return marks
Exemple #3
0
def grade2():
	marks = 0

	try:
		X = np.arange(10).reshape(10, 1)
		assert np.allclose(utils.normalize(X).T, [[-1.5666989,-1.21854359,-0.87038828,-0.52223297,-0.17407766,0.17407766,0.52223297,0.87038828,1.21854359,1.5666989]])
		marks += 0.5

	except:
		print('Q2 normalize() incorrect')
		return marks

	try:
		X = np.arange(6).reshape(3,2).astype(float)
		Y = np.arange(3).reshape(3,1).astype(float)
		X_stud, Y_stud = utils.preprocess(X, Y)
		X_act, Y_act =  [[ 1., -1.22474487], [ 1., 0.], [ 1.,1.22474487]], [[-1.22474487], [0.], [1.22474487]]

		assert np.allclose(X_act, X_stud) and np.allclose(Y_act, Y_stud)
		marks += 1
	except:
		print('Q2 preprocess() incorrect')
		return marks

	try:
		X, Y = utils.load_data2('data2.csv')
		X, Y = utils.preprocess(X, Y)
		X_train, Y_train, X_test, Y_test = utils.split_data(X, Y)
		W, train_mses, test_mses = multi_var_reg.ordinary_least_squares(X_train, Y_train, X_test, Y_test)
		assert train_mses[-1] < 0.23
		assert test_mses[-1] < 0.48
		for i in range(len(train_mses)-1):
			assert train_mses[i] >= train_mses[i+1]
		marks += 1.5
	except:
		print('Q2 ordinary_least_squares() incorrect')

	try:
		reg = 10
		W_act = np.linalg.inv(X_train.T @ X_train + 2 * reg * X_train.shape[0] * np.eye(X_train.shape[1])) @ X_train.T @ Y_train
		W, train_mses, test_mses = multi_var_reg.ridge_regression(X_train, Y_train, X_test, Y_test, reg)
		assert train_mses[-1] < 0.3
		assert test_mses[-1] < 0.35
		assert ([email protected])[0][0] < 1e-7
		assert np.linalg.norm(W - W_act) < 0.5
		for i in range(len(train_mses)-1):
			assert train_mses[i] >= train_mses[i+1]
		marks += 1.5
	except:
		print('Q2 ridge_regression() incorrect')
	
	return marks
def plotImages(rseed, shift1=0, shift2=0, aug=True):
    datasets = load_data2(theano_shared=False)
    test_set_x, test_set_y = datasets[2]
    X = test_set_x[0:501].reshape(501, 3, 32, 32)
    if aug:
        X_new = augmentImages(X, shift1=shift1, shift2=shift2)
    else:
        X_new = X
    f, axarr = plt.subplots(4, 4)
    c = 0
    for i in range(4):
        for j in range(4):
            plt.axes(axarr[i, j])
            plt.imshow(X_new[rseed[c]].transpose(1, 2, 0))
            c += 1
    f.savefig(
        '/home/siddharth/workspace/StrivingForSimplicity/augmented_images_{0}.png'
        .format(int(aug)))
    plt.close(f)
Exemple #5
0
def load_trainval(yearmonth, n_clients='max'):
    """

    Method to load train/validation datasets

    X = [Processed Features](yearmonth) +
        [Processed Targets](yearmonth - 1) +
        [Processed Targets](yearmonth - 1, Jan) +
        [Processed Targets](yearmonth - 1, yearmonth - 100)

    Y = [Targets](yearmonth) +
        [Targets](yearmonth - 1) +
        [Diff targets](yearmonth, yearmonth - 1)

    :param yearmonth: year-month data on which targets to train
    :param n_clients: integer > 0 or 'max'
    :return: X, Y dataframes
    """

    def _get_XY(df):
        X = df[['ncodpers', 'fecha_dato', 'fecha_alta'] +
               FEATURES_NAMES +
               PROCESSED_TARGETS(1) +
               PROCESSED_TARGETS(2) +
               PROCESSED_TARGETS(3) +
               PROCESSED_TARGETS(4) +
               DIFF_TARGETS(1, 2) +
               DIFF_TARGETS(1, 3) +
               DIFF_TARGETS(1, 4)
               ]

        Y = df[['targets_str', 'last_targets_str', 'added_targets_str', 'added_targets_dec'] +
               TARGET_LABELS + LAST_TARGET_LABELS.tolist() + ADDED_TARGET_LABELS.tolist()
        ]
        return X, Y

    filename = "trainval_%s__%s.csv" % (str(yearmonth), str(n_clients))
    filepath = '../data/generated/' + filename
    if os.path.exists(filepath) and os.path.isfile(filepath):
        logging.info("- Found already generated file, load it")
        df = pd.read_csv('../data/generated/' + filename)
        X, Y = _get_XY(df)
        return X, Y
    # else:

    assert yearmonth < 201606, "Yearmonth should be less 201606"

    fname = TRAIN_FILE_PATH

    # load main month and the previous one:
    yearmonths_list = [yearmonth, _get_prev_ym(yearmonth)]
    logging.info("- Load file : %s, yearmonth=%i, n_clients=%s" % (fname, yearmonth, str(n_clients)))
    df = load_data2(fname, yearmonths_list, n_clients)
    minimal_clean_data_inplace(df)
    preprocess_data_inplace(df)

    # Separate data into [main month] and [previous month]
    months_ym_map = _get_yearmonth_map(df)
    df = df.sort_values(['ncodpers', 'fecha_dato'])
    mask0 = df['fecha_dato'] == months_ym_map[yearmonths_list[0]]
    mask1 = df['fecha_dato'] == months_ym_map[yearmonths_list[1]]
    df1 = df[mask1]
    df = df[mask0]
    df1.index = df.index
    assert (df['ncodpers'] == df1['ncodpers']).all(), "Clients are not alignable"

    # Transform main month:
    process_features(df)
    add_targets_str(df)

    # Append products from the previous month:
    append_columns(df, df1[TARGET_LABELS], LAST_TARGET_LABELS)
    add_targets_str(df, 'last_targets_str', target_labels=LAST_TARGET_LABELS)

    # Compute added products from previous month
    compute_added_products(df)
    add_targets_str(df, 'added_targets_str', target_labels=ADDED_TARGET_LABELS)

    # Process targets of yearmonth - 1
    process_targets(df1, label_index=1)
    append_columns(df, df1[PROCESSED_TARGETS(1)])

    assert not df.isnull().any().all(), "Some nan values appeared"

    # Load supplementary data
    ref_clients = df['ncodpers'].unique()
    supp_yearmonths_list = [_get_prev_ym(yearmonths_list[1]), _get_year_january(yearmonth), yearmonth - 100]
    #ll = len(ref_clients)
    ll = 'max'
    index_offset = 2
    for i, ym in enumerate(supp_yearmonths_list):
        logging.info("- Add a supplementary data : %i" % ym)
        df_ym = load_data2(fname, [ym], ll)
        minimal_clean_data_inplace(df_ym)
        preprocess_data_inplace(df_ym)
        #process_features(df_ym)

        df_ym = add_zero_missing_clients(df_ym, ym, df, yearmonth, ref_clients)

        df_ym = df_ym[df_ym['ncodpers'].isin(ref_clients)].sort_values(['ncodpers'])
        df_ym.index = df.index
        assert (df['ncodpers'] == df_ym['ncodpers']).all(), "Clients are not alignable"

        process_targets(df_ym, label_index=i+index_offset)
        append_columns(df, df_ym[PROCESSED_TARGETS(i+index_offset)])

        fn = 'diff_targets_dec_%i%i' % (1, i+index_offset)
        df.loc[:, fn] = compute_targets_diff(df1[TARGET_LABELS], df_ym[TARGET_LABELS])

        res = compute_targets_group_diff(df1[TARGET_GROUPS_DEC(1)],
                                         df_ym[TARGET_GROUPS_DEC(i+index_offset)])
        append_columns(df, res, DIFF_TARGET_GROUPS_DEC(1, i+index_offset))

    logging.info("Store computed data as file : %s" % filepath)
    df.to_csv(filepath, index=False, index_label=False)
    X, Y = _get_XY(df)
    return X, Y
def test_AllCNN_Models_DA_BN(use_bn=False, model='c', learning_rate=0.05, n_epochs=350, batch_size=200, L2_reg=0.001, input_ndo_p=0.8, layer_ndo_p=0.5, save_model=True, save_freq=50, s1=5, s2=5):
    """
    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
                          gradient)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer
    
    :type batch_size: int
    :param batch_size: the number of training examples per batch
    """

    rng = numpy.random.RandomState(23455)

    datasets = load_data2(theano_shared=False)

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]
    
    train_set_x = train_set_x.reshape(len(train_set_x),3,32,32)
    valid_set_x = valid_set_x.reshape(len(valid_set_x),3,32,32)
    test_set_x = test_set_x.reshape(len(test_set_x),3,32,32)
    
    train_set_x = numpy.asarray(train_set_x, dtype=theano.config.floatX)
    valid_set_x = numpy.asarray(valid_set_x, dtype=theano.config.floatX)
    test_set_x = numpy.asarray(test_set_x, dtype=theano.config.floatX)
    
    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.shape[0]
    n_valid_batches = valid_set_x.shape[0]
    n_test_batches = test_set_x.shape[0]
    n_train_batches //= batch_size
    n_valid_batches //= batch_size
    n_test_batches //= batch_size

    print 'n_train_batches: ', n_train_batches
    print 'n_valid_batches: ', n_valid_batches
    print 'n_test_batches: ', n_test_batches
    
    learning_rate = numpy.asarray(learning_rate, dtype=numpy.float32)
    print 'learning_rate: ', learning_rate

    # allocate symbolic variables for the data
    #index = T.lscalar()  # index to a [mini]batch
    lr = T.fscalar()
    training_enabled = T.iscalar('training_enabled')

    # start-snippet-1
    x = T.tensor4('x')   # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of [int] labels

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print('... building the model')

    #layer0_input = x.reshape((batch_size, 3, 32, 32))

    # drop the input only while training, don't drop while testing
    #dropout_input = T.switch(T.neq(training_enabled, 0), drop(layer0_input, p=input_ndo_p), input_ndo_p * layer0_input)
    dropout_input = T.switch(T.neq(training_enabled, 0), drop(x, p=input_ndo_p), input_ndo_p * x)

    classifier = None
    Model_Name = None
    if use_bn:
        if model == 'a':
            Model_Name = ModelA_AllCNN_BN
        elif model == 'b':
            Model_Name = ModelB_AllCNN_BN
        elif model == 'c':
            Model_Name = ModelC_AllCNN_BN
        else:
            raise RuntimeError('Invalid model parameter!')
    else:
        if model == 'a':
            Model_Name = ModelA_AllCNN
        elif model == 'b':
            Model_Name = ModelB_AllCNN
        elif model == 'c':
            Model_Name = ModelC_AllCNN
        else:
            raise RuntimeError('Invalid model parameter!')

    classifier = Model_Name(rng, 
                           dropout_input, 
                           y, 
                           batch_size, 
                           training_enabled, 
                           layer_ndo_p, 
                           L2_reg
                           )
        
    print 'Training Model: ', classifier.__class__.__name__
    
    test_model = theano.function(
        [x, y],
        classifier.errors,
        givens={
            training_enabled: numpy.cast['int32'](0)
        }
    )
    
    validate_model = theano.function(
        [x, y],
        classifier.errors,
        givens={
            training_enabled: numpy.cast['int32'](0)
        }
    )

    # train_model is a function that updates the model parameters by
    # SGD Since this model has many parameters, it would be tedious to
    # manually create an update rule for each model parameter. We thus
    # create the updates list by automatically looping over all
    # (params[i], grads[i]) pairs.

    momentum =theano.shared(numpy.cast[theano.config.floatX](0.9), name='momentum')
    updates = []
    for param in  classifier.params:
        param_update = theano.shared(param.get_value()*numpy.cast[theano.config.floatX](0.))    
        updates.append((param, param - lr * param_update))
        updates.append((param_update, momentum*param_update + (numpy.cast[theano.config.floatX](1.) - momentum)*T.grad(classifier.cost, param)))

    
    train_model = theano.function(
        [x, y, lr],
        classifier.cost,
        updates=updates,
        givens={
            training_enabled: numpy.cast['int32'](1)
        }
    )
        
    # end-snippet-1

    ###############
    # TRAIN MODEL #
    ###############
    print('... training')

    # early-stopping parameters
#     patience = 10000  # look as this many examples regardless
#     patience_increase = 2  # wait this much longer when a new best is found
    
#     improvement_threshold = 0.995  # a relative improvement of this much is considered significant
    
#    validation_frequency = min(n_train_batches, patience // 2)
    
    validation_frequency = n_train_batches // 2
    
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = timeit.default_timer()

    epoch = 0
    done_looping = False
    if use_bn:
        updateLRAfter = 100
    else:
        updateLRAfter = 200    

    while (epoch < n_epochs) and (not done_looping):
        
        # shuffle data before starting the epoch
        
        epoch = epoch + 1
        if(epoch > updateLRAfter):
            learning_rate *= 0.1
            updateLRAfter += 50

        for minibatch_index in range(n_train_batches):
            #print 'epoch: {0}, minibatch: {1}'.format(epoch, minibatch_index)
            iter = (epoch - 1) * n_train_batches + minibatch_index
#             if iter % 50 == 0:
#                 print('training @ iter = ', iter)

            train_x = augmentImages(train_set_x[minibatch_index * batch_size: (minibatch_index + 1) * batch_size], shift1=s1, shift2=s2)
            train_y = train_set_y[minibatch_index* batch_size: (minibatch_index + 1) * batch_size]
            cost_ij = train_model(train_x, train_y, learning_rate)
            
            if (iter + 1) % validation_frequency == 0:

                # compute zero-one loss on validation set
                validation_losses = [validate_model(valid_set_x[ii * batch_size: (ii + 1) * batch_size], valid_set_y[ii * batch_size: (ii + 1) * batch_size]) for ii
                                     in range(n_valid_batches)]
                this_validation_loss = numpy.mean(validation_losses)
                print('epoch %i, minibatch %i/%i, validation error %f %%' %
                      (epoch, minibatch_index + 1, n_train_batches,
                       this_validation_loss * 100.))

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:

                    #improve patience if loss improvement is good enough
#                     if this_validation_loss < best_validation_loss *  \
#                        improvement_threshold:
#                         patience = max(patience, iter * patience_increase)

                    # save best validation score and iteration number
                    best_validation_loss = this_validation_loss
                    best_iter = iter

                    # test it on the test set
                    test_losses = [
                        test_model(test_set_x[ii * batch_size: (ii + 1) * batch_size], test_set_y[ii * batch_size: (ii + 1) * batch_size])
                        for ii in range(n_test_batches)
                    ]
                    
                    test_score= numpy.mean(test_losses)

                    print(('     epoch %i, minibatch %i/%i, test error of '
                           'best model %f %%') %
                          (epoch, minibatch_index + 1, n_train_batches,
                           test_score * 100.))

#             if patience <= iter:
#                 done_looping = True
#                 break

        if save_model and epoch % save_freq == 0:
            # add model name to the file to differentiate different models
            with gzip.open('parameters_epoch_{0}.pklz'.format(epoch), 'wb') as fp:                
                cPickle.dump([param.get_value() for param in classifier.params], fp, protocol=2)
        
    end_time = timeit.default_timer()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
Exemple #7
0
                    type=float,
                    default=0.2,
                    help='Alpha for the leaky_relu.')
parser.add_argument('--patience', type=int, default=100, help='Patience')

args = parser.parse_args()
args.cuda = not args.no_cuda and torch.cuda.is_available()

random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
if args.cuda:
    torch.cuda.manual_seed(args.seed)

# Load data
adj, features, labels, idx_train, idx_val, idx_test = load_data2('cora')

# Model and optimizer
if args.sparse:
    model = SpGAT(nfeat=features.shape[1],
                  nhid=args.hidden,
                  nclass=int(labels.max()) + 1,
                  dropout=args.dropout,
                  nheads=args.nb_heads,
                  alpha=args.alpha)
else:
    model = GAT(nfeat=features.shape[1],
                nhid=args.hidden,
                nclass=int(labels.max()) + 1,
                dropout=args.dropout,
                nheads=args.nb_heads,
def test_ModelC_AllCNN(learning_rate=0.05,
                       n_epochs=350,
                       batch_size=200,
                       L2_reg=0.001,
                       input_ndo_p=0.8,
                       layer_ndo_p=0.5,
                       save_model=True,
                       save_freq=50):
    """
    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
                          gradient)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer
    
    :type batch_size: int
    :param batch_size: the number of training examples per batch
    """

    rng = numpy.random.RandomState(23455)

    datasets = load_data2()

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0]
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
    n_test_batches = test_set_x.get_value(borrow=True).shape[0]

    n_train_batches //= batch_size
    n_valid_batches //= batch_size
    n_test_batches //= batch_size

    print 'n_train_batches: ', n_train_batches
    print 'n_valid_batches: ', n_valid_batches
    print 'n_test_batches: ', n_test_batches

    learning_rate = numpy.asarray(learning_rate, dtype=numpy.float32)
    print 'learning_rate: ', learning_rate

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    lr = T.fscalar()
    training_enabled = T.iscalar('training_enabled')

    # start-snippet-1
    x = T.matrix('x')  # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of [int] labels

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print('... building the model')

    layer0_input = x.reshape((batch_size, 3, 32, 32))

    # drop the input only while training, don't drop while testing
    dropout_input = T.switch(T.neq(training_enabled, 0),
                             drop(layer0_input, p=input_ndo_p),
                             input_ndo_p * layer0_input)

    layer0 = myConvLayer(rng,
                         is_train=training_enabled,
                         input_data=dropout_input,
                         filter_shape=(96, 3, 3, 3),
                         image_shape=(batch_size, 3, 32, 32),
                         ssample=(1, 1),
                         bordermode='half',
                         p=1.0)

    layer1 = myConvLayer(rng,
                         is_train=training_enabled,
                         input_data=layer0.output,
                         filter_shape=(96, 96, 3, 3),
                         image_shape=(batch_size, 96, 32, 32),
                         ssample=(1, 1),
                         bordermode='half',
                         p=1.0)

    layer2 = myConvLayer(rng,
                         is_train=training_enabled,
                         input_data=layer1.output,
                         filter_shape=(96, 96, 3, 3),
                         image_shape=(batch_size, 96, 32, 32),
                         ssample=(2, 2),
                         bordermode='half',
                         p=layer_ndo_p)

    layer3 = myConvLayer(rng,
                         is_train=training_enabled,
                         input_data=layer2.output,
                         filter_shape=(192, 96, 3, 3),
                         image_shape=(batch_size, 96, 16, 16),
                         ssample=(1, 1),
                         bordermode='half',
                         p=1.0)

    layer4 = myConvLayer(rng,
                         is_train=training_enabled,
                         input_data=layer3.output,
                         filter_shape=(192, 192, 3, 3),
                         image_shape=(batch_size, 192, 16, 16),
                         ssample=(1, 1),
                         bordermode='half',
                         p=1.0)

    layer5 = myConvLayer(rng,
                         is_train=training_enabled,
                         input_data=layer4.output,
                         filter_shape=(192, 192, 3, 3),
                         image_shape=(batch_size, 192, 16, 16),
                         ssample=(2, 2),
                         bordermode='half',
                         p=layer_ndo_p)

    layer6 = myConvLayer(rng,
                         is_train=training_enabled,
                         input_data=layer5.output,
                         filter_shape=(192, 192, 3, 3),
                         image_shape=(batch_size, 192, 8, 8),
                         ssample=(1, 1),
                         bordermode='half',
                         p=1.0)

    layer7 = myConvLayer(rng,
                         is_train=training_enabled,
                         input_data=layer6.output,
                         filter_shape=(192, 192, 1, 1),
                         image_shape=(batch_size, 192, 8, 8),
                         ssample=(1, 1),
                         bordermode='half',
                         p=1.0)

    layer8 = myConvLayer(rng,
                         is_train=training_enabled,
                         input_data=layer7.output,
                         filter_shape=(10, 192, 1, 1),
                         image_shape=(batch_size, 192, 8, 8),
                         ssample=(1, 1),
                         bordermode='half',
                         p=1.0)

    # make sure this is what global averaging does
    global_average = layer8.output.mean(axis=(2, 3))

    softmax_layer = SoftmaxWrapper(input_data=global_average,
                                   n_in=10,
                                   n_out=10)

    L2_sqr = ((layer0.W**2).sum() + (layer1.W**2).sum() + (layer2.W**2).sum() +
              (layer3.W**2).sum() + (layer4.W**2).sum() + (layer5.W**2).sum() +
              (layer6.W**2).sum() + (layer7.W**2).sum() + (layer8.W**2).sum())

    # the cost we minimize during training is the NLL of the model
    cost = (softmax_layer.negative_log_likelihood(y) + L2_reg * L2_sqr)

    # create a function to compute the mistakes that are made by the model
    test_model = theano.function(
        [index],
        softmax_layer.errors(y),
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            y: test_set_y[index * batch_size:(index + 1) * batch_size],
            training_enabled: numpy.cast['int32'](0)
        })

    validate_model = theano.function(
        [index],
        softmax_layer.errors(y),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size],
            training_enabled: numpy.cast['int32'](0)
        })

    # create a list of all model parameters to be fit by gradient descent
    params = layer8.params + layer7.params + layer6.params + layer5.params + layer4.params + layer3.params + layer2.params + layer1.params + layer0.params

    # train_model is a function that updates the model parameters by
    # SGD Since this model has many parameters, it would be tedious to
    # manually create an update rule for each model parameter. We thus
    # create the updates list by automatically looping over all
    # (params[i], grads[i]) pairs.

    momentum = theano.shared(numpy.cast[theano.config.floatX](0.9),
                             name='momentum')
    updates = []
    for param in params:
        param_update = theano.shared(param.get_value() *
                                     numpy.cast[theano.config.floatX](0.))
        updates.append((param, param - lr * param_update))
        updates.append((param_update, momentum * param_update +
                        (numpy.cast[theano.config.floatX](1.) - momentum) *
                        T.grad(cost, param)))

    train_model = theano.function(
        [index, lr],
        cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size],
            training_enabled: numpy.cast['int32'](1)
        })
    # end-snippet-1

    ###############
    # TRAIN MODEL #
    ###############
    print('... training')

    # early-stopping parameters
    #     patience = 10000  # look as this many examples regardless
    #     patience_increase = 2  # wait this much longer when a new best is found

    #     improvement_threshold = 0.995  # a relative improvement of this much is considered significant

    #    validation_frequency = min(n_train_batches, patience // 2)

    validation_frequency = n_train_batches // 2

    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = timeit.default_timer()

    epoch = 0
    done_looping = False
    updateLRAfter = 200

    while (epoch < n_epochs) and (not done_looping):

        # shuffle data before starting the epoch

        epoch = epoch + 1
        if (epoch > updateLRAfter):
            learning_rate *= 0.1
            updateLRAfter += 50
            print 'epoch: ', epoch
            print 'updateLRAfter: ', updateLRAfter
            print 'learning_rate: ', learning_rate

        for minibatch_index in range(n_train_batches):
            #print 'epoch: {0}, minibatch: {1}'.format(epoch, minibatch_index)

            iter = (epoch - 1) * n_train_batches + minibatch_index
            if iter % 50 == 0:
                print('training @ iter = ', iter)
            cost_ij = train_model(minibatch_index, learning_rate)

            if (iter + 1) % validation_frequency == 0:

                # compute zero-one loss on validation set
                validation_losses = [
                    validate_model(i) for i in range(n_valid_batches)
                ]
                this_validation_loss = numpy.mean(validation_losses)
                print('epoch %i, minibatch %i/%i, validation error %f %%' %
                      (epoch, minibatch_index + 1, n_train_batches,
                       this_validation_loss * 100.))

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:

                    #improve patience if loss improvement is good enough
                    #                     if this_validation_loss < best_validation_loss *  \
                    #                        improvement_threshold:
                    #                         patience = max(patience, iter * patience_increase)

                    # save best validation score and iteration number
                    best_validation_loss = this_validation_loss
                    best_iter = iter

                    # test it on the test set
                    test_losses = [
                        test_model(i) for i in range(n_test_batches)
                    ]
                    test_score = numpy.mean(test_losses)
                    print(('     epoch %i, minibatch %i/%i, test error of '
                           'best model %f %%') %
                          (epoch, minibatch_index + 1, n_train_batches,
                           test_score * 100.))


#             if patience <= iter:
#                 done_looping = True
#                 break

        if save_model and epoch % save_freq == 0:
            # add model name to the file to differentiate different models
            with gzip.open('parameters_epoch_{0}.pklz'.format(epoch),
                           'wb') as fp:
                cPickle.dump([param.get_value() for param in params],
                             fp,
                             protocol=2)

    end_time = timeit.default_timer()
    print('Optimization complete.')
    print(
        'Best validation score of %f %% obtained at iteration %i, '
        'with test performance %f %%' %
        (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print(('The code for file ' + os.path.split(__file__)[1] +
           ' ran for %.2fm' % ((end_time - start_time) / 60.)), sys.stderr)
    for i in range(max_iter):
        # TODO: Compute train and test MSE

        # END TODO

        train_mses.append(train_mse)
        test_mses.append(test_mse)

        # TODO: Update w and b using a single step of ISTA. You are not allowed to use loops here.

        # END TODO

        # TODO: Stop the algorithm if the norm between previous W and current W falls below 1e-4

        # End TODO

    return W, train_mses, test_mses


if __name__ == '__main__':
    # Load and split data
    X, Y = load_data2('data2.csv')
    X, Y = preprocess(X, Y)
    X_train, Y_train, X_test, Y_test = split_data(X, Y)

    W, train_mses_ista, test_mses_ista = ista(X_train, Y_train, X_test, Y_test)

    # TODO: Your code for plots required in Problem 1.2(b) and 1.2(c)

    # End TODO
Exemple #10
0
def _load(_yearmonths_list, nb_clients, filepath=TRAIN_FILE_PATH):
    logging.info("- Load data : {}".format(_yearmonths_list))
    df = load_data2(filepath, _yearmonths_list, nb_clients)
    minimal_clean_data_inplace(df)
    preprocess_data_inplace(df)
    return df
torch.manual_seed(args.seed)

PAD = '<PAD>'
id_to_word, label_to_ans, label_to_ans_text = load_vocabulary(
    'insuranceQA/V2/vocabulary',
    'insuranceQA/V2/InsuranceQA.label2answer.token.encoded')
w2i = {w: i for i, w in enumerate(id_to_word.values(), 1)}
w2i[PAD] = 0
vocab_size = len(w2i)
print('vocab_size:', vocab_size)

train_data = load_data(
    'insuranceQA/V2/InsuranceQA.question.anslabel.token.500.pool.solr.train.encoded',
    id_to_word, label_to_ans_text)
test_data = load_data2(
    'insuranceQA/V2/InsuranceQA.question.anslabel.token.500.pool.solr.test.encoded',
    id_to_word, label_to_ans_text)
print('n_train:', len(train_data))
print('n_test:', len(test_data))

args.vocab_size = vocab_size
args.pre_embd = None

print('loading a word2vec binary...')
model_path = 'insuranceQA/V2//GoogleNews-vectors-negative300.bin'
word2vec = KeyedVectors.load_word2vec_format(model_path, binary=True)
print('loaded!')
pre_embd = load_embd_weights(word2vec, vocab_size, args.embd_size, w2i)
#save_pickle(pre_embd, 'pre_embd.pickle')
args.pre_embd = pre_embd
def test_Large_AllCNN_Model(use_bn=False, learning_rate=0.05, n_epochs=350, batch_size=200, L2_reg=0.001, 
                            input_ndo_p=0.8, layer_ndo_p=0.5, lrelu_alpha=0.181, save_model=True, save_freq=50, 
                            s1=5, s2=5):
    """
    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
                          gradient)
    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer
    
    :type batch_size: int
    :param batch_size: the number of training examples per batch
    """

    rng = numpy.random.RandomState(23455)

    datasets = load_data2(theano_shared=False)

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]
    
    train_set_x = train_set_x.reshape(len(train_set_x),3,32,32)
    valid_set_x = valid_set_x.reshape(len(valid_set_x),3,32,32)
    test_set_x = test_set_x.reshape(len(test_set_x),3,32,32)
    
    train_set_x = numpy.asarray(train_set_x, dtype=theano.config.floatX)
    valid_set_x = numpy.asarray(valid_set_x, dtype=theano.config.floatX)
    test_set_x = numpy.asarray(test_set_x, dtype=theano.config.floatX)

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.shape[0]
    n_valid_batches = valid_set_x.shape[0]
    n_test_batches = test_set_x.shape[0]
    n_train_batches //= batch_size
    n_valid_batches //= batch_size
    n_test_batches //= batch_size

    print 'n_train_batches: ', n_train_batches
    print 'n_valid_batches: ', n_valid_batches
    print 'n_test_batches: ', n_test_batches
    
    learning_rate = numpy.asarray(learning_rate, dtype=numpy.float32)
    print 'learning_rate: ', learning_rate

#     datasets = load_data2()
# 
#     train_set_x, train_set_y = datasets[0]
#     valid_set_x, valid_set_y = datasets[1]
#     test_set_x, test_set_y = datasets[2]
# 
#     # compute number of minibatches for training, validation and testing
#     n_train_batches = train_set_x.get_value(borrow=True).shape[0]
#     n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
#     n_test_batches = test_set_x.get_value(borrow=True).shape[0]
#     
#     
#     n_train_batches //= batch_size
#     n_valid_batches //= batch_size
#     n_test_batches //= batch_size
# 
#     print 'n_train_batches: ', n_train_batches
#     print 'n_valid_batches: ', n_valid_batches
#     print 'n_test_batches: ', n_test_batches
#     
#     learning_rate = numpy.asarray(learning_rate, dtype=numpy.float32)
#     print 'learning_rate: ', learning_rate

    # allocate symbolic variables for the data
    #index = T.lscalar()  # index to a [mini]batch
    lr = T.fscalar()
    training_enabled = T.iscalar('training_enabled')

    # start-snippet-1
    x = T.tensor4('x')   # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of [int] labels

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print('... building the model')

    #layer0_input = x.reshape((batch_size, 3, 32, 32))

    # drop the input only while training, don't drop while testing
    #dropout_input = T.switch(T.neq(training_enabled, 0), drop(x, p=input_ndo_p), input_ndo_p * x)

        
    ##input of 126x126 with color and spatial augmentation and no dropout

#     layer0 = myConvLayer(
#     rng,
#     is_train=training_enabled,
#     input_data=dropout_input, ## extreme augmented data input
#     filter_shape=(320, 3, 2, 2),
#     image_shape=(batch_size, 3, 126, 126),
#     ssample=(1,1),
#     bordermode='valid',
#     p=1.0,
#     alpha=0.5  ##leaky relu
#     )
# 
#     layer1 = myConvLayer(
#     rng,
#     is_train=training_enabled,
#     input_data=layer0.output,
#     filter_shape=(320, 320, 2, 2),
#     image_shape=(batch_size, 320, 125, 125),
#     ssample=(1,1),
#     bordermode='valid',
#     p=1.0,
#     alpha=0.5  ##leaky relu
# )
# 
#     layer2 = myConvLayer(
#     rng,
#     is_train=training_enabled,
#     input_data=layer1.output,
#     filter_shape=(320, 320, 2, 2),
#     image_shape=(batch_size, 320, 124, 124),
#     ssample=(2,2),
#     bordermode='valid',
#     p=1.0,
#     alpha=0.5  ##leaky relu
# )
# 
#     layer3 = myConvLayer(
#     rng,
#     is_train=training_enabled,
#     input_data=layer2.output,
#     filter_shape=(640, 320, 2, 2),
#     image_shape=(batch_size, 320, 62, 62),
#     ssample=(1,1),
#     bordermode='valid',
#     p=0.9,
#     alpha=0.5  ##leaky relu
# )
# 
#     layer4 = myConvLayer(
#     rng,
#     is_train=training_enabled,
#     input_data=layer3.output,
#     filter_shape=(640, 640, 2, 2),
#     image_shape=(batch_size, 640, 61, 61),
#     ssample=(1,1),
#     bordermode='valid',
#     p=0.9,
#     alpha=0.5  ##leaky relu
# )
# 
#     layer5 = myConvLayer(
#     rng,
#     is_train=training_enabled,
#     input_data=layer4.output,
#     filter_shape=(640, 640, 2, 2),
#     image_shape=(batch_size, 640, 60, 60),
#     ssample=(2,2),
#     bordermode='valid',
#     p=1.0,
#     alpha=0.5  ##leaky relu
# )
# 
#     layer6 = myConvLayer(
#     rng,
#     is_train=training_enabled,
#     input_data=layer5.output,
#     filter_shape=(960, 640, 2, 2),
#     image_shape=(batch_size, 640, 30, 30),
#     ssample=(1,1),
#     bordermode='valid',
#     p=0.8,
#     alpha=0.5  ##leaky relu
# )
# 
#     layer7 = myConvLayer(
#     rng,
#     is_train=training_enabled,
#     input_data=layer6.output,
#     filter_shape=(960, 960, 2, 2),
#     image_shape=(batch_size, 960, 29, 29),
#     ssample=(1,1),
#     bordermode='valid',
#     p=0.8,
#     alpha=0.5  ##leaky relu
# )
# 
#     layer8 = myConvLayer(
#     rng,
#     is_train=training_enabled,
#     input_data=layer7.output,
#     filter_shape=(960, 960, 2, 2),
#     image_shape=(batch_size, 960, 28, 28),
#     ssample=(2,2),
#     bordermode='valid',
#     p=1.0,
#     alpha=0.5  ##leaky relu
# )
# 
#     layer9 = myConvLayer(
#     rng,
#     is_train=training_enabled,
#     input_data=layer8.output,
#     filter_shape=(1280, 960, 2, 2),
#     image_shape=(batch_size, 960, 14, 14),
#     ssample=(1,1),
#     bordermode='valid',
#     p=0.7,
#     alpha=0.5  ##leaky relu
# )
# 
#     layer10 = myConvLayer(
#     rng,
#     is_train=training_enabled,
#     input_data=layer9.output,
#     filter_shape=(1280, 1280, 2, 2),
#     image_shape=(batch_size, 1280, 13, 13),
#     ssample=(1,1),
#     bordermode='valid',
#     p=0.7,
#     alpha=0.5  ##leaky relu
# )
# 
#     layer11 = myConvLayer(
#     rng,
#     is_train=training_enabled,
#     input_data=layer10.output,
#     filter_shape=(1280, 1280, 2, 2),
#     image_shape=(batch_size, 1280, 12, 12),
#     ssample=(2,2),
#     bordermode='valid',
#     p=1.0,
#     alpha=0.5  ##leaky relu
# )
# 
#     layer12 = myConvLayer(
#     rng,
#     is_train=training_enabled,
#     input_data=layer11.output,
#     filter_shape=(1600, 1280, 2, 2),
#     image_shape=(batch_size, 1280, 6, 6),
#     ssample=(1,1),
#     bordermode='valid',
#     p=0.6,
#     alpha=0.5  ##leaky relu
# )
# 
#     layer13 = myConvLayer(
#     rng,
#     is_train=training_enabled,
#     input_data=layer12.output,
#     filter_shape=(1600, 1600, 2, 2),
#     image_shape=(batch_size, 1600, 5, 5),
#     ssample=(1,1),
#     bordermode='valid',
#     p=0.6,
#     alpha=0.5  ##leaky relu
# )
# 
#     layer14 = myConvLayer(
#     rng,
#     is_train=training_enabled,
#     input_data=layer13.output,
#     filter_shape=(1600, 1600, 2, 2),
#     image_shape=(batch_size, 1600, 4, 4),
#     ssample=(2,2),
#     bordermode='valid',
#     p=1.0,
#     alpha=0.5  ##leaky relu
# )
# 
#     layer15 = myConvLayer(
#     rng,
#     is_train=training_enabled,
#     input_data=layer14.output,
#     filter_shape=(1920, 1600, 2, 2),
#     image_shape=(batch_size, 1600, 2, 2),
#     ssample=(1,1)
#     bordermode='valid',
#     p=0.5,
#     alpha=0.5  ##leaky relu
# )
# 
#     layer16 = myConvLayer(
#     rng,
#     is_train=training_enabled,
#     input_data=layer15.output,
#     filter_shape=(1920, 1920, 1, 1),
#     image_shape=(batch_size, 1920, 1, 1),
#     ssample=(1,1),
#     bordermode='valid',
#     p=0.5,
#     alpha=0.5  ##leaky relu
# )
# 
#     layer17 = myConvLayer(
#         rng,
#         is_train=training_enabled,
#         input_data=layer16.output,
#         filter_shape=(10,1920,1,1),
#         image_shape=(batch_size,1920,1,1),
#         ssample=(1,1),
#         bordermode='valid',
#         p=1.0,
#         alpha=0.5 ##leaky relu
# 
#     )
# 
# # make sure this is what global averaging does
# ## no global_average=layer8.output.mean(axis=(2,3))
# ## directly softmax layer
# 
#     # make sure this is what global averaging does
#     # global_average=layer8.output.mean(axis=(2,3))
# 
#     softmax_layer=SoftmaxWrapper(input_data=layer17.output, n_in=10, n_out=10)
# 
#     L2_sqr = (
#                 (layer0.W ** 2).sum()
#                 +(layer1.W**2).sum()
#                 +(layer2.W**2).sum()
#                 +(layer3.W**2).sum()
#                 +(layer4.W**2).sum()
#                 +(layer5.W**2).sum()
#                 +(layer6.W**2).sum()
#                 +(layer7.W**2).sum()
#                 +(layer8.W**2).sum()
#                 +(layer9.W**2).sum()
#                 +(layer10.W**2).sum()
#                 +(layer11.W**2).sum()
#                 +(layer12.W**2).sum()
#                 +(layer13.W**2).sum()
#                 +(layer14.W**2).sum()
#                 +(layer15.W**2).sum()
#                 +(layer16.W**2).sum()
#                 +(layer17.W**2).sum()
# 
#     )
# 
#     # the cost we minimize during training is the NLL of the model
#     cost = (softmax_layer.negative_log_likelihood(y) + L2_reg*L2_sqr)

    classifier = None
    
    if use_bn:
        classifier = Large_AllCNN_BN(rng, 
                           x, 
                           y,
                           batch_size, 
                           training_enabled, 
                           layer_ndo_p, 
                           L2_reg,
                           lrelu_alpha
                           )
    else:
        classifier = Large_AllCNN(rng, 
                           x, 
                           y,
                           batch_size, 
                           training_enabled, 
                           layer_ndo_p, 
                           L2_reg,
                           lrelu_alpha
                           )


    # create a function to compute the mistakes that are made by the model
#     test_model = theano.function(
#         [index],
#         classifier.errors,
#         givens={
#             x: test_set_x[index * batch_size: (index + 1) * batch_size],
#             y: test_set_y[index * batch_size: (index + 1) * batch_size],
#             training_enabled: numpy.cast['int32'](0)
#         }
#     )

    test_model = theano.function(
        [x, y],
        classifier.errors,
        givens={
            training_enabled: numpy.cast['int32'](0)
        }
    )

#     validate_model = theano.function(
#         [index],
#         classifier.errors,
#         givens={
#             x: valid_set_x[index * batch_size: (index + 1) * batch_size],
#             y: valid_set_y[index * batch_size: (index + 1) * batch_size],
#             training_enabled: numpy.cast['int32'](0)
#         }
#     )

    validate_model = theano.function(
        [x, y],
        classifier.errors,
        givens={
            training_enabled: numpy.cast['int32'](0)
        }
    )
    
    # create a list of all model parameters to be fit by gradient descent
    #params = layer17.params + layer16.params + layer15.params + layer14.params + layer13.params + layer12.params + layer11.params + layer10.params + layer9.params + layer8.params + layer7.params + layer6.params + layer5.params + layer4.params + layer3.params + layer2.params + layer1.params + layer0.params

    # train_model is a function that updates the model parameters by
    # SGD Since this model has many parameters, it would be tedious to
    # manually create an update rule for each model parameter. We thus
    # create the updates list by automatically looping over all
    # (params[i], grads[i]) pairs.

    momentum =theano.shared(numpy.cast[theano.config.floatX](0.9), name='momentum')
    updates = []
    for param in classifier.params:
        param_update = theano.shared(param.get_value()*numpy.cast[theano.config.floatX](0.))    
        updates.append((param, param - lr * param_update))
        updates.append((param_update, momentum*param_update + (numpy.cast[theano.config.floatX](1.) - momentum)*T.grad(classifier.cost, param)))

#     train_model = theano.function(
#         [index, lr],
#         classifier.cost,
#         updates=updates,
#         givens={
#             x: train_set_x[index * batch_size: (index + 1) * batch_size],
#             y: train_set_y[index * batch_size: (index + 1) * batch_size],
#             training_enabled: numpy.cast['int32'](1)
#         }
#     )

    train_model = theano.function(
        [x, y, lr],
        classifier.cost,
        updates=updates,
        givens={
            training_enabled: numpy.cast['int32'](1)
        }
    )

    ###############
    # TRAIN MODEL #
    ###############
    print('... training')

    # early-stopping parameters
#     patience = 10000  # look as this many examples regardless
#     patience_increase = 2  # wait this much longer when a new best is found
    
#     improvement_threshold = 0.995  # a relative improvement of this much is considered significant
    
#    validation_frequency = min(n_train_batches, patience // 2)

    validation_frequency = n_train_batches // 2
    
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = timeit.default_timer()

    epoch = 0
    done_looping = False
    if use_bn:
        updateLRAfter = 100
    else:
        updateLRAfter = 200

    print 'Using Batch Normalization: ', use_bn
    print 'updateLRAfter: ', updateLRAfter
    
    while (epoch < n_epochs) and (not done_looping):
        
        # shuffle data before starting the epoch
        
        epoch = epoch + 1
        if(epoch > updateLRAfter):
            learning_rate *= 0.1
            updateLRAfter += 50
            print 'epoch: ', epoch
            print 'updateLRAfter: ', updateLRAfter
            print 'learning_rate: ', learning_rate

        for minibatch_index in range(n_train_batches):
            #print 'epoch: {0}, minibatch: {1}'.format(epoch, minibatch_index)
                        
            iter = (epoch - 1) * n_train_batches + minibatch_index
            if iter % 50 == 0:
                print('training @ iter = ', iter)

            train_x = augmentImages(train_set_x[minibatch_index * batch_size: (minibatch_index + 1) * batch_size], shift1=s1, shift2=s2, enlarge=True)
            train_y = train_set_y[minibatch_index* batch_size: (minibatch_index + 1) * batch_size]
            cost_ij = train_model(train_x, train_y, learning_rate)
            
            if (iter + 1) % validation_frequency == 0:

                # compute zero-one loss on validation set
                validation_losses = [validate_model(enlargeMiniBatch(valid_set_x[ii * batch_size: (ii + 1) * batch_size]), valid_set_y[ii * batch_size: (ii + 1) * batch_size]) for ii
                                     in range(n_valid_batches)]
                this_validation_loss = numpy.mean(validation_losses)
                print('epoch %i, minibatch %i/%i, validation error %f %%' %
                      (epoch, minibatch_index + 1, n_train_batches,
                       this_validation_loss * 100.))

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:

                    #improve patience if loss improvement is good enough
#                     if this_validation_loss < best_validation_loss *  \
#                        improvement_threshold:
#                         patience = max(patience, iter * patience_increase)

                    # save best validation score and iteration number
                    best_validation_loss = this_validation_loss
                    best_iter = iter

                    # test it on the test set
                    test_losses = [
                        test_model(enlargeMiniBatch(test_set_x[ii * batch_size: (ii + 1) * batch_size]), test_set_y[ii * batch_size: (ii + 1) * batch_size])
                        for ii in range(n_test_batches)
                    ]
                    
                    test_score= numpy.mean(test_losses)

                    print(('     epoch %i, minibatch %i/%i, test error of '
                           'best model %f %%') %
                          (epoch, minibatch_index + 1, n_train_batches,
                           test_score * 100.))

#             if patience <= iter:
#                 done_looping = True
#                 break

        if save_model and epoch % save_freq == 0:
            # add model name to the file to differentiate different models
            with gzip.open('parameters_epoch_{0}.pklz'.format(epoch), 'wb') as fp:                
                cPickle.dump([param.get_value() for param in classifier.params], fp, protocol=2)
        
    end_time = timeit.default_timer()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i, '
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print(('The code for file ' +
           os.path.split(__file__)[1] +
           ' ran for %.2fm' % ((end_time - start_time) / 60.)), sys.stderr)
Exemple #13
0
parser.add_argument('--dropout',
                    type=float,
                    default=0.5,
                    help='Dropout rate (1 - keep probability).')

args = parser.parse_args()
args.cuda = not args.no_cuda and torch.cuda.is_available()

np.random.seed(args.seed)
torch.manual_seed(args.seed)
if args.cuda:
    torch.cuda.manual_seed(args.seed)

# Load data
# adj, features, labels, idx_train, idx_val, idx_test = load_data()
adj, features, labels, idx_train, idx_val, idx_test = load_data2("Liar_400")

# Model and optimizer
model = GCN(nfeat=features.shape[1],
            nhid=args.hidden,
            nclass=labels.max().item() + 1,
            dropout=args.dropout)
optimizer = optim.Adam(model.parameters(),
                       lr=args.lr,
                       weight_decay=args.weight_decay)

if args.cuda:
    model.cuda()
    features = features.cuda()
    adj = adj.cuda()
    labels = labels.cuda()
Exemple #14
0
def load_test():
    """

    :return: test X, Y (only LAST_TARGET_LABELS and last_targets_str) dataframe
    """
    def _get_XY(df):
        X = df[['ncodpers', 'fecha_dato', 'fecha_alta'] +
               FEATURES_NAMES +
               PROCESSED_TARGETS(1) +
               PROCESSED_TARGETS(2) +
               PROCESSED_TARGETS(3) +
               PROCESSED_TARGETS(4) +
               DIFF_TARGETS(1, 2) +
               DIFF_TARGETS(1, 3) +
               DIFF_TARGETS(1, 4)
               ]

        Y = df[['last_targets_str'] + LAST_TARGET_LABELS.tolist()]
        return X, Y

    filename = "test.csv"
    filepath = '../data/generated/' + filename
    if os.path.exists(filepath) and os.path.isfile(filepath):
        logging.info("- Found already generated file, load it")
        df = pd.read_csv('../data/generated/' + filename)
        X, Y = _get_XY(df)
        return X, Y
    # else:

    # load all test data:
    fname = TEST_FILE_PATH
    yearmonths_list = []
    logging.info("- Load file : %s" % (fname))
    df = load_data2(fname, [])
    minimal_clean_data_inplace(df)
    preprocess_data_inplace(df)
    ref_clients = df['ncodpers'].unique()

    # load data from train dataset
    fname = TRAIN_FILE_PATH
    yearmonth = 201606
    yearmonths_list = [_get_prev_ym(yearmonth)]
    logging.info("- Load file : %s, yearmonth=%i" % (fname, yearmonths_list[0]))
    df1 = load_data2(fname, yearmonths_list)
    minimal_clean_data_inplace(df1)
    preprocess_data_inplace(df1)
    df1 = df1[df1['ncodpers'].isin(ref_clients)]

    df = df.sort_values(['ncodpers', 'fecha_dato'])
    df1 = df1.sort_values(['ncodpers', 'fecha_dato'])
    df1.index = df.index
    assert (df['ncodpers'] == df1['ncodpers']).all(), "Clients are not alignable"

    # Transform main month:
    process_features(df)

    # Append products from the previous month:
    append_columns(df, df1[TARGET_LABELS], LAST_TARGET_LABELS)
    add_targets_str(df, 'last_targets_str', target_labels=LAST_TARGET_LABELS)

    # Process targets of yearmonth - 1
    process_targets(df1, label_index=1)
    append_columns(df, df1[PROCESSED_TARGETS(1)])

    assert not df.isnull().any().all(), "Some nan values appeared"

    # Load supplementary data
    supp_yearmonths_list = [_get_prev_ym(yearmonths_list[0]), _get_year_january(yearmonth), yearmonth - 100]
    ll = 'max'
    index_offset = 2
    for i, ym in enumerate(supp_yearmonths_list):
        logging.info("- Add a supplementary data : %i" % ym)
        df_ym = load_data2(fname, [ym], ll)
        minimal_clean_data_inplace(df_ym)
        preprocess_data_inplace(df_ym)
        #process_features(df_ym)

        df_ym = add_zero_missing_clients(df_ym, ym, df, yearmonth, ref_clients)

        df_ym = df_ym[df_ym['ncodpers'].isin(ref_clients)].sort_values(['ncodpers'])
        df_ym.index = df.index
        assert (df['ncodpers'] == df_ym['ncodpers']).all(), "Clients are not alignable"

        process_targets(df_ym, label_index=i+index_offset)
        append_columns(df, df_ym[PROCESSED_TARGETS(i+index_offset)])

        fn = 'diff_targets_dec_%i%i' % (1, i+index_offset)
        df.loc[:, fn] = compute_targets_diff(df1[TARGET_LABELS], df_ym[TARGET_LABELS])

        res = compute_targets_group_diff(df1[TARGET_GROUPS_DEC(1)],
                                         df_ym[TARGET_GROUPS_DEC(i+index_offset)])
        append_columns(df, res, DIFF_TARGET_GROUPS_DEC(1, i+index_offset))

    logging.info("Store computed data as file : %s" % filepath)
    df.to_csv(filepath, index=False, index_label=False)
    X, Y = _get_XY(df)
    return X, Y
Exemple #15
0
    loss_test = F.nll_loss(output[idx_test], labels[idx_test])
    acc_test = accuracy(output[idx_test], labels[idx_test])
    print("Test results:", "loss= {:.4f}".format(loss_test),
          "accuracy= {:.4f}".format(acc_test.item()))

    return acc_test.cpu().numpy()


for dataset in ['cora', 'citeseer', 'pubmed']:
    # np.random.seed(args.seed)
    # torch.manual_seed(args.seed)
    # if args.cuda:
    #     torch.cuda.manual_seed(args.seed)

    # Load data
    adj, features, labels, idx_train, idx_val, idx_test = load_data2(dataset)

    # num of nodes
    num_nodes = adj.shape[0]
    acc = []
    for _ in range(100):

        # Model and optimizer
        model = GCN(nfeat=features.shape[1],
                    nhid=args.hidden,
                    nclass=labels.max().item() + 1,
                    dropout=args.dropout)
        optimizer = optim.Adam(model.parameters(),
                               lr=args.lr,
                               weight_decay=args.weight_decay)
Exemple #16
0
import numpy as np
from utils import load_data2, split_data, preprocess, normalize
import argparse

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Problem 4')
    parser.add_argument('--data',
                        type=str,
                        default='data4.csv',
                        help='Path to csv file of dataset')
    args = parser.parse_args()

    X, Y = load_data2(args.data)
    X = X.astype('f') + np.random.normal(size=(X.shape))

    assert X.shape[1] >= 3

    W = np.linalg.inv(X.T @ X) @ (X.T @ Y)
    print("Task 4 Complete")
Exemple #17
0
parser.add_argument('--dropout',
                    type=float,
                    default=0.5,
                    help='Dropout rate (1 - keep probability).')

args = parser.parse_args()
args.cuda = not args.no_cuda and torch.cuda.is_available()

np.random.seed(args.seed)
torch.manual_seed(args.seed)
if args.cuda:
    torch.cuda.manual_seed(args.seed)

# Load data
# adj, features, labels, idx_train, idx_val, idx_test = load_data()
adj, features, labels, idx_train, idx_val, idx_test = load_data2("BlogCatalog")

# Model and optimizer
model = GCN(nfeat=features.shape[1],
            nhid=args.hidden,
            nclass=labels.max().item() + 1,
            dropout=args.dropout)
optimizer = optim.Adam(model.parameters(),
                       lr=args.lr,
                       weight_decay=args.weight_decay)

if args.cuda:
    model.cuda()
    features = features.cuda()
    adj = adj.cuda()
    labels = labels.cuda()