Exemple #1
0
def clean_household(cnxn):
    sqlhh="select distinct HouseholdID,CountyCode,FTract,BlockGroup,City,UPPER(replace(State,'[^\W ]','')) as State,PostalCode from Household_20200501"
    print("Query to get the hh data in progress....")
    hhdf=load_data(cnxn,sqlhh)
    #hhdf['State']=hhdf['State'].str.upper() --Done above
    print("Distinct hh:", len(set(hhdf['HouseholdID'].values)))
    #- Now map full State to State Code
    states=map_state_to_Code(revert=True)
    hhdf=hhdf.replace({"State":states}) #- slow can refactor and later merge
    #- Hardcoding these three Hoseholds to change state from OH to KY
    """
    38663830 21   21015     21015980100           Cincinnati OH 45275
    49727069 21   21015     21015980100           Cincinnati OH 45275
    52218051 21   21015     21015980100           Cincinnati OH 45275
    """
    print("Before substitute: ",hhdf.loc[hhdf['HouseholdID'].isin([38663830,49727069,52218051])])
    hhdf.loc[hhdf['HouseholdID'].isin([38663830,49727069,52218051]),'State']='KY'
    print("After substitute: ", hhdf.loc[hhdf['HouseholdID'].isin([38663830,49727069,52218051])])
    #- now update the fips
    sqlfips="select Fipsstatecode,State from FipsstateMap"
    fips=load_data(cnxn,sqlfips)
    #- join fips
    print("Joining hh with fipsstatecode")
    hhfips=pd.merge(hhdf,fips,left_on='State',right_on='State',how='left')
    print("Distinct hhfips hh:", len(set(hhfips['HouseholdID'].values)))
    #hhfips.drop('state',axis=1,inplace=True) #- Slow
    return hhfips
Exemple #2
0
def comb_hhdata_org(cnxn,addgenre=False,addsec_no=False):
   sqltrg="select * from TRGMap"
   trgmap=load_data(cnxn,sqltrg)
   print("TRG MAP orgids: ", len(set(trgmap['OrgID'].values)))
   sqlOrg="SELECT OrgID,AnnualRevenue,AnnualRevenueYear,PostalCode from Organization"
   organization=load_data(cnxn,sqlOrg)
   print("organization orgids: ", len(set(trgmap['OrgID'].values)))
   trg_org=pd.merge(trgmap,organization,left_on='OrgID',right_on='OrgID',how='inner')
   print("trg_org orgids: ", len(set(trgmap['OrgID'].values)))
   #- combining Orggenre 
   if addgenre:
       print("add Org Genre")
       sqlGen="select distinct OrgID, first_value(Genre) over(partition by OrgId order by Genre ASC) as TRG_Genre from OrgGenre"
       orggen=load_data(cnxn,sqlGen)
       print("Genre available for Orgs: ", len(set(orggen['OrgID'].values)))
       trg_org=pd.merge(trg_org,orggen,left_on='OrgID',right_on='OrgID', how='left')
   if addsec_no:
       print("add sec_no from Orgmap")
       sqlorgmap="select distinct cast(TRGI as int) as TRGI, sec_no from orgmap where TRGI is not NULL"
       orgmap=load_data(cnxn,sqlorgmap)
       print("TRGI sec_no given for ", len(set(orgmap['TRGI'].values)))
       trg_org=pd.merge(trg_org,orgmap,left_on='OrgID',right_on='TRGI', how='left')
       trg_org.drop('TRGI',axis=1,inplace=True)
   print("Shape of final TRG ORG integ: ",trg_org.shape)
   return trg_org
Exemple #3
0
def model_train_validation(ins_file, oos_file, classifier, var_list_filename,
                           result_dir, output_suffix):
    """
    train model
    evaluate on the train and validation data
    evaluate the model performance on the train and validation data
    """
    #################### Load train and validation data ####################
    print 'Loading data for modeling starts ...'
    t0 = time.time()
    target_name = 'target'
    X, y = load_data(ins_file, var_list_filename, target_name)
    Xv, yv = load_data(oos_file, var_list_filename, target_name)
    print "Loading data done, taking ", time.time() - t0, "secs"

    # Train Model
    print '\nModel training starts...'
    t0 = time.time()
    model = classifier
    model.fit(X, y)
    print "Model training done, taking ", time.time() - t0, "secs"
    pickle.dump(model, open(result_dir + "model.p",
                            'wb'))  # save model to disk

    # Predict Train
    y_pred = model.predict(X)
    p_pred = model.predict_proba(X)
    p_pred = p_pred[:, 1]

    # Predict Validation
    yv_pred = model.predict(Xv)
    pv_pred = model.predict_proba(Xv)
    pv_pred = pv_pred[:, 1]

    # Performance Evaluation: Train and Validation
    performance_eval_train_validation(y, p_pred, yv, pv_pred, result_dir,
                                      output_suffix)

    #################### Random Forest Feature Importance ######################
    try:
        varlist_file = open(var_list_filename, 'rU')
        varlist_csv = csv.reader(varlist_file)
        var_list = []
        for row in varlist_csv:
            var_list.append(row[0])
        out_feat_import = open(
            result_dir + 'feature_import_' + str(output_suffix) + '.csv', 'wb')
        feat_import_csv = csv.writer(out_feat_import)
        var_import = zip(range(len(var_list)), var_list,
                         model.feature_importances_)
        feat_import_csv.writerow(['var seq num', 'var name', 'importance'])
        print "RandomForest classifier, var importance was output"
        for row in var_import:
            feat_import_csv.writerow(row)
    except:
        print "Not RandomForest classifier, var importance not created"
def main(user_location, destination):
    """
    Returns polyline of safest route
    :param user_location: list or tuple of latitude, longitude coordinates (lat, lng)
    :param destination: list or tuple of latitude, longitude coordinates(lat, lng)
    :return:
    """
    utcrime = load_data()
    crime_weights = generate_crime_weights(utcrime.df)
    subregion_weights = generate_subregion_weights(utcrime, crime_weights)
    api_key = 'AIzaSyDmKbjLrlWQowWVzzTy_AAWsFQO4Hdbeko'
    cli = client.Client(key=api_key)
    gmap_routes = route_generator(user_location, destination, cli)
    point_routes = convert_to_point_routes(gmap_routes)
    scores = score_routes(point_routes, gmap_routes, subregion_weights)
    safe_route = safest_route(scores)
    route = gmap_routes[safe_route]
    if type(route) is dict:
        # alternative route, indexes differently than waypoint route
        polyline = route['overview_polyline']['points']
    else:
        # waypoint route, indexes differently than alternative route
        polyline = route[0]['overview_polyline']['points']

    return polyline
Exemple #5
0
def my_predict():

    model = load_model("model.h5")

    test_generator = load_data(test_dir, 1)

    for test_image, test_labels in test_generator:
        prediction = model.predict(test_image)
        max_index = np.argmax(prediction)

        # 判断类别
        if max_index == 0:
            label = '%.2f%% ' % (prediction[0][0] * 100) + 'is a ' + str(
                my_labels[0]) + '.'
        elif max_index == 1:
            label = '%.2f%% ' % (prediction[0][1] * 100) + 'is a ' + str(
                my_labels[1]) + '.'
        elif max_index == 2:
            label = '%.2f%% ' % (prediction[0][2] * 100) + 'is a ' + str(
                my_labels[2]) + '.'
        elif max_index == 3:
            label = '%.2f%% ' % (prediction[0][3] * 100) + 'is a ' + str(
                my_labels[3]) + '.'
        elif max_index == 4:
            label = '%.2f%% ' % (prediction[0][4] * 100) + 'is a ' + str(
                my_labels[4]) + '.'

        plt.imshow(test_image[0])
        plt.title(label)
        plt.show()
Exemple #6
0
def load_test_dataset(dataset_dir, tokenizer):
    test_dataset = load_data(dataset_dir)
    # test_dataset = pd.read_csv(dataset_dir, delimiter='\t')
    test_label = test_dataset['label'].values
    # tokenizing dataset
    tokenized_test = tokenized_dataset(test_dataset, tokenizer)
    return tokenized_test, test_label
Exemple #7
0
def train():
	parser = argparse.ArgumentParser()
	# load model and tokenizer
	# MODEL_NAME = "bert-base-multilingual-cased"
	MODEL_NAME = args.model_name # "distilbert-base-multilingual-cased"
	tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

	# load dataset
	train_dataset = load_data("../input/data/train/train.tsv")
	#dev_dataset = load_data("./dataset/train/dev.tsv")
	train_label = train_dataset['label'].values
	#dev_label = dev_dataset['label'].values

	# tokenizing dataset
	tokenized_train = tokenized_dataset(train_dataset, tokenizer)
	#tokenized_dev = tokenized_dataset(dev_dataset, tokenizer)

	# make dataset for pytorch.
	RE_train_dataset = RE_Dataset(tokenized_train, train_label)
	#RE_dev_dataset = RE_Dataset(tokenized_dev, dev_label)

	device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

	# setting model hyperparameter
	bert_config = BertConfig.from_pretrained(MODEL_NAME)
	bert_config.num_labels = 42
	model = BertForSequenceClassification(bert_config) 
	model.parameters
	model.to(device)
  # 사용한 option 외에도 다양한 option들이 있습니다.
  # https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments 참고해주세요.
	training_args = TrainingArguments(
		output_dir=f'./results/{MODEL_NAME}',          # output directory
		save_total_limit=3,              # number of total save model.
		save_steps=500,                 # model saving step.
		# num_train_epochs=4,              # total number of training epochs
		num_train_epochs=5,              # total number of training epochs
		learning_rate=5e-5,               # learning_rate
		per_device_train_batch_size=16,  # batch size per device during training
		#per_device_eval_batch_size=16,   # batch size for evaluation
		warmup_steps=500,                # number of warmup steps for learning rate scheduler
		weight_decay=0.01,               # strength of weight decay
		logging_dir='./logs',            # directory for storing logs
		logging_steps=100,              # log saving step.
		#evaluation_strategy='steps', # evaluation strategy to adopt during training
									# `no`: No evaluation during training.
									# `steps`: Evaluate every `eval_steps`.
									# `epoch`: Evaluate every end of epoch.
		#eval_steps = 500,            # evaluation step.
		#load_best_model_at_end = True, # When set to True, the parameters save_strategy and save_steps will be ignored and the model will be saved after each evaluation.
		)
	trainer = Trainer(
		model=model,                         # the instantiated 🤗 Transformers model to be trained
		args=training_args,                  # training arguments, defined above
		train_dataset=RE_train_dataset,         # training dataset
		#eval_dataset=RE_dev_dataset,             # evaluation dataset
		#compute_metrics=compute_metrics         # define metrics function
	)
	# train model
	trainer.train()
def test_dA(learning_rate=0.1, training_epochs=5,
            batch_size=1, output_folder='dA_plots'):

    datasets = load_data()
    train_set_c, train_set_x = datasets[0]
    n_train_batches = train_set_c.get_value(borrow=True).shape[0] / batch_size

    index = T.lscalar()
    x = T.matrix('x')
    c = T.matrix('c')

    if not os.path.isdir(output_folder):
        os.makedirs(output_folder)
    os.chdir(output_folder)

    rng = numpy.random.RandomState(123)

    da = denoising_layer(
        numpy_rng=rng,
        corrupted_input=c,
        input=x,
        n_visible=800 * 600,
        n_hidden=200
    )

    cost, updates = da.get_cost_updates(learning_rate)

    train_da = theano.function(
        [index],
        cost,
        updates=updates,
        givens={
            c: train_set_c[index * batch_size: (index + 1) * batch_size],
            x: train_set_x[index * batch_size: (index + 1) * batch_size]
        }
    )

    start_time = timeit.default_timer()

    for epoch in xrange(training_epochs):
        c = []
        for batch_index in xrange(n_train_batches):
            j = train_da(batch_index)
            c.append(j)

        print 'Training epoch %d, cost ' % epoch, numpy.mean(c)

    end_time = timeit.default_timer()

    training_time = (end_time - start_time)

    print "training time: " + str(training_time)

    image = Image.fromarray(tile_raster_images(
        X=da.W.get_value(borrow=True).T,
        img_shape=(600, 800), tile_shape=(10, 10),
        tile_spacing=(1, 1)))
    image.save('filters')

    os.chdir('../')
Exemple #9
0
def run_ML_leave_one_subject_out(config, filename, question, clf, cols, return_arr=None, return_index=-1):
    working_directory = config['DATA_DIRECTORY']
    data_X, data_y = load_data(working_directory, filename, cols, question)
    data = leave_one_subject_out(data_X, data_y, 'User')
    score = 0
    score_dummy_mf = 0
    score_dummy_sf = 0
    dummy_clf_mf = DummyClassifier('most_frequent')
    dummy_clf_sf = DummyClassifier('stratified')
    for (training_X, training_y), (testing_X, testing_y) in data:
        clf.fit(training_X, training_y)
        dummy_clf_mf.fit(training_X, training_y)
        dummy_clf_sf.fit(training_X, training_y)

        single_score = clf.score(testing_X, testing_y)
        single_score_dummy_mf = dummy_clf_mf.score(testing_X, testing_y)
        single_score_dummy_sf = dummy_clf_sf.score(testing_X, testing_y)
        #print 'Single run score: ' + ("%0.2f" % single_score.mean())
        #print 'Single run score (dummy most frequent): ' + ("%0.2f" % single_score_dummy_mf.mean())
        #print 'Single run score (dummy stratified): ' + ("%0.2f" % single_score_dummy_sf.mean())

        score = score + single_score.mean()
        score_dummy_mf = score_dummy_mf + single_score_dummy_mf.mean()
        score_dummy_sf = score_dummy_sf + single_score_dummy_sf.mean()
    score = round(float(score / len(data)), 2)
    score_dummy_mf = round(float(score_dummy_mf / len(data)), 2)
    score_dummy_sf = round(float(score_dummy_sf / len(data)), 2)
    #print 'Total score: ' + str(score)
    #print 'Total score (dummy most frequent): ' + str(score_dummy_mf)
    #print 'Total score (dummy stratified): ' + str(score_dummy_sf)
    if return_index == -1:
        return score, score_dummy_mf, score_dummy_sf
    else:
        return_arr[return_index] = (score, score_dummy_mf, score_dummy_sf)
Exemple #10
0
def run(args):
    if args.config is not None:
        with open(args.config, 'r') as stream:
            hyper_params = load(stream, Loader=yaml.FullLoader)
    else:
        hyper_params = {}

    if not os.path.exists(args.output):
        os.makedirs(args.output)

    # to be done as soon as possible otherwise mlflow will not log with the proper exp. name
    if 'exp_name' in hyper_params:
        mlflow.set_experiment(hyper_params['exp_name'])

    # __TODO__ change the hparam that are used from the training algorithm
    # (and NOT the model - these will be specified in the model itself)
    check_and_log_hp(
        ['batch_size', 'optimizer', 'patience', 'architecture', 'max_epoch',
         'exp_name'],
        hyper_params)

    train_loader, dev_loader = load_data(args, hyper_params)
    model = load_model(hyper_params)
    optimizer = load_optimizer(hyper_params, model)
    loss_fun = load_loss(hyper_params)

    train(model, optimizer, loss_fun, train_loader, dev_loader, hyper_params['patience'],
          args.output, max_epoch=hyper_params['max_epoch'],
          use_progress_bar=not args.disable_progressbar, start_from_scratch=args.start_from_scratch)
Exemple #11
0
def callingBatchGD(player):
    '''
    Function for loading data, split on test and train, calling batch gradient descent with data params,
    do the prediction and return rmse result

    This code appears multiple times in code so separate them in function because of redundancy

    :param player: string name of player, input from keyboard
    :return: RMSE metrics for given player
    '''

    train_data, test_data = load_data('dataset/' + player + '.csv')

    x, y = collect_attributes(train_data)

    newB, cost_history_retval = batch_gradient_descent(
        x, y, B, recommended_alpha, recommended_iteration_number)

    x_test, y_test = collect_attributes(test_data)
    y_pre = x_test.dot(newB)

    rmse = calculate_rmse(np.array(y_pre), y_test)

    print("\nRMSE for player " + player + " is: " + str(rmse) + "\n")

    return rmse
Exemple #12
0
def my_predict():
    use_gpu = torch.cuda.is_available()

    test_data = load_data(data_dir_test, image_size=image_size, batch_size=batch_size)
    X_test, y_test = next(iter(test_data))

    model = torch.load('model.pt')

    if use_gpu:
        model = model.cuda()

    if use_gpu:
        images = Variable(X_test.cuda())
    else:
        images = Variable(X_test)
    outputs = model(images)
    _, predicted = torch.max(outputs.data, 1)

    print("Predict Label is: ", predicted.data)
    print("Real Label is :", y_test.data)

    img = torchvision.utils.make_grid(X_test)
    img = img.numpy().transpose([1, 2, 0])   # 转成numpy在转置
    plt.imshow(img)
    plt.show()
Exemple #13
0
def load_test_dataset(root, tokenizer):
    test_dataset = load_data(root + "/input/data/test/test.tsv", root)
    # test_dataset = load_data(root+"/input/data/test/ner_test_ver2.tsv", root)
    test_label = test_dataset['label'].values
    # tokenizing dataset
    tokenized_test = tokenized_dataset(test_dataset, tokenizer)
    return tokenized_test, test_label
Exemple #14
0
def predict():
    # load the saved model
    classifier = cPickle.load(open('best_model.pkl'))

    # compile a predictor function
    predict_model = theano.function(
        inputs=[classifier.input],
        outputs=classifier.y_pred
        )

    # We can test it on some examples from test test
    dataset='mnist.pkl.gz'
    datasets = load_data(dataset)
    test_set_x, test_set_y = datasets[2]
    test_set_x = test_set_x.get_value()
    # test_set_y = test_set_y.get_value()

    predicted_values = predict_model(test_set_x[:1000])
    print ("Predicted values for the examples in test set:")
    print predicted_values

    error_num = 0
    for x in range(1000):
        if test_set_y.eval()[x] != predicted_values[x]:
            error_num += 1
            # print '%d: %d & %d' %(x, test_set_y.eval()[x], predicted_values[x])

    print 'error num: %d, test precision: %f %%' %(error_num, (1.0*error_num/1000)*100)
Exemple #15
0
def train(starting_epoch=0):
  """This module sets all hyper-parametes of the model and optimisers,
  creates an instance of the Keras Model class and partially trains it using
  the trainer module.

  Args:
    starting_epoch: Specifies at which epoch do we want to start. (Integer)

  Returns:
    None
  """
  global JUMP

  model = create_model() # Creates an object of Model class

  if starting_epoch: # In case starting_epoch is Non-zero
    model = load_model_weight(model, 'model_weights.pkl')
  
  (x_train, y_train, x_valid, y_valid, x_test, y_test) = load_data()
  print ("Training Data Shape: ", x_train.shape)
  print ("Testing Data Shape: ", x_test.shape)

  for i in range(starting_epoch, 300000, JUMP): # The paper trained to 300000 
    model = trainer(model,
                    x_train,
                    y_train,
                    x_valid,
                    y_valid,
                    initial_epoch=i)
    #try:
    #  save_model_weight(model, 'model_weights.pkl')
    #except:
    #  print ("Cannot save the model")
    evaluate(model=model, x_test=x_test, y_test=y_test)
Exemple #16
0
def run(args, hyper_params):
    """Setup and run the dataloaders, training loops, etc.

    Args:
        args (list): arguments passed from the cli
        hyper_params (dict): hyper parameters from the config file
    """
    log_exp_details(os.path.realpath(__file__), args)

    if not os.path.exists(args.output):
        os.makedirs(args.output)

    # __TODO__ change the hparam that are used from the training algorithm
    # (and NOT the model - these will be specified in the model itself)
    logger.info('List of hyper-parameters:')
    check_and_log_hp(
        ['batch_size', 'optimizer', 'patience', 'architecture', 'max_epoch',
         'exp_name'],
        hyper_params)

    train_loader, dev_loader = load_data(args, hyper_params)
    model = load_model(hyper_params)
    optimizer = load_optimizer(hyper_params, model)
    loss_fun = load_loss(hyper_params)

    train(model, optimizer, loss_fun, train_loader, dev_loader, hyper_params['patience'],
          args.output, max_epoch=hyper_params['max_epoch'],
          use_progress_bar=not args.disable_progressbar, start_from_scratch=args.start_from_scratch)
def random_forest_cv(folds):

    X, Y = load_data()

    # Create train and test data
    train_x, test_x = get_train_test_data(X)
    print(
        'Train and test data for X matrix created with dimensions {} and {} respectively'
        .format(train_x.shape, test_x.shape))

    train_y, test_y = get_train_test_data(Y)
    print(
        'Train and test data for Y matrix created with dimensions {} and {} respectively'
        .format(train_y.shape, test_y.shape))

    # The fit method of the estimator expects a 1d array and not a column-vector (which is what test_y is now). Change the shape of test_y to (n_samples, )
    train_y = reshape_label_matrix(train_y)

    # Get the start time
    get_start_time()

    clf = RandomForestClassifier(n_estimators=500)

    # Run cross-validation
    for k in folds:
        print('Performing cross validation with {} folds'.format(k))
        scores = cross_val_score(clf, train_x, train_y, cv=k)
        print('The final accuracy scores are {}'.format(scores))
        print('Mean accuracy score for {} folds is {}'.format(
            k, scores.mean()))

    # Get the stop time
    get_stop_time()
def main():
    csv_file = "TY_climate_2017_2018.csv"
    tensorboard_call_back = TensorBoard(log_dir="./log", histogram_freq=1, write_grads=True)

    train_data, test_data, column_name = load_data(csv_file)  # column_name: TT-Avg(℃), MT-Avg(g)
    # train_data, test_data, _ = data_preprocessing(train_data, test_data)
    train_data, _ = data_preprocessing(train_data)
    test_data, _ = data_preprocessing(test_data)

    # load data
    x_train, y_train = create_dataset(train_data)
    # x_test, y_test = create_dataset(test_data)

    x_train = x_train.reshape(x_train.shape[0], 1, 1)
    # x_test = x_test.reshape(x_test.shape[0], 1, 1)

    # reshape data
    y_train = y_train.reshape(y_train.shape[0], 1, 1)
    # y_test = y_test.reshape(y_test.shape[0], 1, 1)

    # load model
    lstm_model = training_model()
    print(lstm_model.summary())

    # start training
    lstm_model.compile(loss="mean_squared_error", optimizer="adam")
    lstm_model.fit(x_train, y_train, epochs=50, batch_size=32, callbacks=[tensorboard_call_back])

    # save model
    if column_name == "TT-Avg(℃)":
        print(column_name)
        lstm_model.save(f"saved_models_tt_avg/{build_name(column_name)}")
    elif column_name == "MT-Avg(g)":
        print(column_name)
        lstm_model.save(f"saved_models_mt_avg/{build_name(column_name)}")
def Sa_train_test(model):
    # process data
    path = download()
    data = load_data(path)
    train_x, train_y, test_x, test_y = split(
        data, 2950, 3000
    )  # shape=(, 10, 64, 64), just for test. Modify (2950, 3000) to about 10,000 in practice, such as (9800, 10000)
    # build model
    model = Sa_build_model()

    epochs = 80  # shape(, 10, 64, 64), just for test. should be close to 10000 in practice, such as (9800, 10000)
    model.fit(
        train_x,
        train_y,
        batch_size=8,
        epochs=epochs,
        verbose=2,
        validation_split=0.1,
    )
    # save trained weight
    model.save_weights('sa_saved_weight/')
    # make prediction
    prediction = model.predict(test_x)
    # turn [64, 64, 1] img to [64, 64] img. Otherwise may raise an error when plot
    prediction = np.squeeze(prediction, 4)  # shape = [batch_size, 10, 64, 64]
    # save result as photoes
    save_as_image(prediction, -1)
    # save the standard result, that is, test_y, as photoes
    # stantard = np.squeeze(test_y, 4);
    # save_as_image(stantard, 1)
    return model
Exemple #20
0
def load_test_dataset(dataset_dir, tokenizer):
    test_dataset = load_data(dataset_dir)
    test_label = test_dataset['label'].values
    
    # tokenize dataset
    tokenized_test = tokenized_dataset(test_dataset, tokenizer)
    return tokenized_test, test_label
Exemple #21
0
def run_knn():
    data = load_data()

    X = data[0]
    Y = data[1]

    # Create train and test data
    train_x, test_x = get_train_test_data(X)
    print(
        'Train and test data for X matrix created with dimensions {} and {} respectively'
        .format(train_x.shape, test_x.shape))

    train_y, test_y = get_train_test_data(Y)
    print(
        'Train and test data for Y matrix created with dimensions {} and {} respectively'
        .format(train_y.shape, test_y.shape))

    # The fit method of the estimator expects a 1d array and not a column-vector (which is what test_y is now). Change the shape of test_y to (n_samples, )
    train_y = reshape_label_matrix(train_y)

    # Get the start time
    get_start_time()

    knn = neighbors.KNeighborsClassifier(n_neighbors=2)
    knn.fit(train_x, train_y)
    pred = knn.predict(test_x)
    cm = confusion_matrix(test_y, pred)
    print('Confusion matrix : \n {}'.format(cm))

    # Get the stop time
    get_stop_time()
Exemple #22
0
def generate_encodings(dataset):
    train_path = TRAIN_PATHS[dataset]
    test_path = TEST_PATHS[dataset]
    data = load_data(train_path, test_path)

    for config in GENSIM_PRETRAINED_MODELS:
        create_encodings(dataset, config, data)
Exemple #23
0
    def __init__(self, source_vocab_size, target_vocab_size, SIGMA, LAMBDA, is_training):
        self.source_vocab_size = source_vocab_size
        self.target_vocab_size = target_vocab_size
        self.SIGMA = SIGMA
        self.LAMBDA = LAMBDA
        self.is_training = is_training

        if self.is_training:
            X, Y, _, _ = load_data(hp.train_file, hp.maxlen)

            
            # calc total batch count
            self.num_batch = len(X) // hp.batch_size
            
            # Convert to tensor
            X = tf.convert_to_tensor(X, tf.int32)
            Y = tf.convert_to_tensor(Y, tf.int32)
            
            # Create Queues
            input_queues = tf.train.slice_input_producer([X, Y])
                    
            # create batch queues
            self.x, self.y = tf.train.shuffle_batch(input_queues,
                                        num_threads=8,
                                        batch_size=hp.batch_size, 
                                        capacity=hp.batch_size*64,   
                                        min_after_dequeue=hp.batch_size*32, 
                                        allow_smaller_final_batch=False)
        else: # inference
            self.x = tf.placeholder(tf.int32, shape=(None, hp.maxlen))
            self.y = tf.placeholder(tf.int32, shape=(None, hp.maxlen))

        self._creat_model()
Exemple #24
0
def main(argv):
    if len(argv) < 1:
        print("Please mention a dataset (movielens or jester)")
        return

    k = 5
    num_perturbs = 20
    dataset_name = argv[0]

    mask = False
    if dataset_name == "movielens" or dataset_name == "jester" or dataset_name == "modcloth":
        mask = True

    adj_matrix = load_data(dataset_name)
    perturbed_matrix = perturb_matrix(adj_matrix, num_perturbs, k, mask)

    omega_c = 0
    if mask == True:
        omega_c = np.count_nonzero(np.isnan(adj_matrix))
    else:
        omega_c = np.count_nonzero(adj_matrix == 0)

    orig_svd = svd(adj_matrix, k, mask)
    perturbed_svd = svd(perturbed_matrix, k, mask)

    error = evaluate_error(orig_svd, perturbed_svd, adj_matrix, omega_c)
    print("RMSE Error:", error)
Exemple #25
0
def test_dA(learning_rate=0.1,
            training_epochs=5,
            batch_size=1,
            output_folder='dA_plots'):

    datasets = load_data()
    train_set_c, train_set_x = datasets[0]
    n_train_batches = train_set_c.get_value(borrow=True).shape[0] / batch_size

    index = T.lscalar()
    x = T.matrix('x')
    c = T.matrix('c')

    if not os.path.isdir(output_folder):
        os.makedirs(output_folder)
    os.chdir(output_folder)

    rng = numpy.random.RandomState(123)

    da = denoising_layer(numpy_rng=rng,
                         corrupted_input=c,
                         input=x,
                         n_visible=800 * 600,
                         n_hidden=200)

    cost, updates = da.get_cost_updates(learning_rate)

    train_da = theano.function(
        [index],
        cost,
        updates=updates,
        givens={
            c: train_set_c[index * batch_size:(index + 1) * batch_size],
            x: train_set_x[index * batch_size:(index + 1) * batch_size]
        })

    start_time = timeit.default_timer()

    for epoch in xrange(training_epochs):
        c = []
        for batch_index in xrange(n_train_batches):
            j = train_da(batch_index)
            c.append(j)

        print 'Training epoch %d, cost ' % epoch, numpy.mean(c)

    end_time = timeit.default_timer()

    training_time = (end_time - start_time)

    print "training time: " + str(training_time)

    image = Image.fromarray(
        tile_raster_images(X=da.W.get_value(borrow=True).T,
                           img_shape=(600, 800),
                           tile_shape=(10, 10),
                           tile_spacing=(1, 1)))
    image.save('filters')

    os.chdir('../')
Exemple #26
0
def main():
    model = create_model()
    model = load_model_weight(model, "model_weights.pkl")
    x_train, y_train, _, _, x_test, y_test = load_data()
    print(x_train.shape)
    print(x_test.shape)

    evaluate(model, x_test, y_test)
	def __init__(self, quote):
		'''
		hand in quote as a string
		'''
		self.neg_data, self.pos_data, self.pos_prob, self.neg_prob = load_data(quote)
		self.news_data = []
		self.news_features = []
		self.quote = quote
		self.date = ''
def load_test_dataset(dataset_dir, tokenizer, model_arc='Electra'):
    test_dataset = load_data(dataset_dir)
    test_label = test_dataset['label'].values
    # tokenizing dataset
    if model_arc == 'Electra':
        tokenized_test = tokenized_dataset(test_dataset, tokenizer)
    else:
        tokenized_test = roberta_tokenized_dataset(test_dataset, tokenizer)
    return tokenized_test, test_label
Exemple #29
0
def prep_data(file_path, percentage_train):
    train_data, test_data = load_data(file_path, percentage_train)
    glove_vectors = load_word_vectors("../data/glove.6B.100d.txt", GLOVE_DIMENSION)
    train_X, train_y = process_rows(train_data, glove_vectors)
    test_X, test_y = process_rows(test_data, glove_vectors)

    print(train_X, len(train_X), len(train_X[0]))
    print(train_y, len(train_y), len(train_y[0]))

    return train_X, test_X, train_y, test_y
Exemple #30
0
 def __init__(self):
     tic = time.time()
     filename = 'Data/DQHI Data Scientist Exercise Data.xlsx'
     self.ld = load_data(filename)
     self.inap_sno = []
     self.inap_orderno = []
     self.run_analysis()
     self.ldata, self.sdata, self.cdata = [], [], []
     toc = time.time() - tic
     print("Running time : " + str(toc))
Exemple #31
0
def main():
    print('convSLTM training')
    path = download()
    data = load_data(path)
    train_x, train_y, test_x, test_y = split(
        data, 2950, 3000
    )  # shape(, 10, 64, 64),just for test. should be close to 10000 in practice, such as (9800, 10000)
    model = build_model()
    model = train(model, train_x, train_y)
    predict(model, test_x, test_y)
Exemple #32
0
def load_test_dataset(dataset_dir, tokenizer):
    test_dataset = load_data(dataset_dir)
    test_label = test_dataset['label'].values

    # pororo ner
    ner = Pororo(task="ner", lang="ko")

    # tokenizing dataset
    tokenized_test = tokenized_dataset(test_dataset, tokenizer, ner)
    return tokenized_test, test_label
def load_test_dataset(dataset_dir, tokenizer):
    test_dataset = load_data(dataset_dir, dev=False)
    test_label = test_dataset['label'].values

    # tokenizing dataset
    tokenized_test = tokenized_dataset(test_dataset, tokenizer)

    print(tokenizer.convert_ids_to_tokens(tokenized_test['input_ids'][2]))

    return tokenized_test, test_label
Exemple #34
0
def callingSVR(player):
    train_data, test_data = load_data('dataset/' + player + '.csv')
    x, y = collect_attributes(train_data)
    x_test, y_test = collect_attributes(test_data)

    predicted = svr(x, y, x_test)
    print(predicted)
    rmse = calculate_rmse(np.array(predicted), y_test)
    print("\n[SVR] RMSE for player " + player + " is: " + str(rmse) + "\n")
    return rmse
Exemple #35
0
def dt(working_directory, filename, columns, question, name, max_depth=None, cross_validation_folds=5, render_tree=False, classify_maybe_as=None):
    data_X, data_y = load_data(working_directory, filename, columns, question, classify_maybe_as=classify_maybe_as)
    if max_depth:
        clf = tree.DecisionTreeClassifier(max_depth=max_depth)
    else:
        clf = tree.DecisionTreeClassifier()
    clf = clf.fit(data_X, data_y)
    scores = cross_validation(clf, data_X, data_y, cross_validation_folds, name)
    if render_tree:
        plot_tree(clf, name)
    return scores
    def askopenfilename(self):
        self.filename = tkinter.filedialog.askopenfilename()
        print ("open filename : %s" %self.filename)
# clean all data
        del orig_data[:]
        del group_data[:]
        del rule[:]
        for i in self.boy_tree.get_children():
            self.boy_tree.delete(i)
        for i in self.girl_tree.get_children():
            self.girl_tree.delete(i)
        for i in self.teacher_tree.get_children():
            self.teacher_tree.delete(i)
# clean all data
        load_data(self.filename, orig_data, rule)
        tmp = self.filename.replace(".xlsx", "")
        schoolname = re.search(r'.*\d+(.*)$', tmp).group(1)
        self.grouping_status.set("讀取"+ schoolname +"新生資料")
        self._status_school_update()
        self._load_boy_data()
        self._load_girl_data()
        self._load_teacher_data()
def train():
  (train, y_train, test, y_test) = load_data()

  pipe = pipeline.Pipeline(
    [('csp', CSP()), ('chan_var', ChanVar()), ('svm', svm.SVC(kernel='linear'))])

  # train model
  pipe.fit(train, y_train)

  # make predictions on unseen test data
  y_pred = pipe.predict(test)

  print metrics.classification_report(y_test, y_pred)
Exemple #38
0
def svc(working_directory, filename, columns, question, name, cross_valudation_folds=5, C=1, kernel='rbf', classify_maybe_as=None):
    data_X, data_y = load_data(working_directory, filename, columns, question, classify_maybe_as=classify_maybe_as)
    clf = svm.SVC(C=C, kernel=kernel)
    return None
    for val in data_X.values():
        data_X1.append(list(val))
    data_X = list(np.transpose(data_X1))
    for row in data_X:
        print row
        print ''
    return None
    data_y = data_y[question]
    clf = clf.fit(data_X, data_y)
    scores = cross_validation(clf, data_X, data_y, cross_valudation_folds, name)
    return scores
Exemple #39
0
    def load_data(self, batch_size):
        datasets = load_data()
        self.train_set_x, self.train_set_y = datasets[0]
        self.valid_set_x, self.valid_set_y = datasets[1]
        #self.test_set_x, self.test_set_y = datasets[2]

        self.n_train_batches = self.train_set_x.get_value(borrow=True).shape[0] / batch_size
        self.n_valid_batches = self.valid_set_x.get_value(borrow=True).shape[0] / batch_size
        #self.n_test_batches = self.test_set_x.get_value(borrow=True).shape[0] / batch_size
        self.batch_size = batch_size

        print 'train_x: ', self.train_set_x.get_value(borrow=True).shape
        print 'train_y: ', self.train_set_y.shape.eval()
        print 'valid_x: ', self.valid_set_x.get_value(borrow=True).shape
        print 'valid_y: ', self.valid_set_y.shape.eval()
Exemple #40
0
from load_data import *

import numpy as np
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.cluster import MeanShift, estimate_bandwidth
from itertools import cycle

###############################################################################
# Preprocessing data

raw = load_data()
data = scale(raw)

reduced_data = PCA(n_components=2).fit_transform(data)
reduced_data = scale(reduced_data)

###############################################################################
# Compute clustering with MeanShift

# The following bandwidth can be automatically detected using
bandwidth = estimate_bandwidth(reduced_data)

ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
ms.fit(reduced_data)
labels = ms.labels_
cluster_centers = ms.cluster_centers_

labels_unique = np.unique(labels)
Exemple #41
0
def rf(working_directory, filename, columns, question, name, cross_validation_folds=5, n_estimators=10, max_depth=5, min_samples_split=2, random_state=0, classify_maybe_as=None):
    data_X, data_y = load_data(working_directory, filename, columns, question, classify_maybe_as=classify_maybe_as)
    clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split, random_state=random_state)
    clf.fit(data_X, data_y)
    scores = cross_validation(clf, data_X, data_y, cross_validation_folds, name)
    return scores
Exemple #42
0
import numpy as np
from load_data import *
from combined_classifier import combined_classifier
from sklearn.cross_validation import train_test_split
from sklearn import svm
from KernelRidge import *
from kNN import kNN
#from sklearn.kernel_ridge import KernelRidge
#from sklearn.grid_search import GridSearchCV

xtr, ytr = load_data()
xte = load_data_test()

xte = flatten(xte)
xtr = flatten(xtr)



### parameter tuning (gaussian kernel + svm)
#xtr1, xtr2, ytr1, ytr2 = train_test_split(xtr, ytr, test_size=0.2)
###find the best lmd and sigma with xtr1 and ytr1
#for i in [0.3,0.5,0.7]:
#	for j in [0.9,1.0,1.1,1.2]:
#		clf = KernelRidge(lmb=i, kernel = 'rbf', sigma=j)
#		clf_combined = combined_classifier(clf)
#		x_train, x_test, y_train, y_test = train_test_split(xtr1, ytr1, test_size=0.2)
#		clf_combined.fit(x_train,y_train)
#		scores = clf_combined.score(x_test,y_test)
#		print 'lmd:',i
#		print 'sigma:',j
#		print scores
def test_SdA(finetune_lr=0.1, pretraining_epochs=1,
             pretrain_lr=0.001, training_epochs=1, 
             b_patch_filename = 'b_10_Training_patches_norm.npy', b_groundtruth_filename = 'b_Training_labels_norm.npy',
             b_valid_filename = 'b_10_Validation_patches_norm.npy', b_validtruth_filename = 'b_Validation_labels_norm.npy',
             u_patch_filename = 'u_10_Training_patches_norm.npy', u_groundtruth_filename = 'u_Training_labels_norm.npy',
             u_valid_filename = 'u_10_Validation_patches_norm.npy', u_validtruth_filename = 'u_Validation_labels_norm.npy',
             batch_size=100, n_ins = 605, n_outs = 2, hidden_layers_sizes = [1000,1000,1000],prefix = '11_11_3_G4_', corruption_levels=[0.2,0.2,0.2] ):
                 
    """
    Demonstrates how to train and test a stochastic denoising autoencoder.

    This is demonstrated on MNIST.

    :type learning_rate: float
    :param learning_rate: learning rate used in the finetune stage
    (factor for the stochastic gradient)

    :type pretraining_epochs: int
    :param pretraining_epochs: number of epoch to do pretraining

    :type pretrain_lr: float
    :param pretrain_lr: learning rate to be used during pre-training

    :type n_iter: int
    :param n_iter: maximal number of iterations to run the optimizer

    :type dataset: string
    :param dataset: path the the pickled dataset

    """
   
    print '###########################'
    print 'Pretraining epochs: ', pretraining_epochs
    print 'Finetuning epochs: ', training_epochs
    print '###########################'
    
    W = []
    b = []
    
    #########################################################
    #########################################################
   
    resumeTraining = False
    
    #@@@@@@@@ Needs to be worked on @@@@@@@@@@@@@@@@@
    # Snippet to resume training if the program crashes halfway through #
    opts, arg = getopt.getopt(sys.argv[1:],"rp:")
    for opt, arg in opts:
        if opt == '-r':
            resumeTraining = True                               # make this true to resume training from saved model    
        elif opt == '-p':
            prefix = arg
            
    flagValue = 1    
    
    if(resumeTraining):
        
        flagFile = file(prefix+'flag.pkl','rb')
        
        try:
            flagValue = cPickle.load(flagFile)
        except:
            pass
        
        savedModel_preTraining = file(prefix+'pre_training.pkl','rb')
        genVariables_preTraining = cPickle.load(savedModel_preTraining)
        layer_number, epochs_done_preTraining, mean_cost , pretrain_lr = genVariables_preTraining
        epoch_flag = 1
        print 'Inside resumeTraining!!!!!!!!!!!!!!!!!!'
        no_of_layers = len(hidden_layers_sizes) + 1
        
        for i in xrange(no_of_layers):
            try:
                W.append(cPickle.load(savedModel_preTraining))
                b.append(cPickle.load(savedModel_preTraining))
            except:
                W.append(None)
                b.append(None)
                    
        if flagValue is 2:
            epochFlag_fineTuning = 1
            iterFlag = 1
            savedModel_fineTuning = file(prefix+'fine_tuning.pkl','rb')
            hidden_layers_sizes = cPickle.load(savedModel_fineTuning)
            genVariables_fineTuning = cPickle.load(savedModel_fineTuning)
            epochs_done_fineTuning,best_validation_loss,finetune_lr,patience,iters_done = genVariables_fineTuning
    
   
    else:
        
        layer_number, epochs_done, mean_cost, pretrain_lr = [0,0,0,pretrain_lr]
        epoch_flag = 0
        epochFlag_fineTuning = 0
        iterFlag = 0
        W = None
        b = None
                
    ##############################################################
    ##############################################################

                    
    datasets = load_data(b_patch_filename,b_groundtruth_filename,b_valid_filename,b_validtruth_filename)

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0]
    n_train_batches /= batch_size

    # numpy random generator
    # start-snippet-3
    numpy_rng = numpy.random.RandomState(89677)
    print '... building the model'
    
#    print 'W: ', W
#    print 'b: ', b
    
    ################################################################
    ################CONSTRUCTION OF SdA CLASS#######################
    sda = SdA(
        numpy_rng=numpy_rng,
        n_ins=n_ins,
        hidden_layers_sizes=hidden_layers_sizes,
        n_outs=n_outs, W = W, b=b)
        
    print 'SdA constructed'
    ################################################################
    ################################################################
    if flagValue is 1:
    ################################################################
    # end-snippet-3 start-snippet-4
    #########################
    # PRETRAINING THE MODEL #
    #########################
    
        flag = open(prefix+'flag.pkl','wb')
        cPickle.dump(1,flag, protocol = cPickle.HIGHEST_PROTOCOL)
        flag.close()
            
        print '... getting the pretraining functions'
        pretraining_fns = sda.pretraining_functions(train_set_x=train_set_x,batch_size=batch_size)
        print 'Length of pretraining function: ', len(pretraining_fns)

        print '... pre-training the model'
        start_time = time.clock()
        ## Pre-train layer-wise
        log_pretrain_cost = []
        #corruption_levels = [.001, .001, .001]
        for i in xrange(sda.n_layers):
        
            if i < layer_number:
                i = layer_number
                #print i
                # go through pretraining epochs
        
            for epoch in xrange(pretraining_epochs):
                ##########################################            
                if epoch_flag is 1 and epoch < epochs_done_preTraining:
                    epoch = epochs_done_preTraining
                    epoch_flag = 0
                    ##########################################
                    # go through the training set
                c = []
                for batch_index in xrange(n_train_batches):
                    #sprint batch_index
                    c.append(pretraining_fns[i](index=batch_index,
                         corruption=corruption_levels[i],
                         lr=pretrain_lr))
                print 'Pre-training layer %i, epoch %d, cost ' % (i, epoch),
                print numpy.mean(c)
                log_pretrain_cost.append(numpy.mean(c))

            

                save_valid = open(prefix+'pre_training.pkl', 'wb')
                #print 'YO! i=',i,' epoch=',epoch,' cost=',numpy.mean(c) 
                #print pretrain_lr
                genVariables = [i, epoch, numpy.mean(c), pretrain_lr]
                cPickle.dump(genVariables,save_valid,protocol = cPickle.HIGHEST_PROTOCOL)
                for j in xrange(len(sda.params)):
                    cPickle.dump(sda.params[j].get_value(borrow=True), save_valid, protocol = cPickle.HIGHEST_PROTOCOL)
                save_valid.close()
        
        
        pretrain_log_file = open(prefix + 'log_pretrain_cost.txt', "a")
        for l in log_pretrain_cost:
            pretrain_log_file.write("%f\n"%l)
        pretrain_log_file.close()



        #print sda.params[0]
        end_time = time.clock()

        print >> sys.stderr, ('The pretraining code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
                          
                          
                          
    # end-snippet-4
    ########################
    # FINETUNING THE MODEL #
    ########################

    # get the training, validation and testing function for the model
    
    
    datasets = load_data(u_patch_filename,u_groundtruth_filename,u_valid_filename,u_validtruth_filename)
    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]
    n_train_batches = train_set_x.get_value(borrow=True).shape[0]
    n_train_batches /= batch_size
    print '... getting the finetuning functions'
    train_fn, validate_model, test_model = sda.build_finetune_functions(datasets=datasets,batch_size=100,learning_rate=0.1)

    print '... finetunning the model'
    # early-stopping parameters
    patience = 10 * n_train_batches  # look as this many examples regardless
    patience_increase = 2.  # wait this much longer when a new best is
                            # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_validation_loss = numpy.inf
    test_score = 0.
    start_time = time.clock()

    done_looping = False
    epoch = 0
    flag = open(prefix+'flag.pkl','wb')
    cPickle.dump(2,flag, protocol = cPickle.HIGHEST_PROTOCOL)
    flag.close()
    
    log_valid_cost=[]

    while (epoch < training_epochs) and (not done_looping):
        
        if epochFlag_fineTuning is 1 and epoch < epochs_done_fineTuning:
            epoch = epochs_done_fineTuning
            epochFlag_fineTuning = 0
            
        epoch = epoch + 1
        
        for minibatch_index in xrange(n_train_batches):
            minibatch_avg_cost = train_fn(minibatch_index)
            
            if iterFlag is 1 and iter < iters_done:
                iter = iters_done
                iterFlag = 0
                
            iter = (epoch - 1) * n_train_batches + minibatch_index

            if (iter + 1) % validation_frequency == 0:
                validation_losses = validate_model()
                this_validation_loss = numpy.mean(validation_losses)
                print('epoch %i, minibatch %i/%i, validation error %f %%' %
                      (epoch, minibatch_index + 1, n_train_batches,
                       this_validation_loss * 100.))
                log_valid_cost.append(this_validation_loss)

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:

                    #improve patience if loss improvement is good enough
                    if (
                        this_validation_loss < best_validation_loss *
                        improvement_threshold
                    ):
                        patience = max(patience, iter * patience_increase)

                    # save best validation score and iteration number
                    best_validation_loss = this_validation_loss
                    best_iter = iter
                    
                    
                    print 'Saving the best validation network'
                    genVariables = [epoch,best_validation_loss,finetune_lr,patience,iter]
                    save_file = open(prefix+'fine_tuning.pkl','wb')
                    cPickle.dump(hidden_layers_sizes, save_file)
                    cPickle.dump(genVariables, save_file)
                    for j in xrange(len(sda.params)):
                        cPickle.dump(sda.params[j].get_value(borrow=True), save_file, protocol = cPickle.HIGHEST_PROTOCOL)
                    save_file.close()
                    valid_file = open('log_valid_cost.txt', "a")
                    for l in log_valid_cost:
                        valid_file.write("%f\n"%l)
                    log_valid_cost=[]
                    
                
                    # test it on the test set
                    test_losses = test_model()
                    test_score = numpy.mean(test_losses)
                    print(('     epoch %i, minibatch %i/%i, test error of '
                           'best model %f %%') %
                          (epoch, minibatch_index + 1, n_train_batches,
                           test_score * 100.))

                else :
                    print 'validation loss not decreasing, hence reducing lr'
                    finetune_lr=0.8*finetune_lr


            if patience <= iter:
                done_looping = True
                break

    end_time = time.clock()
    print(
        (
            'Optimization complete with best validation score of %f %%, '
            'on iteration %i, '
            'with test performance %f %%'
        )
        % (best_validation_loss * 100., best_iter + 1, test_score * 100.)
    )
    print >> sys.stderr, ('The training code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet3(learning_rate=0.1, n_epochs=200, dataset='mnist.pkl.gz', nkerns=[20,50], batch_size=500):
    '''
    layer0: convpool layer
    layer1: convpool layer
    layer1: hidden layer
    layer2: logistic layer
    '''

    datasets = load_data(dataset)
    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]

    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size

    index = T.lscalar()
    x = T.matrix('x')
    y = T.ivector('y')

    image_shape = (batch_size, 1, 28, 28)
    rng = numpy.random.RandomState(1234)

    print 'building the model ...'

    layer0_input = x.reshape(image_shape)
    layer0 = LeNetConvPoolLayer(rng, 
            input        = layer0_input,
            image_shape  = image_shape,
            filter_shape = (nkerns[0], 1, 5, 5),
            poolsize     = (2, 2),
			activation	 = relu)
    layer1 = LeNetConvPoolLayer(rng,
            input        = layer0.output,
            image_shape  = (batch_size, nkerns[0], 12, 12),
            filter_shape = (nkerns[1],  nkerns[0], 5, 5),
            poolsize     = (2,2),
			activation	 = relu)

    layer2_input = layer1.output.flatten(2)
    layer2 = HiddenLayer(rng, 
            input   = layer2_input, 
            n_in    = nkerns[1] * 4 * 4,
            n_out   = 500, 
            activation = relu)
    layer3 = LogisticRegression(
            input   = layer2.output,
            n_in    = 500,
            n_out   = 10)

    cost = layer3.negative_log_likelihood(y)

    test_valid_model = theano.function(inputs=[index],
            outputs=layer3.errors(y),
            givens = {
                x: valid_set_x[index * batch_size : (index+1) * batch_size],
                y: valid_set_y[index * batch_size : (index+1) * batch_size]}
            )

    test_train_model = theano.function(inputs=[index],
            outputs=layer3.errors(y),
            givens = {
                x: train_set_x[index * batch_size : (index+1) * batch_size],
                y: train_set_y[index * batch_size : (index+1) * batch_size]}
            )

    params = layer3.params + layer2.params + layer1.params + layer0.params
    gparams = []
    for param in params:
        gparam = T.grad(cost, param)
        gparams.append(gparam)
    updates = []
    for param, gparam in zip(params, gparams):
        updates.append((param, param - learning_rate * gparam))

    train_model = theano.function(inputs=[index],
            outputs=cost,
            updates=updates,
            givens = {
                x: train_set_x[index * batch_size : (index+1) * batch_size],
                y: train_set_y[index * batch_size : (index+1) * batch_size]}
            )
    print 'Train the model ...'
    train_sample_num = train_set_x.get_value(borrow=True).shape[0]
    valid_sample_num = valid_set_x.get_value(borrow=True).shape[0]

    epoch = 0
    while epoch < n_epochs:
        epoch += 1
        for minibatch_index in xrange(n_train_batches):
            minibatch_cost = train_model(minibatch_index)
            print '\tepoch %i, minibatch_index %i/%i, minibatch_cost %f' % (epoch, minibatch_index, n_train_batches, minibatch_cost)
        train_losses = [test_train_model(i) for i in xrange(n_train_batches)]
        valid_losses = [test_valid_model(i) for i in xrange(n_valid_batches)]

        '''
        train_score  = numpy.sum(train_losses)
        valid_score  = numpy.sum(valid_losses)
        print 'epoch %i, train_score %f, valid_score %f' % (epoch, float(train_score) / train_sample_num, float(valid_score) / valid_sample_num)
        '''
        train_score  = numpy.mean(train_losses)
        valid_score  = numpy.mean(valid_losses)
        print 'epoch %i, train_score %f, valid_score %f' % (epoch, train_score, valid_score)
def main():
    #load data 
    X_train,Y_train,X_valid,Y_valid,X_test=load_data(training_dir,valid_dir,test_dir,labels,sample)
    #preprocess data by mean subtraction and normalization 
    X_train,X_valid,X_test=preprocess(X_train,X_valid,X_test)
    #del X_train
    #del X_test

    #or load pre-processed data from a previously saved hdf5 file:
    '''
    data=h5py.File('imagenet.transpose.individually.augment.hdf5','r')
    X_train=np.asarray(data['X_train']) 
    Y_train=np.asarray(data['Y_train']) 
    X_valid=np.asarray(data['X_valid']) 
    Y_valid=np.asarray(data['Y_valid']) 
    X_test=np.asarray(data['X_test']) 
    '''
    #print "loaded data from pickle" 
    #OPTIONAL: save loaded/pre-processed data to a pickle to save time in the future
    
    #print "saving preprocessed data to hdf5 file" 

    f=h5py.File('imagenet.transpose.individually.augment.contrast.tint.hdf5','w')
    dset_xtrain=f.create_dataset("X_train",data=X_train)
    dset_ytrain=f.create_dataset("Y_train",data=Y_train) 
    dset_xvalid=f.create_dataset("X_valid",data=X_valid) 
    dset_yvalid=f.create_dataset("Y_valid",data=Y_valid) 
    dset_xtest=f.create_dataset("X_test",data=X_test) 
    f.flush() 
    f.close() 

    #print "done saving pre-processed data to hdf5 file!" 
    pretrained_model = pretrained('pretrained_model.h5',False)
    sgd = SGD(lr=1e-3, decay=1e-6, momentum=0.9, nesterov=True)
    pretrained_model.compile(optimizer=sgd, loss='categorical_crossentropy',trainLayersIndividually=0)
    #do some training! 
    print "compilation finished, fitting model" 

    print "pretrained_model.trainLayersIndividually:"+str(pretrained_model.trainLayersIndividually) 
    if pretrained_model.trainLayersIndividually==1: 
        train_epochs=5 
    else: 
        train_epochs=5     
    history=pretrained_model.fit(X_train, Y_train, 128,train_epochs,validation_data=tuple([X_valid,Y_valid]),verbose=1,show_accuracy=True)
    pretrained_model.save_weights("assignment3_weights_nodropout_noregularization_augmenteddata.3epochs.contrast.tint.hdf5",overwrite=True) 
    class_predictions=pretrained_model.predict_classes(X_test) 
    np.savetxt('assignment3_class_predictions_nodropout_noregularization_augmenteddata.3epochs.contrast.tint.txt',class_predictions,fmt='%i',delimiter='\t') 
    train_scores=pretrained_evaluate(pretrained_model,X_train,Y_train)
    print "pretrained model training scores:"+str(train_scores) 
    valid_scores=pretrained_evaluate(pretrained_model,X_valid,Y_valid)
    print "pretrained validation scores:"+str(valid_scores)


    print "writing out the predictions file" 
    predictions=open('assignment3_class_predictions_nodropout_noregularization_augmenteddata.3epochs.contrast.tint.txt','r').read().split('\n') 
    while '' in predictions: 
        predictions.remove('') 
        
    wnids=open(labels,'r').read().split('\n') 
    while '' in wnids: 
        wnids.remove('') 

    cur_dir=test_dir+"images/"
    onlyfiles = [f for f in listdir(cur_dir) if isfile(join(cur_dir, f))]
    entries=10000
    outf=open('assignment3_class_predictions_nodropout_noregularization_augmenteddata.formatted.3epochs.contrast.tint.txt','w') 

    for i in range(entries): 
        image_name=onlyfiles[i] 
        predict_index=int(predictions[i])
        wnid1=wnids[predict_index]
        outf.write(image_name+'\t'+str(wnid1)+'\n')
def test_mlp(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, 
        n_epochs=1000, dataset='mnist.pkl.gz', batch_size=20, n_hidden=500):
    datasets = load_data(dataset)
    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]

    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size

    index = T.lscalar()
    x = T.matrix('x')
    y = T.ivector('y')

    rng = numpy.random.RandomState(1234)

    classifier = MLP(rng=rng, input=x, n_in=28*28,
                     n_hidden=n_hidden, n_out=10)
    cost = classifier.negative_log_likelihood(y) + L1_reg * classifier.L1 + L2_reg * classifier.L2_sqr

    test_valid_model = theano.function(inputs=[index],
            outputs=classifier.errors(y),
            givens = {
                x: valid_set_x[index * batch_size : (index+1) * batch_size],
                y: valid_set_y[index * batch_size : (index+1) * batch_size]}
            )

    test_train_model = theano.function(inputs=[index],
            outputs=classifier.errors(y),
            givens = {
                x: train_set_x[index * batch_size : (index+1) * batch_size],
                y: train_set_y[index * batch_size : (index+1) * batch_size]}
            )

    gparams = []
    for param in classifier.params:
        gparam = T.grad(cost, param)
        gparams.append(gparam)
    updates = []
    for param, gparam in zip(classifier.params, gparams):
        updates.append((param, param - learning_rate * gparam))

    train_model = theano.function(inputs=[index],
            outputs=cost,
            updates=updates,
            givens = {
                x: train_set_x[index * batch_size : (index+1) * batch_size],
                y: train_set_y[index * batch_size : (index+1) * batch_size]}
            )
    print 'Train the model ...'
    train_sample_num = train_set_x.get_value(borrow=True).shape[0]
    valid_sample_num = valid_set_x.get_value(borrow=True).shape[0]

    epoch = 0
    while epoch < n_epochs:
        epoch += 1
        for minibatch_index in xrange(n_train_batches):
            minibatch_cost = train_model(minibatch_index)
        train_losses = [test_train_model(i) for i in xrange(n_train_batches)]
        valid_losses = [test_valid_model(i) for i in xrange(n_valid_batches)]

        '''
        train_score  = numpy.sum(train_losses)
        valid_score  = numpy.sum(valid_losses)
        print 'epoch %i, train_score %f, valid_score %f' % (epoch, float(train_score) / train_sample_num, float(valid_score) / valid_sample_num)
        '''
        train_score  = numpy.mean(train_losses)
        valid_score  = numpy.mean(valid_losses)
        print 'epoch %i, train_score %f, valid_score %f' % (epoch, train_score, valid_score)

"""'Load merged data and do build new features such as types of stores, months with high sales, ..."""

train, test = rawProcess()

all_data = {"rawtrain": train, "rawtest": test}
print "Saving dataset."
pickle.dump(all_data, gzip.open("dataMerged00.pickle.gz", "w"), protocol=pickle.HIGHEST_PROTOCOL)


if __name__ == "__main__":
    t0 = time.clock()
    seed = 2014

    rawtrain, rawtest = load_data("dataMerged00.pickle.gz", shuffle_train=seed)

    rawtrain = rawtrain.set_index("Id")
    rawtest = rawtest.set_index("Id")
    rawtrain["predWeekly_Sales"] = 0.0
    rawtest["Weekly_Sales"] = 0.0

    w = rawtrain["IsHoliday"].values

    globalDept_Weight_Dict, globalDept_std_Dict = globalDeptWeight(rawtrain)
    global_dept_month_WeightDict, global_dept_month_stdDict = globalDeptWeight_by_Month(rawtrain)

    for dept in np.sort(rawtest["Dept"].unique()):
        finetrain, finetest = fineProcess(
            rawtrain,
            rawtest,
Exemple #48
0
  method         = 'pca'   # ['pca', 'lsh', 'itq']
  aver_neighbors = 50      # the number of neighbors to obtain the ground true
  manhattan_hash = True   # whether to use the manhattan hashing
  manhattan_bit  = 2       # map each dimension to `manhattan_bit` bits

  print 'Parameters:'
  print '=========='
  print 'database          :', db
  print 'nbits             :', nbits
  print 'method            :', method
  print 'use manhattan hash:', 'Yes' if manhattan_hash else 'No'
  if manhattan_hash:
    print 'manhattan bit     :', manhattan_bit
  print

  [feats, train, test] = load_data(db, f_feats, f_train, f_test);

  rdm = random.sample(range(len(feats)), len(feats))
  # Get test data
  test_idx = rdm[0:ntest]
  # ntest x #(dimension of feature), for GIST descriptor, the second dimension
  # is 512
  x_test = []
  for idx in test_idx:
    x_test.append(feats[idx - 1][:])

  # Get train data
  train_idx = rdm[ntest:]
  x_train = []
  for idx in train_idx:
    x_train.append(feats[idx - 1][:])
Exemple #49
0
def test_mlp(
        initial_learning_rate,
        learning_rate_decay,
        squared_filter_length_limit,
        n_epochs,
        batch_size,
        mom_params,
        activations,
        dropout,
        dropout_rates,
        layer_sizes,
        dataset,
        use_bias,
        W = None,
        b = None,
        random_seed=1234,
        prefix = ''):
    """
    The dataset is the one from the mlp demo on deeplearning.net.  This training
    function is lifted from there almost exactly.

    :type dataset: string
    :param dataset: the path of the MNIST dataset file from
                 http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz


    """
    print len(layer_sizes)
    print len(dropout_rates)
    assert len(layer_sizes) - 1 == len(dropout_rates)
    
    # extract the params for momentum
    # mom_start = mom_params["start"]
    # mom_end = mom_params["end"]
    # mom_epoch_interval = mom_params["interval"]
    
    # train_patch = '/media/brain/1A34723D34721BC7/BRATS/varghese/Recon_2013_data/BRATS_training_patches/u_trainpatch_2D_11x11_costpenalty_.npy'
    # train_label = '/media/brain/1A34723D34721BC7/BRATS/varghese/Recon_2013_data/BRATS_training_patches/u_trainlabel_2D_11x11_costpenalty_.npy'
    # valid_patch = '/media/brain/1A34723D34721BC7/BRATS/varghese/Recon_2013_data/BRATS_validation_patches/u_validpatch_2D_11x11_costpenalty_.npy'
    # valid_label = '/media/brain/1A34723D34721BC7/BRATS/varghese/Recon_2013_data/BRATS_validation_patches/u_validlabel_2D_11x11_costpenalty_.npy'

    train_patch, train_label, valid_patch, valid_label = dataset
    
    datasets = load_data(train_patch,train_label,valid_patch,valid_label)
    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
    n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size

    ######################
    # BUILD ACTUAL MODEL #
    ######################

    print '... building the model'

    # allocate symbolic variables for the data
    index = T.lscalar()    # index to a [mini]batch
    epoch = T.scalar()
    x = T.matrix('x')  # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of
                        # [int] labels

    learning_rate = T.scalar('lr')
    # learning_rate = theano.shared(np.asarray(initial_learning_rate,
    #     dtype=theano.config.floatX))

    p1 = T.sum(T.eq(train_set_y, 1)).eval() / float(train_set_y.shape[0].eval())
    p2 = T.sum(T.eq(train_set_y, 2)).eval() / float(train_set_y.shape[0].eval())
    p3 = T.sum(T.eq(train_set_y, 3)).eval() / float(train_set_y.shape[0].eval())
    p4 = T.sum(T.eq(train_set_y, 4)).eval() / float(train_set_y.shape[0].eval())

    # print 'Probability 1: ',p1
    # print 'Probability 2: ',p2
    # print 'Probability 3: ',p3
    # print 'Probability 4: ',p4

    rng = np.random.RandomState(random_seed)

    # construct the MLP class
    classifier = MLP(rng=rng, input=x,
                     layer_sizes=layer_sizes,
                     dropout_rates=dropout_rates,
                     activations=activations,
                     W = W,
                     b = b,
                     use_bias=use_bias)

    print '#############################'
    print classifier.params
    print '#############################'

    # Build the expresson for the cost function.
    cost = classifier.negative_log_likelihood(y) + 0.0001 * classifier.L2_sqr + 0.0001*classifier.L1_sqr   # added today
    dropout_cost = classifier.dropout_negative_log_likelihood(y) + 0.0001 * classifier.L2_sqr + 0.0001* classifier.L1_sqr   # added today

    # Compile theano function for testing.
    test_model = theano.function(inputs=[index],
            outputs=classifier.errors(y),
            givens={
                x: test_set_x[index * batch_size:(index + 1) * batch_size],
                y: test_set_y[index * batch_size:(index + 1) * batch_size]})
    #theano.printing.pydotprint(test_model, outfile="test_file.png",
    #        var_with_name_simple=True)

    # Compile theano function for validation.
    validate_model = theano.function(inputs=[index],
            outputs=classifier.errors(y),
            givens={
                x: valid_set_x[index * batch_size:(index + 1) * batch_size],
                y: valid_set_y[index * batch_size:(index + 1) * batch_size]})

    def valid_score():
        return [validate_model(i) for i in xrange(n_valid_batches)]

        # Create a function that scans the entire test set
    def test_score():
        return [test_model(i) for i in xrange(n_test_batches)]

    def get_prediction(train_set_x, batch_size):
        prediction = theano.function(inputs = [index], outputs = classifier.pred,
                  givens={x: train_set_x[index * batch_size: (index + 1) * batch_size]})
        return prediction




    #theano.printing.pydotprint(validate_model, outfile="validate_file.png",
    #        var_with_name_simple=True)

    # Compute gradients of the model wrt parameters
    # gparams = []
    # for param in classifier.params:
    #     # Use the right cost function here to train with or without dropout.
    #     gparam = T.grad(dropout_cost if dropout else cost, param)
    #     gparams.append(gparam)

    # # ... and allocate mmeory for momentum'd versions of the gradient
    # gparams_mom = []
    # for param in classifier.params:
    #     gparam_mom = theano.shared(np.zeros(param.get_value(borrow=True).shape,
    #         dtype=theano.config.floatX))
    #     gparams_mom.append(gparam_mom)

    # # Compute momentum for the current epoch
    # mom = ifelse(epoch < mom_epoch_interval,
    #         mom_start*(1.0 - epoch/mom_epoch_interval) + mom_end*(epoch/mom_epoch_interval),
    #         mom_end)

    # # Update the step direction using momentum
    # updates = OrderedDict()
    # for gparam_mom, gparam in zip(gparams_mom, gparams):
    #     # Misha Denil's original version
    #     #updates[gparam_mom] = mom * gparam_mom + (1. - mom) * gparam
      
    #     # change the update rule to match Hinton's dropout paper
    #     updates[gparam_mom] = mom * gparam_mom - (1. - mom) * learning_rate * gparam

    # # ... and take a step along that direction
    # for param, gparam_mom in zip(classifier.params, gparams_mom):
    #     # Misha Denil's original version
    #     #stepped_param = param - learning_rate * updates[gparam_mom]
        
    #     # since we have included learning_rate in gparam_mom, we don't need it
    #     # here
    #     stepped_param = param + updates[gparam_mom]

    #     # This is a silly hack to constrain the norms of the rows of the weight
    #     # matrices.  This just checks if there are two dimensions to the
    #     # parameter and constrains it if so... maybe this is a bit silly but it
    #     # should work for now.
    #     if param.get_value(borrow=True).ndim == 2:
    #         #squared_norms = T.sum(stepped_param**2, axis=1).reshape((stepped_param.shape[0],1))
    #         #scale = T.clip(T.sqrt(squared_filter_length_limit / squared_norms), 0., 1.)
    #         #updates[param] = stepped_param * scale
            
    #         # constrain the norms of the COLUMNs of the weight, according to
    #         # https://github.com/BVLC/caffe/issues/109
    #         col_norms = T.sqrt(T.sum(T.sqr(stepped_param), axis=0))
    #         desired_norms = T.clip(col_norms, 0, T.sqrt(squared_filter_length_limit))
    #         scale = desired_norms / (1e-7 + col_norms)
    #         updates[param] = stepped_param * scale
    #     else:
    #         updates[param] = stepped_param

    updates = sgd(dropout_cost if dropout else cost, classifier.params, learning_rate = learning_rate)

    # Compile theano function for training.  This returns the training cost and
    # updates the model parameters.
    output = dropout_cost if dropout else cost
    train_model = theano.function(inputs=[epoch, index, theano.Param(learning_rate, default=0.1)], outputs=output,
            updates=updates,
            givens={
                x: train_set_x[index * batch_size:(index + 1) * batch_size],
                y: train_set_y[index * batch_size:(index + 1) * batch_size]},
                on_unused_input = 'ignore')
    #theano.printing.pydotprint(train_model, outfile="train_file.png",
    #        var_with_name_simple=True)

    # Theano function to decay the learning rate, this is separate from the
    # training function because we only want to do this once each epoch instead
    # of after each minibatch.
    # decay_learning_rate = theano.function(inputs=[], outputs=learning_rate,
    #         updates={learning_rate: learning_rate * learning_rate_decay})

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'

    ########################confusion matrix Block 1##########################    
    prediction = get_prediction(train_set_x,batch_size)
    y_truth = train_set_y.eval()
    y_truth = y_truth[0:(len(y_truth)-(len(y_truth)%batch_size))]
    cnf_freq = 1
    cnf_freq_v=5
    #################################
    prediction_v = get_prediction(valid_set_x,batch_size)
    y_truth_v = valid_set_y.eval()
    y_truth_v = y_truth_v[0:(len(y_truth_v)-(len(y_truth_v)%batch_size))]
    #######Added to see the confusion matrix of the validation data#############################



    patience = 40 * n_train_batches  # look as this many examples regardless
    patience_increase = 10.  # wait this much longer when a new best is
                            # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch


    



    best_params = None
    best_validation_errors = np.inf
    best_validation_loss = np.inf
    best_iter = 0
    test_scores = 0.
    epoch_counter = 0
    start_time = time.clock()

    adapt_counter = 0
    log_valid_cost = []

    shapeimg = [(42,42),(50,50), (25,40), (50,10)]

    # results_file = open(results_file_name, 'wb')

    adaptive_lr = initial_learning_rate


    while epoch_counter < n_epochs:
        # Train this epoch
        epoch_counter = epoch_counter + 1

        ################################confusion matrix block 2#################
        if epoch_counter%cnf_freq==0:
            pred_c = numpy.array([])
            for minibatch_index in xrange(n_train_batches):
                pred_c = numpy.concatenate([pred_c,numpy.array(prediction(minibatch_index))])
        
            cnf_matrix = confusion_matrix(y_truth, pred_c)
            print 'Training confusion matrix'
            print 
            print cnf_matrix
            print 
            ##########################################################################

        if epoch_counter%cnf_freq_v==0:
            pred_v = numpy.array([])
            for minibatch_index_v in xrange(n_valid_batches):
                pred_v = numpy.concatenate([pred_v,numpy.array(prediction(minibatch_index_v))])
        
            cnf_matrix_v = confusion_matrix(y_truth_v, pred_v)
            print 'validation confusion_matrix'
            print 
            print cnf_matrix_v  
            print           

        c = []
        for minibatch_index in xrange(n_train_batches):
            minibatch_avg_cost = train_model(epoch_counter, minibatch_index, adaptive_lr)
            c.append(minibatch_avg_cost)

            ###################################################################################

            iter = (epoch_counter - 1) * n_train_batches + minibatch_index

            if (iter + 1) % validation_frequency == 0:
                validation_losses = valid_score()
                this_validation_loss = numpy.mean(validation_losses)
                print('epoch %i, minibatch %i/%i, validation error %f %%' %
                      (epoch_counter, minibatch_index + 1, n_train_batches,
                       this_validation_loss * 100.))
                log_valid_cost.append(this_validation_loss)
##############################################Added on 13oct to see confusion matrix of validation data!##########################################################################

###########################################################################################################################################
                
                print 'Training cost: ', np.mean(c)

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:

                    #improve patience if loss improvement is good enough
                    if (
                        this_validation_loss < best_validation_loss *
                        improvement_threshold
                    ):
                        patience = max(patience, iter * patience_increase)

                    # save best validation score and iteration number
                    best_validation_loss = this_validation_loss
                    best_iter = iter
                    
                    
                    print 'Saving the best validation network'
                    genVariables = 'gen'
                    if dropout:
                        save_file = open(prefix + 'dropout_fine_tuning.pkl','wb')
                    else:
                        save_file = open(prefix + 'fine_tuning.pkl','wb')
                    cPickle.dump([1000,1000,1000], save_file)
                    cPickle.dump(genVariables, save_file)
                    for j in xrange(len(classifier.params)):
                        cPickle.dump(classifier.params[j].get_value(borrow=True), save_file, protocol = cPickle.HIGHEST_PROTOCOL)
                    save_file.close()
                    
                    
                
                    # test it on the test set
                    test_losses = test_score()
                    test_scores = numpy.mean(test_losses)
                    print(('     epoch %i, minibatch %i/%i, test error of '
                           'best model %f %%') %
                          (epoch_counter, minibatch_index + 1, n_train_batches,
                           test_scores * 100.))
                    

                else :
                    adapt_counter = adapt_counter+1
                # if adapt_counter>20:
                #     adapt_counter=0
                #     adaptive_lr=0.8*adaptive_lr
                #     print 'Reducing learning rate! ', adaptive_lr

        adaptive_lr = initial_learning_rate / ( 1 + 0.01 * epoch_counter)
        if epoch_counter %5 ==0:
            print 'current learning rate:-',adaptive_lr 

        # adaptive_lr=initial_learning_rate  # changed since we are using adadelta!!!

#        if epoch%1==0:
#            
#            prediction1 = prediction()
#            print prediction1[0]
                    
            #if patience <= iter:
            #    done_looping = True
            #    break

        if epoch_counter%10 == 0 and epoch_counter!=0 or epoch_counter == 399 or epoch_counter == 199:
            for i in xrange(len(classifier.params)/2 - 1):
                image = Image.fromarray(tile_raster_images(
                    X=classifier.params[2*i].get_value(borrow=True).T,
                    img_shape=shapeimg[i], tile_shape=(40,layer_sizes[i+1]/20),
                    tile_spacing=(1, 1)))
                image.save(prefix+str(i) + '_' + str(epoch_counter)+'.png')

        save_file = open(prefix + 'latest_fine_tuning2.pkl','wb')
        cPickle.dump([1000,1000,1000], save_file)
        cPickle.dump(genVariables, save_file)
        for j in xrange(len(classifier.params)):
            cPickle.dump(classifier.params[j].get_value(borrow=True), save_file, protocol = cPickle.HIGHEST_PROTOCOL)
        save_file.close()

    end_time = time.clock()
    print(
        (
            'Optimization complete with best validation score of %f %%, '
            'on iteration %i, '
            'with test performance %f %%'
        )
        % (best_validation_loss * 100., best_iter + 1, test_scores * 100.)
    )
    print >> sys.stderr, ('The training code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Exemple #50
0
def evaluate_lenet5(dataset='mnist.pkl.gz',
                    nkerns=[20, 50], batch_size=500):
    """ Demonstrates lenet on MNIST dataset

    :type dataset: string
    :param dataset: path to the dataset used for training /testing (MNIST here)

    :type nkerns: list of ints
    :param nkerns: number of kernels on each layer
    """

    rng = numpy.random.RandomState(23455)

    datasets = load_data(dataset)

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0]
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
    n_test_batches = test_set_x.get_value(borrow=True).shape[0]
    n_train_batches /= batch_size
    n_valid_batches /= batch_size
    n_test_batches /= batch_size

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch

    # start-snippet-1
    x = T.matrix('x')   # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of
                        # [int] labels

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size, 28 * 28)
    # to a 4D tensor, compatible with our ConvPoolLayer
    # (28, 28) is the size of MNIST images.
    layer0_input = x.reshape((batch_size, 1, 28, 28))

    # Construct the first convolutional pooling layer:
    # filtering reduces the image size to (28-5+1 , 28-5+1) = (24, 24)
    # maxpooling reduces this further to (24/2, 24/2) = (12, 12)
    # 4D output tensor is thus of shape (batch_size, nkerns[0], 12, 12)
    layer0 = ConvPoolLayer(
        rng,
        input=layer0_input,
        input_shape=(batch_size, 1, 28, 28),
        filter_shape=(nkerns[0], 1, 5, 5),
        poolsize=(2, 2)
    )

    # Construct the second convolutional pooling layer
    # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8)
    # maxpooling reduces this further to (8/2, 8/2) = (4, 4)
    # 4D output tensor is thus of shape (batch_size, nkerns[1], 4, 4)
    layer1 = ConvPoolLayer(
        rng,
        input=layer0.output,
        input_shape=(batch_size, nkerns[0], 12, 12),
        filter_shape=(nkerns[1], nkerns[0], 5, 5),
        poolsize=(2, 2)
    )

    # the HiddenLayer being fully-connected, it operates on 2D matrices of
    # shape (batch_size, num_pixels) (i.e matrix of rasterized images).
    # This will generate a matrix of shape (batch_size, nkerns[1] * 4 * 4),
    # or (500, 50 * 4 * 4) = (500, 800) with the default values.
    layer2_input = layer1.output.flatten(2)

    # construct a fully-connected sigmoidal layer
    layer2 = HiddenLayer(
        rng,
        input=layer2_input,
        n_in=nkerns[1] * 4 * 4,
        n_out=500,
        activation=T.tanh
    )

    # classify the values of the fully-connected sigmoidal layer
    layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=10)

    print '... loading saved model params'

    saved_params = joblib.load('model/lenet5_params.pkl')
    layer0.load_params(saved_params['conv1'])
    layer1.load_params(saved_params['conv2'])
    layer2.load_params(saved_params['fc1'])
    layer3.load_params(saved_params['log1'])

    # the cost we minimize during training is the NLL of the model
    # cost = layer3.negative_log_likelihood(y)

    # create a function to compute the mistakes that are made by the model
    test_model = theano.function(
        [index],
        layer3.errors(y),
        givens={
            x: test_set_x[index * batch_size: (index + 1) * batch_size],
            y: test_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )

    print '... testing'
    test_losses = [
        test_model(i)
        for i in xrange(n_test_batches)
    ]
    test_score = numpy.mean(test_losses)
    print 'test error: ', test_score * 100, '%'
import sys
from group import *
from load_data import *
from write_data import *

orig_data = [] 
group_data = [] 
rule = [] 
if len(sys.argv) != 5:

    print("%d\n" %len(sys.argv))
else:
    total_class = int(sys.argv[1])
    boy_class = int(sys.argv[2])
    girl_class = int(sys.argv[3])
    filename = sys.argv[4]
    load_data(filename, orig_data, rule)
    group_data = grouping(orig_data, total_class, boy_class, girl_class, rule)
    data = copy.deepcopy(group_data)
    writefile(data, filename, total_class, boy_class, girl_class)
from keras.layers import containers
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
import numpy as np
from scipy import misc
import os
import load_data

encoder = containers.Sequential([Dense(540*420, 270*210), Dense(270*210, 135*105)])
decoder = containers.Sequential([Dense(135*105, 270*210), Dense(270*210, 540*420)])

autoencoder = Sequential()
autoencoder.add(AutoEncoder(encoder=encoder, decoder=decoder, output_reconstruction=True))

sgd = SGD(lr=0.1, decay=1e-6, momentum=0.0, nesterov=True)
autoencoder.compile(loss='categorical_crossentropy', optimizer=sgd)

batch_size = 12
nb_epoch = 20

data, X_test = load_data()
X_train, Y_train = data[0][:140,:], data[1][:140,:]
X_test, Y_test = data[0][141,:], data[1][:141,:]

autoencoder.fit(X_train, Y_train, batch_size=batch_size, nb_epoch=nb_epoch)

Y_test = autoencoder.predict_classes(X_test, batch_size=1, verbose=True)
Y_test = Y_test.reshape((420,540))
print Y_test.tolist()

misc.imsave('nudie.png', Y_test)
def sgd_optimization_mnist(learning_rate=0.13, 
        n_epochs=1000, dataset='mnist.pkl.gz', batch_size=300):
    datasets = load_data(dataset)
    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]

    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size

    index = T.lscalar()
    x = T.matrix('x')
    y = T.ivector('y')
    step_rate = T.dscalar()

    classifier = LogisticRegression(input=x, n_in=28*28, n_out=10)
    cost = classifier.negative_log_likelihood(y)

    test_valid_model = theano.function(inputs=[index],
            outputs=classifier.errors(y),
            givens = {
                x: valid_set_x[index * batch_size : (index+1) * batch_size],
                y: valid_set_y[index * batch_size : (index+1) * batch_size]}
            )

    test_train_model = theano.function(inputs=[index],
            outputs=classifier.errors(y),
            givens = {
                x: train_set_x[index * batch_size : (index+1) * batch_size],
                y: train_set_y[index * batch_size : (index+1) * batch_size]}
            )

    g_W = T.grad(cost=cost, wrt=classifier.W)
    g_b = T.grad(cost=cost, wrt=classifier.b)
    updates = [(classifier.W, classifier.W - step_rate * g_W),
               (classifier.b, classifier.b - step_rate * g_b)]
    train_model = theano.function(inputs=[index, step_rate],
            outputs=cost,
            updates=updates,
            givens = {
                x: train_set_x[index * batch_size : (index+1) * batch_size],
                y: train_set_y[index * batch_size : (index+1) * batch_size]}
            )
    print 'Train the model ...'
    train_sample_num = train_set_x.get_value(borrow=True).shape[0]
    valid_sample_num = valid_set_x.get_value(borrow=True).shape[0]

    epoch = 0
    while epoch < n_epochs:
        epoch += 1
        if epoch > 50:
            learning_rate = 0.1
        for minibatch_index in xrange(n_train_batches):
            minibatch_cost = train_model(minibatch_index, learning_rate)
        train_losses = [test_train_model(i) for i in xrange(n_train_batches)]
        valid_losses = [test_valid_model(i) for i in xrange(n_valid_batches)]

        '''
        train_score  = numpy.sum(train_losses)
        valid_score  = numpy.sum(valid_losses)
        print 'epoch %i, train_score %f, valid_score %f' % (epoch, float(train_score) / train_sample_num, float(valid_score) / valid_sample_num)
        '''
        train_score  = numpy.mean(train_losses)
        valid_score  = numpy.mean(valid_losses)
        print 'epoch %i, train_score %f, valid_score %f' % (epoch, train_score, valid_score)
Exemple #54
0
num_estimators = 10  # estimators inside the random forest 
num_iters = 1000  # num of training iterations
num_users = sys.maxint  # max num of users(=trajectories) (use sys.maxint for unlimited case)
num_users_ratio = 0.7  # % of users(=trajectories) for training
demography = False
target_action = "Q235"  # every trajectory should end with this action
# target_action = "Q315"  # every trajectory should end with this action
feat_path = "../../data/lectures+demography/feats.csv"
result_dir = "../results/demo"+("1" if demography else '0')+"-d"+str(discount)+("-un"+str(num_users) if num_users != sys.maxint else "")+("-ur"+str(num_users_ratio) if num_users_ratio != 1 else "")+"-e"+str(num_estimators)+"-i"+str(num_iters)+"-t"+target_action
approximator_path = result_dir + "/approximator/random_forest_regressor.model"  # path to save the trained approximator
debug_action_cnt = False
debug_q0 = True


# Load data
cur_states, actions, rewards, next_states, users, action_index, user_index, valid_feats = load_data(feat_path, target_action, num_users=num_users, num_users_ratio=num_users_ratio, demography=demography)
print cur_states.shape, next_states.shape, actions.shape, rewards.shape, users.shape, len(action_index), len(user_index), sum(valid_feats)
# dim: cur_states,next_states = num_instances x num_features 
# dim: actions,rewards = num_instances

num_feats = cur_states.shape[1]
num_actions = len(action_index)
action_list = [ 0 for x in range(num_actions) ]
for a,i in action_index.iteritems(): action_list[i] = a
 
 
s0 = np.zeros((1,num_feats))
approximator = RandomForest(num_estimators=num_estimators, num_actions=num_actions)
approximator.train(cur_states, actions, rewards)
for iter in range(num_iters):
    print "---------------------------------------------------------------\nIteration", iter
Exemple #55
0
def evaluate_srcnn(learning_rate=0.1, n_epochs=200,
                    nkerns=[20, 50], batch_size=500):
    
    rng = numpy.random.RandomState(123)

    datasets = load_data()

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    n_train_batches = train_set_x.get_value(borrow=True).shape[0]
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
    n_test_batches = test_set_x.get_value(borrow=True).shape[0]
    n_train_batches /= batch_size
    n_valid_batches /= batch_size
    n_test_batches /= batch_size

    index = T.lscalar()

    x = T.matrix('x')
    y = T.matrix('y')

    print '... building the model'

    layer0_input = x.reshape((batch_size, 1, 800, 600))

    layer0 = conv_layer(
        rng,
        input=layer0_input,
        image_shape=(batch_size, 1, 800, 600),
        filter_shape=(nkerns[0], 1, 121, 1)
    )

    layer1 = conv_layer(
        rng,
        input=layer0.output,
        image_shape=(batch_size, nkerns[0], 680, 600),
        filter_shape=(nkerns[1], nkerns[0], 1, 121)
    )

    layer2 = conv_layer(
        rng,
        input=layer0.output,
        image_shape=(batch_size, nkerns[1], 680, 480),
        filter_shape=(nkerns[1], nkerns[1], 16, 16)
    )

    layer3 = conv_layer(
        rng,
        input=layer0.output,
        image_shape=(batch_size, nkerns[2], 665, 465),
        filter_shape=(nkerns[1], nkerns[2], 8, 8)
    )

    cost = sum((layer3.output - layer0_input)^2)

    test_model = theano.function(
        [index],
        layer3.errors(y),
        givens={
            x: test_set_x[index * batch_size: (index + 1) * batch_size],
            y: test_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )

    validate_model = theano.function(
        [index],
        layer3.errors(y),
        givens={
            x: valid_set_x[index * batch_size: (index + 1) * batch_size],
            y: valid_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )

    params = layer3.params + layer2.params + layer1.params + layer0.params

    grads = T.grad(cost, params)

    updates = [
        (param_i, param_i - learning_rate * grad_i)
        for param_i, grad_i in zip(params, grads)
    ]

    train_model = theano.function(
        [index],
        cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size: (index + 1) * batch_size],
            y: train_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )

    print '... training'

    patience = 10000
    patience_increase = 2

    improvement_threshold = 0.995

    validation_frequency = min(n_train_batches, patience / 2)

    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = timeit.default_timer()

    epoch = 0
    done_looping = False

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in xrange(n_train_batches):

            iter = (epoch - 1) * n_train_batches + minibatch_index

            if iter % 100 == 0:
                print 'training @ iter = ', iter
            cost_ij = train_model(minibatch_index)

            if (iter + 1) % validation_frequency == 0:

                # compute zero-one loss on validation set
                validation_losses = [validate_model(i) for i
                                     in xrange(n_valid_batches)]
                this_validation_loss = numpy.mean(validation_losses)
                print('epoch %i, minibatch %i/%i, validation error %f %%' %
                      (epoch, minibatch_index + 1, n_train_batches,
                       this_validation_loss * 100.))

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:

                    #improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                       improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    # save best validation score and iteration number
                    best_validation_loss = this_validation_loss
                    best_iter = iter

                    # test it on the test set
                    test_losses = [
                        test_model(i)
                        for i in xrange(n_test_batches)
                    ]
                    test_score = numpy.mean(test_losses)
                    print(('     epoch %i, minibatch %i/%i, test error of '
                           'best model %f %%') %
                          (epoch, minibatch_index + 1, n_train_batches,
                           test_score * 100.))

            if patience <= iter:
                done_looping = True
                break

    end_time = timeit.default_timer()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i, '
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Exemple #56
0
from load_data import *

import numpy as np
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.preprocessing import scale

data = load_data()
# data = scale(raw)

pca = PCA(n_components=9)
pca.fit(data)
print type(pca.explained_variance_ratio_)

cumulative_evr = np.zeros(8)
cumulative_evr[0] = pca.explained_variance_ratio_[0]
for i in range(1, 8):
    cumulative_evr[i] = cumulative_evr[i - 1] + pca.explained_variance_ratio_[i]

plt.figure(1)
index = np.arange(9) + 1
plt.bar(index, pca.explained_variance_ratio_, 0.35, color="g")
plt.ylim(0, 1)
plt.xlim(1, 9)
plt.title("Bar Plot for tge Explained Variance Ratio of Each Dimension")
plt.xlabel("dimension")
plt.ylabel("EVR")

plt.show()
Exemple #57
0
def test_SdA(finetune_lr=0.1, pretraining_epochs=1,
             pretrain_lr=0.001, training_epochs=1, 
             b_patch_filename = 'b_Training_patches_norm.npy', b_groundtruth_filename = 'b_Training_labels_norm.npy',
             b_valid_filename = 'b_Validation_patches_norm.npy', b_validtruth_filename = 'b_Validation_labels_norm.npy',
             u_patch_filename = 'u_Training_patches_norm.npy', u_groundtruth_filename = 'u_Training_labels_norm.npy',
             u_valid_filename = 'u_Validation_patches_norm.npy', u_validtruth_filename = 'u_Validation_labels_norm.npy',
             batch_size=100, n_ins = 605, n_outs = 5, hidden_layers_sizes = [1000,1000,1000],prefix = '11_11_3_G4_', corruption_levels=[0.2,0.2,0.2], resumeTraining = False, StopAtPretraining = False):
                 
    """
    Demonstrates how to train and test a stochastic denoising autoencoder.

    This is demonstrated on MNIST.

    :type learning_rate: float
    :param learning_rate: learning rate used in the finetune stage
    (factor for the stochastic gradient)

    :type pretraining_epochs: int
    :param pretraining_epochs: number of epoch to do pretraining

    :type pretrain_lr: float
    :param pretrain_lr: learning rate to be used during pre-training

    :type n_iter: int
    :param n_iter: maximal number of iterations to run the optimizer

    :type dataset: string
    :param dataset: path the the pickled dataset

    """
   
    print '###########################'
    print 'Pretraining epochs: ', pretraining_epochs
    print 'Finetuning epochs: ', training_epochs
    print '###########################'
    
    W = []
    b = []
    
    #########################################################
    #########################################################
    
    #@@@@@@@@ Needs to be worked on @@@@@@@@@@@@@@@@@
    # Snippet to resume training if the program crashes halfway through #
    opts, arg = getopt.getopt(sys.argv[1:],"rp:")
    for opt, arg in opts:
        if opt == '-r':
            resumeTraining = True                               # make this true to resume training from saved model    
        elif opt == '-p':
            prefix = arg
            
    flag = 0
    
    if(resumeTraining):
        
        flag = 1
        
        path = '/media/brain/1A34723D34721BC7/BRATS/codes/results/test_255_9x9x3/9x9x3pre_training.pkl'
                
        savedModel_preTraining = file(path,'rb')
        genVariables_preTraining = cPickle.load(savedModel_preTraining)
        layer_number, epochs_done_preTraining, mean_cost , pretrain_lr = genVariables_preTraining
        epoch_flag = 1
        print 'Inside resumeTraining!!!!!!!!!!!!!!!!!!'
        no_of_layers = len(hidden_layers_sizes) + 1
        
        for i in xrange(no_of_layers):
            W.append(cPickle.load(savedModel_preTraining))
            b.append(cPickle.load(savedModel_preTraining))    
   
              
    ##############################################################
    ##############################################################

    if flag == 0:
                
        datasets = load_data(b_patch_filename,b_groundtruth_filename,b_valid_filename,b_validtruth_filename)
    
        train_set_x, train_set_y = datasets[0]
        valid_set_x, valid_set_y = datasets[1]
        test_set_x, test_set_y = datasets[2]
        
    
        # compute number of minibatches for training, validation and testing
        n_train_batches = train_set_x.get_value(borrow=True).shape[0]
        n_train_batches /= batch_size
    
        # numpy random generator
        # start-snippet-3
        numpy_rng = numpy.random.RandomState(89677)
        print '... building the model'
        
    #    print 'W: ', W
    #    print 'b: ', b
        
        ################################################################
        ################CONSTRUCTION OF SdA CLASS#######################
        sda = SdA(
            numpy_rng=numpy_rng,
            n_ins=n_ins,
            hidden_layers_sizes=hidden_layers_sizes,
            n_outs=n_outs)
            
        print 'SdA constructed'
        ################################################################
        ################################################################
        
        ################################################################
        # end-snippet-3 start-snippet-4
        #########################
        # PRETRAINING THE MODEL #
        #########################
    
        flag = open(prefix+'flag.pkl','wb')
        cPickle.dump(1,flag, protocol = cPickle.HIGHEST_PROTOCOL)
        flag.close()
            
        print '... getting the pretraining functions'
        pretraining_fns = sda.pretraining_functions(train_set_x=train_set_x,batch_size=batch_size)
        print 'Length of pretraining function: ', len(pretraining_fns)

        print '... pre-training the model'
        start_time = time.clock()
        ## Pre-train layer-wise
        log_pretrain_cost = []

        

        shapeimg = [(33,44),(50,60), (25,40), (50,10)]

        #corruption_levels = [.001, .001, .001]
        for i in xrange(sda.n_layers):
            
            # if i < layer_number:
            #     i = layer_number
                #print i
                # go through pretraining epochs
            best_cost = numpy.inf
            adapt_counter = 0
            learning_rate = pretrain_lr

            if i==0:
                num_of_epochs = pretraining_epochs
            else:
                num_of_epochs = pretraining_epochs
            for epoch in xrange(num_of_epochs):


                ##########################################            
                # if epoch_flag is 1 and epoch < epochs_done_preTraining:
                #     epoch = epochs_done_preTraining
                #     epoch_flag = 0
                    ##########################################
                    # go through the training set
                c = []
                for batch_index in xrange(n_train_batches):
                    #sprint batch_index
                    c.append(pretraining_fns[i](index=batch_index,
                         corruption=corruption_levels[i],
                         lr=learning_rate))
                print 'Pre-training layer %i, epoch %d, cost ' % (i, epoch),
                print numpy.mean(c)
                current_cost = numpy.mean(c)
                log_pretrain_cost.append(numpy.mean(c))
                if current_cost < best_cost:
                    best_cost = current_cost
                if current_cost > best_cost :
                    adapt_counter = adapt_counter+1
                # if adapt_counter>25:
                itr = epoch + 1
                learning_rate = learning_rate / ( 1 + itr * 5e-05)
                # print 'Reducing learning rate', learning_rate
                adapt_counter = 0

            
                previous_cost = current_cost
                

                if epoch%50 == 0 and epoch!=0 or epoch == 399 or epoch == 199:
                    image = Image.fromarray(tile_raster_images(
                        X=sda.params[2*i].get_value(borrow=True).T,
                        img_shape=shapeimg[i], tile_shape=(40,hidden_layers_sizes[i]/20),
                        tile_spacing=(1, 1)))
                    image.save(prefix+str(i) + '_' + str(epoch)+'.png')
        

            save_valid = open(prefix+'pre_training.pkl', 'wb')
            genVariables = ['gen']
            cPickle.dump(genVariables,save_valid,protocol = cPickle.HIGHEST_PROTOCOL)
            for j in xrange(len(sda.params)):
                cPickle.dump(sda.params[j].get_value(borrow=True), save_valid, protocol = cPickle.HIGHEST_PROTOCOL)
            save_valid.close()


        pretrain_log_file = open(prefix + 'log_pretrain_cost.txt', "a")
        for l in log_pretrain_cost:
            pretrain_log_file.write("%f\n"%l)
        pretrain_log_file.close()
        
        # for k in [0,2,4,6]:
        #     print k
        #     image = Image.fromarray(tile_raster_images(
        #        X=sda.params[k].get_value(borrow=True).T,
        #        img_shape=shapeimg[k/2], tile_shape=(40,hidden_layers_sizes[k/2]/20),
        #        tile_spacing=(1, 1)))
        #     image.save(prefix+str(k/2)+'.png')


        #print sda.params[0]
        end_time = time.clock()

        print >> sys.stderr, ('The pretraining code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
                          
                     
    print '###################'
    # end-snippet-4
    ########################
    # FINETUNING THE MODEL #
    ########################

    # get the training, validation and testing function for the model   


    if flag == 1:
    
        datasets = load_data(u_patch_filename,u_groundtruth_filename,u_valid_filename,u_validtruth_filename)
        train_set_x, train_set_y = datasets[0]
        valid_set_x, valid_set_y = datasets[1]
        test_set_x, test_set_y = datasets[2]
        n_train_batches = train_set_x.get_value(borrow=True).shape[0]
        
        n_train_batches /= batch_size
        
        numpy_rng = numpy.random.RandomState(89677)
        print '... building the model'
        
    #    print 'W: ', W
    #    print 'b: ', b
        
        ################################################################
        ################CONSTRUCTION OF SdA CLASS#######################
        sda = SdA(
            numpy_rng=numpy_rng,
            n_ins=n_ins,
            hidden_layers_sizes=hidden_layers_sizes,
            n_outs=n_outs, W = W, b = b)
        
        print 'SdA constructed'
        
    if StopAtPretraining == False:  
        
        print '... getting the finetuning functions'
        train_fn, validate_model, test_model = sda.build_finetune_functions(datasets=datasets,batch_size=batch_size)
        print batch_size

        print '... finetunning the model'
        ########################confusion matrix Block 1##########################    
        prediction = sda.get_prediction(train_set_x,batch_size)
        y_truth = np.load(u_groundtruth_filename)
        y_truth = y_truth[0:(len(y_truth)-(len(y_truth)%batch_size))]
        cnf_freq = 1
        ##################################################################  
        # early-stopping parameters
        patience = 40 * n_train_batches  # look as this many examples regardless
        patience_increase = 10.  # wait this much longer when a new best is
                                # found
        improvement_threshold = 0.995  # a relative improvement of this much is
                                       # considered significant
        validation_frequency = min(n_train_batches, patience / 2)
                                      # go through this many
                                      # minibatche before checking the network
                                      # on the validation set; in this case we
                                      # check every epoch

        best_validation_loss = numpy.inf
        test_score = 0.
        start_time = time.clock()

        finetune_lr_initial = finetune_lr

        done_looping = False
        epoch = 0
        flag = open(prefix+'flag.pkl','wb')
        cPickle.dump(2,flag, protocol = cPickle.HIGHEST_PROTOCOL)
        flag.close()
        
        log_valid_cost=[]
        adapt_counter = 0
        while (epoch < training_epochs) and (not done_looping):
            
    #        if epochFlag_fineTuning is 1 and epoch < epochs_done_fineTuning:
    #            epoch = epochs_done_fineTuning
    #            epochFlag_fineTuning = 0
                
            epoch = epoch + 1
            ################################confusion matrix block 2#################
            if epoch%cnf_freq==0:
                pred_c = np.array([])
                for minibatch_index in xrange(n_train_batches):
                    pred_c = np.concatenate([pred_c,np.array(prediction(minibatch_index))])
            
                cnf_matrix = confusion_matrix(y_truth, pred_c)
                print cnf_matrix
            ##########################################################################
            c = []
            for minibatch_index in xrange(n_train_batches):
                minibatch_avg_cost = train_fn(index=minibatch_index,lr=finetune_lr)
                c.append(minibatch_avg_cost)
    #            if iterFlag is 1 and iter < iters_done:
    #                iter = iters_done
    #                iterFlag = 0
                        
                iter = (epoch - 1) * n_train_batches + minibatch_index

                if (iter + 1) % validation_frequency == 0:
                    validation_losses = validate_model()
                    this_validation_loss = numpy.mean(validation_losses)
                    print('epoch %i, minibatch %i/%i, validation error %f %%' %
                          (epoch, minibatch_index + 1, n_train_batches,
                           this_validation_loss * 100.))
                    log_valid_cost.append(this_validation_loss)

                    # if we got the best validation score until now
                    if this_validation_loss < best_validation_loss:

                        #improve patience if loss improvement is good enough
                        if (
                            this_validation_loss < best_validation_loss *
                            improvement_threshold
                        ):
                            patience = max(patience, iter * patience_increase)

                        # save best validation score and iteration number
                        best_validation_loss = this_validation_loss
                        best_iter = iter
                        
                        
                        print 'Saving the best validation network'
                        genVariables = [epoch,best_validation_loss,finetune_lr,patience,iter]
                        save_file = open(prefix+'fine_tuning.pkl','wb')
                        cPickle.dump(hidden_layers_sizes, save_file)
                        cPickle.dump(genVariables, save_file)
                        for j in xrange(len(sda.params)):
                            cPickle.dump(sda.params[j].get_value(borrow=True), save_file, protocol = cPickle.HIGHEST_PROTOCOL)
                        save_file.close()
                        
                        
                    
                        # test it on the test set
                        test_losses = test_model()
                        test_score = numpy.mean(test_losses)
                        print(('     epoch %i, minibatch %i/%i, test error of '
                               'best model %f %%') %
                              (epoch, minibatch_index + 1, n_train_batches,
                               test_score * 100.))
                               
                        print 'Training cost: ', np.mean(c)
                    else:
                        adapt_counter = adapt_counter+1
                    if adapt_counter>20:
                        adapt_counter=0
                        finetune_lr = 0.8*finetune_lr
                        print 'Reduced learning rate : ', finetune_lr

                    else:
                        finetune_lr = finetune_lr_initial / (1 + epoch * 5e-05)
                        
                #if patience <= iter:
                #    done_looping = True
                #    break

        end_time = time.clock()
        print(
            (
                'Optimization complete with best validation score of %f %%, '
                'on iteration %i, '
                'with test performance %f %%'
            )
            % (best_validation_loss * 100., best_iter + 1, test_score * 100.)
        )
        print >> sys.stderr, ('The training code for file ' +
                              os.path.split(__file__)[1] +
                              ' ran for %.2fm' % ((end_time - start_time) / 60.))

        valid_file = open(prefix+'log_valid_error.txt', 'w')
        valid_file.write('Best validation error: '+str(best_validation_loss*100))
        valid_file.write('\nBest test error: '+str(test_score*100))
        valid_file.close()
        finetune_log_file = open(prefix + 'log_finetune_cost.txt', "a")
        for l in log_valid_cost:
            finetune_log_file.write("%f\n"%l)
        finetune_log_file.close()