def run_logistic_regression(train_subset=45000, valid_size=5000, test=False):
  train_dataset, train_labels = load_train_data()
  train_dataset = reformat_dataset(train_dataset)

  valid_dataset = train_dataset[:valid_size, :]
  valid_labels = train_labels[:valid_size]
  train_dataset = train_dataset[valid_size:valid_size + train_subset, :]
  train_labels = train_labels[valid_size:valid_size + train_subset]
  print 'Training set size: ', train_dataset.shape, train_labels.shape
  print 'Validation set size: ', valid_dataset.shape, valid_labels.shape

  print 'Training...'
  logreg = LogisticRegression()
  logreg.fit(train_dataset, train_labels)

  train_predict = logreg.predict(train_dataset)
  valid_predict = logreg.predict(valid_dataset)

  train_accuracy = accuracy(train_predict, train_labels)
  valid_accuracy = accuracy(valid_predict, valid_labels)
  print_accuracy(train_accuracy, valid_accuracy)

  # Predict test data
  if (not test):
    return

  print 'Predicting test dataset...'
  test_dataset = load_test_data()
  test_dataset = test_dataset.reshape((test_dataset.shape[0], test_dataset.shape[1] *
                                       test_dataset.shape[2] * test_dataset.shape[3]))

  test_predict = logreg.predict(test_dataset)
  label_matrices_to_csv(test_predict, 'submission.csv')
Example #2
0
def get_train_valid_data(train_subset=45000,
                         valid_size=5000,
                         reformat_data=True,
                         reformat_label=True):
    """
  Get dataset from cifar10_train.pickle file, convert the data type of numpy.float32, and 
  separate the dataset into training set and validation set. 

  Take note that train_subset + valid_size cannot be more than 50000.

  Keyword arguments:
    train_subset -- the number of training set
    valid_size -- the number of validation set
    reformat_data -- if True, reformat the dataset to 2 dimension matrix. Else, keep the dataset 
    as 4 dimension matrix
    reformat_label -- if True, reformat the labels to (n X num_labels) dimension matrix. Else, 
    keep the labels as 2 dimension matrix
  """
    if train_subset + valid_size > 50000:
        raise Exception('train_subset + valid_size cannot be more than 50000')

    train_dataset, train_labels = load_train_data()

    if reformat_data:
        train_dataset = reformat_dataset(train_dataset)

    if reformat_label:
        train_labels = reformat_labels(train_labels)

    train_dataset = train_dataset.astype(np.float32)
    train_labels = train_labels.astype(np.float32)

    # Create a validation dataset
    valid_dataset = train_dataset[:valid_size]
    valid_labels = train_labels[:valid_size]
    train_dataset = train_dataset[valid_size:valid_size + train_subset]
    train_labels = train_labels[valid_size:valid_size + train_subset]
    print 'Training set size:', train_dataset.shape, train_labels.shape
    print 'Validation set size:', valid_dataset.shape, valid_labels.shape

    return train_dataset, train_labels, valid_dataset, valid_labels
Example #3
0
def get_train_valid_data(train_subset=45000,
                         valid_size=5000,
                         reformat_data=True,
                         reformat_label=True):
  """
  Get dataset from cifar10_train.pickle file, convert the data type of numpy.float32, and 
  separate the dataset into training set and validation set. 

  Take note that train_subset + valid_size cannot be more than 50000.

  Keyword arguments:
    train_subset -- the number of training set
    valid_size -- the number of validation set
    reformat_data -- if True, reformat the dataset to 2 dimension matrix. Else, keep the dataset 
    as 4 dimension matrix
    reformat_label -- if True, reformat the labels to (n X num_labels) dimension matrix. Else, 
    keep the labels as 2 dimension matrix
  """
  if train_subset + valid_size > 50000:
    raise Exception('train_subset + valid_size cannot be more than 50000')

  train_dataset, train_labels = load_train_data()

  if reformat_data:
    train_dataset = reformat_dataset(train_dataset)

  if reformat_label:
    train_labels = reformat_labels(train_labels)

  train_dataset = train_dataset.astype(np.float32)
  train_labels = train_labels.astype(np.float32)

  # Create a validation dataset
  valid_dataset = train_dataset[:valid_size]
  valid_labels = train_labels[:valid_size]
  train_dataset = train_dataset[valid_size:valid_size + train_subset]
  train_labels = train_labels[valid_size:valid_size + train_subset]
  print 'Training set size:', train_dataset.shape, train_labels.shape
  print 'Validation set size:', valid_dataset.shape, valid_labels.shape

  return train_dataset, train_labels, valid_dataset, valid_labels