def __init__(self, scaled_features, labels, num_samples, scikit_balancing):
        """
    scaled_feature: Must contain the features all scaled to the same range.
    labels: labels corresponding to scaled_features
    num_samples: how many random data points to sample and use from scaled_features for
    training the feature selector models
    """
        # select a smaller sample for feature selection
        indices = numpy.random.choice(scaled_features.shape[0],
                                      num_samples,
                                      replace=False)
        l1_svm_features = scaled_features[indices, :]
        l1_svm_labels = labels[indices]

        # Manually balance data. Do this on the whole data, because we are training the
        # feature selction on all of data.
        self.features, self.labels, self.penalty_weights = utils.prepare_train_data(
            l1_svm_features, l1_svm_labels, scikit_balancing, -1)

        # a dictionary from svm cost to trained selector model
        self.feature_selector_models = dict()

        # Keeps track of the latest model trained for feature selection. All calls for data
        # transformation or feature coefficients are done based on this trained model.
        self.current_model = None
        self.current_transformer = None
Ejemplo n.º 2
0
def train(model, config):
    input_, label_ = prepare_train_data(config)

    model.train_op = tf.train.AdamOptimizer(
        learning_rate=config.learning_rate).minimize(model.loss)
    tf.initialize_all_variables().run()

    counter = 0
    time_ = time.time()

    model.load("checkpoint")

    print("Starting to train on {} images".format(input_.shape))
    for ep in range(config.epoch):
        batch_i = len(input_) // config.batch_size
        for idx in range(0, batch_i):
            batch_images = input_[idx * config.batch_size:(idx + 1) *
                                  config.batch_size]
            batch_labels = label_[idx * config.batch_size:(idx + 1) *
                                  config.batch_size]
            counter += 1
            _, err = model.sess.run([model.train_op, model.loss],
                                    feed_dict={
                                        model.images: batch_images,
                                        model.labels: batch_labels
                                    })
            if counter % 100 == 0:
                print(
                    "Epoch: [%2d], step: [%2d], time: [%4.4f], loss: [%.8f]" %
                    ((ep + 1), counter, time.time() - time_, err))
            if counter % 1000 == 0:
                model.save("checkpoint", counter)
Ejemplo n.º 3
0
def train_logistic(train_features, train_labels, test_features,
                   scikit_balancing, train_size, skip_feature_selection,
                   skip_grid_search, penalty, cost, dual, tol, num_jobs):
    """
  Performs all the data transformations on test data and returns the trained model and the
  transformed test data
  """
    # balance the train data set and create requested train size.
    train_features, train_labels, penalty_weights = utils.prepare_train_data(
        train_features, train_labels, scikit_balancing, train_size)

    # Impute the data and replace missing values
    imputer = Imputer(missing_values="NaN",
                      strategy='mean',
                      axis=0,
                      copy=False)
    imputer.fit(train_features)
    train_features = imputer.transform(train_features)
    test_features = imputer.transform(test_features)

    if not skip_feature_selection:
        # feature selector expects scaled features
        (scaled_train_features,
         scaled_test_features) = utils.scale_data(train_features,
                                                  test_features, 'minmax')
        feature_selector_obj = feature_selection.feature_selector(
            scaled_train_features, train_labels, len(train_labels),
            scikit_balancing)
        feature_selector_obj.select_optimal_set(num_jobs)
        train_features = feature_selector_obj.transform(train_features)
        test_features = feature_selector_obj.transform(test_features)
        print("Selected %d features for grid search and final test." %
              len(feature_selector_obj.get_selected_features()))

    # requested grid search. find best parameters, to achieve highest average recall
    if not skip_grid_search:
        algorithm = "logistic"
        clf = grid_search.grid_search("macro-recall", train_features,
                                      train_labels, scikit_balancing,
                                      algorithm, num_jobs)
        params = clf.best_params_
        print("Best Parameters are: {} ".format(params))
        print("Best Cross Validation Score (mean, std): ({},{})".format(
            clf.cv_results_['mean_test_score'][clf.best_index_],
            clf.cv_results_['std_test_score'][clf.best_index_]))
        penalty = params['penalty']
        cost = params['C']

    # Now perform the training on full train data. check on test data
    model = LogisticRegression(penalty=penalty,
                               dual=dual,
                               C=cost,
                               tol=tol,
                               max_iter=5000,
                               class_weight=penalty_weights)
    model = model.fit(train_features, train_labels)

    return (model, train_features, train_labels, test_features)
Ejemplo n.º 4
0
def train_and_save_model():
    print("reading data..")
    train_x, train_y, test_x, test_y = prepare_train_data()
    assert train_x.shape[0] == train_y.shape[0]
    assert test_x.shape[0] == test_y.shape[0]

    print("start classifying..")
    model = train_model(train_x, train_y, test_x, test_y)
    model.save(MODEL_DIR, save_format='h5')
Ejemplo n.º 5
0
def perform_single_svm(input_data):
    """ Perform a single trial of svm with selected features
  """
    # extract inputs from input tuple
    features = input_data[0]
    labels = input_data[1]
    svm_kernel = input_data[2]
    svm_gamma = input_data[3]
    svm_cost = input_data[4]
    svm_degree = input_data[5]
    scikit_balancing = input_data[6]
    test_size = input_data[7]

    tolerance = 0.005
    cache_size = 6000

    # VERY IMPORTANT: Provide a random state, since it seems like multiple workers split the
    # data in the same way
    random.seed()
    train_features, test_features, train_labels, test_labels = (
        model_selection.train_test_split(features,
                                         labels,
                                         test_size=test_size,
                                         random_state=random.randint(
                                             1, 99999999)))

    train_features, train_labels, penalty_weights = utils.prepare_train_data(
        train_features, train_labels, scikit_balancing, -1)

    model = svm.SVC(tol=tolerance,
                    cache_size=cache_size,
                    class_weight=penalty_weights,
                    kernel=svm_kernel,
                    gamma=svm_gamma,
                    C=svm_cost,
                    degree=svm_degree)
    model = model.fit(train_features, train_labels)
    predicted_labels = model.predict(test_features)
    label_values = [0, 1]
    trial_metrics = compute_evaluation_metrics(predicted_labels, test_labels,
                                               label_values)
    return trial_metrics
Ejemplo n.º 6
0
def perform_single_random_forest(input_data):
    """ Perform a single trial of random forest with selected features
  """
    # extract inputs from input tuple
    features = input_data[0]
    labels = input_data[1]
    rf_num_trees = input_data[2]
    rf_criterion = input_data[3]
    rf_max_features = input_data[4]
    rf_min_samples_split = input_data[5]
    rf_min_samples_leaf = input_data[6]
    scikit_balancing = input_data[7]
    test_size = input_data[8]

    # VERY IMPORTANT: Provide a random state, since it seems like multiple workers split the
    # data in the same way
    random.seed()
    train_features, test_features, train_labels, test_labels = (
        model_selection.train_test_split(features,
                                         labels,
                                         test_size=test_size,
                                         random_state=random.randint(
                                             1, 99999999)))

    train_features, train_labels, penalty_weights = utils.prepare_train_data(
        train_features, train_labels, scikit_balancing, -1)

    model = RandomForestClassifier(n_estimators=rf_num_trees,
                                   n_jobs=-1,
                                   criterion=rf_criterion,
                                   max_features=rf_max_features,
                                   min_samples_split=rf_min_samples_split,
                                   min_samples_leaf=rf_min_samples_leaf)
    model = model.fit(train_features,
                      train_labels,
                      sample_weight=penalty_weights)
    predicted_labels = model.predict(test_features)
    label_values = [0, 1]
    trial_metrics = compute_evaluation_metrics(predicted_labels, test_labels,
                                               label_values)
    return trial_metrics
Ejemplo n.º 7
0
def perform_single_logistic(input_data):
    """ Perform a single trial of logistic regression with selected features
  """
    # extract inputs from input tuple
    features = input_data[0]
    labels = input_data[1]
    logistic_penalty = input_data[2]
    logistic_cost = input_data[3]
    scikit_balancing = input_data[4]
    test_size = input_data[5]

    tolerance = 0.0005
    max_iterations = 10000

    # VERY IMPORTANT: Provide a random state, since it seems like multiple workers split the
    # data in the same way
    random.seed()
    train_features, test_features, train_labels, test_labels = (
        model_selection.train_test_split(features,
                                         labels,
                                         test_size=test_size,
                                         random_state=random.randint(
                                             1, 99999999)))

    train_features, train_labels, penalty_weights = utils.prepare_train_data(
        train_features, train_labels, scikit_balancing, -1)

    model = LogisticRegression(penalty=logistic_penalty,
                               C=logistic_cost,
                               tol=tolerance,
                               max_iter=max_iterations,
                               class_weight=penalty_weights)
    model = model.fit(train_features, train_labels)
    predicted_labels = model.predict(test_features)
    label_values = [0, 1]
    trial_metrics = compute_evaluation_metrics(predicted_labels, test_labels,
                                               label_values)
    return trial_metrics
Ejemplo n.º 8
0
def main(_):
    all_sentence, all_tags, all_intent, vocab, dictionary, tags_list, tags_dict, intent_list, intent_dict = prepare_train_data(
        FLAGS.train_data_file, FLAGS.vocab_size)
    train_data, dev_data = split_data(all_sentence, all_tags, all_intent)
    # train_sentence, train_tags, train_intent = train_data
    # dev_sentence, dev_tags, dev_intent = dev_data

    output_path = os.path.join(sys.path[0], 'runs', str(int(time.time())))
    checkpoint_dir = os.path.join(output_path, 'checkpoints')
    os.makedirs(checkpoint_dir, mode=0o755, exist_ok=True)

    save_vocabulary(os.path.join(output_path, 'sentence_vocab'), vocab)
    save_vocabulary(os.path.join(output_path, 'tag_vocab'), tags_list)
    save_vocabulary(os.path.join(output_path, 'intent_vocab'), intent_list)

    model = RNNModel(hidden_size=FLAGS.hidden_size,
                     embed_size=FLAGS.embedding_size,
                     source_vocab_size=len(vocab),
                     tag_vocab_size=len(tags_list),
                     intent_vocab_size=len(intent_list))

    with tf.Session(graph=model.graph) as sess:
        sess.run(tf.initialize_all_variables())

        step = 1
        avg_tag_loss = 0
        avg_intent_loss = 0
        for epoch in range(FLAGS.num_epoch):
            batch_gen = batch_generator(*train_data)
            for sentence_batch, length_batch, tags_batch, intent_batch in batch_gen:
                _, tag_loss, intent_loss = sess.run(
                    [model.train_op, model.tag_loss, model.intent_loss],
                    feed_dict={
                        model.input_x: sentence_batch,
                        model.input_len: length_batch,
                        model.input_tag: tags_batch,
                        model.input_intent: intent_batch,
                        model.keep_prob: FLAGS.dropout_keep_prob
                    })
                avg_tag_loss += tag_loss
                avg_intent_loss += intent_loss
                if step % 20 == 0:
                    avg_tag_loss /= 20
                    avg_intent_loss /= 20
                    print('Step', step, 'Tag loss', tag_loss, 'Intent loss',
                          intent_loss)
                    avg_tag_loss = 0
                    avg_intent_loss = 0
                step += 1

            correct_tag, total_tag = 0, 0
            correct_intent, total_intent = 0, 0
            for sentence, tags, intent in zip(*dev_data):
                predict_tags, predict_intent = sess.run(
                    [model.output_tag, model.output_intent],
                    feed_dict={
                        model.input_x: [sentence],
                        model.input_len: [len(sentence)],
                        model.keep_prob: 1.0
                    })
                for tag1, tag2 in zip(tags, predict_tags[0]):
                    if tag1 == tag2:
                        correct_tag += 1
                    total_tag += 1
                if intent == predict_intent[0]:
                    correct_intent += 1
                total_intent += 1
            tag_accuracy = correct_tag / total_tag
            intent_accuracy = correct_intent / total_intent
            print('[Validation]', 'tag acc =', tag_accuracy, ', intent acc =',
                  intent_accuracy, '\n')
            model.saver.save(
                sess,
                os.path.join(
                    checkpoint_dir,
                    '{}_{:.4f}_{:.4f}.ckpt'.format(epoch, tag_accuracy,
                                                   intent_accuracy)))
Ejemplo n.º 9
0
def main():
  df = pandas.read_csv(args.input_filename, index_col=False, header=0)
  data = df.values
  column_names = df.columns.values.tolist()
  
  # Extract features/labels and their names from raw data
  features = data[:, 0:args.label_column]
  labels = data[:, args.label_column].astype(int)
  feature_names = column_names[0:args.label_column]
  label_name = column_names[args.label_column]
  
  # We specify absolute train sizes so we can compare across different data sets with
  # different overal sizes. Need to do a dummy split to train and test, so we can figure
  # out max possible train size after balancing
  dummy_train_features, dummy_test_features, dummy_train_labels, dummy_test_labels = (
    model_selection.train_test_split(features, labels, test_size=args.test_size))
  dummy_train_features, dummy_train_labels, penalty_weights = utils.prepare_train_data(
      dummy_train_features, dummy_train_labels, args.scikit_balancing, -1)
  max_possible_train_size = dummy_train_features.shape[0]
  train_sizes = range(400, 15000, 100)
  train_sizes.extend(range(15000, min(max_possible_train_size, 30001), 500))
 
  metric_names = ["train_size",
                  "test_size", "test_female_size", "test_male_size",
                  "test_true_female", "test_false_female",
                  "test_true_male", "test_false_male",
                  "test_accuracy",
                  "test_female_precision", "test_male_precision",
                  "test_female_recall", "test_male_recall"]
  # mapping from train size to any of "accuracy", "precision"... to list of values, each
  # value corresponding to the result from one trial
  results = defaultdict(lambda: defaultdict(list))
  finished_trials = 0
  while finished_trials < args.num_trials:
    # Figure out how many parallel processes we should launch to satisfy number of trials.
    num_processes = min(args.num_processes, args.num_trials - finished_trials)
    replicated_data = list()
    for n in range(0, num_processes):
      # VERY IMPORTANT: Provide a random state, since it seems like multiple workers split
      # the data in the same way due to an identical intitial random state
      random_seed = random.randint(1, 999999999)
      replicated_data.append((features, labels, train_sizes, random_seed))

    pool = multiprocessing.Pool(processes = num_processes)
    trials_metrics = pool.map(compute_trial_metrics, replicated_data)
    pool.close()
    finished_trials += num_processes

    # Add trial metrics to results by looping over different trials in a list
    for trial_metrics in trials_metrics:
      # loop over different train size in dict
      for train_size in train_sizes:
        metric_values = trial_metrics[train_size]
        # loop over different metrics
        for metric in metric_names:
          results[train_size][metric].append(metric_values[metric])
          
    print("\nFinished %d trials\n" % finished_trials)

  
  # generate output file and header
  output_file = open(args.output_filename, "w")
  output_file_writer = csv.writer(output_file)
  output_file_writer.writerow(metric_names)

  for train_size in train_sizes:
    output_file_writer.writerow([train_size,
                                 int(mean(results[train_size]["test_size"])),
                                 int(mean(results[train_size]["test_female_size"])),
                                 int(mean(results[train_size]["test_male_size"])),
                                 int(mean(results[train_size]["test_true_female"])),
                                 int(mean(results[train_size]["test_false_female"])),
                                 int(mean(results[train_size]["test_true_male"])),
                                 int(mean(results[train_size]["test_false_male"])),
                                 mean(results[train_size]["test_accuracy"]),
                                 mean(results[train_size]["test_female_precision"]),
                                 mean(results[train_size]["test_male_precision"]),
                                 mean(results[train_size]["test_female_recall"]),
                                 mean(results[train_size]["test_male_recall"])
                                ])
  output_file.close() 
Ejemplo n.º 10
0
def grid_search(score, features, labels, scikit_balancing, algorithm,
                num_jobs):
    """
  expects the features to be scaled!
  """
    # Now balance the train data set and create requested train size.
    features, labels, penalty_weights = utils.prepare_train_data(
        features, labels, scikit_balancing, -1)

    # Set the parameters for gid search and model based on algorithm choice
    if algorithm == 'kernel-svm':
        tuned_parameters = [{
            'kernel': ['rbf'],
            'gamma': [0.1, 0.01, 0.001, 0.0001],
            'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
        }, {
            'kernel': ['sigmoid'],
            'gamma': [0.1, 0.01, 0.001, 0.0001],
            'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
        }, {
            'kernel': ['poly'],
            'degree': [2, 3, 4],
            'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
        }]
        model = svm.SVC(tol=0.005,
                        cache_size=6000,
                        class_weight=penalty_weights)
    elif algorithm == 'linear-svm':
        tuned_parameters = [{
            'loss': ['hinge', 'squared_hinge'],
            'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
        }]
        model = svm.LinearSVC(tol=0.005,
                              max_iter=5000,
                              class_weight=penalty_weights)
    elif algorithm == 'logistic':
        tuned_parameters = [{
            'penalty': ['l1', 'l2'],
            'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]
        }]
        model = LogisticRegression(tol=0.0005,
                                   max_iter=1000,
                                   class_weight=penalty_weights)
    elif algorithm == 'random-forest':
        tuned_parameters = [{
            'n_estimators': [100],
            'criterion': ['gini', 'entropy'],
            'max_features': ['sqrt', 'log2', 0.5, 0.8],
            'min_samples_split': [2],
            'min_samples_leaf': [1]
        }, {
            'n_estimators': [100],
            'criterion': ['gini', 'entropy'],
            'max_features': ['sqrt', 'log2', 0.5, 0.8],
            'min_samples_split': [5],
            'min_samples_leaf': [1, 2]
        }, {
            'n_estimators': [100],
            'criterion': ['gini', 'entropy'],
            'max_features': ['sqrt', 'log2', 0.5, 0.8],
            'min_samples_split': [10],
            'min_samples_leaf': [2, 5]
        }, {
            'n_estimators': [100],
            'criterion': ['gini', 'entropy'],
            'max_features': ['sqrt', 'log2', 0.5, 0.8],
            'min_samples_split': [20],
            'min_samples_leaf': [5, 10]
        }, {
            'n_estimators': [100],
            'criterion': ['gini', 'entropy'],
            'max_features': ['sqrt', 'log2', 0.5, 0.8],
            'min_samples_split': [50],
            'min_samples_leaf': [5, 15, 25]
        }]
        model = RandomForestClassifier(class_weight=penalty_weights)
    elif algorithm == 'knn':
        tuned_parameters = [{
            'n_neighbors':
            [1, 2, 3, 4, 5, 10, 15, 20, 30, 50, 70, 100, 150, 200],
            'metric': ['euclidean', 'manhattan', 'chebyshev'],
            'algorithm': ['ball_tree', 'kd_tree'],
            'weights': ['uniform', 'distance']
        }]
        model = KNeighborsClassifier()
    else:
        sys.exit('Invalid algorithm: ' + algorithm + ' provided')

    scorer = create_scorer(score)
    skf = StratifiedKFold(n_splits=5, shuffle=True)
    # Don't pre dispatch all jobs at once, only dispatch ones you are runnings so memory
    # usage does not blow up
    clf = GridSearchCV(estimator=model,
                       param_grid=tuned_parameters,
                       n_jobs=num_jobs,
                       pre_dispatch="n_jobs",
                       cv=skf,
                       scoring=scorer)

    clf.fit(features, labels)
    return clf
Ejemplo n.º 11
0
def train_knn(train_features, train_labels, test_features, imbalanced_data,
              train_size, scaling_method, minmax_min, minmax_max,
              skip_feature_selection, skip_grid_search, n_neighbors, weights,
              algorithm, metric, num_jobs):
    """
  Performs all the data transformations on test data and returns the trained model and the
  transformed test data
  """
    # balance the train data set and create requested train size. Here instead of
    # scikit balancing, we will use imbalanced_data flag and discard the last output since
    # it is irrelevant to knn. In order not to balance the data, the third argument should
    # be true (simulate scikit balancing); so we will use imabalanced_data flag in place of
    # scikit_balancing.
    train_features, train_labels, dummy = utils.prepare_train_data(
        train_features, train_labels, imbalanced_data, train_size)

    # Impute the data and replace missing values
    imputer = Imputer(missing_values="NaN",
                      strategy='mean',
                      axis=0,
                      copy=False)
    imputer.fit(train_features)
    train_features = imputer.transform(train_features)
    test_features = imputer.transform(test_features)

    # now that we have limited the data to requested train size, scale data
    (train_features,
     test_features) = utils.scale_data(train_features, test_features,
                                       scaling_method, minmax_min, minmax_max)

    if not skip_feature_selection:
        feature_selector_obj = feature_selection.feature_selector(
            train_features, train_labels, len(train_labels), imbalanced_data)
        feature_selector_obj.select_optimal_set(num_jobs)
        train_features = feature_selector_obj.transform(train_features)
        test_features = feature_selector_obj.transform(test_features)
        print("Selected %d features for grid search and final test." %
              len(feature_selector_obj.get_selected_features()))

    # requested grid search. find best parameters, to achieve highest average recall
    if not skip_grid_search:
        algorithm = "knn"
        clf = grid_search.grid_search("macro-recall", train_features,
                                      train_labels, imbalanced_data, algorithm,
                                      num_jobs)
        params = clf.best_params_
        print("Best Parameters are: {} ".format(params))
        print("Best Cross Validation Score (mean, std): ({},{})".format(
            clf.cv_results_['mean_test_score'][clf.best_index_],
            clf.cv_results_['std_test_score'][clf.best_index_]))
        n_neighbors = params['n_neighbors']
        weights = params['weights']
        algorithm = params['algorithm']
        metric = params['metric']

    # Now perform the training on full train data. check on test data
    model = KNeighborsClassifier(n_neighbors=n_neighbors,
                                 weights=weights,
                                 algorithm=algorithm,
                                 metric=metric)
    model = model.fit(train_features, train_labels)

    return (model, train_features, train_labels, test_features)
Ejemplo n.º 12
0
def train_svm(train_features, train_labels, test_features, scikit_balancing,
              train_size, scaling_method, minmax_min, minmax_max,
              skip_feature_selection, skip_grid_search, kernel, gamma, cost,
              degree, num_jobs):
    """ Balances, extracts the requested train size, imputes, scales and finally performs
  features selection on the train data. Then it performs grid search, train a model using
  the best parameters.

  Performs all the data transformations on test data and returns the trained model and the
  transformed test data
  """
    # balance the train data set and create requested train size.
    train_features, train_labels, penalty_weights = utils.prepare_train_data(
        train_features, train_labels, scikit_balancing, train_size)

    # Impute the data and replace missing values
    imputer = Imputer(missing_values="NaN",
                      strategy='mean',
                      axis=0,
                      copy=False)
    imputer.fit(train_features)
    train_features = imputer.transform(train_features)
    test_features = imputer.transform(test_features)

    # now that we have limited the data to requested train size, scale data
    (train_features,
     test_features) = utils.scale_data(train_features, test_features,
                                       scaling_method, minmax_min, minmax_max)

    if not skip_feature_selection:
        feature_selector_obj = feature_selection.feature_selector(
            train_features, train_labels, len(train_labels), scikit_balancing)
        feature_selector_obj.select_optimal_set(num_jobs)
        train_features = feature_selector_obj.transform(train_features)
        test_features = feature_selector_obj.transform(test_features)
        print("Selected %d features for grid search and final test." %
              len(feature_selector_obj.get_selected_features()))

    # requested grid search. find best parameters, to achieve highest average recall
    if not skip_grid_search:
        algorithm = "linear-svm" if kernel == "linear" else "kernel-svm"
        clf = grid_search.grid_search("macro-recall", train_features,
                                      train_labels, scikit_balancing,
                                      algorithm, num_jobs)
        params = clf.best_params_
        print("Best Parameters are: {} ".format(params))
        print("Best Cross Validation Score (mean, std): ({},{})".format(
            clf.cv_results_['mean_test_score'][clf.best_index_],
            clf.cv_results_['std_test_score'][clf.best_index_]))
        if 'kernel' in params:
            kernel = params['kernel']
        if 'gamma' in params:
            gamma = params['gamma']
        if 'C' in params:
            cost = params['C']
        if 'degree' in params:
            degree = params['degree']

    # Now perform the training on full train data. check on test data
    # We enable probability estimates, so that we can identify the top samples.
    model = svm.SVC(tol=0.05,
                    cache_size=6000,
                    class_weight=penalty_weights,
                    kernel=kernel,
                    gamma=gamma,
                    C=cost,
                    degree=degree,
                    probability=True)
    model = model.fit(train_features, train_labels)

    return (model, train_features, train_labels, test_features)
Ejemplo n.º 13
0
def train_random_forest(train_features, train_labels, test_features,
                        scikit_balancing, train_size, skip_feature_selection,
                        skip_grid_search, max_features, n_estimators,
                        criterion, min_samples_split, min_samples_leaf,
                        num_jobs):
    """
  Performs all the data transformations on test data and returns the trained model and the
  transformed test data
  """
    # balance the train data set and create requested train size.
    train_features, train_labels, penalty_weights = utils.prepare_train_data(
        train_features, train_labels, scikit_balancing, train_size)

    # Impute the data and replace missing values
    imputer = Imputer(missing_values="NaN",
                      strategy='mean',
                      axis=0,
                      copy=False)
    imputer.fit(train_features)
    train_features = imputer.transform(train_features)
    test_features = imputer.transform(test_features)

    if not skip_feature_selection:
        # feature selector expects scaled features
        (scaled_train_features,
         scaled_test_features) = utils.scale_data(train_features,
                                                  test_features, 'minmax')
        feature_selector_obj = feature_selection.feature_selector(
            scaled_train_features, train_labels, len(train_labels),
            scikit_balancing)
        feature_selector_obj.select_optimal_set(num_jobs)
        train_features = feature_selector_obj.transform(train_features)
        test_features = feature_selector_obj.transform(test_features)
        print("Selected %d features for grid search and final test." %
              len(feature_selector_obj.get_selected_features()))

    max_features = utils.extract_max_features(max_features)
    # requested grid search. find best parameters, to achieve highest average recall
    if not skip_grid_search:
        algorithm = "random-forest"
        clf = grid_search.grid_search("macro-recall", train_features,
                                      train_labels, scikit_balancing,
                                      algorithm, num_jobs)
        params = clf.best_params_
        print("Best Parameters are: {} ".format(params))
        print("Best Cross Validation Score (mean, std): ({},{})".format(
            clf.cv_results_['mean_test_score'][clf.best_index_],
            clf.cv_results_['std_test_score'][clf.best_index_]))
        n_estimators = max(params['n_estimators'], n_estimators)
        criterion = params['criterion']
        max_features = params['max_features']
        min_samples_split = params['min_samples_split']
        min_samples_leaf = params['min_samples_leaf']

    # Now perform the training on full train data. check on test data
    model = RandomForestClassifier(n_estimators=n_estimators,
                                   n_jobs=num_jobs,
                                   criterion=criterion,
                                   max_features=max_features,
                                   min_samples_split=min_samples_split,
                                   min_samples_leaf=min_samples_leaf,
                                   class_weight=penalty_weights)
    model = model.fit(train_features, train_labels)

    return (model, train_features, train_labels, test_features)
Ejemplo n.º 14
0
tf.flags.DEFINE_integer(
    "evaluate_every", 100,
    "Evaluate model on dev set after this many steps (default: 100)")
tf.flags.DEFINE_integer("checkpoint_every", 100,
                        "Save model after this many steps (default: 100)")

FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()

print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{} = {}".format(attr, value))
print("")

print("Dataset preparing...")
train_X, train_y = prepare_train_data(FLAGS.pos_data_file, FLAGS.neg_data_file)
train_X, test_X, train_y, test_y = train_test_split(train_X,
                                                    train_y,
                                                    test_size=FLAGS.test_perc)

model = SentimentPredictior(
    MAX_SENT_LEN,
    MAX_WORD_LEN,
    LETTERS_COUNT,
    filters=FLAGS.num_filters,
    lr=FLAGS.learning_rate,
    words_take=[int(i) for i in FLAGS.words_take.split(",")],
    dropout=FLAGS.dropout)

print("Start training...")
model.fit(train_X,
def main():
    df = pandas.read_csv(args.input_filename, index_col=False, header=0)

    # figure out the name of the filtering column
    column_names = df.columns.values.tolist()
    filtering_columns = [
        'number_of_contacts__allweek__allday__call__mean',
        'number_of_contacts__call__mean'
    ]
    filtering_columns = [x for x in filtering_columns if x in column_names]
    if len(filtering_columns) == 2:
        sys.exit('Both columns ' + str(filtering_columns) +
                 ' are present in data.')
    if len(filtering_columns) == 0:
        sys.exit('None of columns ' + str(filtering_columns) +
                 ' are present in data.')
    filtering_column = filtering_columns[0]

    # figure out filtering thresholds that satisfy the requested train and test sizes. Need
    # to do a dummy split to train and test, so we can figure out max possible train size
    # after balancing
    filtering_thresholds = list()
    for filtering_threshold in arange(0, 7, 0.5):
        data = df[df[filtering_column] >= filtering_threshold].values
        features = data[:, 0:args.label_column]
        labels = data[:, args.label_column].astype(int)
        dummy_train_features, dummy_test_features, dummy_train_labels, dummy_test_labels = (
            model_selection.train_test_split(features,
                                             labels,
                                             test_size=args.test_size))
        dummy_train_features, dummy_train_labels, penalty_weights = utils.prepare_train_data(
            dummy_train_features, dummy_train_labels, args.scikit_balancing,
            -1)
        # This is a good filtering threshold if the number of data points satisfying the
        # threshold exceeds the requested train size
        if dummy_train_features.shape[0] >= args.train_size:
            filtering_thresholds.append(filtering_threshold)
        else:
            break

    metric_names = [
        "min_active_days", "percentage_data", "train_size",
        "train_female_size", "train_male_size", "test_size",
        "test_female_size", "test_male_size", "test_true_female",
        "test_false_female", "test_true_male", "test_false_male",
        "test_accuracy", "test_AUC", "test_average_precision",
        "test_female_precision", "test_male_precision", "test_average_recall",
        "test_female_recall", "test_male_recall", "test_average_f1score",
        "test_female_f1score", "test_male_f1score"
    ]
    # mapping from filtering threshold to any of "accuracy", "precision"... to list of
    # values, each value corresponding to the result from one trial
    results = defaultdict(lambda: defaultdict(list))
    for trial in range(args.num_trials):
        random_seed = random.randint(1, 999999999)
        trial_metrics = compute_trial_metrics(df, filtering_thresholds,
                                              filtering_column, random_seed)

        # loop over different filtering thresholds in dict
        for filtering_threshold in filtering_thresholds:
            metric_values = trial_metrics[filtering_threshold]
            # loop over different metrics
            for metric in metric_names:
                results[filtering_threshold][metric].append(
                    metric_values[metric])

        print("\nFinished %d trials\n" % (trial + 1))

    # generate output file and header
    output_file = open(args.output_filename, "w")
    output_file_writer = csv.writer(output_file)
    output_file_writer.writerow(metric_names)

    for filtering_threshold in filtering_thresholds:
        output_file_writer.writerow([
            filtering_threshold,
            mean(results[filtering_threshold]["percentage_data"]),
            int(mean(results[filtering_threshold]["train_size"])),
            int(mean(results[filtering_threshold]["train_female_size"])),
            int(mean(results[filtering_threshold]["train_male_size"])),
            int(mean(results[filtering_threshold]["test_size"])),
            int(mean(results[filtering_threshold]["test_female_size"])),
            int(mean(results[filtering_threshold]["test_male_size"])),
            int(mean(results[filtering_threshold]["test_true_female"])),
            int(mean(results[filtering_threshold]["test_false_female"])),
            int(mean(results[filtering_threshold]["test_true_male"])),
            int(mean(results[filtering_threshold]["test_false_male"])),
            mean(results[filtering_threshold]["test_accuracy"]),
            mean(results[filtering_threshold]["test_AUC"]),
            mean(results[filtering_threshold]["test_average_precision"]),
            mean(results[filtering_threshold]["test_female_precision"]),
            mean(results[filtering_threshold]["test_male_precision"]),
            mean(results[filtering_threshold]["test_average_recall"]),
            mean(results[filtering_threshold]["test_female_recall"]),
            mean(results[filtering_threshold]["test_male_recall"]),
            mean(results[filtering_threshold]["test_average_f1score"]),
            mean(results[filtering_threshold]["test_female_f1score"]),
            mean(results[filtering_threshold]["test_male_f1score"])
        ])
    output_file.close()