コード例 #1
0
def get_values(data_input):
    """
    For each given depth, calculate the maximum depth
    and accuracies of the unpruned and pruned tree
    """
    data = data_input.copy()
    np.random.shuffle(data)
    t_split = 0.8
    v_split = 0.9
    train = data[:int(len(data) * t_split)]
    validation = data[int(len(data) * t_split):(int(len(data) * v_split))]
    test = data[int(len(data) * v_split):]
    depths = []
    pruned_depths = []
    accuracies = []
    pruned_accuracies = []

    for i in range(1, 20):
        model = binarySearchTree(train, limit=i)
        depth = model.get_max_depth()
        y_pred = [x.item() for x in model.predict(test[:, :-1])]
        y_true = [int(val.item()) for val in test[:, -1]]
        depths.append(depth)
        cm = ev.confusion_matrix(y_true, y_pred)
        accuracies.append(ev.get_class_rate(cm))

        model.prune_tree(validation)
        depth2 = model.get_max_depth()
        y_pred2 = [x.item() for x in model.predict(test[:, :-1])]
        y_true2 = [int(val.item()) for val in test[:, -1]]
        pruned_depths.append(depth2)
        cm2 = ev.confusion_matrix(y_true2, y_pred2)
        pruned_accuracies.append(ev.get_class_rate(cm2))
    return depths, accuracies, pruned_depths, pruned_accuracies
コード例 #2
0
ファイル: cross_validation.py プロジェクト: monikajot/ML_cw1
def grow_binary_trees(data_input, stratified=False, pruning=False):
    '''
    Splits the data set into 10 folds and uses each fold as a testing set once.
    Calculates metrics for each fold to get standard errors.
    Data split - stratified or fully random. #TODO
    '''

    # Create 10 Folds: Group indices into 10 folds
    data = data_input.copy()
    np.random.shuffle(data)
    data = data.reshape((10, -1, 8))

    if pruning:
        results, results_pruned = {}, {}
        for i, test_fold in enumerate(data):
            results[i] = np.zeros((4, 4))
            results_pruned[i] = np.zeros((4, 4))

            train_val_data = np.delete(data, i, axis=0)

            for j, val_fold in enumerate(train_val_data):

                train_data = np.vstack(np.delete(train_val_data, j, axis=0))

                tree = binarySearchTree(train_data)

                results[i] += ev.confusion_matrix(test_fold[:, -1].astype(int),
                                                  tree.predict(test_fold),
                                                  normalised=True)

                tree.prune_tree(val_fold)

                results_pruned[i] += ev.confusion_matrix(
                    test_fold[:, -1].astype(int),
                    tree.predict(test_fold),
                    normalised=True)
            results[i] /= 9
            results_pruned[i] /= 9

        return results, results_pruned

    else:
        results = {}
        for i, test_fold in enumerate(data):

            train_data = np.vstack(np.delete(data, i, axis=0))

            tree = binarySearchTree(train_data)

            results[i] = ev.confusion_matrix(test_fold[:, -1].astype(int),
                                             tree.predict(test_fold),
                                             normalised=True)

        return results
コード例 #3
0
ファイル: trees.py プロジェクト: monikajot/ML_cw1
 def get_f1(self, data):
     """
     Calculates the F1 score
     """
     pred = self.predict(data[:, :-1])
     cm = ev.confusion_matrix(data[:, -1], pred)
     return ev.get_f1_scores(cm)
コード例 #4
0
    def predict(self):
        correct_labels = []
        predicted_labels = []

        # Perform MAP classification
        with open(DATA_DIR + self.test_file) as f:
            for line in f:
                doc = line.split()
                correct_labels.append(self.classes[doc[0]])
                map_classifier = np.zeros(self.num_classes)
                # Calculate decision function on test doc features for each class
                for model_class in range(self.num_classes):
                    map_classifier[model_class] = np.array(
                        [math.log(float(self.doc_counts[model_class]/np.sum(self.doc_counts)))])
                    if self.runmode == 'multinomial':
                        for word in doc[1:]:
                            word = word.split(':')
                            if word[0] in self.model:
                                map_classifier[model_class] += math.log(self.model[word[0]][model_class])
                    elif self.runmode == 'bernoulli':
                        words = [word.split(':')[0] for word in doc[1:]]
                        for word in self.model:
                            if word in words:
                                map_classifier[model_class] += math.log(self.model[word][model_class])
                            else:
                                map_classifier[model_class] += math.log(1. - self.model[word][model_class])
                # Get best classified class
                predicted_labels.append(np.argmax(map_classifier))

        # Get total accuracy
        correct_labels = np.array(correct_labels)
        predicted_labels = np.array(predicted_labels)
        accuracy = calc_accuracy(correct_labels, predicted_labels)
        logger.info('NB model is {0:.2f}% accurate on the {1} data with k = {2}.'
                    .format(accuracy, self.runmode, self.k))

        # Get confusion matrix with class accuracies
        cm = confusion_matrix(correct_labels, predicted_labels, self.num_classes)
        class_accuracies = [cm[n][n] for n in range(self.num_classes)]
        for n, x in enumerate(class_accuracies):
            logger.info('Class {0} has an accuracy of {1:.2f}%'.format(self.class_names[n], 100 * x))

        # Plot confusion matrix
        plt.figure(figsize=(30,30))
        plt.imshow(cm, cmap=plt.get_cmap('Greens'), interpolation='nearest')
        plt.title('Confusion Matrix')
        plt.xticks(np.arange(self.num_classes), self.class_names, fontsize = 8)
        plt.yticks(np.arange(self.num_classes), self.class_names, fontsize = 10)
        plt.xlabel('Predictions')
        plt.ylabel('Truths')
        plt.colorbar()
        plt.show()
コード例 #5
0
def task_3_logistic(x, y, x_test, y_test, args):
    accuracies = []
    sizes = np.linspace(10, 200, num=20)
    N = y.shape[0]
    for size in sizes:
        acc = 0
        for i in range(50):

            rand = np.random.randint(int(N), size=int(size))
            m = LogisticRegression(x[rand], y[rand])
            m.fit(lr=args[0], eps=args[1], regularization=args[2])
            pred = m.predict(x_test)
            cm = evaluation.confusion_matrix(y_test, pred)
            acc += evaluation.accuracy(cm)

        accuracies.append(acc/50)

    return accuracies, sizes
コード例 #6
0
def evaluate_link(class_match_set, class_nonmatch_set, true_match_set,
                  all_comparisons):
    # Linkage evaluation
    linkage_result = evaluation.confusion_matrix(class_match_set,
                                                 class_nonmatch_set,
                                                 true_match_set,
                                                 all_comparisons)

    accuracy = evaluation.accuracy(linkage_result)
    precision = evaluation.precision(linkage_result)
    recall = evaluation.recall(linkage_result)
    fmeasure = evaluation.fmeasure(linkage_result)

    print('Linkage evaluation:')
    print('  Accuracy:    %.6f' % (accuracy))
    print('  Precision:   %.6f' % (precision))
    print('  Recall:      %.6f' % (recall))
    print('  F-measure:   %.6f' % (fmeasure))
    print('')
コード例 #7
0
def evaluate(dataset, args, verbose=True):
    """Evaluate the audio tagging predictions and record the results.

    Args:
        dataset (Dataset): Information about the dataset.
        args: Named tuple of configuration arguments.
        verbose (bool): Whether to print the results.

    Returns:
        pd.DataFrame: The results of the evaluation.
    """
    import evaluation

    # Load grouth truth data and predictions
    path = os.path.join(args.prediction_path, args.training_id,
                        f'{dataset.name}.csv')
    y_pred = pd.read_csv(path, index_col=0)
    df_true = pd.read_csv(dataset.metadata_path, index_col=0)
    y_true = pd.get_dummies(df_true.loc[y_pred.index].label).values
    y_pred = y_pred.values

    # Evaluate audio tagging performance
    scores = evaluation.evaluate(y_true, y_pred)
    C = evaluation.confusion_matrix(y_true, y_pred)

    # Ensure output directory exist
    result_path = os.path.join(args.result_path, args.training_id)
    os.makedirs(result_path, exist_ok=True)

    # Write results to disk
    output_path = os.path.join(result_path, '%s_{}.csv' % dataset.name)
    scores.to_csv(output_path.format('scores'))
    C.to_csv(output_path.format('cmatrix'))

    # Print results (optional)
    if verbose:
        print('Confusion Matrix:\n', C.values, '\n')

        pd.options.display.float_format = '{:,.3f}'.format
        print(str(scores))

    return scores
コード例 #8
0
def task_3_naive(df, test_df, label, cont=[], cat=[], bin=[]):
    accuracies = []
    sizes = np.linspace(10, 200, num=20)
    N = df.shape[0]
    for size in sizes:
        acc = 0
        for i in range(25):
            print(size, i)
            rand = np.random.randint(int(N), size=int(size))
            m = NaiveBayes(df.loc[rand], label, continuous=cont, categorical=cat, binary=bin)
            pred = test_df.apply(m.predict, axis=1)

            cm = evaluation.confusion_matrix(test_df[label].to_numpy(), pred.to_numpy())
            acc += evaluation.accuracy(cm)

        accuracies.append(acc/25)



    return accuracies, sizes
コード例 #9
0
def new_model(data1, data2):
    """
    Training and testing a tree on the noisy dataset without those observations above
    Results in a 97% accuracy
    """
    diff_obs = check_signals_only(data1, data2)
    df2 = pd.DataFrame(data2)
    clean_removed = pd.concat([df2, diff_obs,
                               diff_obs]).drop_duplicates(keep=False)
    clean_removed_dataset = clean_removed.to_numpy()

    np.random.shuffle(clean_removed_dataset)
    split = 0.7
    train = clean_removed_dataset[:int(len(clean_removed_dataset) * split)]
    test = clean_removed_dataset[int(len(clean_removed_dataset) * split):]

    model = trees.binarySearchTree(train)
    print('Max depth is', model.get_max_depth())
    y_pred = model.predict(test[:, :-1])
    cm = ev.confusion_matrix(test[:, -1], y_pred)
    i = ev.get_metrics(cm, printout=True)
    ev.plot_conf_matrix(cm)
コード例 #10
0
    def train(self, epochs=10):
        if self.__xtrain and self.__ytrain and self.__xtest and self.__ytest:
            pass
        else:
            self.__load_dataset()

        # Open a writer to write summaries.
        self.__writer = tf.summary.FileWriter(self.__TMP_DIR,
                                              self.__session.graph)

        for epoch in range(epochs):
            #learning_rate = self.__session.run(self.__lr)
            #print('Learning rate', learning_rate)

            average_loss = 0
            num_steps = len(self.__flow)

            for step in tqdm.tqdm(range(num_steps),
                                  desc='Epoch ' +
                                  str(epoch + 1 + self.__GLOBAL_EPOCH) + '/' +
                                  str(epochs + self.__GLOBAL_EPOCH)):

                batch, label = self.__flow.next()

                run_metadata = tf.RunMetadata()
                _, l = self.__session.run([self.__train_op, self.__loss],
                                          feed_dict={
                                              self.__images: batch,
                                              self.__labels: label
                                          },
                                          run_metadata=run_metadata)

                average_loss += l

                # print loss and accuracy on test set at the and of each epoch
                if step == num_steps - 1:

                    y_true = []
                    y_pred = []

                    for i in range(len(self.__xtest)):
                        prediction = self.__session.run(
                            self.__labels_predicted,
                            feed_dict={self.__images: [self.__xtest[i]]},
                            run_metadata=run_metadata)

                        y_true.append(self.__ytest[i])
                        y_pred.append(prediction[0])

                    accuracy = ev.accuracy(y_true, y_pred)

                    print('Loss:', str(average_loss / step), '\tAccuracy:',
                          accuracy)

                    with open(self.__TMP_DIR + '/log.txt',
                              'a',
                              encoding='utf8') as f:
                        f.write(
                            str(accuracy) + ' ' + str(average_loss / step) +
                            '\n')

                if step == (num_steps - 1) and epoch + 1 == epochs:
                    s = self.__session.run(self.__global_step)
                    self.__writer.add_run_metadata(run_metadata,
                                                   'step%d' % step,
                                                   global_step=s)

        self.__saver.save(self.__session,
                          os.path.join(self.__TMP_DIR, 'model.ckpt'))
        dp.global_epoch(self.__TMP_DIR + 'epoch.txt',
                        update=self.__GLOBAL_EPOCH + epochs)

        self.__writer.close()

        pg.generate_accuracy_plot(data_dir=self.__TMP_DIR)
        pg.generate_loss_plot(data_dir=self.__TMP_DIR)

        conf_mat = ev.confusion_matrix(y_true, y_pred, len(self.__SENTIMENTS))
        pg.generate_confusion_matrix_plot(conf_mat,
                                          self.__SENTIMENTS,
                                          data_dir=self.__TMP_DIR)
        pg.generate_confusion_matrix_plot(conf_mat,
                                          self.__SENTIMENTS,
                                          normalize=True,
                                          data_dir=self.__TMP_DIR)
コード例 #11
0
estimator.fit(X_train_normalized, y_train_row)

#print(estimator.cv_results_)
print(estimator.best_score_)
print(estimator.best_params_)

### Plot the best number of components ###
pca.fit(X_train_normalized)
plt.figure(1, figsize=(4, 3))
plt.clf()
plt.axes([.2, .2, .7, .7])
plt.plot(pca.explained_variance_, linewidth=2)
plt.axis('tight')
plt.xlabel('n_components')
plt.ylabel('explained_variance')
plt.axvline(estimator.best_estimator_.named_steps['pca'].n_components,
            linestyle=':',
            label='n_components chosen')
plt.legend(prop=dict(size=12))
plt.show()

### Evaluation ###
prediction = estimator.predict(X_train_normalized)
evaluation.visualize_errors_by_genre(y_train_row, prediction, y_label_names)
evaluation.confusion_matrix(y_train_row, prediction, y_label_names)

###Visualize grid search
results = estimator.cv_results_
evaluation.visualize_gridsearch(results, 'logistic__C', 'C')
#evaluation.visualize_gridsearch(results,'pca__n_components', 'n_components')
コード例 #12
0
ファイル: recordLinkage.py プロジェクト: Alvin2580du/alvin_py
# Blocking evaluation
#
rr = evaluation.reduction_ratio(num_comparisons, all_comparisons)
pc = evaluation.pairs_completeness(cand_rec_id_pair_list, true_match_set)
pq = evaluation.pairs_quality(cand_rec_id_pair_list, true_match_set)

print('Blocking evaluation:')
print('  Reduction ratio:    %.3f' % (rr))
print('  Pairs completeness: %.3f' % (pc))
print('  Pairs quality:      %.3f' % (pq))
print('')

# Linkage evaluation
#
linkage_result = evaluation.confusion_matrix(class_match_set,
                                             class_nonmatch_set,
                                             true_match_set, all_comparisons)

accuracy = evaluation.accuracy(linkage_result)
precision = evaluation.precision(linkage_result)
recall = evaluation.recall(linkage_result)
fmeasure = evaluation.fmeasure(linkage_result)

print('Linkage evaluation:')
print('  Accuracy:    %.3f' % (accuracy))
print('  Precision:   %.3f' % (precision))
print('  Recall:      %.3f' % (recall))
print('  F-measure:   %.3f' % (fmeasure))
print('')

linkage_time = loading_time + blocking_time + comparison_time + \
コード例 #13
0
def main(blocking_fn,
         classification_fn,
         threshold,
         minthresh,
         weightvec,
         blocking_attrs,
         func_list,
         save=False):

    # ******** In lab 3, explore different attribute sets for blocking ************

    # The list of attributes to use for blocking (all must occur in the above
    # attribute lists)
    blocking_attrA_list = blocking_attrs
    blocking_attrB_list = blocking_attrs

    # ******** In lab 4, explore different comparison functions for different  ****
    # ********           attributes                                            ****

    # The list of tuples (comparison function, attribute number in record A,
    # attribute number in record B)
    #
    exact_comp_funct_list = [
        (comparison.exact_comp, 1, 1),  # First name
        (comparison.exact_comp, 2, 2),  # Middle name
        (comparison.exact_comp, 3, 3),  # Last name
        (comparison.exact_comp, 8, 8),  # Suburb
        (comparison.exact_comp, 10, 10),  # State
    ]

    approx_comp_funct_list = [
        (func_list[0], 1, 1),  # First name
        (func_list[1], 2, 2),  # Middle name
        (func_list[2], 3, 3),  # Last name
        (func_list[3], 7, 7),  # Address
        (func_list[4], 8, 8),  # Suburb
        (func_list[5], 10, 10),  # State
    ]

    # =============================================================================
    #
    # Step 1: Load the two datasets from CSV files

    start_time = time.time()

    recA_dict = loadDataset.load_data_set(datasetA_name, rec_idA_col, \
                                          attrA_list, headerA_line)
    recB_dict = loadDataset.load_data_set(datasetB_name, rec_idB_col, \
                                          attrB_list, headerB_line)

    # Load data set of true matching pairs
    #
    true_match_set = loadDataset.load_truth_data(truthfile_name)

    loading_time = time.time() - start_time

    # -----------------------------------------------------------------------------
    # Step 2: Block the datasets

    def genericBlock(block_function='none',
                     recA_dict=recA_dict,
                     recB_dict=recB_dict,
                     blocking_attrA_list=blocking_attrA_list,
                     blocking_attrB_list=blocking_attrB_list):

        start_time = time.time()

        # Select one blocking technique
        if block_function == 'none':
            # No blocking (all records in one block)
            #
            resultA = blocking.noBlocking(recA_dict)
            resultB = blocking.noBlocking(recB_dict)

        if block_function == 'attr':
            # Simple attribute-based blocking
            #
            resultA = blocking.simpleBlocking(recA_dict, blocking_attrA_list)
            resultB = blocking.simpleBlocking(recB_dict, blocking_attrB_list)

        if block_function == 'soundex':
            # Phonetic (Soundex) based blocking
            #
            resultA = blocking.phoneticBlocking(recA_dict, blocking_attrA_list)
            resultB = blocking.phoneticBlocking(recB_dict, blocking_attrB_list)

        if block_function == 'slk':
            # Statistical linkage key (SLK-581) based blocking
            #
            fam_name_attr_ind = 3
            giv_name_attr_ind = 1
            dob_attr_ind = 6
            gender_attr_ind = 4

            resultA = blocking.slkBlocking(recA_dict, fam_name_attr_ind, \
                                              giv_name_attr_ind, dob_attr_ind, \
                                              gender_attr_ind)
            resultB = blocking.slkBlocking(recB_dict, fam_name_attr_ind, \
                                              giv_name_attr_ind, dob_attr_ind, \
                                              gender_attr_ind)

        block_time = time.time() - start_time

        # Print blocking statistics
        #
        # blocking.printBlockStatistics(resultA, resultB)

        return resultA, resultB, block_time

    blockA_dict, blockB_dict, blocking_time = genericBlock(
        block_function=blocking_fn)
    # -----------------------------------------------------------------------------
    # Step 3: Compare the candidate pairs

    start_time = time.time()

    sim_vec_dict = comparison.compareBlocks(blockA_dict, blockB_dict, \
                                            recA_dict, recB_dict, \
                                            approx_comp_funct_list)

    comparison_time = time.time() - start_time

    # -----------------------------------------------------------------------------
    # Step 4: Classify the candidate pairs

    def genericClassification(classification_function='exact',
                              sim_vec_dict=sim_vec_dict,
                              sim_threshold=threshold,
                              min_sim_threshold=minthresh,
                              weight_vec=weightvec,
                              true_match_set=true_match_set):
        start_time = time.time()

        if classification_function == 'exact':
            # Exact matching based classification
            class_match_set1, class_nonmatch_set1 = \
                         classification.exactClassify(sim_vec_dict)

        if classification_function == 'simthresh':
            # Similarity threshold based classification
            #
            class_match_set1, class_nonmatch_set1 = \
                        classification.thresholdClassify(sim_vec_dict, sim_threshold)

        if classification_function == 'minsim':
            # Minimum similarity threshold based classification
            #
            class_match_set1, class_nonmatch_set1 = \
                        classification.minThresholdClassify(sim_vec_dict,
                                                            min_sim_threshold)

        if classification_function == 'weightsim':
            # Weighted similarity threshold based classification
            #
            # weight_vec = [1.0] * len(approx_comp_funct_list)

            # Lower weights for middle name and state
            #
            # weight_vec = [2.0, 1.0, 2.0, 2.0, 2.0, 1.0]

            class_match_set1, class_nonmatch_set1 = \
                        classification.weightedSimilarityClassify(sim_vec_dict,
                                                                  weight_vec,
                                                                  sim_threshold)

        if classification_function == 'dt':
            # A supervised decision tree classifier
            #
            class_match_set1, class_nonmatch_set1 = \
                      classification.supervisedMLClassify(sim_vec_dict, true_match_set)

        class_time = time.time() - start_time

        return class_match_set1, class_nonmatch_set1, class_time

    threshold = minthresh

    class_match_set, class_nonmatch_set, classification_time = genericClassification(
        classification_fn)

    # -----------------------------------------------------------------------------
    # Step 5: Evaluate the classification

    # Initialise dictionary of results
    dict = {}

    # Get the number of record pairs compared
    #
    num_comparisons = len(sim_vec_dict)

    # Get the number of total record pairs to compared if no blocking used
    #
    all_comparisons = len(recA_dict) * len(recB_dict)

    # Get the list of identifiers of the compared record pairs
    #
    cand_rec_id_pair_list = sim_vec_dict.keys()

    # Blocking evaluation
    #
    rr = evaluation.reduction_ratio(num_comparisons, all_comparisons)
    pc = evaluation.pairs_completeness(cand_rec_id_pair_list, true_match_set)
    pq = evaluation.pairs_quality(cand_rec_id_pair_list, true_match_set)

    # Linkage evaluation
    #
    linkage_result = evaluation.confusion_matrix(class_match_set,
                                                 class_nonmatch_set,
                                                 true_match_set,
                                                 all_comparisons)

    accuracy = evaluation.accuracy(linkage_result)
    precision = evaluation.precision(linkage_result)
    recall = evaluation.recall(linkage_result)
    fmeasure = evaluation.fmeasure(linkage_result)

    # print('Linkage evaluation:')
    # print('  Accuracy:    %.3f' % (accuracy))
    # print('  Precision:   %.3f' % (precision))
    # print('  Recall:      %.3f' % (recall))
    # print('  F-measure:   %.3f' % (fmeasure))
    # print('')

    linkage_time = loading_time + blocking_time + comparison_time + \
                   classification_time
    # print('Total runtime required for linkage: %.3f sec' % (linkage_time))

    # Export blocking metrics
    dict['blocking_fn'] = blocking_fn
    dict['classification_fn'] = classification_fn
    dict['threshold'] = threshold
    dict['min_thresh'] = minthresh
    dict['weight_vec'] = weightvec
    dict['blocking_attrs'] = blocking_attrs
    dict['comp_funcs'] = func_list
    dict['num_comparisons'] = num_comparisons
    dict['all_comparisons'] = all_comparisons
    # dict['cand_rec_id_pair_list'] = cand_rec_id_pair_list
    dict['rr'] = rr
    dict['pc'] = pc
    dict['pq'] = pq
    dict['blocking_time'] = blocking_time
    # dict['linkage_result'] = linkage_result
    dict['accuracy'] = accuracy
    dict['precision'] = precision
    dict['recall'] = recall
    dict['fmeasure'] = fmeasure
    dict['linkage_time'] = linkage_time

    # Save results
    if save:
        saveLinkResult.save_linkage_set('final_results.txt', class_match_set)

    # Return results
    return dict
コード例 #14
0
            )
            if limit == '':
                print('No limit entered')
                limit = None
            else:
                limit = int(limit)

            np.random.shuffle(data)
            train = data[:int(len(data) * split)]
            test = data[int(len(data) * split):]

            model = binarySearchTree(train, limit=limit)
            print('Max depth of tree is', model.get_max_depth())

            y_pred = model.predict(test[:, :-1])
            cm = ev.confusion_matrix(test[:, -1], y_pred)
            i = ev.get_metrics(cm, printout=True)
            print('To continue, you may need to close the plot windows first')
            ev.plot_conf_matrix(cm)
            print('Visualising the pruned trees')
            model.visualise_tree()

            input('\nTo restart, hit enter\n')

        if model == '2':
            split = float(input('Enter training data split value, eg 0.7\n'))
            while True:
                if split < 0 or split > 1:
                    print('Invalid split entered')
                else:
                    break
コード例 #15
0
# 1. Compare accuracy of naive bayes and logistic regression

# Get cross validation accuracy for 5-fold cv
print("Ionosphere validation accuracy (default parameters):")
evaluation.cross_validation(5, ionosphere_train_features, ionosphere_train_labels, model=LogisticRegression)

# Grid search for optimal hyperparameters
print("Ionosphere grid search hyperparameters:")
ionosphere_max_val_acc, ionosphere_arg_max = evaluation.grid_search(learning_rates=lrs, epsilons=eps, lambdas=lamdas, x=ionosphere_train_features, y=ionosphere_train_labels, model=LogisticRegression)

# Accuracy on test split - train with best hyperparameters
print("Ionosphere test accuracy:")
logistic_ionosphere = LogisticRegression(ionosphere_train_features, ionosphere_train_labels)
logistic_ionosphere.fit(lr=ionosphere_arg_max[0], eps=ionosphere_arg_max[1], regularization=ionosphere_arg_max[2])
ionosphere_prediction = logistic_ionosphere.predict(ionosphere_test_features)
cm_ionosphere = evaluation.confusion_matrix(ionosphere_test_labels, ionosphere_prediction)
print("Accuracy:", evaluation.accuracy(cm_ionosphere), "Precision:", evaluation.precision(cm_ionosphere), "Recall:", evaluation.true_positive(cm_ionosphere), "F1:", evaluation.f_score(cm_ionosphere))

# 5-fold CV for naive bayes
print("Ionosphere validation accuracy (naive bayes):")
evaluation.cross_validation_naive(5, ionosphere_dataset.train_data, NaiveBayes, ionosphere_dataset.label_column, ionosphere_dataset.feature_columns)

naive_ionosphere = NaiveBayes(ionosphere_dataset.train_data, ionosphere_dataset.label_column, continuous=ionosphere_dataset.feature_columns)

print("Ionosphere test accuracy (naive bayes):")

ionosphere_pred_naive = ionosphere_dataset.test_data.apply(naive_ionosphere.predict, axis=1)
cm_ionosphere_naive = evaluation.confusion_matrix(ionosphere_test_labels, ionosphere_pred_naive.to_numpy())
print("Accuracy:", evaluation.accuracy(cm_ionosphere_naive), "Precision:", evaluation.precision(cm_ionosphere_naive), "Recall:", evaluation.true_positive(cm_ionosphere_naive), "F1:", evaluation.f_score(cm_ionosphere_naive))

コード例 #16
0
    def predict(self, info=True):
        if self.runmode == 'digits':
            test_label_path = DATA_DIR + '/testlabels'
            test_images_path = DATA_DIR + '/testimages'
        elif self.runmode == 'faces':
            test_label_path = DATA_DIR + '/facedatatestlabels'
            test_images_path = DATA_DIR + '/facedatatest'

        correct_labels = []

        with open(test_label_path) as f:
            for line in f:
                correct_labels.append(int(line))

        num_images = len(correct_labels)
        # Using python list instead of np since np chararrays replace spaces with empty string
        test_images = [[None for _ in range(self.row)] for _ in range(num_images)]

        with open(test_images_path) as f:
            for n in range(num_images):
                for y in range(self.row):
                    test_images[n][y] = list(f.readline().rstrip('\n'))

        predicted_labels = []
        for n in range(num_images):
            map_classifier = np.zeros(self.num_classes)
            for num in range(self.num_classes):
                map_classifier[num] = np.array([math.log(self.num_counts[num]/np.sum(self.num_counts))])
                for y in range(self.row):
                    for x in range(self.col):
                        if test_images[n][y][x] == ' ':
                            map_classifier[num] += math.log(self.model[num][y][x][0])
                        if self.num_features == 3:
                            if test_images[n][y][x] == '+':
                                map_classifier[num] += math.log(self.model[num][y][x][1])
                            elif test_images[n][y][x] == '#':
                                map_classifier[num] += math.log(self.model[num][y][x][2])
                        else:
                            if test_images[n][y][x] in ['+', '#']:
                                map_classifier[num] += math.log(self.model[num][y][x][1])
            predicted_label = np.argmax(map_classifier)
            predicted_labels.append((predicted_label, map_classifier[predicted_label], n))

        truths = np.array(correct_labels)
        predictions = np.array([x[0] for x in predicted_labels])
        accuracy = calc_accuracy(truths, predictions)
        logger.info('NB model is {0:.2f}% accurate on the {1} data with k = {2}.'.format(accuracy, self.runmode, self.k))

        if info:
            cm = confusion_matrix(truths, predictions, self.num_classes)
            class_accuracies = [cm[n][n] for n in range(self.num_classes)]
            # Class accuracies
            for n, x in enumerate(class_accuracies):
                logger.info('Class {0} has an accuracy of {1:.2f}%'.format(n, 100 * x))

            # Confusion matrix
            plt.figure()
            plt.imshow(cm, cmap=plt.get_cmap('Greens'), interpolation='nearest')
            plt.title('Confusion Matrix')
            plt.xticks(np.arange(self.num_classes))
            plt.yticks(np.arange(self.num_classes))
            plt.xlabel('Predictions')
            plt.ylabel('Truths')

            # Test images with the highest and lowest posterior probability
            # Sorts from lowest to highest by class, then by posterior probability
            sorted_predictions = sorted(predicted_labels)
            class_indices = []
            for x in range(len(sorted_predictions)):
                if sorted_predictions[x][0] != sorted_predictions[x-1][0]:
                    class_indices.append(x)

            for x in range(len(class_indices)):
                curr_class = sorted_predictions[class_indices[x]][0]
                lowest_idex = sorted_predictions[class_indices[x]][2]
                try:
                    highest_idx = sorted_predictions[class_indices[x+1]-1][2]
                except IndexError:
                    highest_idx = sorted_predictions[len(sorted_predictions)-1][2]
                best_test_image = [[0 if x in ['#', '+'] else 1 for x in y] for y in test_images[highest_idx]]
                worst_test_image = [[0 if x in ['#', '+'] else 1 for x in y] for y in test_images[lowest_idex]]
                plt.figure()
                plt.suptitle('Class {0}'.format(curr_class))
                plt.subplot(1, 2, 1)
                plt.imshow(best_test_image, cmap=plt.get_cmap('Greys_r'))
                plt.title('Highest')
                plt.xticks([])
                plt.yticks([])
                plt.subplot(1, 2, 2)
                plt.title('Lowest')
                plt.xticks([])
                plt.yticks([])
                plt.imshow(worst_test_image, cmap=plt.get_cmap('Greys_r'))

            # Odds ratio for the four worst classes
            cm_ravel = np.ravel(cm)
            least_accurate_pairs = cm_ravel.argsort()[:4]
            least_accurate_pairs = [(x % self.num_classes, math.floor(x / self.num_classes)) for x in least_accurate_pairs]

            if self.num_features == 2 and self.runmode == 'digits':
                for i, j in least_accurate_pairs:
                    log_likelihood_one = np.zeros((self.col, self.row))
                    log_likelihood_two = np.zeros((self.col, self.row))
                    odds_ratio = np.zeros((self.col, self.row))
                    for y in range(self.row):
                        for x in range(self.col):
                            log_likelihood_one[y][x] = math.log(self.model[i][y][x][1])
                            log_likelihood_two[y][x] = math.log(self.model[j][y][x][1])
                            odds_ratio[y][x] = math.log(self.model[i][y][x][1] / self.model[j][y][x][1])

                    plt.figure()
                    plt.subplot(1, 3, 1)
                    plt.imshow(log_likelihood_one, interpolation='nearest')
                    plt.title('Likelihood of {0}'.format(i))
                    plt.xticks([])
                    plt.yticks([])
                    cbar = plt.colorbar(shrink=0.35)
                    cbar.set_ticks(np.arange(np.amin(log_likelihood_one), np.amax(log_likelihood_one), step=2, dtype=np.int8))
                    for t in cbar.ax.get_yticklabels():
                        t.set_horizontalalignment('right')
                        t.set_x(4)
                    plt.subplot(1, 3, 2)
                    plt.imshow(log_likelihood_two, interpolation='nearest')
                    plt.title('Likelihood of {0}'.format(j))
                    plt.xticks([])
                    plt.yticks([])
                    cbar = plt.colorbar(shrink=0.35)
                    cbar.set_ticks(np.arange(np.amin(log_likelihood_two), np.amax(log_likelihood_two), step=2, dtype=np.int8))
                    for t in cbar.ax.get_yticklabels():
                        t.set_horizontalalignment('right')
                        t.set_x(4)
                    plt.subplot(1, 3, 3)
                    plt.imshow(odds_ratio, interpolation='nearest')
                    plt.title('Odds ratio')
                    plt.xticks([])
                    plt.yticks([])
                    cbar = plt.colorbar(shrink=0.35)
                    cbar.set_ticks(np.arange(np.amin(odds_ratio), np.amax(odds_ratio), step=2, dtype=np.int8))
                    for t in cbar.ax.get_yticklabels():
                        t.set_horizontalalignment('right')
                        t.set_x(4)

            plt.show()
コード例 #17
0
    def predict(self, info=True):
        test_label_path = DATA_DIR + '/testlabels'
        test_images_path = DATA_DIR + '/testimages'

        correct_labels = []

        with open(test_label_path) as f:
            for line in f:
                correct_labels.append(int(line))

        num_images = len(correct_labels)

        test_images = [[None for _ in range(self.row)] for _ in range(num_images)]
        with open(test_images_path) as f:
            for n in range(num_images):
                for y in range(self.row):
                    test_images[n][y] = list(f.readline().rstrip('\n'))

        predicted_labels = []

        for n in range(num_images):
            model = np.zeros((self.row,self.col))
            for y in range(self.row):
                for x in range(self.col):
                    if test_images[n][y][x] in ['+', '#']:
                        model[y][x] = 1
            decision = self.train_decision(model)
            predicted_labels.append(decision)

        truths = np.array(correct_labels)
        predictions = np.array(predicted_labels)
        accuracy = calc_accuracy(truths, predictions)
        logger.info('NB model is {0:.2f}% accurate on the digit data'.format(accuracy))

        X = np.array(range(self.col))
        Y = np.fliplr(np.atleast_2d(np.array(range(self.row))))[0]
        if info:
            confm = confusion_matrix(truths, predictions, self.num_classes)
            class_accuracies = [confm[n][n] for n in range(self.num_classes)]
            # Class accuracies
            for n, x in enumerate(class_accuracies):
                logger.info('Class {0} has an accuracy of {1:.2f}%'.format(n, 100 * x))

            # Confusion matrixx
            plt.figure()
            plt.imshow(confm, cmap=plt.get_cmap('Greens'), interpolation='nearest')
            plt.title('Confusion Matrix')
            plt.xticks(np.arange(self.num_classes))
            plt.yticks(np.arange(self.num_classes))
            plt.xlabel('Predictions')
            plt.ylabel('Truths')

            X, Y = np.meshgrid(range(self.col), range(self.row))
            Y = Y[::-1]
            for i in range(self.num_classes):
                hf = plt.figure()
                ha = hf.gca(projection = '3d')

                ha.plot_surface(X, Y, self.feature_weight_vectors[i], rstride=1, cstride=1,
                                linewidth=0, cmap=cm.coolwarm, antialiased = False)
                ha.set_xlabel('X')
                ha.set_ylabel('Y')
                ha.set_zlabel('weigh')
            plt.show()
if __name__=="__main__":
    args = parse_args()

    print "Reading data..."
    titles, bodies, tags_sets, _ = da.read_data(args.data, args.maxRows)
    tags = [list(t)[0] for t in tags_sets]

    X_train, X_test, y_train, y_test = evaluation.cross_validation(zip(titles, bodies), tags)
    X_train_t, X_train_b = zip(*X_train)

    print "Generating features..."
    if args.feat == "bow":
        X, extractor = fe.bag_of_words(X_train_t, X_train_b)
    elif args.feat == "tfidf":
        X, extractor = fe.tfidf(X_train_t, X_train_b)
    elif args.feat == "bigram":
        X, extractor = fe.ngrams(X_train_t, X_train_b, n_upper=2)
    else:
        X, extractor = fe.ngrams(X_train_t, X_train_b, n_upper=3)

    print "Train..."
    if args.classifier == "naive":
        classifier = MultinomialNB()

    classifier.fit(X, y_train)

    print "Test..."
    predictions = [classifier.predict(extractor.transform(t, b))[0] for t,b in X_test]

    evaluation.confusion_matrix(y_test, predictions)
コード例 #19
0
ファイル: utils.py プロジェクト: alexdesiqueira/fcn_microct
def measure_all_coefficients(data_test,
                             data_gt,
                             calc_coef={
                                 'matthews': True,
                                 'dice': True
                             },
                             save_coef=True,
                             filename='coefficients.csv'):
    """Measures all comparison coefficients between two input data.

    Parameters
    ----------
    data_test : (M, N, P) array
        The input test data.
    data_gt : (M, N, P) array
        The gold standard data.
    calc_coef : dict, optional (default : {'matthews': True, 'dice': True})
        Determines what coefficients to calculate.
    save_coef : boolean, optional (default : True)
        If True, saves the coefficients to a file in the disk.
    filename : str, optional (default : 'coefficients.csv')
        If save_coef is True, where to save the calculated coefficients.

    Returns
    -------
    all_matthews : array
        Array containing all Matthews coefficient values for the input data.
    all_dice : array
        Array containing all Dice coefficient values for the input data.

    Example
    -------
    >>> from skimage import io
    >>> data_bin = io.ImageCollection(load_pattern='res_figures/binary/Test_TIRR_0_1p5_B0p2_*_bin.png',
                                      plugin=None)[1000:2000]
    >>> data_gt = io.ImageCollection(load_pattern='gt_figures/19_Gray_*.tif',
                                     plugin=None)
    >>> all_matthews, all_dice = measure_all_coefficients(data_bin,
                                                          data_gt,
                                                          save_coef=True)
    """
    _assert_same_length(data_test, data_gt)

    all_matthews, all_dice = ['matthews'], ['dice']

    for idx, img_test in enumerate(data_test):
        aux_gt = process_goldstd_images(data_gt[idx])

        _assert_compatible(img_test, aux_gt)

        aux_confusion = evaluation.confusion_matrix(aux_gt, img_test)
        if calc_coef['matthews']:
            all_matthews.append(evaluation.measure_matthews(aux_confusion))
        if calc_coef['dice']:
            all_dice.append(evaluation.measure_dice(aux_confusion))

    if save_coef:
        if calc_coef['matthews']:
            with open(filename, 'a+') as file_coef:
                coef_writer = csv.writer(file_coef, delimiter=',')
                coef_writer.writerow(all_matthews)
        if calc_coef['dice']:
            with open(filename, 'a+') as file_coef:
                coef_writer = csv.writer(file_coef, delimiter=',')
                coef_writer.writerow(all_dice)

    return all_matthews, all_dice
コード例 #20
0
    X_train, X_test, y_train, y_test = evaluation.cross_validation(zip(titles, bodies), tags)
    X_train_t, X_train_b = zip(*X_train)

    print "Generating features..."
    if args.feat == "bow":
        X, extractor = fe.bag_of_words(X_train_t, X_train_b)
    elif args.feat == "tfidf":
        X, extractor = fe.tfidf(X_train_t, X_train_b)
    elif args.feat == "bigram":
        X, extractor = fe.ngrams(X_train_t, X_train_b, n_upper=2)
    else:
        X, extractor = fe.ngrams(X_train_t, X_train_b, n_upper=3)

    print "Train..."
    if args.classifier == "knn":
        classifier = KNeighborsClassifier(n_neighbors=3)
    elif args.classifier == "log-reg":
        classifier = LogisticRegression(C=1e5)
    elif args.classifier == "dec-tree":
        classifier = DecisionTreeClassifier()
    else:
        classifier = svm.SVC()

    classifier.fit(X, y_train)

    print "Test..."
    predictions = [classifier.predict(extractor.transform(t, b))[0] for t,b in X_test]

    evaluation.confusion_matrix(y_test, predictions)
コード例 #21
0
def main():
    model = "pca_svc"

    ## Read data sets
    header = pd.read_csv('Datasets/header.txt', delimiter=",", header=None)
    X_train = pd.read_csv('Datasets/train_data.csv',
                          delimiter=",",
                          header=None)
    X_train.columns = list(header.values)
    X_test = pd.read_csv('Datasets/test_data.csv', delimiter=",", header=None)
    X_test.columns = list(header.values)
    y_train = pd.read_csv('Datasets/train_labels.csv',
                          delimiter=",",
                          header=None)
    y_label_names = [
        'Pop_Rock', 'Electronic', 'Rap', 'Jazz', 'Latin', 'RnB',
        'International', 'Country', 'Reggae', 'Blues'
    ]

    labels = y_train.values
    c, r = labels.shape
    labels = labels.reshape(c, )

    ## Feature normalization
    X_train = preprocessing.scale(X_train)
    X_test = preprocessing.scale(X_test)

    from sklearn.ensemble import RandomForestClassifier
    from sklearn.model_selection import cross_val_score
    clf = RandomForestClassifier()
    scores = cross_val_score(clf, X_train, labels)
    scores.mean()

    clf = None

    if (model == "pca_svc"):
        ## Defining pipeline with PCA and SVC rbf hyperparameter tuning

        svm = SVC(kernel="rbf")
        pca = decomposition.PCA()
        pipe = Pipeline(steps=[('pca', pca), ('svc', svm)])

        n_components = [30]
        C_range = np.logspace(2, 6, 5)
        gamma_range = np.logspace(-9, -1, 5)

        svc_pipe = GridSearchCV(pipe,
                                dict(pca__n_components=n_components,
                                     svc__C=C_range,
                                     svc__gamma=gamma_range),
                                verbose=10,
                                n_jobs=-1)
        svc_pipe.fit(X_train, labels)
        print(svc_pipe.best_score_)
        print(svc_pipe.best_params_)

        clf = svc_pipe.best_estimator_

    if (model == "rfe_log"):
        log = linear_model.LogisticRegressionCV(multi_class='multinomial',
                                                class_weight=None,
                                                cv=3)
        rfecv = RFECV(estimator=log,
                      step=5,
                      cv=2,
                      scoring='accuracy',
                      verbose=100,
                      n_jobs=-1)
        rfecv.fit(X_train, labels)

        print("Optimal number of features : %d" % rfecv.n_features_)

        X_train = rfecv.transform(X_train)

        # Plot number of features VS. cross-validation scores
        plt.figure()
        plt.xlabel("Number of features selected")
        plt.ylabel("Cross validation score (nb of correct classifications)")
        plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
        plt.show()

        log_reg = linear_model.LogisticRegressionCV(multi_class='multinomial',
                                                    class_weight=None,
                                                    cv=3)
        log_reg.fit(X_train, labels)

        X_test = rfecv.transform(X_test)
        clf = log_reg

    ### Evaluation ###

    accuracy = clf.score(X_train, labels)
    print("Accuracy: {0}", format(accuracy))

    prediction_training = clf.predict(X_train)
    precision = evaluation.precision(prediction_training, labels)
    recall = evaluation.recall(prediction_training, labels)
    print("Precision: {0}", format(precision))
    print("Recall: {0}", format(recall))

    evaluation.confusion_matrix(labels, prediction_training, y_label_names)

    # ### Output ###

    prediction_test = clf.predict(X_test)
    output.accuracy(prediction_test)