def validate_feature_linear(features, labels, classes, n_folds=5, print_folds=True, print_absolute=True, print_logloss=True): kfold = cv.LabelKFold(labels, n_folds) model = lda.LDA() if print_absolute: score = cross_validation.cross_val_score(model, features, classes, cv=kfold) if print_absolute: print("absolute scores") if print_folds: print("\tfolds:", score) if print_absolute: print("\tmean:", score.mean(), "std:", numpy.std(score)) scores = score_calculation.loglossKFold(features, classes, model, kfold, given_kfold=True) if print_logloss: print("logloss scores") if print_folds: print("\tfolds", scores) if print_logloss: print("\tmean:", numpy.mean(scores), "std:", numpy.std(scores))
def split_and_write(count_df, experiment_proto, save_dir): """Splits into folds, write each test fold to an sstable. Only the test folds are written out so the train table for any fold is all the other sstables. This limits re-writing the data multiple times. Args: count_df: dataframe where the index is the sequence and the rest of the columns are counts in each round of seletion. experiment_proto: selection_pb2.Experiment proto describing the experimental configuration and results. save_dir: string base name for the directory in which to save output. """ experiment_proto = copy.deepcopy(experiment_proto) # These FASTQ files describe all the experiment data, not the split train / # test data. _remove_fastq_paths(experiment_proto) label_kfold = cross_validation.LabelKFold(count_df.cluster, n_folds=5) for i, (train, test) in enumerate(label_kfold): logging.info("Fold %d has %d train and %d test", i, len(train), len(test)) test_counts = count_df.iloc[test] train_counts = count_df.iloc[train] for split_name, subcounts in [("test", test_counts), ("train", train_counts)]: update_experiment_read_counts(experiment_proto, subcounts) path = os.path.join(save_dir, "experiment_fold_%d_%s.pbtxt" % (i, split_name)) with gfile.GFile(path, "w") as f: f.write(text_format.MessageToString(experiment_proto)) # HDF5 can be quickly read and writen from Python logging.info("Saving count table as HDF5.") path = os.path.join(save_dir, "table_fold_%d.h5" % i) io_utils.write_dataframe_to_hdf5(test_counts, path) # we use the sstable of examples for TensorFlow logging.info("Saving SSTable of TensorFlow example protos.") path = os.path.join(save_dir, "examples_fold_%d.sstable" % i) write_sstable(test_counts, path, experiment_proto.forward_primer, experiment_proto.reverse_primer)
def test_label_kfold(): rng = np.random.RandomState(0) # Parameters of the test n_labels = 15 n_samples = 1000 n_folds = 5 # Construct the test data tolerance = 0.05 * n_samples # 5 percent error allowed labels = rng.randint(0, n_labels, n_samples) folds = cval.LabelKFold(labels, n_folds=n_folds).idxs ideal_n_labels_per_fold = n_samples // n_folds # Check that folds have approximately the same size assert_equal(len(folds), len(labels)) for i in np.unique(folds): assert_greater_equal(tolerance, abs(sum(folds == i) - ideal_n_labels_per_fold)) # Check that each label appears only in 1 fold for label in np.unique(labels): assert_equal(len(np.unique(folds[labels == label])), 1) # Check that no label is on both sides of the split labels = np.asarray(labels, dtype=object) for train, test in cval.LabelKFold(labels, n_folds=n_folds): assert_equal(len(np.intersect1d(labels[train], labels[test])), 0) # Construct the test data labels = [ 'Albert', 'Jean', 'Bertrand', 'Michel', 'Jean', 'Francis', 'Robert', 'Michel', 'Rachel', 'Lois', 'Michelle', 'Bernard', 'Marion', 'Laura', 'Jean', 'Rachel', 'Franck', 'John', 'Gael', 'Anna', 'Alix', 'Robert', 'Marion', 'David', 'Tony', 'Abel', 'Becky', 'Madmood', 'Cary', 'Mary', 'Alexandre', 'David', 'Francis', 'Barack', 'Abdoul', 'Rasha', 'Xi', 'Silvia' ] labels = np.asarray(labels, dtype=object) n_labels = len(np.unique(labels)) n_samples = len(labels) n_folds = 5 tolerance = 0.05 * n_samples # 5 percent error allowed folds = cval.LabelKFold(labels, n_folds=n_folds).idxs ideal_n_labels_per_fold = n_samples // n_folds # Check that folds have approximately the same size assert_equal(len(folds), len(labels)) for i in np.unique(folds): assert_greater_equal(tolerance, abs(sum(folds == i) - ideal_n_labels_per_fold)) # Check that each label appears only in 1 fold for label in np.unique(labels): assert_equal(len(np.unique(folds[labels == label])), 1) # Check that no label is on both sides of the split for train, test in cval.LabelKFold(labels, n_folds=n_folds): assert_equal(len(np.intersect1d(labels[train], labels[test])), 0) # Should fail if there are more folds than labels labels = np.array([1, 1, 1, 2, 2]) assert_raises(ValueError, cval.LabelKFold, labels, n_folds=3)
#Initializing the classifiers (All are tree based classifiers) dt = DecisionTreeClassifier() rf = RandomForestClassifier(n_estimators=51) extree = ExtraTreeClassifier() classifier_list = [dt, rf, extree] classifier_name_list = ["Decision Tree", "Random Forests", "Extra Trees"] data = dataFrame.values # Initializing Cross Validation Models kf = cross_validation.KFold(len(labels), n_folds=5) stratifiedkf = cross_validation.StratifiedKFold(labels, n_folds=4) labeledkf = cross_validation.LabelKFold(labels, n_folds=4) leavePout = cross_validation.LeavePOut(len(labels), p=100) cross_validation_model_list = [kf, stratifiedkf, labeledkf, leavePout] cross_validation_model_names = [ "K-Fold", "Stratified K-fold", "Labeled K-Fold", "Leave P Out" ] # Cross Validating each given classifier for classifier, classifier_name in zip(classifier_list, classifier_name_list): scores = cross_validation.cross_val_score(classifier, data, labels, cv=10) print "-------- For Classifier : ", classifier_name, " ---------------" print "Score Array : ", scores print "Mean Score : ", scores.mean() print "Standard Deviation : ", scores.std() print "------------------------------------------------------"
def cross_validate(self, X_train, y_train, folds=3, normalize_predictions=False): ''' clf - sklearn classifier implementing predict_proba function X_train - list of lists of pandas dataframes, each sub-list contains dataframes for related data (to keep in the same fold) each dataframe contains index-related points for windowed calculations ALL DATAFRAMES MUST HAVE SAME COLUMNS--Missing data is indicated with np.nan y_train - truth labels for each datapoint, organized in same general structure as X_train: list of lists of np.arrays Returns: AUC's for each fold ''' aucs = [] group_labels = range(len(X_train)) pred_points = [] pred_groups = [] pred_subgroups = [] #somewhat unusual use of labelkfold--each group is its own fold for train_indices, test_indices in cross_validation.LabelKFold( group_labels, folds): train_x = [x for i, x in enumerate(X_train) if i in train_indices] train_y = [y for i, y in enumerate(y_train) if i in train_indices] self.fit(train_x, train_y) test_x = [x for i, x in enumerate(X_train) if i in test_indices] test_y = [y for i, y in enumerate(y_train) if i in test_indices] predictions = self.predict_proba(test_x) #Option to compute AUCs on nomalized predictions across prediction groups if normalize_predictions: for group in range(len(predictions)): for subgroup in range(len(predictions[group])): predictions[group][subgroup] = normalize( predictions[group][subgroup]) #book-keeping of how many groups and sub-groups were able to be predicted for concat_predictions = [] group_finite = 0 subgroup_finite = 0 for group in predictions: group_has_finite = False for subgroup in group: concat_predictions.append(subgroup) if np.any(np.isfinite(subgroup)): subgroup_finite += 1 group_has_finite = True if group_has_finite: group_finite += 1 #linearize predictions concat_truths = np.hstack(itertools.chain(*test_y)) concat_predictions = np.hstack(concat_predictions) #drop truths/predictions where predicion is nan concat_truths = concat_truths[np.where( np.isfinite(concat_predictions))] concat_predictions = concat_predictions[np.where( np.isfinite(concat_predictions))] #calculate auc aucs.append(roc_auc_score(concat_truths, concat_predictions)) pred_groups.append(group_finite) pred_subgroups.append(subgroup_finite) pred_points.append(len(concat_predictions) ) #len actually predicted, with nans dropped return { 'aucs': aucs, 'loss': self.cv_loss(aucs), 'pred_groups': pred_groups, 'pred_subgroups': pred_subgroups, 'pred_points': pred_points }
def run(inputFilename, metricFilename, cv=3, neighbors=10, threshold=1.0, gene_coeff=1, gene_exp=1, geo_coeff=1, geo_exp=1, feat_coeff=1, feat_exp=1, verbose=0, limit=-1, labeled=0, jobs=1, weighted=True, algorithm="knn", prefix=""): if verbose: print("Running " + repr(cv) + "-fold cross-validation:") if algorithm == "knn": print(" algorithm = knn") print(" neighbors = " + repr(neighbors)) print(" distance threshold = " + repr(threshold)) print(" gene_coeff = " + repr(gene_coeff)) print(" gene_exp = " + repr(gene_exp)) print(" geo_ceff = " + repr(geo_coeff)) print(" geo_exp = " + repr(geo_exp)) print(" feat_coeff = " + repr(feat_coeff)) print(" feat_exp = " + repr(feat_exp)) print(" weighted = " + repr(weighted)) elif algorithm == "random": print(" algorithm = random") elif algorithm == "zero": print(" algorithm = zero") elif algorithm == "majority": print(" algorithm = majority") print("Loading feature dataset") inputDataset = Dataset() inputDataset.loadFromNPZ(inputFilename) inputFeatureList = inputDataset.getFeatureList() # pick out the features of interest inputFeatureList = [((l, f, s), v) for ((l, f, s), v) in inputFeatureList if f.startswith(prefix)] if limit > 0: inputFeatureList = inputFeatureList[:limit] print("Using " + repr(len(inputFeatureList)) + " features") np.random.shuffle(inputFeatureList) inputData, inputTarget = zip(*inputFeatureList) inputData = np.array(inputData) inputTarget = np.array(inputTarget) if verbose: print("Making classifier") if algorithm == 'knn': distanceDataset = DistanceDataset() distanceDataset.loadFromNPZ(metricFilename) rc = GeoMetricKNNClassifier(distanceDataset, k=neighbors, threshold=threshold, verbose=verbose, gene_coeff=gene_coeff, gene_exp=gene_exp, geo_coeff=geo_coeff, geo_exp=geo_exp, weighted=weighted) elif algorithm == 'baggingKNN': distanceDataset = DistanceDataset() distanceDataset.loadFromNPZ(metricFilename) rc = GeoMetricKNNClassifier(distanceDataset, k=neighbors, threshold=threshold, verbose=verbose, gene_coeff=gene_coeff, gene_exp=gene_exp, geo_coeff=geo_coeff, geo_exp=geo_exp, weighted=weighted) rc = BaggingClassifier(rc, n_estimators=3, bootstrap=False) elif algorithm == 'random': rc = RandomClassifier() elif algorithm == 'majority': rc = AverageClassifier() elif algorithm == 'zero': rc = ZeroClassifier() elif algorithm == 'als': geocoordDataset = Dataset() geocoordDataset.loadFromNPZ( "../../../data/results/geodata/geocoord_features.npz") rc = MatrixFactorClassifier(algorithm='als', additionalDatasets=[]) elif algorithm == 'svd': rc = MatrixFactorClassifier(algorithm='svd') elif algorithm == "featureKNNdivided": rc = DividedFeatureMetricKNNClassifier(k=neighbors, threshold=threshold, verbose=verbose, weighted=weighted) elif algorithm == "featureKNN": rc = FeatureMetricKNNClassifier(k=neighbors, threshold=threshold, verbose=verbose, weighted=weighted) elif algorithm == "KNNdivided": distanceDataset = DistanceDataset() distanceDataset.loadFromNPZ(metricFilename) rc = DividedKNNClassifier(distanceDataset, k=neighbors, threshold=threshold, verbose=verbose, gene_coeff=gene_coeff, gene_exp=gene_exp, geo_coeff=geo_coeff, geo_exp=geo_exp, feat_coeff=feat_coeff, feat_exp=feat_exp, weighted=weighted) elif algorithm == "ensemble": distanceDataset = DistanceDataset() distanceDataset.loadFromNPZ(metricFilename) rc = EnsembleClassifier() c = SavedMetricKNNClassifier(distanceDataset, "GENETIC", k=neighbors, threshold=threshold, verbose=verbose, weighted=weighted) rc.addClassifier(c) c = SavedMetricKNNClassifier(distanceDataset, "GEOGRAPHIC", k=neighbors, threshold=threshold, verbose=verbose, weighted=weighted) rc.addClassifier(c) c = DividedFeatureMetricKNNClassifier(k=neighbors, threshold=threshold, verbose=verbose, weighted=weighted) #c = DividedFeatureMetricKNNClassifier(k=neighbors, threshold=threshold, verbose=verbose, weighted=weighted) rc.addClassifier(c) #c = MatrixFactorClassifier(algorithm='als') #rc.addClassifier(c) else: sys.exit("ERROR: No algorithm named " + algorithm) if verbose: print("Cross validating") if labeled: labels = [inputDataset.languageCodes.index(l) for l, _, _ in inputData] kf = cross_validation.LabelKFold(labels, n_folds=cv) else: kf = cross_validation.KFold(len(inputData), n_folds=cv) scores = cross_validation.cross_val_score(rc, inputData, inputTarget, cv=kf, scoring="accuracy", n_jobs=jobs, verbose=verbose) print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean() * 100, scores.std() * 2 * 100))