def main(): myco = Myco(file_path='data/gipsy.myc') myco.load() out_path = 'data/' n_estimators, n_jobs = 60, 1 result = Result(dataset=myco, out_path=out_path) model = Model(RandomForestClassifier, out_path, n_estimators=n_estimators, verbose=True, n_jobs=n_jobs) partition = model._get_partition(myco, n_cores=n_jobs) test_indices = [1, 23, 45, 12] model.fit(partitions=partitions, populations=myco.populations, exclude_indices=test_indices) probabilities = model.predict_proba(partitions=partitions, include_indices=test_indices) classes = model.classes_ top_class_ids = np.argmax(probabilities, axis=1) top_probabilities = np.max(probabilities, axis=1) predicted_origin = [classes[class_id] for class_id in top_class_ids] result.set_q_pred_pops_params(predicted_origin, top_probabilities, classes, test_indices=test_indices) result.output_q() mixture_plot(result)
def main(): myco = Myco(file_path='data/gipsy.myc') myco.load() cv = CrossValidate(dataset=myco, out_path='data/') cv.run(n_partitions=1, n_loci=0, n_splits=5, n_estimators=60, n_cores=1) mixture_plot(cv)
def main(): myco = Myco(file_path='data/myco.myc', is_str=True) myco = Structure(file_path='data/myco.str', is_str=True) myco.load() cv = CrossValidate(dataset=myco, out_path='data/') cv.run(n_partitions=1, n_loci=0, n_splits=5, n_estimators=60, n_cores=1) mixture_plot(cv)
def main(): myco = Myco(file_path='data/gipsy.myc') myco.load() cv = Supervised(dataset=myco, out_path='data/') cv.run(test=[1, 23, 45, 12], n_partitions=1, n_loci=0, n_estimators=60, n_cores=1) mixture_plot(cv)
def main(): myco = Myco(file_path='examples/data/gipsy.myc') myco.load() populations = np.array(myco.populations) out_path = 'data/' n_splits = 5 n_estimators, n_jobs = 60, 1 # RandomForestClassifier parameters n_partitions, n_loci = 1, 0 # _partitions parameters # Check if population can be split in @n_splits splits counts = Counter(populations) for key in counts: if counts[key] < n_splits: raise ValueError( "Population {0} has less samples ({1}) than the number of splits ({2})." .format(key, counts[key], num_splits)) predicted_origin_out, mixture_estimate_out, ordering_out = [], [], [] # As a alternative - use shortcut: Model(...)._get_partition(...) partitions = _partition(myco, out_path, n_partitions, n_loci, n_jobs) # Splits dataset with @n_splits splits where each kf = StratifiedKFold(n_splits=n_splits, shuffle=True) # Each train/test set is near balanced split of data (label-wise) for train, test in kf.split(partitions[0], y=populations): mixture_estimates_per_partition = [] model = Model(RandomForestClassifier, out_path, n_estimators=n_estimators, verbose=True, n_jobs=n_jobs) for part in partitions: # Train model.fit(partitions=part, populations=populations, include_indices=train) # Predict probabilities for observed classes mixture = model.predict_proba(partitions=part, include_indices=test) mixture_estimates_per_partition.append(mixture) # average the results among different trials and find predicted origins mixture_estimate = np.mean(mixture_estimates_per_partition, axis=0) predicted_origin = [ model.classes_[ind] for ind in np.argmax(mixture_estimate, axis=1) ] # accumulate the results predicted_origin_out.extend(predicted_origin) mixture_estimate_out.extend(mixture_estimate.tolist()) ordering_out.extend(test.tolist()) # find indices for smart sorting indices = np.argsort(ordering_out) # sort results according to the pred_pops = np.array(predicted_origin_out)[indices] q = np.array(mixture_estimate_out)[indices] q_pops = sorted(set(populations)) # Save data to the Result object and output data to the @out_path result = Result(dataset=myco, out_path=out_path) result.set_pred_pops(pred_pops) result.set_q(q, q_pops) result.output_q() result.output_accuracy() # Plot the results mixture_plot(result)
def supervised(): parser = argparse.ArgumentParser() parser.add_argument('-i', '--in-file', type=str, required=True, help='Path to the input data file.') parser.add_argument('-o', '--out', type=str, required=True, help='Path to the output folder.') parser.add_argument('-P', '--partitions', type=int, default=1, help='The number of partitions (default 1).') parser.add_argument( '-M', '--loci', type=int, default=0, help='The number of randomly selected loci (0 for all).') parser.add_argument( '-e', '--estimators', type=int, default=60, help='Number of trees in the Random Forest classifier (default 60).') parser.add_argument('-c', '--cores', type=int, default=1, help='The number of cores (default 1).') parser.add_argument( '-x', '--splitstree', type=str, default=None, help='Path to the SplitsTree executable (default PATH).') parser.add_argument('-f', '--format', type=str, default='myco', choices=['myco', 'struct'], help='Data file format (default myco).') parser.add_argument('-t', '--type', type=str, default='snp', choices=['snp', 'str'], help='SNP or microsatellite data.') args = parser.parse_args() if args.splitstree is not None: const['__SPLITSTREE_PATH__'] = args.splitstree if args.format == 'myco': myco = Myco(file_path=args.in_file, is_str=True if args.type == 'str' else False) elif args.format == 'struct': myco = Structure(file_path=args.in_file, is_str=True if args.type == 'str' else False) else: raise ValueError('Invalid format.') myco.load() cv = Supervised(dataset=myco, out_path=args.out) cv.run(n_partitions=args.partitions, n_loci=args.loci, n_estimators=args.estimators, n_cores=args.cores) mixture_plot(cv, predictionOnly=True)