def main():
    myco = Myco(file_path='data/gipsy.myc')
    myco.load()

    out_path = 'data/'
    n_estimators, n_jobs = 60, 1

    result = Result(dataset=myco, out_path=out_path)

    model = Model(RandomForestClassifier,
                  out_path,
                  n_estimators=n_estimators,
                  verbose=True,
                  n_jobs=n_jobs)
    partition = model._get_partition(myco, n_cores=n_jobs)
    test_indices = [1, 23, 45, 12]

    model.fit(partitions=partitions,
              populations=myco.populations,
              exclude_indices=test_indices)
    probabilities = model.predict_proba(partitions=partitions,
                                        include_indices=test_indices)

    classes = model.classes_
    top_class_ids = np.argmax(probabilities, axis=1)
    top_probabilities = np.max(probabilities, axis=1)
    predicted_origin = [classes[class_id] for class_id in top_class_ids]

    result.set_q_pred_pops_params(predicted_origin,
                                  top_probabilities,
                                  classes,
                                  test_indices=test_indices)
    result.output_q()

    mixture_plot(result)
def main():

    myco = Myco(file_path='data/gipsy.myc')
    myco.load()

    cv = CrossValidate(dataset=myco, out_path='data/')
    cv.run(n_partitions=1, n_loci=0, n_splits=5, n_estimators=60, n_cores=1)

    mixture_plot(cv)
Example #3
0
def main():

    myco = Myco(file_path='data/myco.myc', is_str=True)
    myco = Structure(file_path='data/myco.str', is_str=True)
    myco.load()

    cv = CrossValidate(dataset=myco, out_path='data/')
    cv.run(n_partitions=1, n_loci=0, n_splits=5, n_estimators=60, n_cores=1)

    mixture_plot(cv)
Example #4
0
def main():
    myco = Myco(file_path='data/gipsy.myc')
    myco.load()

    cv = Supervised(dataset=myco, out_path='data/')
    cv.run(test=[1, 23, 45, 12],
           n_partitions=1,
           n_loci=0,
           n_estimators=60,
           n_cores=1)

    mixture_plot(cv)
Example #5
0
def main():
    myco = Myco(file_path='examples/data/gipsy.myc')
    myco.load()
    populations = np.array(myco.populations)
    out_path = 'data/'
    n_splits = 5
    n_estimators, n_jobs = 60, 1  # RandomForestClassifier parameters
    n_partitions, n_loci = 1, 0  # _partitions parameters

    # Check if population can be split in @n_splits splits
    counts = Counter(populations)
    for key in counts:
        if counts[key] < n_splits:
            raise ValueError(
                "Population {0} has less samples ({1}) than the number of splits ({2})."
                .format(key, counts[key], num_splits))

    predicted_origin_out, mixture_estimate_out, ordering_out = [], [], []
    # As a alternative - use shortcut: Model(...)._get_partition(...)
    partitions = _partition(myco, out_path, n_partitions, n_loci, n_jobs)

    # Splits dataset with @n_splits splits where each
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True)

    # Each train/test set is near balanced split of data (label-wise)
    for train, test in kf.split(partitions[0], y=populations):
        mixture_estimates_per_partition = []
        model = Model(RandomForestClassifier,
                      out_path,
                      n_estimators=n_estimators,
                      verbose=True,
                      n_jobs=n_jobs)
        for part in partitions:
            # Train
            model.fit(partitions=part,
                      populations=populations,
                      include_indices=train)
            # Predict probabilities for observed classes
            mixture = model.predict_proba(partitions=part,
                                          include_indices=test)
            mixture_estimates_per_partition.append(mixture)

        # average the results among different trials and find predicted origins
        mixture_estimate = np.mean(mixture_estimates_per_partition, axis=0)
        predicted_origin = [
            model.classes_[ind] for ind in np.argmax(mixture_estimate, axis=1)
        ]

        # accumulate the results
        predicted_origin_out.extend(predicted_origin)
        mixture_estimate_out.extend(mixture_estimate.tolist())
        ordering_out.extend(test.tolist())

    # find indices for smart sorting
    indices = np.argsort(ordering_out)

    # sort results according to the
    pred_pops = np.array(predicted_origin_out)[indices]
    q = np.array(mixture_estimate_out)[indices]
    q_pops = sorted(set(populations))

    # Save data to the Result object and output data to the @out_path
    result = Result(dataset=myco, out_path=out_path)
    result.set_pred_pops(pred_pops)
    result.set_q(q, q_pops)
    result.output_q()
    result.output_accuracy()

    # Plot the results
    mixture_plot(result)
Example #6
0
def supervised():
    parser = argparse.ArgumentParser()
    parser.add_argument('-i',
                        '--in-file',
                        type=str,
                        required=True,
                        help='Path to the input data file.')
    parser.add_argument('-o',
                        '--out',
                        type=str,
                        required=True,
                        help='Path to the output folder.')
    parser.add_argument('-P',
                        '--partitions',
                        type=int,
                        default=1,
                        help='The number of partitions (default 1).')
    parser.add_argument(
        '-M',
        '--loci',
        type=int,
        default=0,
        help='The number of randomly selected loci (0 for all).')
    parser.add_argument(
        '-e',
        '--estimators',
        type=int,
        default=60,
        help='Number of trees in the Random Forest classifier (default 60).')
    parser.add_argument('-c',
                        '--cores',
                        type=int,
                        default=1,
                        help='The number of cores (default 1).')
    parser.add_argument(
        '-x',
        '--splitstree',
        type=str,
        default=None,
        help='Path to the SplitsTree executable (default PATH).')
    parser.add_argument('-f',
                        '--format',
                        type=str,
                        default='myco',
                        choices=['myco', 'struct'],
                        help='Data file format (default myco).')
    parser.add_argument('-t',
                        '--type',
                        type=str,
                        default='snp',
                        choices=['snp', 'str'],
                        help='SNP or microsatellite data.')

    args = parser.parse_args()
    if args.splitstree is not None:
        const['__SPLITSTREE_PATH__'] = args.splitstree

    if args.format == 'myco':
        myco = Myco(file_path=args.in_file,
                    is_str=True if args.type == 'str' else False)
    elif args.format == 'struct':
        myco = Structure(file_path=args.in_file,
                         is_str=True if args.type == 'str' else False)
    else:
        raise ValueError('Invalid format.')
    myco.load()

    cv = Supervised(dataset=myco, out_path=args.out)
    cv.run(n_partitions=args.partitions,
           n_loci=args.loci,
           n_estimators=args.estimators,
           n_cores=args.cores)

    mixture_plot(cv, predictionOnly=True)