Beispiel #1
0
def main():
    """
    Main script function.
    :return:
    """
    def to_proportion(string):

        string = float(string)
        if string <= 0 or string > 100:
            raise ValueError(string)
        if 1 < string:
            string /= 100
        return string

    def to_pos(string):

        string = int(string)
        if string < 0:
            raise ValueError(string)

    parser = argparse.ArgumentParser(__doc__)
    subset = parser.add_mutually_exclusive_group()
    subset.add_argument("-r",
                        "--random",
                        type=int,
                        default=None,
                        help="A fixed of models to select for training.")
    subset.add_argument(
        "-p",
        "--proportion",
        type=float,
        default=None,
        help="Proportion of the models to be used for training.")
    parser.add_argument(
        "-c",
        "--conf",
        required=True,
        help=
        "File with the configuration for selecting best and worst transcripts."
    )
    parser.add_argument("--regress", action="store_true", default=False)
    # parser.add_argument("-t", "--tmap",
    #                     help="The TMAP file with the comparison results.",
    #                     required=True)
    parser.add_argument("-m",
                        "--metrics",
                        help="The metrics file.",
                        required=True)
    parser.add_argument("-o",
                        "--out",
                        help="Output file.",
                        default="forest.model")
    parser.add_argument(
        "-s",
        "--scoring",
        help="The original scoring file, to retrieve info on fragments.")
    args = parser.parse_args()

    # X should contain a matrix of features derived from the portcullis tab file
    # y should contain the labels (0 not a valid junction, 1 a valid junction).
    # Confirmed with the reference.

    # Load tab file and produce matrix
    # bed, tab = loadtab(args.input)
    # tmap_results = load_tmap(args.tmap)
    # scores = dict()
    # for tid in tmap_results:
    #     if tmap_results[tid].ccode == ("u",):
    #         continue
    #     recall = np.mean([tmap_results[tid].j_recall,
    #                       tmap_results[tid].e_recall,
    #                       tmap_results[tid].n_recall])
    #     precision = np.mean([tmap_results[tid].j_prec,
    #                          tmap_results[tid].e_prec,
    #                          tmap_results[tid].n_prec])
    #     if min(recall, precision) > 0:
    #         scores[tid] = hmean([recall, precision])
    #     else:
    #         scores[tid] = 0
    #
    # print("# TMAP results: " + str(len(tmap_results)))

    # Load reference and add labels
    # ref = bed12.loadbed(args.reference, False, False)
    # metrics = pandas.read_csv(args.metrics, delimiter="\t")

    metrics_pandas = pandas.read_csv(args.metrics, delimiter="\t")

    try:
        zeros = metrics_pandas[(
            ((metrics_pandas.exon_num == 1) &
             (metrics_pandas.combined_cds_length == 0) &
             (metrics_pandas.cdna_length < 300)) |
            ((metrics_pandas.exon_num > 1) &
             (metrics_pandas.combined_cds_intron_fraction == 0) &
             (metrics_pandas.retained_fraction > 0.5)))].tid
    except AttributeError as exc:
        raise AttributeError("\n".join(
            [str(exc),
             str("\n\t".join(list(metrics_pandas.columns)))]))
    hundreds = metrics_pandas[
        (metrics_pandas.proportion_verified_introns_inlocus == 1)
        & (metrics_pandas.snowy_blast_score > 10) &
        (metrics_pandas.retained_fraction == 0) &
        (((metrics_pandas.exon_num > 1) &
          (metrics_pandas.verified_introns_num > 2)) |
         ((metrics_pandas.exon_num == 1) & (metrics_pandas.utr_num == 2)))].tid

    metrics = load_metrics(args.metrics)

    scores = dict()

    for z in zeros:
        scores[z] = 0
    for h in hundreds:
        scores[h] = 100

    print("# metered transcripts:", len(metrics))

    if args.random is not None or args.proportion is not None:
        if args.random is not None:
            selected = random.sample(scores.keys(), args.random)
        else:
            selected = random.sample(scores.keys(),
                                     int(floor(len(scores) * args.proportion)))

        scores = dict(_ for _ in scores.items() if _[0] in selected)
        metrics = dict(_ for _ in metrics.items() if _[0] in selected)

    X = np.zeros((len(scores), len(MetricEntry.metrics)))
    y = []

    for index, (tid, score) in enumerate(scores.items()):
        X[index] = metrics[tid].matrix_row
        y.append(score)

    if args.regress is True:

        clf = RandomForestRegressor(n_estimators=int(
            len(MetricEntry.metrics) / 3),
                                    max_depth=None,
                                    n_jobs=10,
                                    random_state=0)
    else:
        clf = RandomForestClassifier(n_estimators=int(
            len(MetricEntry.metrics) / 3),
                                     max_depth=None,
                                     n_jobs=10,
                                     random_state=0)

    clf.fit(X, y)
    clf.metrics = MetricEntry.metrics
    importances = clf.feature_importances_
    # std = np.std([tree.feature_importances_ for tree in clf.estimators_], axis=0)
    # indices = np.argsort(importances)[::-1]

    # Print the feature ranking
    print("Feature ranking:")

    ordered = sorted([(MetricEntry.metrics[_], importances[_])
                      for _ in range(len(importances))],
                     key=operator.itemgetter(1),
                     reverse=True)

    for rank, couple in enumerate(ordered, start=1):
        print(rank, "feature", couple[0], couple[1] * 100)

    print("Total contribution", 100 * sum([_[1] for _ in ordered]))

    # for f in range(X.shape[1]):
    #     print("{0}, feature {1} ({2})".format(
    #         f + 1,
    #         MetricEntry.metrics[f],
    #         importances[f]
    #     ))

    # Create the dictionary
    clf = {"scoring": clf}
    with open(args.scoring) as sco:
        orig = yaml.load(sco)
    clf["requirements"] = orig["requirements"]
    clf["not_fragmentary"] = orig["not_fragmentary"]

    with open(args.out, "wb") as forest:
        pickle.dump(clf, forest)
Beispiel #2
0
def main():

    """
    Main script function.
    :return:
    """

    def to_proportion(string):

        string = float(string)
        if string <= 0 or string > 100:
            raise ValueError(string)
        if 1 < string:
            string /= 100
        return string

    def to_pos(string):

        string = int(string)
        if string < 0:
            raise ValueError(string)

    parser = argparse.ArgumentParser(__doc__)
    subset = parser.add_mutually_exclusive_group()
    subset.add_argument("-r", "--random", type=int,
                        default=None,
                        help="A fixed of models to select for training.")
    subset.add_argument("-p", "--proportion", type=float,
                        default=None,
                        help="Proportion of the models to be used for training.")
    parser.add_argument("-c", "--conf",
                        required=True,
                        help="File with the configuration for selecting best and worst transcripts.")
    parser.add_argument("--regress", action="store_true", default=False)
    # parser.add_argument("-t", "--tmap",
    #                     help="The TMAP file with the comparison results.",
    #                     required=True)
    parser.add_argument("-m", "--metrics", help="The metrics file.", required=True)
    parser.add_argument("-o", "--out", help="Output file.", default="forest.model")
    parser.add_argument("-s", "--scoring", help="The original scoring file, to retrieve info on fragments.")
    args = parser.parse_args()

    # X should contain a matrix of features derived from the portcullis tab file
    # y should contain the labels (0 not a valid junction, 1 a valid junction).
    # Confirmed with the reference.

    # Load tab file and produce matrix
    # bed, tab = loadtab(args.input)
    # tmap_results = load_tmap(args.tmap)
    # scores = dict()
    # for tid in tmap_results:
    #     if tmap_results[tid].ccode == ("u",):
    #         continue
    #     recall = np.mean([tmap_results[tid].j_recall,
    #                       tmap_results[tid].e_recall,
    #                       tmap_results[tid].n_recall])
    #     precision = np.mean([tmap_results[tid].j_prec,
    #                          tmap_results[tid].e_prec,
    #                          tmap_results[tid].n_prec])
    #     if min(recall, precision) > 0:
    #         scores[tid] = hmean([recall, precision])
    #     else:
    #         scores[tid] = 0
    #
    # print("# TMAP results: " + str(len(tmap_results)))

    # Load reference and add labels
    # ref = bed12.loadbed(args.reference, False, False)
    # metrics = pandas.read_csv(args.metrics, delimiter="\t")

    metrics_pandas = pandas.read_csv(args.metrics, delimiter="\t")

    try:
        zeros = metrics_pandas[(((metrics_pandas.exon_num==1) & (metrics_pandas.combined_cds_length==0) & (metrics_pandas.cdna_length < 300 )) | ((metrics_pandas.exon_num>1) & (metrics_pandas.combined_cds_intron_fraction==0) & (metrics_pandas.retained_fraction>0.5 )) )].tid
    except AttributeError as exc:
        raise AttributeError("\n".join([str(exc), str("\n\t".join(list(metrics_pandas.columns)))]))
    hundreds = metrics_pandas[(metrics_pandas.proportion_verified_introns_inlocus==1) & (metrics_pandas.snowy_blast_score>10) & (metrics_pandas.retained_fraction==0) & (((metrics_pandas.exon_num>1) & (metrics_pandas.verified_introns_num>2)) | ((metrics_pandas.exon_num==1) & (metrics_pandas.utr_num==2)) )].tid

    metrics = load_metrics(args.metrics)

    scores = dict()

    for z in zeros:
        scores[z] = 0
    for h in hundreds:
        scores[h] = 100

    print("# metered transcripts:", len(metrics))

    if args.random is not None or args.proportion is not None:
        if args.random is not None:
            selected = random.sample(scores.keys(), args.random)
        else:
            selected = random.sample(scores.keys(),
                                     int(floor(len(scores) * args.proportion)))

        scores = dict(_ for _ in scores.items() if _[0] in selected)
        metrics = dict(_ for _ in metrics.items() if _[0] in selected)

    X = np.zeros((len(scores), len(MetricEntry.metrics)))
    y = []

    for index, (tid, score) in enumerate(scores.items()):
        X[index] = metrics[tid].matrix_row
        y.append(score)

    if args.regress is True:

        clf = RandomForestRegressor(n_estimators=int(len(MetricEntry.metrics)/3),
                                    max_depth=None,
                                    n_jobs=10,
                                    random_state=0)
    else:
        clf = RandomForestClassifier(n_estimators=int(len(MetricEntry.metrics)/3),
                                        max_depth=None,
                                        n_jobs=10,
                                        random_state=0)

    clf.fit(X, y)
    clf.metrics = MetricEntry.metrics
    importances = clf.feature_importances_
    # std = np.std([tree.feature_importances_ for tree in clf.estimators_], axis=0)
    # indices = np.argsort(importances)[::-1]

    # Print the feature ranking
    print("Feature ranking:")

    ordered = sorted([(MetricEntry.metrics[_], importances[_]) for _ in range(len(importances))],
                     key=operator.itemgetter(1), reverse=True)

    for rank, couple in enumerate(ordered, start=1):
        print(rank, "feature", couple[0], couple[1] * 100)

    print("Total contribution", 100 * sum([_[1] for _ in ordered]))

    # for f in range(X.shape[1]):
    #     print("{0}, feature {1} ({2})".format(
    #         f + 1,
    #         MetricEntry.metrics[f],
    #         importances[f]
    #     ))

    # Create the dictionary
    clf = {"scoring": clf}
    with open(args.scoring) as sco:
        orig = yaml.load(sco)
    clf["requirements"] = orig["requirements"]
    clf["not_fragmentary"] = orig["not_fragmentary"]

    with open(args.out, "wb") as forest:
        pickle.dump(clf, forest)
Beispiel #3
0
def main():
    """
    Main script function.
    :return:
    """
    def to_proportion(string):

        string = float(string)
        if string <= 0 or string > 100:
            raise ValueError(string)
        if 1 < string:
            string /= 100
        return string

    def to_pos(string):

        string = int(string)
        if string < 0:
            raise ValueError(string)

    parser = argparse.ArgumentParser(__doc__)
    subset = parser.add_mutually_exclusive_group()
    subset.add_argument("-r",
                        "--random",
                        type=int,
                        default=None,
                        help="A fixed of models to select for training.")
    subset.add_argument(
        "-p",
        "--proportion",
        type=float,
        default=None,
        help="Proportion of the models to be used for training.")
    parser.add_argument("-t",
                        "--tmap",
                        help="The TMAP file with the comparison results.",
                        required=True)
    parser.add_argument("-m",
                        "--metrics",
                        help="The metrics file.",
                        required=True)
    parser.add_argument("-o",
                        "--out",
                        help="Output file.",
                        default="forest.model")
    parser.add_argument(
        "-s",
        "--scoring",
        help="The original scoring file, to retrieve info on fragments.")
    args = parser.parse_args()

    # X should contain a matrix of features derived from the portcullis tab file
    # y should contain the labels (0 not a valid junction, 1 a valid junction).
    # Confirmed with the reference.

    # Load tab file and produce matrix
    # bed, tab = loadtab(args.input)
    tmap_results = load_tmap(args.tmap)
    scores = dict()
    for tid in tmap_results:
        if tmap_results[tid].ccode == ("u", ):
            continue
        recall = np.mean([
            tmap_results[tid].j_recall, tmap_results[tid].e_recall,
            tmap_results[tid].n_recall
        ])
        precision = np.mean([
            tmap_results[tid].j_prec, tmap_results[tid].e_prec,
            tmap_results[tid].n_prec
        ])
        if min(recall, precision) > 0:
            scores[tid] = hmean([recall, precision])
        else:
            scores[tid] = 0

    print("# TMAP results: " + str(len(tmap_results)))

    # Load reference and add labels
    # ref = bed12.loadbed(args.reference, False, False)
    metrics = load_metrics(args.metrics)
    print("# metered transcripts:", len(metrics))

    if args.random is not None or args.proportion is not None:
        if args.random is not None:
            selected = random.sample(scores.keys(), args.random)
        else:
            selected = random.sample(scores.keys(),
                                     int(floor(len(scores) * args.proportion)))

        scores = dict(_ for _ in scores.items() if _[0] in selected)
        metrics = dict(_ for _ in metrics.items() if _[0] in selected)

    X = np.zeros((len(scores), len(MetricEntry.metrics)))
    y = []

    for index, (tid, score) in enumerate(scores.items()):
        X[index] = metrics[tid].matrix_row
        y.append(score)

    clf = RandomForestRegressor(n_estimators=int(len(MetricEntry.metrics) / 3),
                                max_depth=None,
                                n_jobs=10,
                                random_state=0)

    clf.fit(X, y)
    clf.metrics = MetricEntry.metrics
    importances = clf.feature_importances_
    # std = np.std([tree.feature_importances_ for tree in clf.estimators_], axis=0)
    # indices = np.argsort(importances)[::-1]

    # Print the feature ranking
    print("Feature ranking:")

    ordered = sorted([(MetricEntry.metrics[_], importances[_])
                      for _ in range(len(importances))],
                     key=operator.itemgetter(1),
                     reverse=True)

    for rank, couple in enumerate(ordered, start=1):
        print(rank, "feature", couple[0], couple[1] * 100)

    print("Total contribution", 100 * sum([_[1] for _ in ordered]))

    # for f in range(X.shape[1]):
    #     print("{0}, feature {1} ({2})".format(
    #         f + 1,
    #         MetricEntry.metrics[f],
    #         importances[f]
    #     ))

    # Create the dictionary
    clf = dict(("scoring", clf))
    with open(args.scoring) as sco:
        orig = yaml.load(sco)
    clf["requirements"] = orig["requirements"]
    clf["not_fragmentary"] = orig["not_fragmentary"]

    with open(args.out, "wb") as forest:
        pickle.dump(clf, forest)