def main(): """ Main script function. :return: """ def to_proportion(string): string = float(string) if string <= 0 or string > 100: raise ValueError(string) if 1 < string: string /= 100 return string def to_pos(string): string = int(string) if string < 0: raise ValueError(string) parser = argparse.ArgumentParser(__doc__) subset = parser.add_mutually_exclusive_group() subset.add_argument("-r", "--random", type=int, default=None, help="A fixed of models to select for training.") subset.add_argument( "-p", "--proportion", type=float, default=None, help="Proportion of the models to be used for training.") parser.add_argument( "-c", "--conf", required=True, help= "File with the configuration for selecting best and worst transcripts." ) parser.add_argument("--regress", action="store_true", default=False) # parser.add_argument("-t", "--tmap", # help="The TMAP file with the comparison results.", # required=True) parser.add_argument("-m", "--metrics", help="The metrics file.", required=True) parser.add_argument("-o", "--out", help="Output file.", default="forest.model") parser.add_argument( "-s", "--scoring", help="The original scoring file, to retrieve info on fragments.") args = parser.parse_args() # X should contain a matrix of features derived from the portcullis tab file # y should contain the labels (0 not a valid junction, 1 a valid junction). # Confirmed with the reference. # Load tab file and produce matrix # bed, tab = loadtab(args.input) # tmap_results = load_tmap(args.tmap) # scores = dict() # for tid in tmap_results: # if tmap_results[tid].ccode == ("u",): # continue # recall = np.mean([tmap_results[tid].j_recall, # tmap_results[tid].e_recall, # tmap_results[tid].n_recall]) # precision = np.mean([tmap_results[tid].j_prec, # tmap_results[tid].e_prec, # tmap_results[tid].n_prec]) # if min(recall, precision) > 0: # scores[tid] = hmean([recall, precision]) # else: # scores[tid] = 0 # # print("# TMAP results: " + str(len(tmap_results))) # Load reference and add labels # ref = bed12.loadbed(args.reference, False, False) # metrics = pandas.read_csv(args.metrics, delimiter="\t") metrics_pandas = pandas.read_csv(args.metrics, delimiter="\t") try: zeros = metrics_pandas[( ((metrics_pandas.exon_num == 1) & (metrics_pandas.combined_cds_length == 0) & (metrics_pandas.cdna_length < 300)) | ((metrics_pandas.exon_num > 1) & (metrics_pandas.combined_cds_intron_fraction == 0) & (metrics_pandas.retained_fraction > 0.5)))].tid except AttributeError as exc: raise AttributeError("\n".join( [str(exc), str("\n\t".join(list(metrics_pandas.columns)))])) hundreds = metrics_pandas[ (metrics_pandas.proportion_verified_introns_inlocus == 1) & (metrics_pandas.snowy_blast_score > 10) & (metrics_pandas.retained_fraction == 0) & (((metrics_pandas.exon_num > 1) & (metrics_pandas.verified_introns_num > 2)) | ((metrics_pandas.exon_num == 1) & (metrics_pandas.utr_num == 2)))].tid metrics = load_metrics(args.metrics) scores = dict() for z in zeros: scores[z] = 0 for h in hundreds: scores[h] = 100 print("# metered transcripts:", len(metrics)) if args.random is not None or args.proportion is not None: if args.random is not None: selected = random.sample(scores.keys(), args.random) else: selected = random.sample(scores.keys(), int(floor(len(scores) * args.proportion))) scores = dict(_ for _ in scores.items() if _[0] in selected) metrics = dict(_ for _ in metrics.items() if _[0] in selected) X = np.zeros((len(scores), len(MetricEntry.metrics))) y = [] for index, (tid, score) in enumerate(scores.items()): X[index] = metrics[tid].matrix_row y.append(score) if args.regress is True: clf = RandomForestRegressor(n_estimators=int( len(MetricEntry.metrics) / 3), max_depth=None, n_jobs=10, random_state=0) else: clf = RandomForestClassifier(n_estimators=int( len(MetricEntry.metrics) / 3), max_depth=None, n_jobs=10, random_state=0) clf.fit(X, y) clf.metrics = MetricEntry.metrics importances = clf.feature_importances_ # std = np.std([tree.feature_importances_ for tree in clf.estimators_], axis=0) # indices = np.argsort(importances)[::-1] # Print the feature ranking print("Feature ranking:") ordered = sorted([(MetricEntry.metrics[_], importances[_]) for _ in range(len(importances))], key=operator.itemgetter(1), reverse=True) for rank, couple in enumerate(ordered, start=1): print(rank, "feature", couple[0], couple[1] * 100) print("Total contribution", 100 * sum([_[1] for _ in ordered])) # for f in range(X.shape[1]): # print("{0}, feature {1} ({2})".format( # f + 1, # MetricEntry.metrics[f], # importances[f] # )) # Create the dictionary clf = {"scoring": clf} with open(args.scoring) as sco: orig = yaml.load(sco) clf["requirements"] = orig["requirements"] clf["not_fragmentary"] = orig["not_fragmentary"] with open(args.out, "wb") as forest: pickle.dump(clf, forest)
def main(): """ Main script function. :return: """ def to_proportion(string): string = float(string) if string <= 0 or string > 100: raise ValueError(string) if 1 < string: string /= 100 return string def to_pos(string): string = int(string) if string < 0: raise ValueError(string) parser = argparse.ArgumentParser(__doc__) subset = parser.add_mutually_exclusive_group() subset.add_argument("-r", "--random", type=int, default=None, help="A fixed of models to select for training.") subset.add_argument("-p", "--proportion", type=float, default=None, help="Proportion of the models to be used for training.") parser.add_argument("-c", "--conf", required=True, help="File with the configuration for selecting best and worst transcripts.") parser.add_argument("--regress", action="store_true", default=False) # parser.add_argument("-t", "--tmap", # help="The TMAP file with the comparison results.", # required=True) parser.add_argument("-m", "--metrics", help="The metrics file.", required=True) parser.add_argument("-o", "--out", help="Output file.", default="forest.model") parser.add_argument("-s", "--scoring", help="The original scoring file, to retrieve info on fragments.") args = parser.parse_args() # X should contain a matrix of features derived from the portcullis tab file # y should contain the labels (0 not a valid junction, 1 a valid junction). # Confirmed with the reference. # Load tab file and produce matrix # bed, tab = loadtab(args.input) # tmap_results = load_tmap(args.tmap) # scores = dict() # for tid in tmap_results: # if tmap_results[tid].ccode == ("u",): # continue # recall = np.mean([tmap_results[tid].j_recall, # tmap_results[tid].e_recall, # tmap_results[tid].n_recall]) # precision = np.mean([tmap_results[tid].j_prec, # tmap_results[tid].e_prec, # tmap_results[tid].n_prec]) # if min(recall, precision) > 0: # scores[tid] = hmean([recall, precision]) # else: # scores[tid] = 0 # # print("# TMAP results: " + str(len(tmap_results))) # Load reference and add labels # ref = bed12.loadbed(args.reference, False, False) # metrics = pandas.read_csv(args.metrics, delimiter="\t") metrics_pandas = pandas.read_csv(args.metrics, delimiter="\t") try: zeros = metrics_pandas[(((metrics_pandas.exon_num==1) & (metrics_pandas.combined_cds_length==0) & (metrics_pandas.cdna_length < 300 )) | ((metrics_pandas.exon_num>1) & (metrics_pandas.combined_cds_intron_fraction==0) & (metrics_pandas.retained_fraction>0.5 )) )].tid except AttributeError as exc: raise AttributeError("\n".join([str(exc), str("\n\t".join(list(metrics_pandas.columns)))])) hundreds = metrics_pandas[(metrics_pandas.proportion_verified_introns_inlocus==1) & (metrics_pandas.snowy_blast_score>10) & (metrics_pandas.retained_fraction==0) & (((metrics_pandas.exon_num>1) & (metrics_pandas.verified_introns_num>2)) | ((metrics_pandas.exon_num==1) & (metrics_pandas.utr_num==2)) )].tid metrics = load_metrics(args.metrics) scores = dict() for z in zeros: scores[z] = 0 for h in hundreds: scores[h] = 100 print("# metered transcripts:", len(metrics)) if args.random is not None or args.proportion is not None: if args.random is not None: selected = random.sample(scores.keys(), args.random) else: selected = random.sample(scores.keys(), int(floor(len(scores) * args.proportion))) scores = dict(_ for _ in scores.items() if _[0] in selected) metrics = dict(_ for _ in metrics.items() if _[0] in selected) X = np.zeros((len(scores), len(MetricEntry.metrics))) y = [] for index, (tid, score) in enumerate(scores.items()): X[index] = metrics[tid].matrix_row y.append(score) if args.regress is True: clf = RandomForestRegressor(n_estimators=int(len(MetricEntry.metrics)/3), max_depth=None, n_jobs=10, random_state=0) else: clf = RandomForestClassifier(n_estimators=int(len(MetricEntry.metrics)/3), max_depth=None, n_jobs=10, random_state=0) clf.fit(X, y) clf.metrics = MetricEntry.metrics importances = clf.feature_importances_ # std = np.std([tree.feature_importances_ for tree in clf.estimators_], axis=0) # indices = np.argsort(importances)[::-1] # Print the feature ranking print("Feature ranking:") ordered = sorted([(MetricEntry.metrics[_], importances[_]) for _ in range(len(importances))], key=operator.itemgetter(1), reverse=True) for rank, couple in enumerate(ordered, start=1): print(rank, "feature", couple[0], couple[1] * 100) print("Total contribution", 100 * sum([_[1] for _ in ordered])) # for f in range(X.shape[1]): # print("{0}, feature {1} ({2})".format( # f + 1, # MetricEntry.metrics[f], # importances[f] # )) # Create the dictionary clf = {"scoring": clf} with open(args.scoring) as sco: orig = yaml.load(sco) clf["requirements"] = orig["requirements"] clf["not_fragmentary"] = orig["not_fragmentary"] with open(args.out, "wb") as forest: pickle.dump(clf, forest)
def main(): """ Main script function. :return: """ def to_proportion(string): string = float(string) if string <= 0 or string > 100: raise ValueError(string) if 1 < string: string /= 100 return string def to_pos(string): string = int(string) if string < 0: raise ValueError(string) parser = argparse.ArgumentParser(__doc__) subset = parser.add_mutually_exclusive_group() subset.add_argument("-r", "--random", type=int, default=None, help="A fixed of models to select for training.") subset.add_argument( "-p", "--proportion", type=float, default=None, help="Proportion of the models to be used for training.") parser.add_argument("-t", "--tmap", help="The TMAP file with the comparison results.", required=True) parser.add_argument("-m", "--metrics", help="The metrics file.", required=True) parser.add_argument("-o", "--out", help="Output file.", default="forest.model") parser.add_argument( "-s", "--scoring", help="The original scoring file, to retrieve info on fragments.") args = parser.parse_args() # X should contain a matrix of features derived from the portcullis tab file # y should contain the labels (0 not a valid junction, 1 a valid junction). # Confirmed with the reference. # Load tab file and produce matrix # bed, tab = loadtab(args.input) tmap_results = load_tmap(args.tmap) scores = dict() for tid in tmap_results: if tmap_results[tid].ccode == ("u", ): continue recall = np.mean([ tmap_results[tid].j_recall, tmap_results[tid].e_recall, tmap_results[tid].n_recall ]) precision = np.mean([ tmap_results[tid].j_prec, tmap_results[tid].e_prec, tmap_results[tid].n_prec ]) if min(recall, precision) > 0: scores[tid] = hmean([recall, precision]) else: scores[tid] = 0 print("# TMAP results: " + str(len(tmap_results))) # Load reference and add labels # ref = bed12.loadbed(args.reference, False, False) metrics = load_metrics(args.metrics) print("# metered transcripts:", len(metrics)) if args.random is not None or args.proportion is not None: if args.random is not None: selected = random.sample(scores.keys(), args.random) else: selected = random.sample(scores.keys(), int(floor(len(scores) * args.proportion))) scores = dict(_ for _ in scores.items() if _[0] in selected) metrics = dict(_ for _ in metrics.items() if _[0] in selected) X = np.zeros((len(scores), len(MetricEntry.metrics))) y = [] for index, (tid, score) in enumerate(scores.items()): X[index] = metrics[tid].matrix_row y.append(score) clf = RandomForestRegressor(n_estimators=int(len(MetricEntry.metrics) / 3), max_depth=None, n_jobs=10, random_state=0) clf.fit(X, y) clf.metrics = MetricEntry.metrics importances = clf.feature_importances_ # std = np.std([tree.feature_importances_ for tree in clf.estimators_], axis=0) # indices = np.argsort(importances)[::-1] # Print the feature ranking print("Feature ranking:") ordered = sorted([(MetricEntry.metrics[_], importances[_]) for _ in range(len(importances))], key=operator.itemgetter(1), reverse=True) for rank, couple in enumerate(ordered, start=1): print(rank, "feature", couple[0], couple[1] * 100) print("Total contribution", 100 * sum([_[1] for _ in ordered])) # for f in range(X.shape[1]): # print("{0}, feature {1} ({2})".format( # f + 1, # MetricEntry.metrics[f], # importances[f] # )) # Create the dictionary clf = dict(("scoring", clf)) with open(args.scoring) as sco: orig = yaml.load(sco) clf["requirements"] = orig["requirements"] clf["not_fragmentary"] = orig["not_fragmentary"] with open(args.out, "wb") as forest: pickle.dump(clf, forest)