def main(): parser = argparse.ArgumentParser( "Script to create the Venn Plots from BED files") parser.add_argument( "input", nargs='+', help="The directory containing BED files from pipeline") parser.add_argument("-r", "--reference", required=True, help="The reference BED file to compare against") parser.add_argument("-o", "--output", required=True, help="The output prefix") parser.add_argument( "-f", "--filter", ) args = parser.parse_args() ref_bed = bed12.loadbed(args.reference, False, False) print("Loaded Reference BED file. # junctions: " + str(len(ref_bed))) # Load all bed files bed_data = {} aligners = set() reads = set() junc_analysers = set() for bed_file in args.input: bed_base = os.path.splitext(os.path.basename(bed_file))[0] parts = bed_base.split('-') if (not parts[0] == "trinity"): aligners.add(parts[0]) reads.add(parts[1]) junc_analysers.add(parts[2]) bed_data[parts[2] + "-" + parts[0]] = bed12.loadbed( bed_file, False, False) print("Loaded: " + bed_file + "; # junctions: " + str(len(bed_data[parts[2] + "-" + parts[0]]))) print("Found these aligners: " + ', '.join(aligners)) print("Found these reads: " + ', '.join(reads)) print("Found these junction analysis tools: " + ', '.join(junc_analysers)) # Build table tab = [] for a in aligners: for j in junc_analysers: p = Performance() p.tp = len(bed_data[j + "-" + a] & ref_bed) p.fp = len(bed_data[j + "-" + a] - ref_bed) p.fn = len(ref_bed - bed_data[j + "-" + a]) tab.append(a + "\t" + j + "\t" + p.__str__()) # Output table to disk with open(args.output + "-junc_analysis.tab", "w") as tab_out: print("Aligner\tFilter\t" + Performance.shortHeader(), file=tab_out) for p in tab: print(p, file=tab_out)
def main(): parser = argparse.ArgumentParser( "Script to compare bed file against reference bed") parser.add_argument("input", nargs="+", help="The BED file to analyse") parser.add_argument("-r", "--reference", required=True, help="The reference BED file to compare against") parser.add_argument("-o", "--output", help="The output venn plot") args = parser.parse_args() ref_bed = bed12.loadbed(args.reference, False, False) print("Loaded Reference BED file. # junctions: ", len(ref_bed)) # Load all bed files print("Results:") print("File\t#junc\t", Performance.shortHeader()) recall = 0 precision = 0 f1 = 0 for bf in args.input: bed_data = bed12.loadbed(bf, False, False) # Build table tab = list() p = Performance() p.tp = len(ref_bed & bed_data) p.fp = len(bed_data - ref_bed) p.fn = len(ref_bed - bed_data) print(bf, "\t", len(bed_data), "\t", p) recall += p.recall() precision += p.precision() f1 += p.F1() if len(args.input) > 1: print("Mean recall: ", recall / len(args.input)) print("Mean precision: ", precision / len(args.input)) print("Mean f1: ", f1 / len(args.input)) if not args.output == None and len(args.input) == 1: # Create Venns plt = figure(1, figsize=(6, 6)) venn2(subsets=(p.fn, p.fp, p.tp), set_labels=(args.reference, args.input)) plt.show() plt.savefig(args.output)
def main(): parser = argparse.ArgumentParser("Script to produce a list labelling whether each entry in the portcullis tab input belong to the bed reference or not") parser.add_argument("input", nargs="+", help="The tab file produce by portcullis") parser.add_argument("-r", "--reference", required=True, help="The reference BED file to compare against") parser.add_argument("-o", "--output", default="bedref_out.labels", help="Output prefix for output files") args = parser.parse_args() # X should contain a matrix of features derived from the portcullis tab file # y should contain the labels (0 not a valid junction, 1 a valid junction). Confirmed with the reference. # Load tab file and produce matrix header = "" with open(args.input[0]) as f: # Skip header header = f.readline() f.close() # Load reference and add labels ref = bed12.loadbed(args.reference, False, False) print("# ref entries: " + str(len(ref)), file=sys.stderr) res = open(args.output, "w") nbentries = 0 nb_pos = 0 nb_neg = 0 for tf in args.input: with open(tf) as f: # Skip header h = f.readline() for line in f: cleanline = line.strip() if not cleanline == "": b = bed12.BedEntry.create_from_line(line, False, False) nbentries += 1 if b in ref: print("1", file=res) nb_pos += 1 else: print("0", file=res) nb_neg += 1 res.close() print("Found", nbentries, "bed entries in", len(args.input), "input files") print("Detected", nb_pos, "positive and", nb_neg, "negative entries")
def main(): parser = argparse.ArgumentParser( "Script to build a random forest decision tree") parser.add_argument("input", nargs="+", help="The tab file produce by portcullis") parser.add_argument("-r", "--reference", required=True, help="The reference BED file to compare against") args = parser.parse_args() inputstr = ", ".join(args.input) # X should contain a matrix of features derived from the portcullis tab file # y should contain the labels (0 not a valid junction, 1 a valid junction). Confirmed with the reference. # Load tab file and produce matrix tabdata = [] bed = [] for i in args.input: b, t = tab.loadtab(i) tabdata.extend(t) bed.extend(b) print("Loaded " + str(len(b)) + " entries from: " + i) print("# tab entries: " + str(len(tabdata)) + " from " + str(len(args.input)) + " input files") # Load reference and add labels ref = bed12.loadbed(args.reference, False, False) print("# ref entries: " + str(len(ref))) in_juncs = 0 out_juncs = 0 X = np.zeros((len(tabdata), tab.TabEntry.nbMetrics())) y = np.zeros((len(tabdata), 1)) for i in range(0, len(tabdata)): b = bed[i] X[i] = tabdata[i].makeMatrixRow() if b in ref: in_juncs += 1 y[i, 0] = 1 else: out_juncs += 1 y[i, 0] = 0 y = y[:, 0] print("In:" + str(in_juncs)) print("Out:" + str(out_juncs)) print("Running PCA") pca = PCA(n_components=10) X_r = pca.fit_transform(scale(X)) print(pca.explained_variance_ratio_) fig, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4, figsize=(27, 8)) plt.tight_layout(pad=4.0, w_pad=4.0, h_pad=3.0) for c, i, target_name in zip("gr", [1, 0], ["genuine", "invalid"]): ax1.scatter(X_r[y == i, 0], X_r[y == i, 1], c=c, label=target_name) ax1.legend() ax1.set_title("PCA") ax1.set_xlabel("PC1") ax1.set_ylabel("PC2") for c, i, target_name in zip("gr", [1, 0], ["genuine", "invalid"]): ax2.scatter(X_r[y == i, 0], X_r[y == i, 1], c=c, label=target_name) ax2.legend() ax2.set_title("Zoomed PCA") ax2.set_xlabel("PC1") ax2.set_ylabel("PC2") ax2.set_xlim(-5, 10) ax2.set_ylim(-2, 10) n = len(X_r) kf_5 = cross_validation.KFold(n, n_folds=5, shuffle=True, random_state=2) regr = LogisticRegression() acc = [] score = cross_validation.cross_val_score(regr, np.ones((n, 1)), y.ravel(), cv=kf_5, scoring='f1').mean() acc.append(score) for i in np.arange(1, 11): score = cross_validation.cross_val_score(regr, X_r[:, :i], y.ravel(), cv=kf_5, scoring='f1').mean() acc.append(score) print(acc) ax3.plot([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], acc[0:11], '-v') ax3.set_title('Logistic regression using 5-fold CV') ax3.set_xlabel('Number of principal components in regression') ax3.set_ylabel('Accuracy') ax4.plot([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], acc[1:11], '-v') ax4.set_title('Logistic regression excluding initial') ax4.set_xlabel('Number of principal components in regression') ax4.set_ylabel('Accuracy') ax4.set_xlim(0, 10) plt.show()
def main(): parser = argparse.ArgumentParser("Script to create the Venn Plots from BED files") parser.add_argument("input", help="The directory containing BED files from pipeline") parser.add_argument("-o", "--output", required=True, help="The output bed files prefix") parser.add_argument("-r", "--reference", required=True, help="The reference BED file to compare against") args = parser.parse_args() ref_bed = bed12.loadbed(args.reference, False, False) print("Loaded Reference BED file. # junctions: " + str(len(ref_bed))) # Load all bed files bed_data = {} aligners = set() reads = set() junc_analysers = set() for bed_file in os.listdir(args.input): if bed_file.endswith("-real-all.bed"): bed_base = os.path.splitext(bed_file)[0] bed_data[bed_base] = bed12.loadbed(args.input + "/" + bed_file, False, False) parts = bed_base.split('-') aligners.add(parts[0]) reads.add(parts[1]) junc_analysers.add(parts[2]) print("Loaded: " + bed_file + "; # junctions: " + str(len(bed_data[bed_base]))) print("Found these aligners: " + ', '.join(aligners)) print("Found these reads: " + ', '.join(reads)) print("Found these junction analysis tools: " + ', '.join(junc_analysers)) # Filtering reference new_ref = set() extra = set() for a in aligners: new_ref = new_ref.union(ref_bed.intersection(bed_data[a + "-real-all"])) extra = extra.union(bed_data[a + "-real-all"] - ref_bed) print("New reference contains " + str(len(new_ref)) + " junctions from original reference") sv_bed = set2list(new_ref, "simvar") # Output new bed file to disk with open(args.output + ".sim_var.bed", "w") as bed_sv_out: print("track name=\"junctions\"", file=bed_sv_out) for b in sv_bed: print(b, file=bed_sv_out) print("Saved: " + args.output + ".sim_var.bed") print() print("Found " + str(len(extra)) + " potential junctions outside reference") extra_special = set() for e in extra: found = 0 for a in aligners: if e in bed_data[a + "-real-all"]: found += 1 if found >= 2: extra_special.add(e) print("Found " + str(len(extra_special)) + " junctions outside reference but present in at least 2 alignments") new_ref = new_ref.union(extra_special) print("New reference contains " + str(len(new_ref)) + " junctions") new_bed = set2list(new_ref, "real") # Output new bed file to disk with open(args.output + ".real.bed", "w") as bed_out: print("track name=\"junctions\"", file=bed_out) for b in new_bed: print(b, file=bed_out) print("Saved: " + args.output + ".real.bed")
def main(): parser = argparse.ArgumentParser( "Script to produce a list labelling whether each entry in the portcullis tab input belong to the bed reference or not" ) parser.add_argument("input", nargs="+", help="The tab file produce by portcullis") parser.add_argument("-r", "--reference", required=True, help="The reference BED file to compare against") parser.add_argument("-o", "--output", default="bedref_out.labels", help="Output prefix for output files") parser.add_argument("-f", "--filter", action='store_true', default=False, help="Whether to filter tab file") args = parser.parse_args() # X should contain a matrix of features derived from the portcullis tab file # y should contain the labels (0 not a valid junction, 1 a valid junction). Confirmed with the reference. # Load tab file and produce matrix header = "" with open(args.input[0]) as f: # Skip header header = f.readline() f.close() # b, t = tab.loadtab(i) # bed.extend(b) # tabs.extend(t) # print ("Loaded " + str(len(b)) + " entries from: " + i, file=sys.stderr) # print ("# tab entries: " + str(len(tabs)) + " from " + str(len(args.input)) + " input files", file=sys.stderr) # Load reference and add labels ref = bed12.loadbed(args.reference, False, False) print("# ref entries: " + str(len(ref)), file=sys.stderr) res = open(args.output, "w") filtin = None filtout = None if args.filter: filtin = open(args.output + ".in.tab", "w") filtout = open(args.output + ".out.tab", "w") filtin.write(header) filtout.write(header) nbentries = 0 for tf in args.input: with open(tf) as f: # Skip header h = f.readline() for line in f: cleanline = line.strip() if not cleanline == "": b = bed12.BedEntry.create_from_tabline(line, False, False) nbentries += 1 if b in ref: print("1", file=res) if args.filter: print(line, file=filtin, end="") else: print("0", file=res) if args.filter: print(line, file=filtout, end="") res.close() if args.filter: filtin.close() filtout.close() print("Found ", nbentries, " tab entries in ", len(args.input), " input files")
def main(): parser = argparse.ArgumentParser( "Script to build a random forest decision tree") parser.add_argument("input", nargs="+", help="The tab file produce by portcullis") parser.add_argument("-r", "--reference", required=True, help="The reference BED file to compare against") parser.add_argument("-t", "--threads", type=int, default="1", help="The number of threads to use") parser.add_argument("--test", help="Test the classifier against this file") parser.add_argument("-o", "--output", required=True, help="The output prefix") args = parser.parse_args() # X should contain a matrix of features derived from the portcullis tab file # y should contain the labels (0 not a valid junction, 1 a valid junction). Confirmed with the reference. # Load tab file and produce matrix bed = [] tab = [] for i in args.input: b, t = tab.loadtab(i) bed.extend(b) tab.extend(t) print("Loaded " + str(len(b)) + " entries from: " + i) print("# tab entries: " + str(len(tab)) + " from " + str(len(args.input)) + " input files") # Load reference and add labels ref = bed12.loadbed(args.reference, False, False) print("# ref entries: " + str(len(ref))) in_juncs = 0 out_juncs = 0 X = np.zeros((len(tab), TabEntry.nbMetrics())) y = list() for i in range(0, len(bed)): b = bed[i] X[i] = tab[i].makeMatrixRow() if b in ref: in_juncs += 1 y.append(1) else: out_juncs += 1 y.append(0) print("In:" + str(in_juncs)) print("Out:" + str(out_juncs)) # Load test data test_b, test_t = loadtab(args.test) test_X = np.zeros((len(test_t), TabEntry.nbMetrics())) test_y = [] for i in range(0, len(test_t)): b = test_b[i] test_X[i] = test_t[i].makeMatrixRow() if b in ref: test_y.append(1) else: test_y.append(0) print("Training Random Forest classifier") clf1 = RandomForestClassifier(n_estimators=40) scores = cross_val_score(clf1, X, y, n_jobs=args.threads, scoring="f1", cv=10) print("Random Forest F1 score: " + str(scores.mean()) + " (+/- " + str(scores.std() * 2)) clf1.fit(X, y) clf1_y_pred = clf1.predict(test_X) print( classification_report(test_y, clf1_y_pred, target_names=["Invalid", "Valid"])) # print("Training SVM (with RBF) classifier") # clf2 = SVC() # scores = cross_val_score(clf2, X, y, n_jobs=args.threads, scoring="f1") # print("SVM Mean score: " + str(scores.mean())) # clf2.fit(X, y) # clf2_y_pred = clf2.predict(test_X) # print(classification_report(test_y, clf2_y_pred, target_names=["0", "1"])) importances = clf1.feature_importances_ std = np.std([tree.feature_importances_ for tree in clf1.estimators_], axis=0) indices = np.argsort(importances)[::-1] # Print the feature ranking print("Feature ranking:") for f in range(X.shape[1]): print("%d. feature %s (%f)" % (f + 1, TabEntry.featureAt(indices[f]), importances[indices[f]])) # Plot the feature importances of the forest plt.figure() plt.title("Feature importances") plt.bar(range(X.shape[1]), importances[indices], color="r", yerr=std[indices], align="center") plt.xticks(range(X.shape[1]), TabEntry.sortedFeatures(indices)) locs, labels = plt.xticks() plt.setp(labels, rotation=90) plt.xlim([-1, X.shape[1]]) plt.tight_layout() plt.savefig(args.output + ".png")
def main(): parser = argparse.ArgumentParser( "Script to create the Venn Plots from BED files") parser.add_argument("input", nargs="+", help="The BED files to analyse") parser.add_argument("-r", "--reference", required=True, help="The reference BED file to compare against") parser.add_argument("-o", "--output", required=True, help="The output prefix") args = parser.parse_args() ref_bed = bed12.loadbed(args.reference, False, False) print("Loaded Reference BED file. # junctions: " + str(len(ref_bed))) # Load all bed files bed_data = {} aligners = set() reads = set() # junc_analysers = set() for bed_path in args.input: bed_file = os.path.split(bed_path)[1] bed_base = os.path.splitext(bed_file)[0] bed_data[bed_base] = bed12.loadbed(bed_path, False, False) parts = bed_base.split('-') aligners.add(parts[0]) reads.add(parts[1]) # junc_analysers.add(parts[2]) print("Loaded: " + bed_file + "; # junctions: " + str(len(bed_data[bed_base]))) print("Found these aligners: " + ', '.join(aligners)) print("Found these reads: " + ', '.join(reads)) # print ("Found these junction analysis tools: " + ', '.join(junc_analysers)) # Build table tab = list() for a in aligners: for r in reads: p = Performance() p.aligner = a p.input = r p.tp = len(ref_bed & bed_data[a + "-" + r]) p.fp = len(bed_data[a + "-" + r] - ref_bed) p.fn = len(ref_bed - bed_data[a + "-" + r]) tab.append(r + "\t" + a + "\t" + p.__str__()) # Output table to disk with open(args.output + "-align_reads.tab", "w") as tab_out: print("Dataset\tAligner\t" + Performance.shortHeader(), file=tab_out) for p in tab: print(p, file=tab_out) # Create Venns cols = rpy2.robjects.vectors.StrVector( ["lightblue", "purple", "green", "orange", "red"]) r = rpy2.robjects.r # Start the R thread base = importr("base") venn = importr("VennDiagram") grdevices = importr("grDevices") for r in reads: categories = list() categories.append("Reference") sets = list() sets.append(ref_bed) nums = dict() nums["area1"] = len(ref_bed) i = 2 for a in sorted(aligners): s = bed_data[a + "-" + r] sets.append(s) categories.append(a) nums["area{0}".format(i)] = len(s) i += 1 for num_combs in range(2, 6): for comb in itertools.combinations(range(1, 6), num_combs): index = "".join([str(x) for x in comb]) curr_sets = [sets[num - 1] for num in comb] nums["n{0}".format(index)] = len(set.intersection(*curr_sets)) grdevices.tiff(args.output + "-" + r + ".venn.tiff", width=960, height=960) venn.draw_quintuple_venn( height=5000, width=5000, # This will be in alphabetical order X( fill=cols, category=rpy2.robjects.vectors.StrVector(categories), margin=0.2, cat_dist=rpy2.robjects.vectors.FloatVector( [0.25, 0.3, 0.25, 0.25, 0.25]), cat_cex=3, cat_col=rpy2.robjects.vectors.StrVector( ["darkblue", "purple", "darkgreen", "darkorange", "darkred"]), cex=2, main="Comparison on junctions found by alignment tools", main_col="black", main_cex=8, sub="" + r + " dataset", sub_col="black", sub_cex=5, **nums) grdevices.dev_off()
def main(): parser = argparse.ArgumentParser( "Script to build a random forest decision tree") parser.add_argument("pos", help="The tab file produce by portcullis") parser.add_argument("neg", help="The tab file produce by portcullis") parser.add_argument("input", help="The tab file produce by portcullis") #parser.add_argument("input2", help="The tab file produce by portcullis") parser.add_argument("-t", "--threads", type=int, default="1", help="The number of threads to use") parser.add_argument("--test", help="Test the classifier against this file") parser.add_argument("-o", "--output", required=True, help="The output prefix") args = parser.parse_args() # X should contain a matrix of features derived from the portcullis tab file # y should contain the labels (0 not a valid junction, 1 a valid junction). Confirmed with the reference. pos = bed12.loadbed(args.pos, False, False) neg = bed12.loadbed(args.neg, False, False) data_X = [] train_Y = [] data = [] test_Y = [] #ref = open(args.input2) with open(args.input) as f: # Skip header f.readline() for line in f: parts = line.strip().split(sep="\t") if len(parts) > 1: test_Y.append(int(parts[8])) #test_Y.append(int(ref.readline())) raw = int(parts[14]) #rel = float(parts[9]) #rel2raw = float(parts[10]) maxmmes = float(parts[11]) #data.append(parts[9:-1]) data.append(maxmmes) b = bed12.BedEntry(False) b.chrom = parts[2] b.thick_start = int(parts[4]) b.thick_end = int(parts[5]) + 1 b.strand = parts[12] if b in pos: #data_X.append(parts[9:-1]) data_X.append(maxmmes) train_Y.append(1) if b in neg: #data_X.append(parts[9:-1]) data_X.append(maxmmes) train_Y.append(0) train_X = np.array(data_X, dtype='|S4').astype(np.float).reshape(-1, 1) test_X = np.array(data, dtype='|S4').astype(np.float).reshape(-1, 1) #train_X = np.array(data_X, dtype='|S4').astype(np.float) #test_X = np.array(data, dtype='|S4').astype(np.float) print("Logistic regression with L1 regularisation") #L1 regularized logistic regression with adjusted # weights (inversely proportional to class frequency) lr = LogisticRegression(C=1.0, penalty='l1', tol=1e-6) logReg = lr.fit(train_X, train_Y) lr_pred = lr.predict(test_X) print( classification_report(test_Y, lr_pred, target_names=["Invalid", "Valid"], digits=4)) print("Training Random Forest classifier") rf = RandomForestClassifier(n_estimators=100) scores = cross_val_score(rf, train_X, train_Y, n_jobs=args.threads, scoring="f1", cv=5) print("Random Forest F1 score: " + str(scores.mean()) + " (+/- " + str(scores.std() * 2) + ")") rf.fit(train_X, train_Y) clf1_y_pred = rf.predict(test_X) print( classification_report(test_Y, clf1_y_pred, target_names=["Invalid", "Valid"], digits=4))