def main():
    parser = argparse.ArgumentParser(
        "Script to create the Venn Plots from BED files")
    parser.add_argument(
        "input",
        nargs='+',
        help="The directory containing BED files from pipeline")
    parser.add_argument("-r",
                        "--reference",
                        required=True,
                        help="The reference BED file to compare against")
    parser.add_argument("-o",
                        "--output",
                        required=True,
                        help="The output prefix")
    parser.add_argument(
        "-f",
        "--filter",
    )
    args = parser.parse_args()

    ref_bed = bed12.loadbed(args.reference, False, False)
    print("Loaded Reference BED file.  # junctions: " + str(len(ref_bed)))

    # Load all bed files
    bed_data = {}
    aligners = set()
    reads = set()
    junc_analysers = set()
    for bed_file in args.input:
        bed_base = os.path.splitext(os.path.basename(bed_file))[0]
        parts = bed_base.split('-')
        if (not parts[0] == "trinity"):
            aligners.add(parts[0])
            reads.add(parts[1])
            junc_analysers.add(parts[2])
            bed_data[parts[2] + "-" + parts[0]] = bed12.loadbed(
                bed_file, False, False)
            print("Loaded: " + bed_file + "; # junctions: " +
                  str(len(bed_data[parts[2] + "-" + parts[0]])))

    print("Found these aligners: " + ', '.join(aligners))
    print("Found these reads: " + ', '.join(reads))
    print("Found these junction analysis tools: " + ', '.join(junc_analysers))

    # Build table
    tab = []
    for a in aligners:
        for j in junc_analysers:
            p = Performance()
            p.tp = len(bed_data[j + "-" + a] & ref_bed)
            p.fp = len(bed_data[j + "-" + a] - ref_bed)
            p.fn = len(ref_bed - bed_data[j + "-" + a])
            tab.append(a + "\t" + j + "\t" + p.__str__())

    # Output table to disk
    with open(args.output + "-junc_analysis.tab", "w") as tab_out:
        print("Aligner\tFilter\t" + Performance.shortHeader(), file=tab_out)
        for p in tab:
            print(p, file=tab_out)
Exemple #2
0
def main():
    parser = argparse.ArgumentParser(
        "Script to compare bed file against reference bed")
    parser.add_argument("input", nargs="+", help="The BED file to analyse")
    parser.add_argument("-r",
                        "--reference",
                        required=True,
                        help="The reference BED file to compare against")
    parser.add_argument("-o", "--output", help="The output venn plot")
    args = parser.parse_args()

    ref_bed = bed12.loadbed(args.reference, False, False)
    print("Loaded Reference BED file.  # junctions: ", len(ref_bed))

    # Load all bed files
    print("Results:")
    print("File\t#junc\t", Performance.shortHeader())

    recall = 0
    precision = 0
    f1 = 0

    for bf in args.input:
        bed_data = bed12.loadbed(bf, False, False)

        # Build table
        tab = list()
        p = Performance()
        p.tp = len(ref_bed & bed_data)
        p.fp = len(bed_data - ref_bed)
        p.fn = len(ref_bed - bed_data)

        print(bf, "\t", len(bed_data), "\t", p)

        recall += p.recall()
        precision += p.precision()
        f1 += p.F1()

    if len(args.input) > 1:
        print("Mean recall: ", recall / len(args.input))
        print("Mean precision: ", precision / len(args.input))
        print("Mean f1: ", f1 / len(args.input))

    if not args.output == None and len(args.input) == 1:
        # Create Venns
        plt = figure(1, figsize=(6, 6))
        venn2(subsets=(p.fn, p.fp, p.tp),
              set_labels=(args.reference, args.input))
        plt.show()
        plt.savefig(args.output)
def main():
	parser = argparse.ArgumentParser("Script to produce a list labelling whether each entry in the portcullis tab input belong to the bed reference or not")
	parser.add_argument("input", nargs="+", help="The tab file produce by portcullis")
	parser.add_argument("-r", "--reference", required=True, help="The reference BED file to compare against")
	parser.add_argument("-o", "--output", default="bedref_out.labels", help="Output prefix for output files")
	args = parser.parse_args()

	# X should contain a matrix of features derived from the portcullis tab file
	# y should contain the labels (0 not a valid junction, 1 a valid junction).  Confirmed with the reference.

	# Load tab file and produce matrix

	header = ""

	with open(args.input[0]) as f:
		# Skip header
		header = f.readline()
	f.close()

	# Load reference and add labels
	ref = bed12.loadbed(args.reference, False, False)
	print("# ref entries: " + str(len(ref)), file=sys.stderr)

	res = open(args.output, "w")

	nbentries = 0
	nb_pos = 0
	nb_neg = 0
	for tf in args.input:
		with open(tf) as f:

			# Skip header
			h = f.readline()

			for line in f:

				cleanline = line.strip()

				if not cleanline == "":
					b = bed12.BedEntry.create_from_line(line, False, False)
					nbentries += 1
					if b in ref:
						print("1", file=res)
						nb_pos += 1

					else:
						print("0", file=res)
						nb_neg += 1


	res.close()

	print("Found", nbentries, "bed entries in", len(args.input), "input files")
	print("Detected", nb_pos, "positive and", nb_neg, "negative entries")
Exemple #4
0
def main():
    parser = argparse.ArgumentParser(
        "Script to build a random forest decision tree")
    parser.add_argument("input",
                        nargs="+",
                        help="The tab file produce by portcullis")
    parser.add_argument("-r",
                        "--reference",
                        required=True,
                        help="The reference BED file to compare against")
    args = parser.parse_args()

    inputstr = ", ".join(args.input)
    # X should contain a matrix of features derived from the portcullis tab file
    # y should contain the labels (0 not a valid junction, 1 a valid junction).  Confirmed with the reference.

    # Load tab file and produce matrix
    tabdata = []
    bed = []
    for i in args.input:
        b, t = tab.loadtab(i)
        tabdata.extend(t)
        bed.extend(b)
        print("Loaded " + str(len(b)) + " entries from: " + i)
    print("# tab entries: " + str(len(tabdata)) + " from " +
          str(len(args.input)) + " input files")

    # Load reference and add labels
    ref = bed12.loadbed(args.reference, False, False)
    print("# ref entries: " + str(len(ref)))

    in_juncs = 0
    out_juncs = 0
    X = np.zeros((len(tabdata), tab.TabEntry.nbMetrics()))
    y = np.zeros((len(tabdata), 1))
    for i in range(0, len(tabdata)):
        b = bed[i]
        X[i] = tabdata[i].makeMatrixRow()
        if b in ref:
            in_juncs += 1
            y[i, 0] = 1
        else:
            out_juncs += 1
            y[i, 0] = 0

    y = y[:, 0]

    print("In:" + str(in_juncs))
    print("Out:" + str(out_juncs))

    print("Running PCA")

    pca = PCA(n_components=10)
    X_r = pca.fit_transform(scale(X))
    print(pca.explained_variance_ratio_)

    fig, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4, figsize=(27, 8))
    plt.tight_layout(pad=4.0, w_pad=4.0, h_pad=3.0)

    for c, i, target_name in zip("gr", [1, 0], ["genuine", "invalid"]):
        ax1.scatter(X_r[y == i, 0], X_r[y == i, 1], c=c, label=target_name)
    ax1.legend()
    ax1.set_title("PCA")
    ax1.set_xlabel("PC1")
    ax1.set_ylabel("PC2")

    for c, i, target_name in zip("gr", [1, 0], ["genuine", "invalid"]):
        ax2.scatter(X_r[y == i, 0], X_r[y == i, 1], c=c, label=target_name)
    ax2.legend()
    ax2.set_title("Zoomed PCA")
    ax2.set_xlabel("PC1")
    ax2.set_ylabel("PC2")
    ax2.set_xlim(-5, 10)
    ax2.set_ylim(-2, 10)

    n = len(X_r)
    kf_5 = cross_validation.KFold(n, n_folds=5, shuffle=True, random_state=2)

    regr = LogisticRegression()
    acc = []

    score = cross_validation.cross_val_score(regr,
                                             np.ones((n, 1)),
                                             y.ravel(),
                                             cv=kf_5,
                                             scoring='f1').mean()
    acc.append(score)

    for i in np.arange(1, 11):
        score = cross_validation.cross_val_score(regr,
                                                 X_r[:, :i],
                                                 y.ravel(),
                                                 cv=kf_5,
                                                 scoring='f1').mean()
        acc.append(score)

    print(acc)

    ax3.plot([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], acc[0:11], '-v')
    ax3.set_title('Logistic regression using 5-fold CV')
    ax3.set_xlabel('Number of principal components in regression')
    ax3.set_ylabel('Accuracy')

    ax4.plot([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], acc[1:11], '-v')
    ax4.set_title('Logistic regression excluding initial')
    ax4.set_xlabel('Number of principal components in regression')
    ax4.set_ylabel('Accuracy')
    ax4.set_xlim(0, 10)

    plt.show()
Exemple #5
0
def main():
	parser = argparse.ArgumentParser("Script to create the Venn Plots from BED files")
	parser.add_argument("input", help="The directory containing BED files from pipeline")
	parser.add_argument("-o", "--output", required=True, help="The output bed files prefix")
	parser.add_argument("-r", "--reference", required=True, help="The reference BED file to compare against")
	args = parser.parse_args()

	ref_bed = bed12.loadbed(args.reference, False, False)
	print("Loaded Reference BED file.  # junctions: " + str(len(ref_bed)))

	# Load all bed files
	bed_data = {}
	aligners = set()
	reads = set()
	junc_analysers = set()
	for bed_file in os.listdir(args.input):
		if bed_file.endswith("-real-all.bed"):
			bed_base = os.path.splitext(bed_file)[0]
			bed_data[bed_base] = bed12.loadbed(args.input + "/" + bed_file, False, False)
			parts = bed_base.split('-')
			aligners.add(parts[0])
			reads.add(parts[1])
			junc_analysers.add(parts[2])
			print("Loaded: " + bed_file + "; # junctions: " + str(len(bed_data[bed_base])))

	print("Found these aligners: " + ', '.join(aligners))
	print("Found these reads: " + ', '.join(reads))
	print("Found these junction analysis tools: " + ', '.join(junc_analysers))

	# Filtering reference
	new_ref = set()
	extra = set()
	for a in aligners:
		new_ref = new_ref.union(ref_bed.intersection(bed_data[a + "-real-all"]))
		extra = extra.union(bed_data[a + "-real-all"] - ref_bed)

	print("New reference contains " + str(len(new_ref)) + " junctions from original reference")

	sv_bed = set2list(new_ref, "simvar")

	# Output new bed file to disk
	with open(args.output + ".sim_var.bed", "w") as bed_sv_out:
		print("track name=\"junctions\"", file=bed_sv_out)
		for b in sv_bed:
			print(b, file=bed_sv_out)
	print("Saved: " + args.output + ".sim_var.bed")
	print()
	print("Found " + str(len(extra)) + " potential junctions outside reference")

	extra_special = set()
	for e in extra:
		found = 0
		for a in aligners:
			if e in bed_data[a + "-real-all"]:
				found += 1

		if found >= 2:
			extra_special.add(e)

	print("Found " + str(len(extra_special)) + " junctions outside reference but present in at least 2 alignments")

	new_ref = new_ref.union(extra_special)

	print("New reference contains " + str(len(new_ref)) + " junctions")

	new_bed = set2list(new_ref, "real")

	# Output new bed file to disk
	with open(args.output + ".real.bed", "w") as bed_out:
		print("track name=\"junctions\"", file=bed_out)
		for b in new_bed:
			print(b, file=bed_out)

	print("Saved: " + args.output + ".real.bed")
def main():
    parser = argparse.ArgumentParser(
        "Script to produce a list labelling whether each entry in the portcullis tab input belong to the bed reference or not"
    )
    parser.add_argument("input",
                        nargs="+",
                        help="The tab file produce by portcullis")
    parser.add_argument("-r",
                        "--reference",
                        required=True,
                        help="The reference BED file to compare against")
    parser.add_argument("-o",
                        "--output",
                        default="bedref_out.labels",
                        help="Output prefix for output files")
    parser.add_argument("-f",
                        "--filter",
                        action='store_true',
                        default=False,
                        help="Whether to filter tab file")
    args = parser.parse_args()

    # X should contain a matrix of features derived from the portcullis tab file
    # y should contain the labels (0 not a valid junction, 1 a valid junction).  Confirmed with the reference.

    # Load tab file and produce matrix

    header = ""

    with open(args.input[0]) as f:
        # Skip header
        header = f.readline()
    f.close()

    # b, t = tab.loadtab(i)
    # bed.extend(b)
    # tabs.extend(t)
    # print ("Loaded " + str(len(b)) + " entries from: " + i, file=sys.stderr)
    #    print ("# tab entries: " + str(len(tabs)) + " from " + str(len(args.input)) + " input files", file=sys.stderr)

    # Load reference and add labels
    ref = bed12.loadbed(args.reference, False, False)
    print("# ref entries: " + str(len(ref)), file=sys.stderr)

    res = open(args.output, "w")

    filtin = None
    filtout = None

    if args.filter:
        filtin = open(args.output + ".in.tab", "w")
        filtout = open(args.output + ".out.tab", "w")
        filtin.write(header)
        filtout.write(header)

    nbentries = 0
    for tf in args.input:
        with open(tf) as f:

            # Skip header
            h = f.readline()

            for line in f:

                cleanline = line.strip()

                if not cleanline == "":
                    b = bed12.BedEntry.create_from_tabline(line, False, False)
                    nbentries += 1
                    if b in ref:
                        print("1", file=res)
                        if args.filter:
                            print(line, file=filtin, end="")
                    else:
                        print("0", file=res)
                        if args.filter:
                            print(line, file=filtout, end="")

    res.close()

    if args.filter:
        filtin.close()
        filtout.close()

    print("Found ", nbentries, " tab entries in ", len(args.input),
          " input files")
Exemple #7
0
def main():
    parser = argparse.ArgumentParser(
        "Script to build a random forest decision tree")
    parser.add_argument("input",
                        nargs="+",
                        help="The tab file produce by portcullis")
    parser.add_argument("-r",
                        "--reference",
                        required=True,
                        help="The reference BED file to compare against")
    parser.add_argument("-t",
                        "--threads",
                        type=int,
                        default="1",
                        help="The number of threads to use")
    parser.add_argument("--test", help="Test the classifier against this file")
    parser.add_argument("-o",
                        "--output",
                        required=True,
                        help="The output prefix")
    args = parser.parse_args()

    # X should contain a matrix of features derived from the portcullis tab file
    # y should contain the labels (0 not a valid junction, 1 a valid junction).  Confirmed with the reference.

    # Load tab file and produce matrix
    bed = []
    tab = []
    for i in args.input:
        b, t = tab.loadtab(i)
        bed.extend(b)
        tab.extend(t)
        print("Loaded " + str(len(b)) + " entries from: " + i)
    print("# tab entries: " + str(len(tab)) + " from " + str(len(args.input)) +
          " input files")

    # Load reference and add labels
    ref = bed12.loadbed(args.reference, False, False)
    print("# ref entries: " + str(len(ref)))

    in_juncs = 0
    out_juncs = 0
    X = np.zeros((len(tab), TabEntry.nbMetrics()))
    y = list()
    for i in range(0, len(bed)):
        b = bed[i]
        X[i] = tab[i].makeMatrixRow()
        if b in ref:
            in_juncs += 1
            y.append(1)
        else:
            out_juncs += 1
            y.append(0)

    print("In:" + str(in_juncs))
    print("Out:" + str(out_juncs))

    # Load test data
    test_b, test_t = loadtab(args.test)
    test_X = np.zeros((len(test_t), TabEntry.nbMetrics()))
    test_y = []
    for i in range(0, len(test_t)):
        b = test_b[i]
        test_X[i] = test_t[i].makeMatrixRow()
        if b in ref:
            test_y.append(1)
        else:
            test_y.append(0)

    print("Training Random Forest classifier")

    clf1 = RandomForestClassifier(n_estimators=40)
    scores = cross_val_score(clf1,
                             X,
                             y,
                             n_jobs=args.threads,
                             scoring="f1",
                             cv=10)
    print("Random Forest F1 score: " + str(scores.mean()) + " (+/- " +
          str(scores.std() * 2))
    clf1.fit(X, y)
    clf1_y_pred = clf1.predict(test_X)
    print(
        classification_report(test_y,
                              clf1_y_pred,
                              target_names=["Invalid", "Valid"]))

    # print("Training SVM (with RBF) classifier")
    # clf2 = SVC()
    # scores = cross_val_score(clf2, X, y, n_jobs=args.threads, scoring="f1")
    # print("SVM Mean score: " + str(scores.mean()))
    # clf2.fit(X, y)
    # clf2_y_pred = clf2.predict(test_X)
    # print(classification_report(test_y, clf2_y_pred, target_names=["0", "1"]))

    importances = clf1.feature_importances_
    std = np.std([tree.feature_importances_ for tree in clf1.estimators_],
                 axis=0)
    indices = np.argsort(importances)[::-1]

    # Print the feature ranking
    print("Feature ranking:")

    for f in range(X.shape[1]):
        print("%d. feature %s (%f)" %
              (f + 1, TabEntry.featureAt(indices[f]), importances[indices[f]]))

    # Plot the feature importances of the forest
    plt.figure()
    plt.title("Feature importances")
    plt.bar(range(X.shape[1]),
            importances[indices],
            color="r",
            yerr=std[indices],
            align="center")
    plt.xticks(range(X.shape[1]), TabEntry.sortedFeatures(indices))
    locs, labels = plt.xticks()
    plt.setp(labels, rotation=90)
    plt.xlim([-1, X.shape[1]])
    plt.tight_layout()

    plt.savefig(args.output + ".png")
Exemple #8
0
def main():
    parser = argparse.ArgumentParser(
        "Script to create the Venn Plots from BED files")
    parser.add_argument("input", nargs="+", help="The BED files to analyse")
    parser.add_argument("-r",
                        "--reference",
                        required=True,
                        help="The reference BED file to compare against")
    parser.add_argument("-o",
                        "--output",
                        required=True,
                        help="The output prefix")
    args = parser.parse_args()

    ref_bed = bed12.loadbed(args.reference, False, False)
    print("Loaded Reference BED file.  # junctions: " + str(len(ref_bed)))

    # Load all bed files
    bed_data = {}
    aligners = set()
    reads = set()
    # junc_analysers = set()
    for bed_path in args.input:
        bed_file = os.path.split(bed_path)[1]
        bed_base = os.path.splitext(bed_file)[0]
        bed_data[bed_base] = bed12.loadbed(bed_path, False, False)
        parts = bed_base.split('-')
        aligners.add(parts[0])
        reads.add(parts[1])
        # junc_analysers.add(parts[2])
        print("Loaded: " + bed_file + "; # junctions: " +
              str(len(bed_data[bed_base])))

    print("Found these aligners: " + ', '.join(aligners))
    print("Found these reads: " + ', '.join(reads))
    # print ("Found these junction analysis tools: " + ', '.join(junc_analysers))

    # Build table
    tab = list()
    for a in aligners:
        for r in reads:
            p = Performance()
            p.aligner = a
            p.input = r
            p.tp = len(ref_bed & bed_data[a + "-" + r])
            p.fp = len(bed_data[a + "-" + r] - ref_bed)
            p.fn = len(ref_bed - bed_data[a + "-" + r])

            tab.append(r + "\t" + a + "\t" + p.__str__())

    # Output table to disk
    with open(args.output + "-align_reads.tab", "w") as tab_out:
        print("Dataset\tAligner\t" + Performance.shortHeader(), file=tab_out)
        for p in tab:
            print(p, file=tab_out)

    # Create Venns
    cols = rpy2.robjects.vectors.StrVector(
        ["lightblue", "purple", "green", "orange", "red"])

    r = rpy2.robjects.r  # Start the R thread
    base = importr("base")
    venn = importr("VennDiagram")
    grdevices = importr("grDevices")

    for r in reads:

        categories = list()
        categories.append("Reference")

        sets = list()
        sets.append(ref_bed)

        nums = dict()
        nums["area1"] = len(ref_bed)
        i = 2
        for a in sorted(aligners):
            s = bed_data[a + "-" + r]
            sets.append(s)
            categories.append(a)
            nums["area{0}".format(i)] = len(s)
            i += 1

        for num_combs in range(2, 6):
            for comb in itertools.combinations(range(1, 6), num_combs):
                index = "".join([str(x) for x in comb])
                curr_sets = [sets[num - 1] for num in comb]
                nums["n{0}".format(index)] = len(set.intersection(*curr_sets))

        grdevices.tiff(args.output + "-" + r + ".venn.tiff",
                       width=960,
                       height=960)
        venn.draw_quintuple_venn(
            height=5000,
            width=5000,
            # This will be in alphabetical order X(
            fill=cols,
            category=rpy2.robjects.vectors.StrVector(categories),
            margin=0.2,
            cat_dist=rpy2.robjects.vectors.FloatVector(
                [0.25, 0.3, 0.25, 0.25, 0.25]),
            cat_cex=3,
            cat_col=rpy2.robjects.vectors.StrVector(
                ["darkblue", "purple", "darkgreen", "darkorange", "darkred"]),
            cex=2,
            main="Comparison on junctions found by alignment tools",
            main_col="black",
            main_cex=8,
            sub="" + r + " dataset",
            sub_col="black",
            sub_cex=5,
            **nums)
    grdevices.dev_off()
Exemple #9
0
def main():
    parser = argparse.ArgumentParser(
        "Script to build a random forest decision tree")
    parser.add_argument("pos", help="The tab file produce by portcullis")
    parser.add_argument("neg", help="The tab file produce by portcullis")
    parser.add_argument("input", help="The tab file produce by portcullis")
    #parser.add_argument("input2", help="The tab file produce by portcullis")
    parser.add_argument("-t",
                        "--threads",
                        type=int,
                        default="1",
                        help="The number of threads to use")
    parser.add_argument("--test", help="Test the classifier against this file")
    parser.add_argument("-o",
                        "--output",
                        required=True,
                        help="The output prefix")
    args = parser.parse_args()

    # X should contain a matrix of features derived from the portcullis tab file
    # y should contain the labels (0 not a valid junction, 1 a valid junction).  Confirmed with the reference.

    pos = bed12.loadbed(args.pos, False, False)
    neg = bed12.loadbed(args.neg, False, False)

    data_X = []
    train_Y = []
    data = []
    test_Y = []
    #ref = open(args.input2)
    with open(args.input) as f:
        # Skip header
        f.readline()

        for line in f:
            parts = line.strip().split(sep="\t")
            if len(parts) > 1:
                test_Y.append(int(parts[8]))
                #test_Y.append(int(ref.readline()))
                raw = int(parts[14])
                #rel = float(parts[9])
                #rel2raw = float(parts[10])
                maxmmes = float(parts[11])

                #data.append(parts[9:-1])
                data.append(maxmmes)
                b = bed12.BedEntry(False)
                b.chrom = parts[2]
                b.thick_start = int(parts[4])
                b.thick_end = int(parts[5]) + 1
                b.strand = parts[12]
                if b in pos:
                    #data_X.append(parts[9:-1])
                    data_X.append(maxmmes)
                    train_Y.append(1)
                if b in neg:
                    #data_X.append(parts[9:-1])
                    data_X.append(maxmmes)
                    train_Y.append(0)

    train_X = np.array(data_X, dtype='|S4').astype(np.float).reshape(-1, 1)
    test_X = np.array(data, dtype='|S4').astype(np.float).reshape(-1, 1)
    #train_X = np.array(data_X, dtype='|S4').astype(np.float)
    #test_X = np.array(data, dtype='|S4').astype(np.float)

    print("Logistic regression with L1 regularisation")
    #L1 regularized logistic regression with adjusted
    # weights (inversely proportional to class frequency)
    lr = LogisticRegression(C=1.0, penalty='l1', tol=1e-6)
    logReg = lr.fit(train_X, train_Y)
    lr_pred = lr.predict(test_X)
    print(
        classification_report(test_Y,
                              lr_pred,
                              target_names=["Invalid", "Valid"],
                              digits=4))

    print("Training Random Forest classifier")

    rf = RandomForestClassifier(n_estimators=100)
    scores = cross_val_score(rf,
                             train_X,
                             train_Y,
                             n_jobs=args.threads,
                             scoring="f1",
                             cv=5)
    print("Random Forest F1 score: " + str(scores.mean()) + " (+/- " +
          str(scores.std() * 2) + ")")
    rf.fit(train_X, train_Y)
    clf1_y_pred = rf.predict(test_X)
    print(
        classification_report(test_Y,
                              clf1_y_pred,
                              target_names=["Invalid", "Valid"],
                              digits=4))