Exemple #1
0
    def create_file(self):
        # Expanding summits
        tfbs_summit_regions = GenomicRegionSet("TFBS Summit Regions")
        tfbs_summit_regions.read_bed(self.tfbs_summit_fname)

        for region in iter(tfbs_summit_regions):
            summit = int(region.data.split()[-1]) + region.initial
            region.initial = max(summit - (self.peak_ext / 2), 0)
            region.final = summit + (self.peak_ext / 2)

        # Calculating intersections
        mpbs_regions = GenomicRegionSet("MPBS Regions")
        mpbs_regions.read_bed(self.mpbs_fname)

        tfbs_summit_regions.sort()
        mpbs_regions.sort()

        with_overlap_regions = mpbs_regions.intersect(tfbs_summit_regions, mode=OverlapType.ORIGINAL)
        without_overlap_regions = mpbs_regions.subtract(tfbs_summit_regions, whole_region=True)
        tfbs_regions = GenomicRegionSet("TFBS Regions")

        for region in iter(with_overlap_regions):
            region.name = region.name.split(":")[0] + ":Y"
            tfbs_regions.add(region)

        for region in iter(without_overlap_regions):
            region.name = region.name.split(":")[0] + ":N"
            tfbs_regions.add(region)

        tfbs_regions.sort()

        tfbs_fname = os.path.join(self.output_location, "{}.bed".format(self.mpbs_name))
        tfbs_regions.write_bed(tfbs_fname)
Exemple #2
0
def fisher_table(motif_name, regions, mpbs, gene_set=False, mpbs_set=False):
    """
    TODO

    Keyword arguments:
    motif_name -- TODO
    regions -- TODO
    mpbs -- TODO
    gene_set -- TODO
    mpbs_set -- TODO

    Return:
    a -- TODO
    b -- TODO
    gene_set -- TODO
    mpbs_set -- TODO
    """

    # Fetching motif
    mpbs_motif = GenomicRegionSet(name="mpbs_motif")
    for region in mpbs.sequences:
        if motif_name in region.name:
            mpbs_motif.add(region)

    # Performing intersections
    if len(mpbs_motif) > 0:
        # regions which are overlapping with mpbs_motif
        intersect_original = regions.intersect(mpbs_motif, mode=OverlapType.ORIGINAL, rm_duplicates=True)
        # regions which are not overlapping with regions from mpbs_motif
        subtract_overlap = regions.subtract(mpbs_motif, whole_region=True)

        # Fetching genes
        if gene_set:
            gene_set_res = GeneSet(motif_name)
            for genomic_region in intersect_original.sequences:
                if genomic_region.name:
                    gene_list = [e if e[0] != "." else e[1:] for e in genomic_region.name.split(":")]
                    for g in gene_list:
                        gene_set_res.genes.append(g)
            gene_set_res.genes = list(set(gene_set_res.genes))  # Keep only unique genes
        else:
            gene_set_res = None

        # Fetching mpbs
        if mpbs_set:
            mpbs_set_res = mpbs_motif.intersect(regions, mode=OverlapType.ORIGINAL, rm_duplicates=True)
        else:
            mpbs_set_res = None

        return len(intersect_original), len(subtract_overlap), gene_set_res, mpbs_set_res

    else:
        gene_set_res = GeneSet(motif_name) if gene_set else None
        mpbs_set_res = GenomicRegionSet(mpbs_motif.name) if mpbs_set else None

        return 0, len(regions), gene_set_res, mpbs_set_res
Exemple #3
0
    def create_file(self):
        # Expanding summits
        tfbs_summit_regions = GenomicRegionSet("TFBS Summit Regions")
        tfbs_summit_regions.read_bed(self.tfbs_summit_fname)

        for region in iter(tfbs_summit_regions):
            summit = int(region.data.split()[-1]) + region.initial
            region.initial = max(summit - (self.peak_ext / 2), 0)
            region.final = summit + (self.peak_ext / 2)

        # Calculating intersections
        mpbs_regions = GenomicRegionSet("MPBS Regions")
        mpbs_regions.read_bed(self.mpbs_fname)

        tfbs_summit_regions.sort()
        mpbs_regions.sort()

        with_overlap_regions = mpbs_regions.intersect(
            tfbs_summit_regions, mode=OverlapType.ORIGINAL)
        without_overlap_regions = mpbs_regions.subtract(tfbs_summit_regions,
                                                        whole_region=True)
        tfbs_regions = GenomicRegionSet("TFBS Regions")

        for region in iter(with_overlap_regions):
            region.name = region.name.split(":")[0] + ":Y"
            tfbs_regions.add(region)

        for region in iter(without_overlap_regions):
            region.name = region.name.split(":")[0] + ":N"
            tfbs_regions.add(region)

        tfbs_regions.sort()

        tfbs_fname = os.path.join(self.output_location,
                                  "{}.bed".format(self.mpbs_name))
        tfbs_regions.write_bed(tfbs_fname)
Exemple #4
0
def chip_evaluate(args):
    # Evaluate Statistics
    fpr = dict()
    tpr = dict()
    roc_auc_1 = dict()
    roc_auc_10 = dict()
    roc_auc_50 = dict()
    roc_auc_100 = dict()
    recall = dict()
    precision = dict()
    prc_auc_1 = dict()
    prc_auc_10 = dict()
    prc_auc_50 = dict()
    prc_auc_100 = dict()

    footprint_file = args.footprint_file.split(",")
    footprint_name = args.footprint_name.split(",")
    footprint_type = args.footprint_type.split(",")

    max_score = 0
    if "SEG" in footprint_type:
        mpbs_regions = GenomicRegionSet("TFBS")
        mpbs_regions.read(args.tfbs_file)

        # Verifying the maximum score of the MPBS file
        for region in iter(mpbs_regions):
            score = int(region.data.split("\t")[0])
            if score > max_score:
                max_score = score
        max_score += 1

    for i in range(len(footprint_file)):
        footprints_regions = GenomicRegionSet("Footprints Prediction")
        footprints_regions.read(footprint_file[i])
        footprints_regions.sort()

        if footprint_type[i] == "SEG":
            # Increasing the score of MPBS entry once if any overlaps found in the predicted footprints.
            increased_score_mpbs_regions = GenomicRegionSet("Increased Regions")
            intersect_regions = mpbs_regions.intersect(footprints_regions, mode=OverlapType.ORIGINAL)
            for region in iter(intersect_regions):
                region.data = str(int(region.data.split("\t")[0]) + max_score)
                increased_score_mpbs_regions.add(region)

            # Keep the score of remained MPBS entry unchanged
            without_intersect_regions = mpbs_regions.subtract(footprints_regions, whole_region=True)
            for region in iter(without_intersect_regions):
                increased_score_mpbs_regions.add(region)

            increased_score_mpbs_regions.sort_score()

            fpr[i], tpr[i], roc_auc_1[i], roc_auc_10[i], roc_auc_50[i], roc_auc_100[i] = \
                roc_curve(increased_score_mpbs_regions)
            recall[i], precision[i], prc_auc_1[i], prc_auc_10[i], prc_auc_50[i], prc_auc_100[i] = \
                precision_recall_curve(increased_score_mpbs_regions)
        elif footprint_type[i] == "SC":
            footprints_regions.sort_score()
            fpr[i], tpr[i], roc_auc_1[i], roc_auc_10[i], roc_auc_50[i], roc_auc_100[i] = \
                roc_curve(footprints_regions)
            recall[i], precision[i], prc_auc_1[i], prc_auc_10[i], prc_auc_50[i], prc_auc_100[i] = \
                precision_recall_curve(footprints_regions)

    # Output the statistics results into text
    stats_fname = os.path.join(args.output_location, "{}_stats.txt".format(args.output_prefix))
    stats_header = ["METHOD", "AUC_100", "AUC_50", "AUC_10", "AUC_1", "AUPR_100", "AUPR_50", "AUPR_10", "AUPR_1"]
    with open(stats_fname, "w") as stats_file:
        stats_file.write("\t".join(stats_header) + "\n")
        for i in range(len(footprint_name)):
            stats_file.write(footprint_name[i] + "\t" +
                             str(roc_auc_100[i]) + "\t" + str(roc_auc_50[i]) + "\t" + str(roc_auc_10[i]) + "\t" +
                             str(roc_auc_1[i]) + "\t" + str(prc_auc_100[i]) + "\t" + str(prc_auc_50[i]) + "\t" +
                             str(prc_auc_10[i]) + "\t" + str(prc_auc_1[i]) + "\n")

    # Output the curves
    if args.print_roc_curve:
        label_x = "False Positive Rate"
        label_y = "True Positive Rate"
        curve_name = "ROC"
        plot_curve(footprint_name, args.output_location, fpr, tpr, roc_auc_100, label_x, label_y, args.output_prefix,
                   curve_name)
    if args.print_pr_curve:
        label_x = "Recall"
        label_y = "Precision"
        curve_name = "PRC"
        plot_curve(footprint_name, args.output_location, recall, precision, prc_auc_100, label_x, label_y,
                   args.output_prefix, curve_name)

    output_points(footprint_name, args.output_location, args.output_prefix, fpr, tpr, recall, precision)
Exemple #5
0
    def chip_evaluate(self):
        """
        This evaluation methodology uses motif-predicted binding sites (MPBSs) together with TF ChIP-seq data
        to evaluate the footprint predictions.

        return:
        """

        # Evaluate Statistics
        fpr = dict()
        tpr = dict()
        roc_auc = dict()
        roc_auc_1 = dict()
        roc_auc_2 = dict()
        recall = dict()
        precision = dict()
        prc_auc = dict()

        if "SEG" in self.footprint_type:
            mpbs_regions = GenomicRegionSet("TFBS")
            mpbs_regions.read_bed(self.tfbs_file)
            mpbs_regions.sort()

            # Verifying the maximum score of the MPBS file
            max_score = -99999999
            for region in iter(mpbs_regions):
                score = int(region.data)
                if score > max_score:
                    max_score = score
            max_score += 1

        for i in range(len(self.footprint_file)):
            footprints_regions = GenomicRegionSet("Footprints Prediction")
            footprints_regions.read_bed(self.footprint_file[i])

            # Sort footprint prediction bed files
            footprints_regions.sort()

            if self.footprint_type[i] == "SEG":
                # Increasing the score of MPBS entry once if any overlaps found in the predicted footprints.
                increased_score_mpbs_regions = GenomicRegionSet("Increased Regions")
                intersect_regions = mpbs_regions.intersect(footprints_regions, mode=OverlapType.ORIGINAL)
                for region in iter(intersect_regions):
                    region.data = str(int(region.data) + max_score)
                    increased_score_mpbs_regions.add(region)


                # Keep the score of remained MPBS entry unchanged
                without_intersect_regions = mpbs_regions.subtract(footprints_regions, whole_region=True)
                for region in iter(without_intersect_regions):
                    increased_score_mpbs_regions.add(region)

                increased_score_mpbs_regions.sort_score()

                fpr[i], tpr[i], roc_auc[i], roc_auc_1[i], roc_auc_2[i] = self.roc_curve(increased_score_mpbs_regions)
                recall[i], precision[i], prc_auc[i] = self.precision_recall_curve(increased_score_mpbs_regions)
            elif self.footprint_type[i] == "SC":
                footprints_regions.sort_score()
                fpr[i], tpr[i], roc_auc[i], roc_auc_1[i], roc_auc_2[i] = self.roc_curve(footprints_regions)
                recall[i], precision[i], prc_auc[i] = self.precision_recall_curve(footprints_regions)

        # Output the statistics results into text
        stats_fname = self.output_location + self.tf_name + "_stats.txt"
        stats_header = ["METHOD", "AUC_100", "AUC_10", "AUC_1", "AUPR"]
        with open(stats_fname, "w") as stats_file:
            stats_file.write("\t".join(stats_header) + "\n")
            for i in range(len(self.footprint_name)):
                stats_file.write(self.footprint_name[i] + "\t" + str(roc_auc[i]) + "\t" + str(roc_auc_1[i]) + "\t"
                                 + str(roc_auc_2[i]) + "\t" + str(prc_auc[i]) + "\n")

        # Output the curves
        if self.print_roc_curve:
            label_x = "False Positive Rate"
            label_y = "True Positive Rate"
            curve_name = "ROC"
            self.plot_curve(fpr, tpr, roc_auc, label_x, label_y, self.tf_name, curve_name)
        if self.print_pr_curve:
            label_x = "Recall"
            label_y = "Precision"
            curve_name = "PRC"
            self.plot_curve(recall, precision, prc_auc, label_x, label_y, self.tf_name, curve_name)

        self.output_points(self.tf_name, fpr, tpr, recall, precision)
Exemple #6
0
def chip_evaluate(args):
    # Evaluate Statistics
    fpr = dict()
    tpr = dict()
    roc_auc_1 = dict()
    roc_auc_10 = dict()
    roc_auc_50 = dict()
    roc_auc_100 = dict()
    recall = dict()
    precision = dict()
    prc_auc_1 = dict()
    prc_auc_10 = dict()
    prc_auc_50 = dict()
    prc_auc_100 = dict()

    footprint_file = args.footprint_file.split(",")
    footprint_name = args.footprint_name.split(",")
    footprint_type = args.footprint_type.split(",")

    max_score = 0
    if "SEG" in footprint_type:
        mpbs_regions = GenomicRegionSet("TFBS")
        mpbs_regions.read(args.tfbs_file)

        # Verifying the maximum score of the MPBS file
        for region in iter(mpbs_regions):
            score = int(region.data.split("\t")[0])
            if score > max_score:
                max_score = score
        max_score += 1

    max_points = []
    for i in range(len(footprint_file)):
        footprints_regions = GenomicRegionSet("Footprints Prediction")
        footprints_regions.read(footprint_file[i])
        footprints_regions.sort()

        if footprint_type[i] == "SEG":
            # Increasing the score of MPBS entry once if any overlaps found in the predicted footprints.
            increased_score_mpbs_regions = GenomicRegionSet("Increased Regions")
            intersect_regions = mpbs_regions.intersect(footprints_regions, mode=OverlapType.ORIGINAL)
            for region in iter(intersect_regions):
                region.data = str(int(region.data.split("\t")[0]) + max_score)
                increased_score_mpbs_regions.add(region)

            # Keep the score of remained MPBS entry unchanged
            without_intersect_regions = mpbs_regions.subtract(footprints_regions, whole_region=True)
            for region in iter(without_intersect_regions):
                increased_score_mpbs_regions.add(region)

            increased_score_mpbs_regions.sort_score()

            fpr[i], tpr[i], roc_auc_1[i], roc_auc_10[i], roc_auc_50[i], roc_auc_100[i] = \
                roc_curve(increased_score_mpbs_regions)
            recall[i], precision[i], prc_auc_1[i], prc_auc_10[i], prc_auc_50[i], prc_auc_100[i] = \
                precision_recall_curve(increased_score_mpbs_regions)

            max_points.append(len(intersect_regions))

        elif footprint_type[i] == "SC":
            footprints_regions.sort_score()
            fpr[i], tpr[i], roc_auc_1[i], roc_auc_10[i], roc_auc_50[i], roc_auc_100[i] = \
                roc_curve(footprints_regions)
            recall[i], precision[i], prc_auc_1[i], prc_auc_10[i], prc_auc_50[i], prc_auc_100[i] = \
                precision_recall_curve(footprints_regions)

            max_points.append(len(footprints_regions))

    # Output the statistics results into text
    stats_fname = os.path.join(args.output_location, "{}_stats.txt".format(args.output_prefix))
    stats_header = ["METHOD", "AUC_100", "AUC_50", "AUC_10", "AUC_1", "AUPR_100", "AUPR_50", "AUPR_10", "AUPR_1"]
    with open(stats_fname, "w") as stats_file:
        stats_file.write("\t".join(stats_header) + "\n")
        for i in range(len(footprint_name)):
            stats_file.write(footprint_name[i] + "\t" +
                             str(roc_auc_100[i]) + "\t" + str(roc_auc_50[i]) + "\t" + str(roc_auc_10[i]) + "\t" +
                             str(roc_auc_1[i]) + "\t" + str(prc_auc_100[i]) + "\t" + str(prc_auc_50[i]) + "\t" +
                             str(prc_auc_10[i]) + "\t" + str(prc_auc_1[i]) + "\n")

    # Output the curves
    if args.print_roc_curve:
        label_x = "False Positive Rate"
        label_y = "True Positive Rate"
        curve_name = "ROC"
        plot_curve(footprint_name, args.output_location, fpr, tpr, roc_auc_100, label_x, label_y, args.output_prefix,
                   curve_name, max_points=max_points)
    if args.print_pr_curve:
        label_x = "Recall"
        label_y = "Precision"
        curve_name = "PRC"
        plot_curve(footprint_name, args.output_location, recall, precision, prc_auc_100, label_x, label_y,
                   args.output_prefix, curve_name, max_points=max_points)

    output_points(footprint_name, args.output_location, args.output_prefix, fpr, tpr, recall, precision)
Exemple #7
0
    def chip_evaluate(self):
        """
        This evaluation methodology uses motif-predicted binding sites (MPBSs) together with TF ChIP-seq data
        to evaluate the footprint predictions.

        return:
        """

        # Evaluate Statistics
        fpr = dict()
        tpr = dict()
        roc_auc = dict()
        roc_auc_1 = dict()
        roc_auc_2 = dict()
        recall = dict()
        precision = dict()
        prc_auc = dict()

        if "SEG" in self.footprint_type:
            mpbs_regions = GenomicRegionSet("TFBS")
            mpbs_regions.read_bed(self.tfbs_file)
            mpbs_regions.sort()

            # Verifying the maximum score of the MPBS file
            max_score = -99999999
            for region in iter(mpbs_regions):
                score = int(region.data)
                if score > max_score:
                    max_score = score
            max_score += 1

        for i in range(len(self.footprint_file)):
            footprints_regions = GenomicRegionSet("Footprints Prediction")
            footprints_regions.read_bed(self.footprint_file[i])

            # Sort footprint prediction bed files
            footprints_regions.sort()

            if self.footprint_type[i] == "SEG":
                # Increasing the score of MPBS entry once if any overlaps found in the predicted footprints.
                increased_score_mpbs_regions = GenomicRegionSet(
                    "Increased Regions")
                intersect_regions = mpbs_regions.intersect(
                    footprints_regions, mode=OverlapType.ORIGINAL)
                for region in iter(intersect_regions):
                    region.data = str(int(region.data) + max_score)
                    increased_score_mpbs_regions.add(region)

                # Keep the score of remained MPBS entry unchanged
                without_intersect_regions = mpbs_regions.subtract(
                    footprints_regions, whole_region=True)
                for region in iter(without_intersect_regions):
                    increased_score_mpbs_regions.add(region)

                increased_score_mpbs_regions.sort_score()

                fpr[i], tpr[i], roc_auc[i], roc_auc_1[i], roc_auc_2[
                    i] = self.roc_curve(increased_score_mpbs_regions)
                recall[i], precision[i], prc_auc[
                    i] = self.precision_recall_curve(
                        increased_score_mpbs_regions)
            elif self.footprint_type[i] == "SC":
                footprints_regions.sort_score()
                fpr[i], tpr[i], roc_auc[i], roc_auc_1[i], roc_auc_2[
                    i] = self.roc_curve(footprints_regions)
                recall[i], precision[i], prc_auc[
                    i] = self.precision_recall_curve(footprints_regions)

        # Output the statistics results into text
        stats_fname = self.output_location + self.tf_name + "_stats.txt"
        stats_header = ["METHOD", "AUC_100", "AUC_10", "AUC_1", "AUPR"]
        with open(stats_fname, "w") as stats_file:
            stats_file.write("\t".join(stats_header) + "\n")
            for i in range(len(self.footprint_name)):
                stats_file.write(self.footprint_name[i] + "\t" +
                                 str(roc_auc[i]) + "\t" + str(roc_auc_1[i]) +
                                 "\t" + str(roc_auc_2[i]) + "\t" +
                                 str(prc_auc[i]) + "\n")

        # Output the curves
        if self.print_roc_curve:
            label_x = "False Positive Rate"
            label_y = "True Positive Rate"
            curve_name = "ROC"
            self.plot_curve(fpr, tpr, roc_auc, label_x, label_y, self.tf_name,
                            curve_name)
        if self.print_pr_curve:
            label_x = "Recall"
            label_y = "Precision"
            curve_name = "PRC"
            self.plot_curve(recall, precision, prc_auc, label_x, label_y,
                            self.tf_name, curve_name)

        self.output_points(self.tf_name, fpr, tpr, recall, precision)
Exemple #8
0
                                                  line[6], line[2], str(min(donor)), str(max(acceptor)), 
                                                  "255,0,0", "2", 
                                                  ",".join([ str(abs(donor[1] - donor[0])),
                                                             str(abs(acceptor[1] - acceptor[0])) ]),
                                                  "0,"+str(abs(min(donor)-min(acceptor))) ]), file=g)
                        else:
                            pass
                            #print(line)
                            #sys.exit()

        print("tcons:\t" + args.t)
        tcons = GenomicRegionSet("tcons")
        tcons.read_bed(args.t)
        circrna = GenomicRegionSet("circRNA")
        circrna.read_bed(args.o)
        circ_inTCON = circrna.intersect(y=tcons, mode = OverlapType.COMP_INCL)
        circ_TCONs = tcons.intersect(y=circ_inTCON, mode = OverlapType.ORIGINAL)
        #print(len(circ_TCONs))
        circ_TCONs.write_bed(args.c)


#  0        1       2     3         4       5   6   7   8
# chr1  39449029    +   chr1    39448068    +   0   0   0
#                     9                          10          11      12            13   
# 97ZZTR1:411:C4VC3ACXX:5:1102:16097:34171    39448994    35M15S  39448069    35S15M868p50M

    ############### FASTA slicing #######################################
    elif args.mode == "sliceFASTA":
        print(os.path.basename(args.i) + " -start "+str(args.p)+" -end "+str(args.p+args.l))
        from rgt.SequenceSet import SequenceSet
        seq = SequenceSet(name=args.i, seq_type="RNA")
Exemple #9
0
def fisher_table(motif_name, regions, mpbs, gene_set=False, mpbs_set=False):
    """
    TODO

    Keyword arguments:
    motif_name -- TODO
    regions -- TODO
    mpbs -- TODO
    gene_set -- TODO
    mpbs_set -- TODO

    Return:
    a -- TODO
    b -- TODO
    gene_set -- TODO
    mpbs_set -- TODO
    """

    # Fetching motif
    mpbs_motif = GenomicRegionSet(name="mpbs_motif")
    for region in mpbs.sequences:
        if motif_name in region.name:
            mpbs_motif.add(region)

    # Performing intersections
    if len(mpbs_motif) > 0:
        # regions which are overlapping with mpbs_motif
        intersect_original = regions.intersect(mpbs_motif,
                                               mode=OverlapType.ORIGINAL,
                                               rm_duplicates=True)
        # regions which are not overlapping with regions from mpbs_motif
        subtract_overlap = regions.subtract(mpbs_motif, whole_region=True)

        # Fetching genes
        if gene_set:
            gene_set_res = GeneSet(motif_name)
            for genomic_region in intersect_original.sequences:
                if genomic_region.name:
                    gene_list = [
                        e if e[0] != "." else e[1:]
                        for e in genomic_region.name.split(":")
                    ]
                    for g in gene_list:
                        gene_set_res.genes.append(g)
            gene_set_res.genes = list(set(
                gene_set_res.genes))  # Keep only unique genes
        else:
            gene_set_res = None

        # Fetching mpbs
        if mpbs_set:
            mpbs_set_res = mpbs_motif.intersect(regions,
                                                mode=OverlapType.ORIGINAL,
                                                rm_duplicates=True)
        else:
            mpbs_set_res = None

        return len(intersect_original), len(
            subtract_overlap), gene_set_res, mpbs_set_res

    else:
        gene_set_res = GeneSet(motif_name) if gene_set else None
        mpbs_set_res = GenomicRegionSet(mpbs_motif.name) if mpbs_set else None

        return 0, len(regions), gene_set_res, mpbs_set_res