Exemple #1
0
def tss_dist(inputfile=None, outputfile=None):
    """
    Computes the distance between TSS of gene transcripts.
    """

    gtf = GTF(inputfile, check_ensembl_format=True)

    gn_tss_dist = defaultdict(dict)

    message("Getting TSSs.")
    tss = gtf.get_tss(name=["transcript_id", "gene_id"], as_dict=True)

    for k in tss:
        tx_id, gn_id = k.split("|")
        gn_tss_dist[gn_id][tx_id] = int(tss[k])

    gn_to_tx_to_tss = gtf.get_gn_to_tx(as_dict_of_dict=True)

    message("Computing distances.")

    outputfile.write("\t".join([
        "gene_id", "transcript_id_1", "transcript_id_2", "dist", "tss_num_1",
        "tss_num_2"
    ]) + "\n")
    try:
        for gn_id in sorted(gn_tss_dist.keys()):

            tx_list = sorted(list(gn_tss_dist[gn_id].keys()))

            for i in range(len(tx_list) - 1):

                for j in range(i + 1, len(tx_list)):
                    dist = str(
                        abs(gn_tss_dist[gn_id][tx_list[i]] -
                            gn_tss_dist[gn_id][tx_list[j]]))
                    tss_1 = gn_to_tx_to_tss[gn_id][tx_list[i]]
                    tss_2 = gn_to_tx_to_tss[gn_id][tx_list[j]]

                    if tss_1 < tss_2:
                        str_out = "\t".join([
                            gn_id, tx_list[i], tx_list[j], dist,
                            str(tss_1),
                            str(tss_2)
                        ]) + "\n"
                        outputfile.write(str_out)
                    else:
                        str_out = "\t".join([
                            gn_id, tx_list[j], tx_list[i], dist,
                            str(tss_2),
                            str(tss_1)
                        ]) + "\n"
                        outputfile.write(str_out)

    except (BrokenPipeError, IOError):

        def _void_f(*args, **kwargs):
            pass

        message("Received a boken pipe signal", type="WARNING")
        sys.stdout.write = _void_f
        sys.stdout.flush = _void_f

    close_properly(outputfile, inputfile)
def tss_numbering(inputfile=None,
                  outputfile=None,
                  compute_dist=False,
                  key_name='tss_number',
                  key_name_dist='dist_to_first_tss',
                  add_nb_tss_to_gene=False,
                  gene_key='nb_tss'):
    """
    Computes the distance between TSS of gene transcripts.
    """

    gtf = GTF(inputfile, check_ensembl_format=True)

    gn_tss_dist = defaultdict(dict)

    message("Getting TSSs.")
    tss = gtf.get_tss(name=["transcript_id"], as_dict=True)
    tx_to_gn = gtf.get_tx_to_gn()

    for k in tss:
        gn_id = tx_to_gn[k]
        gn_tss_dist[gn_id][k] = int(tss[k])

    # if_dict_of_dict is true, get_gn_to_tx() returns a dict of dict
    # that maps gene_id to transcript_id and transcript_id to TSS
    # numbering (1 for most 5', then 2...). For transcripts having
    # the same TSSs, the tss number will be the same.
    gn_to_tx_to_tss = gtf.get_gn_to_tx(as_dict_of_dict=True)

    message("Numbering TSSs.")

    tss_number_file = make_tmp_file(prefix='tx_to_tss_number', suffix='.txt')

    gn_how_many_tss = dict()

    for gn_id in gn_to_tx_to_tss:
        for tx_id in gn_to_tx_to_tss[gn_id]:
            tss_num = str(gn_to_tx_to_tss[gn_id][tx_id])
            tss_number_file.write(tx_id + "\t" + tss_num + "\n")
            if gn_id not in gn_how_many_tss:
                gn_how_many_tss[gn_id] = tss_num
            else:
                if int(tss_num) > int(gn_how_many_tss[gn_id]):
                    gn_how_many_tss[gn_id] = tss_num

    tss_number_file.close()

    gtf = gtf.add_attr_from_file(feat='transcript',
                                 key='transcript_id',
                                 new_key=key_name,
                                 inputfile=open(tss_number_file.name),
                                 has_header=False)

    if add_nb_tss_to_gene:

        gn_how_many_tss_file = make_tmp_file(prefix='gn_how_many_tss',
                                             suffix='.txt')

        for a_key, a_val in gn_how_many_tss.items():
            gn_how_many_tss_file.write(a_key + "\t" + a_val + "\n")

        gn_how_many_tss_file.close()

        gtf = gtf.add_attr_from_file(feat='gene',
                                     key='gene_id',
                                     new_key=gene_key,
                                     inputfile=open(gn_how_many_tss_file.name),
                                     has_header=False)

    if compute_dist:
        gn_to_tx_ordered_by_tss = gtf.get_gn_to_tx(ordered_5p=True)
        tss_dist_file = make_tmp_file(prefix='tx_tss_dist_to_first_tss',
                                      suffix='.txt')

        for gn_id in gn_to_tx_to_tss:
            tx_list = gn_to_tx_ordered_by_tss[gn_id]
            tx_first = tx_list.pop(0)
            # The first tss as distance 0 to the
            # first tss...
            tss_dist_file.write(tx_first + "\t0\n")
            for tx_id in tx_list:
                dist_to_first = abs(int(tss[tx_first]) - int(tss[tx_id]))
                tss_dist_file.write(tx_id + "\t" + str(dist_to_first) + "\n")

        tss_dist_file.close()

        gtf = gtf.add_attr_from_file(feat='transcript',
                                     key='transcript_id',
                                     new_key=key_name_dist,
                                     inputfile=open(tss_dist_file.name),
                                     has_header=False)

    gtf.write(outputfile, gc_off=True)

    close_properly(outputfile, inputfile)
Exemple #3
0
def coverage(
        inputfile=None,
        outputfile=None,
        bw_list=None,
        labels=None,
        pseudo_count=1,
        nb_window=1,
        ft_type="promoter",
        n_highest=None,
        downstream=1000,
        key_name="cov",
        zero_to_na=False,
        name_column=None,
        upstream=1000,
        chrom_info=None,
        nb_proc=1,
        matrix_out=False,
        stat='mean'):
    """
    Compute transcript coverage with one or several bigWig.
    """

    # -------------------------------------------------------------------------
    # Create a list of labels.
    # Take user input in account
    # -------------------------------------------------------------------------

    bw_list = [x.name for x in bw_list]

    if len(bw_list) != len(set(bw_list)):
        message("Found the same bigwigs several times.",
                type="ERROR")

    message('Checking labels.')

    if labels is not None:
        labels = labels.split(",")
        # Ensure the number of labels is the same as the number of bw files.
        if len(labels) != len(bw_list):
            message("The number of labels should be the same as the number of"
                    " bigwig files.", type="ERROR")
        # Ensure labels are non-redondant
        if len(labels) > len(set(labels)):
            message("Labels must be unique.", type="ERROR")
    else:
        labels = []
        for i in range(len(bw_list)):
            labels += [
                os.path.splitext(
                    os.path.basename(
                        bw_list[i]))[0]]

    # -------------------------------------------------------------------------
    # Check the number of windows
    #
    # -------------------------------------------------------------------------

    if n_highest is None:
        n_highest = nb_window

    message('Number of bins: %d' % nb_window)
    message('N highest values: %d' % n_highest)

    if n_highest > nb_window:
        message('The number of window used for computing the score'
                ' (-n) can not be greater than the number of'
                ' windows (-w)', type="ERROR")
        sys.exit()

    # -------------------------------------------------------------------------
    # Check input file is in bed or GTF format
    #
    # -------------------------------------------------------------------------

    message("Loading input file...")
    if inputfile.name == '<stdin>':
        gtf = GTF(inputfile.name)
        is_gtf = True
    else:
        region_bo = BedTool(inputfile.name)
        if len(region_bo) == 0:
            message("Unable to find requested regions",
                    type="ERROR")

        if region_bo.file_type == 'gff':
            gtf = GTF(inputfile.name)
            is_gtf = True
        else:
            is_gtf = False

    # -------------------------------------------------------------------------
    # Get regions of interest
    #
    # -------------------------------------------------------------------------

    name_column = name_column.split(",")

    if is_gtf:

        message("Getting regions of interest...")

        if ft_type.lower() == "intergenic":

            region_bo = gtf.get_intergenic(chrom_info, 0, 0).slop(s=True,
                                                                  l=upstream,
                                                                  r=downstream,
                                                                  g=chrom_info.name).sort()

        elif ft_type.lower() == "intron":

            region_bo = gtf.get_introns().slop(s=True,
                                               l=upstream,
                                               r=downstream,
                                               g=chrom_info.name).sort()

        elif ft_type == "intron_by_tx":

            region_bo = gtf.get_introns(by_transcript=True,
                                        name=name_column,
                                        ).slop(s=True,
                                               l=upstream,
                                               r=downstream,
                                               g=chrom_info.name).sort()

        elif ft_type.lower() in ["promoter", "tss"]:

            region_bo = gtf.get_tss(name=name_column, ).slop(s=True,
                                                             l=upstream,
                                                             r=downstream,
                                                             g=chrom_info.name).sort()

        elif ft_type.lower() in ["tts", "terminator"]:

            region_bo = gtf.get_tts(name=name_column).slop(s=True,
                                                           l=upstream,
                                                           r=downstream,
                                                           g=chrom_info.name).sort()

        else:

            region_bo = gtf.select_by_key(
                "feature",
                ft_type, 0
            ).to_bed(name=name_column).slop(s=True,
                                            l=upstream,
                                            r=downstream,
                                            g=chrom_info.name).sort()

        if len(region_bo) == 0:
            message("Unable to find requested regions",
                    type="ERROR")

    else:
        region_bo = region_bo.slop(s=True,
                                   l=upstream,
                                   r=downstream,
                                   g=chrom_info.name).sort()

    region_bed = make_tmp_file(prefix="region", suffix=".bed")

    region_bo.saveas(region_bed.name)

    # -------------------------------------------------------------------------
    # Compute coverage
    #
    # -------------------------------------------------------------------------

    result_bed = bw_cov_mp(bw_list=bw_list,
                           region_file=open(region_bed.name),
                           labels=labels,
                           bin_nb=nb_window,
                           pseudo_count=pseudo_count,
                           zero_to_na=zero_to_na,
                           nb_proc=nb_proc,
                           n_highest=n_highest,
                           stat=stat,
                           verbose=pygtftk.utils.VERBOSITY)

    if matrix_out:
        result_bed.close()

        df_first = pd.read_csv(result_bed.name, sep="\t", header=None)

        df_first = df_first.ix[:, [0, 1, 2, 3, 5, 4]]

        df_list = []

        for i in range(len(labels)):
            # create a sub data frame containing the coverage values of the
            # current bwig
            str_to_find = r"^" + labels[i] + r"\|"
            tmp_df = df_first[df_first[3].str.match(str_to_find)].copy()
            to_replace = r"^" + labels[i] + r"\|"
            tmp_df.iloc[:, 3] = tmp_df.iloc[:, 3].replace(to_replace,
                                                          r"", regex=True)

            df_list += [tmp_df]

        df_final = df_list.pop(0)

        for i in df_list:
            # Add columns to df final by joining on
            # chrom, start, end, transcript_id, strand
            df_final = df_final.merge(i.iloc[:,
                                      list(range(6))], on=[0, 1,
                                                           2, 3, 5])

        df_final.columns = ["chrom",
                            "start",
                            "end",
                            "name",
                            "strand"] + labels

        df_final.to_csv(outputfile, sep="\t", index=False)

    else:
        nb_line = 0
        for i in result_bed:
            outputfile.write(i)
            nb_line += 1

        if nb_line == 0:
            message("No line available in output...",
                    type="ERROR")
    gc.disable()
    close_properly(inputfile, outputfile)
def great_reg_domains(inputfile=None,
                      outputfile=None,
                      go_id="GO:0003700",
                      species="hsapiens",
                      upstream=1000,
                      downstream=1000,
                      chrom_info=None,
                      distal=1000000,
                      mode='basal_plus_extension',
                      http_proxy=None,
                      https_proxy=None):
    """ Given a GTF and a GO term, attempt compute labeled regions using GREAT 'association rule'. """

    # -------------------------------------------------------------------------
    # chrom_len will store the chromosome sizes.
    # -------------------------------------------------------------------------

    chrom_len = chrom_info_as_dict(chrom_info)

    # -------------------------------------------------------------------------
    # Read the GTF
    # -------------------------------------------------------------------------

    gtf = GTF(inputfile, check_ensembl_format=False)

    # -------------------------------------------------------------------------
    # Get the TSSs -- Extend them by upstream/dowstream
    # -------------------------------------------------------------------------

    message("Defining basal regulatory domains.", type="INFO")
    basal_reg_bed = gtf.get_tss(name=['gene_id', 'gene_name']).slop(
        s=True, l=upstream, r=downstream, g=chrom_info.name).sort()

    basal_reg_bed_file = make_tmp_file(prefix='basal_reg', suffix='.bed')
    basal_reg_bed.saveas(basal_reg_bed_file.name)

    if mode == 'basal_plus_extension':
        # -------------------------------------------------------------------------
        # Search for upstream limits of each basal_reg_bed
        # Here we ignore overlapping  basal_reg_bed as the way they
        # are proceded is not documented in GREAT to our knowledge
        # -------------------------------------------------------------------------
        message("Defining regulatory domains upstream regions.", type="INFO")

        regulatory_region_start = dict()
        regulatory_region_end = dict()
        chroms = dict()
        strands = dict()

        basal_reg_bed_upstream = basal_reg_bed.closest(
            basal_reg_bed,
            # Ignore features in B that overlap A
            io=True,
            # In addition to the closest feature in B report distance
            # use negative distances to report upstream features.
            # Report distance with respect to A.
            # When A is on the - strand, "upstream" means B has a
            # higher(start, stop).
            D="a",
            # Ignore features in B that are downstream of features in A
            id=True,
            # How ties are handled. "first"  Report the first tie
            t="first",
            # Require that the query and the closest hit have different names/gene_ids.
            N=True)

        basal_reg_bed_upstream_file = make_tmp_file(
            prefix='basal_reg_bed_upstream', suffix='.bed')
        basal_reg_bed_upstream.saveas(basal_reg_bed_upstream_file.name)

        for line in basal_reg_bed_upstream:

            gene_id = line.name
            strand = line.strand
            end = line.end
            start = line.start
            gene_id = "|".join([gene_id, str(start), str(end), strand])
            chroms[gene_id] = line.chrom
            strands[gene_id] = strand

            if strand == '+':

                # if the feature chromosome in B is
                # '.' we have reached the start of the chr
                if line.fields[6] == '.':
                    regulatory_region_start[gene_id] = max(
                        0, line.start - distal)
                else:
                    padding = min(distal, abs(int(line.fields[12])) - 1)
                    regulatory_region_start[gene_id] = line.start - padding

            elif strand == '-':
                # if the feature chromosome in B is
                # '.' we have reached the end of the chr
                if line.fields[6] == '.':
                    regulatory_region_end[gene_id] = min(
                        int(chrom_len[line.chrom]), line.end + distal)
                else:
                    padding = min(distal, abs(int(line.fields[12])) - 1)
                    regulatory_region_end[gene_id] = line.end + padding
            else:
                message("Cannot process genes without strand", type="WARNING")
                message("Please check:" + gene_id, type="ERROR")

        # -------------------------------------------------------------------------
        # Search for downstream limits of each basal_reg_bed
        # Here we ignore overlapping  basal_reg_bed as the way they
        # are proceded is not documented in GREAT to our knowledge
        # -------------------------------------------------------------------------
        message("Defining regulatory domains downstream regions.", type="INFO")

        basal_reg_bed_downstream = basal_reg_bed.closest(
            basal_reg_bed,
            # Ignore features in B that overlap A
            io=True,
            # In addition to the closest feature in B report distance
            # use negative distances to report upstream features.
            # Report distance with respect to A.
            # When A is on the - strand, "upstream" means B has a
            # higher(start, stop).
            D="a",
            # Ignore features in B that are upstream of features in A
            iu=True,
            # How ties are handled. "first"  Report the first tie
            t="first",
            # Require that the query and the closest hit have different names/gene_ids.
            N=True)

        basal_reg_bed_downstream_file = make_tmp_file(
            prefix='basal_reg_bed_upstream', suffix='.bed')
        basal_reg_bed_downstream.saveas(basal_reg_bed_downstream_file.name)

        for line in basal_reg_bed_downstream:

            gene_id = line.name
            strand = line.strand
            end = line.end
            start = line.start
            gene_id = "|".join([gene_id, str(start), str(end), strand])
            chroms[gene_id] = line.chrom
            strands[gene_id] = strand

            if strand == '+':
                # if the feature chromosome in B is
                # '.' we have reached the start of the chr
                if line.fields[6] == '.':
                    regulatory_region_end[gene_id] = min(
                        int(chrom_len[line.chrom]), line.end + distal)
                else:
                    padding = min(distal, abs(int(line.fields[12])) - 1)
                    regulatory_region_end[gene_id] = line.end + padding
            elif strand == '-':
                if line.fields[6] == '.':
                    # sys.stderr.write(str(line.start - distal + 1) + "\n")
                    # sys.stderr.write(gene_id + "\n")
                    regulatory_region_start[gene_id] = max(
                        0, line.start - distal)
                else:
                    padding = min(distal, abs(int(line.fields[12])) - 1)
                    regulatory_region_start[gene_id] = max(
                        0, line.start - padding)
            else:
                message("Cannot process genes without strand", type="WARNING")
                message("Please check:" + gene_id, type="ERROR")
            # print(regulatory_region_start)

    else:
        message(
            "Only 'basal_plus_extension' association rule is currently supported.",
            type='ERROR')

    # -------------------------------------------------------------------------
    # Print the regulatory regions of all genes
    # By default print all genes
    # -------------------------------------------------------------------------

    if go_id is None:
        for gene_id in regulatory_region_start:
            outlist = [
                chroms[gene_id],
                str(regulatory_region_start[gene_id]),
                str(regulatory_region_end[gene_id]),
                gene_id.split("|")[0], "0", strands[gene_id]
            ]

            outputfile.write("\t".join(outlist) + "\n")
    else:

        # -------------------------------------------------------------------------
        # Get the list of gene/transcript associated with a particular GO term
        # -------------------------------------------------------------------------

        message("Getting Gene Ontology annotations.")

        if not go_id.startswith("GO:"):
            go_id = "GO:" + go_id

        is_associated = set()

        bm = Biomart(http_proxy=http_proxy, https_proxy=https_proxy)

        bm.get_datasets('ENSEMBL_MART_ENSEMBL')

        if species + "_gene_ensembl" not in bm.datasets:
            message("Unknow dataset/species.", type="ERROR")

        bm.query({'query': XML.format(species=species, go=go_id)})

        for i in bm.response.content.decode().split("\n"):
            i = i.rstrip("\n")
            if i != '':
                is_associated.add(i)

        for gene_id in regulatory_region_start:
            gene_id_short = gene_id.split("|")[0]
            if gene_id_short in is_associated:
                outlist = [
                    chroms[gene_id],
                    str(regulatory_region_start[gene_id]),
                    str(regulatory_region_end[gene_id]),
                    gene_id.split("|")[0], "0", strands[gene_id]
                ]

                outputfile.write("\t".join(outlist) + "\n")
def rm_dup_tss(inputfile=None, outputfile=None):
    """If several transcripts of a gene share the same tss, select only one."""

    # ----------------------------------------------------------------------
    # Get the TSS
    # ----------------------------------------------------------------------

    gtf = GTF(inputfile)
    tss_bo = gtf.get_tss(["gene_id", "transcript_id"])

    # ----------------------------------------------------------------------
    # Sort the file by name (4th col) to ensure reproducibility between calls.
    # ----------------------------------------------------------------------

    with open(tss_bo.fn) as f:
        lines = [line.split('\t') for line in f]

    tmp_file = make_tmp_file(prefix="tss_sorted_by_tx_id", suffix=".bed")

    for line in sorted(lines, key=operator.itemgetter(3)):
        tmp_file.write('\t'.join(line))

    tmp_file.close()

    tss_bo = BedTool(tmp_file.name)

    # ----------------------------------------------------------------------
    # Get the list of non redundant TSSs
    # ----------------------------------------------------------------------

    gene_dict = defaultdict(dict)
    to_delete = []

    message("Looking for redundant TSS (gene-wise).")

    for line in tss_bo:

        tss = line.start
        name = line.name
        gene_id, tx_id = name.split("|")

        if gene_id in gene_dict:
            if tss not in gene_dict[gene_id]:
                gene_dict[gene_id][tss] = tx_id
            else:
                to_delete += [tx_id]
        else:
            gene_dict[gene_id][tss] = tx_id

    message("Deleted transcripts: " +
            ",".join(to_delete[1:min(10, len(to_delete))]) + "...",
            type="DEBUG")

    # ----------------------------------------------------------------------
    # Write
    # ----------------------------------------------------------------------

    gtf.select_by_key("feature", "gene",
                      invert_match=True).select_by_key(
                          "transcript_id",
                          ",".join(to_delete),
                          invert_match=True).write(outputfile, gc_off=True)