def __call__(self,
                 parser,
                 namespace,
                 values,
                 option_string=None):
        if values in ["mm8", "mm9", "mm10", "hg19", "hg38", "rn3", "rn4"]:
            chr_size = pybedtools.helpers.chromsizes(values)
            ## Delete haplotype chromosome
            ## unplaced contig and unlocalized contig
            regexp = re.compile('(_random)|(^chrUn)|(_hap\d+)|(_alt)|(^chrM$)')
            chr_size = {key: chr_size[key] for key in chr_size if not regexp.search(key)}
            tmp_file_chr = make_tmp_file(prefix='chromsize', suffix='.txt')
            for chrom, size in chr_size.items():
                tmp_file_chr.write(chrom + "\t" + str(size[1]) + "\n")
            tmp_file_chr.close()
            values = open(tmp_file_chr.name, 'r')
        elif values == 'simple':
            chr_size = {'chr1': 300, 'chr2': 600}
            tmp_file_chr = make_tmp_file(prefix='chromsize', suffix='.txt')
            for chrom, size in chr_size.items():
                tmp_file_chr.write(chrom + "\t" + str(size) + "\n")
            tmp_file_chr.close()
            values = open(tmp_file_chr.name, 'r')
        else:
            check_file_or_dir_exists(values)
            values = open(values, "r")
            chrom_info_as_dict(values)

        # Add the attribute
        setattr(namespace, self.dest, values)
Beispiel #2
0
def great_reg_domains(inputfile=None,
                      outputfile=None,
                      go_id="GO:0003700",
                      species="hsapiens",
                      upstream=1000,
                      downstream=1000,
                      chrom_info=None,
                      distal=1000000,
                      mode='basal_plus_extension',
                      http_proxy=None,
                      https_proxy=None):
    """ Given a GTF and a GO term, attempt compute labeled regions using GREAT 'association rule'. """

    # -------------------------------------------------------------------------
    # chrom_len will store the chromosome sizes.
    # -------------------------------------------------------------------------

    chrom_len = chrom_info_as_dict(chrom_info)

    # -------------------------------------------------------------------------
    # Read the GTF
    # -------------------------------------------------------------------------

    gtf = GTF(inputfile, check_ensembl_format=False)

    # -------------------------------------------------------------------------
    # Get the TSSs -- Extend them by upstream/dowstream
    # -------------------------------------------------------------------------

    message("Defining basal regulatory domains.", type="INFO")
    basal_reg_bed = gtf.get_tss(name=['gene_id', 'gene_name']).slop(
        s=True, l=upstream, r=downstream, g=chrom_info.name).sort()

    basal_reg_bed_file = make_tmp_file(prefix='basal_reg', suffix='.bed')
    basal_reg_bed.saveas(basal_reg_bed_file.name)

    if mode == 'basal_plus_extension':
        # -------------------------------------------------------------------------
        # Search for upstream limits of each basal_reg_bed
        # Here we ignore overlapping  basal_reg_bed as the way they
        # are proceded is not documented in GREAT to our knowledge
        # -------------------------------------------------------------------------
        message("Defining regulatory domains upstream regions.", type="INFO")

        regulatory_region_start = dict()
        regulatory_region_end = dict()
        chroms = dict()
        strands = dict()

        basal_reg_bed_upstream = basal_reg_bed.closest(
            basal_reg_bed,
            # Ignore features in B that overlap A
            io=True,
            # In addition to the closest feature in B report distance
            # use negative distances to report upstream features.
            # Report distance with respect to A.
            # When A is on the - strand, "upstream" means B has a
            # higher(start, stop).
            D="a",
            # Ignore features in B that are downstream of features in A
            id=True,
            # How ties are handled. "first"  Report the first tie
            t="first",
            # Require that the query and the closest hit have different names/gene_ids.
            N=True)

        basal_reg_bed_upstream_file = make_tmp_file(
            prefix='basal_reg_bed_upstream', suffix='.bed')
        basal_reg_bed_upstream.saveas(basal_reg_bed_upstream_file.name)

        for line in basal_reg_bed_upstream:

            gene_id = line.name
            strand = line.strand
            end = line.end
            start = line.start
            gene_id = "|".join([gene_id, str(start), str(end), strand])
            chroms[gene_id] = line.chrom
            strands[gene_id] = strand

            if strand == '+':

                # if the feature chromosome in B is
                # '.' we have reached the start of the chr
                if line.fields[6] == '.':
                    regulatory_region_start[gene_id] = max(
                        0, line.start - distal)
                else:
                    padding = min(distal, abs(int(line.fields[12])) - 1)
                    regulatory_region_start[gene_id] = line.start - padding

            elif strand == '-':
                # if the feature chromosome in B is
                # '.' we have reached the end of the chr
                if line.fields[6] == '.':
                    regulatory_region_end[gene_id] = min(
                        int(chrom_len[line.chrom]), line.end + distal)
                else:
                    padding = min(distal, abs(int(line.fields[12])) - 1)
                    regulatory_region_end[gene_id] = line.end + padding
            else:
                message("Cannot process genes without strand", type="WARNING")
                message("Please check:" + gene_id, type="ERROR")

        # -------------------------------------------------------------------------
        # Search for downstream limits of each basal_reg_bed
        # Here we ignore overlapping  basal_reg_bed as the way they
        # are proceded is not documented in GREAT to our knowledge
        # -------------------------------------------------------------------------
        message("Defining regulatory domains downstream regions.", type="INFO")

        basal_reg_bed_downstream = basal_reg_bed.closest(
            basal_reg_bed,
            # Ignore features in B that overlap A
            io=True,
            # In addition to the closest feature in B report distance
            # use negative distances to report upstream features.
            # Report distance with respect to A.
            # When A is on the - strand, "upstream" means B has a
            # higher(start, stop).
            D="a",
            # Ignore features in B that are upstream of features in A
            iu=True,
            # How ties are handled. "first"  Report the first tie
            t="first",
            # Require that the query and the closest hit have different names/gene_ids.
            N=True)

        basal_reg_bed_downstream_file = make_tmp_file(
            prefix='basal_reg_bed_upstream', suffix='.bed')
        basal_reg_bed_downstream.saveas(basal_reg_bed_downstream_file.name)

        for line in basal_reg_bed_downstream:

            gene_id = line.name
            strand = line.strand
            end = line.end
            start = line.start
            gene_id = "|".join([gene_id, str(start), str(end), strand])
            chroms[gene_id] = line.chrom
            strands[gene_id] = strand

            if strand == '+':
                # if the feature chromosome in B is
                # '.' we have reached the start of the chr
                if line.fields[6] == '.':
                    regulatory_region_end[gene_id] = min(
                        int(chrom_len[line.chrom]), line.end + distal)
                else:
                    padding = min(distal, abs(int(line.fields[12])) - 1)
                    regulatory_region_end[gene_id] = line.end + padding
            elif strand == '-':
                if line.fields[6] == '.':
                    # sys.stderr.write(str(line.start - distal + 1) + "\n")
                    # sys.stderr.write(gene_id + "\n")
                    regulatory_region_start[gene_id] = max(
                        0, line.start - distal)
                else:
                    padding = min(distal, abs(int(line.fields[12])) - 1)
                    regulatory_region_start[gene_id] = max(
                        0, line.start - padding)
            else:
                message("Cannot process genes without strand", type="WARNING")
                message("Please check:" + gene_id, type="ERROR")
            # print(regulatory_region_start)

    else:
        message(
            "Only 'basal_plus_extension' association rule is currently supported.",
            type='ERROR')

    # -------------------------------------------------------------------------
    # Print the regulatory regions of all genes
    # By default print all genes
    # -------------------------------------------------------------------------

    if go_id is None:
        for gene_id in regulatory_region_start:
            outlist = [
                chroms[gene_id],
                str(regulatory_region_start[gene_id]),
                str(regulatory_region_end[gene_id]),
                gene_id.split("|")[0], "0", strands[gene_id]
            ]

            outputfile.write("\t".join(outlist) + "\n")
    else:

        # -------------------------------------------------------------------------
        # Get the list of gene/transcript associated with a particular GO term
        # -------------------------------------------------------------------------

        message("Getting Gene Ontology annotations.")

        if not go_id.startswith("GO:"):
            go_id = "GO:" + go_id

        is_associated = set()

        bm = Biomart(http_proxy=http_proxy, https_proxy=https_proxy)

        bm.get_datasets('ENSEMBL_MART_ENSEMBL')

        if species + "_gene_ensembl" not in bm.datasets:
            message("Unknow dataset/species.", type="ERROR")

        bm.query({'query': XML.format(species=species, go=go_id)})

        for i in bm.response.content.decode().split("\n"):
            i = i.rstrip("\n")
            if i != '':
                is_associated.add(i)

        for gene_id in regulatory_region_start:
            gene_id_short = gene_id.split("|")[0]
            if gene_id_short in is_associated:
                outlist = [
                    chroms[gene_id],
                    str(regulatory_region_start[gene_id]),
                    str(regulatory_region_end[gene_id]),
                    gene_id.split("|")[0], "0", strands[gene_id]
                ]

                outputfile.write("\t".join(outlist) + "\n")
from pygtftk.stats.intersect.read_bed import read_bed_as_list as read_bed
from pygtftk.utils import chrom_info_as_dict
from pygtftk import arg_formatter

np.random.seed(RANDOM_SEED)  # Random seed for reproducibility



## Prepare files
bedA = pybedtools.BedTool(QUERY_PATH).sort().merge()
bedsB = [pybedtools.BedTool(bedfilepath).sort().merge() for bedfilepath in MORE_PATHS]


# Do the exclusion manually Generate a fake bed for the entire genome, using the chromsizes
bed_incl = pybedtools.BedTool(INCL_PATH)
chrom_len = chrom_info_as_dict(open(GENOME_PATH, 'r'))

full_genome_bed = [str(chrom) + '\t' + '0' + '\t' + str(chrom_len[chrom]) + '\n' for chrom in chrom_len if chrom != 'all_chrom']
full_genome_bed = pybedtools.BedTool(full_genome_bed)
bed_excl = full_genome_bed.subtract(bed_incl)

bedA = read_bed.exclude_concatenate(bedA, bed_excl)
bedsB = [read_bed.exclude_concatenate(bedB, bed_excl) for bedB in bedsB]
full_genome_bed_after_excl = read_bed.exclude_concatenate(full_genome_bed, bed_excl)


# Note that by definition, in this intersections' matrix only regions where at 
# least two sets are open are given. For example {4} alone is not found. 
# To fix it, use a fake full genome bed as query, so there is always one file 
# open, then truncate it to get the flags_matrix. 
#true_intersection = compute_true_intersection(bedA, bedsB)
Beispiel #4
0
def shift(inputfile=None,
          outputfile=None,
          shift_value=None,
          chrom_info=None,
          stranded=False,
          allow_outside=False):
    """Shift coordinates in 3' or 5' direction.
    """

    gtf = GTF(inputfile, check_ensembl_format=False)

    chrom_list_gtf = gtf.get_chroms(nr=True)
    chrom_info = chrom_info_as_dict(chrom_info)

    for chr in chrom_list_gtf:
        if chr not in chrom_info:
            raise GTFtkError("Chromosome " + chr +
                             " was not found in chrom-info file.")

    for i in gtf:
        size = i.end - i.start + 1
        if not stranded:
            new_start = i.start + shift_value
            new_end = i.end + shift_value
        else:
            if i.strand == "-":
                new_start = i.start - shift_value
                new_end = i.end - shift_value
            else:
                new_start = i.start + shift_value
                new_end = i.end + shift_value

        # Feature is going outside genome in left direction
        if not allow_outside:
            if new_start < 1:
                new_start = 1
                new_end = size

            # Feature is going outside genome in right direction
            if new_end > int(chrom_info[i.chrom]):
                new_end = int(chrom_info[i.chrom])
                new_start = new_end - size + 1
        else:
            if new_start < 1:
                new_start = 1
                if new_end < 1:
                    new_end = None

            # Feature is going outside genome in right direction
            if new_end > int(chrom_info[i.chrom]):
                new_end = int(chrom_info[i.chrom])
                if new_start > int(chrom_info[i.chrom]):
                    new_start = None

        if new_start is not None and new_end is not None:
            i.start = new_start
            i.end = new_end
            i.write(outputfile)

    gc.disable()
    close_properly(outputfile, inputfile)