Exemple #1
0
def select_by_max_exon_nb(inputfile=None, outputfile=None):
    """
    Select transcripts based on the number of exons.
    """

    msg = "Selecting transcript with the highest number of exon for each gene."
    message(msg)

    gtf = GTF(inputfile, check_ensembl_format=False).select_by_max_exon_nb()

    gtf.write(outputfile, gc_off=True)
def select_most_5p_tx(inputfile=None, outputfile=None, keep_gene_lines=False):
    """
    Select the most 5' transcript of each gene.
    """

    message("Selecting the most 5' transcript of each gene.")

    gtf = GTF(inputfile)

    if keep_gene_lines:
        gtf = gtf.select_5p_transcript()
    else:
        gtf = gtf.select_5p_transcript().select_by_key("feature", "gene", 1)

    gtf.write(outputfile, gc_off=True)
Exemple #3
0
def select_by_nb_exon(inputfile=None,
                      outputfile=None,
                      min_exon_number=None,
                      max_exon_number=None):
    """
    Select transcripts based on the number of exons.
    """

    msg = "Selecting transcript by exon number (range: [{m},{M}])"
    msg = msg.format(m=str(min_exon_number), M=str(max_exon_number))
    message(msg)

    gtf = GTF(inputfile, check_ensembl_format=False).select_by_number_of_exons(
        min_exon_number, max_exon_number)

    gtf.write(outputfile, gc_off=True)

    close_properly(outputfile, inputfile)
Exemple #4
0
def exon_sizes(inputfile=None, outputfile=None, key_name=None):
    """
 Add a new key to transcript features containing a comma-separated list of exon-size.
    """

    gtf = GTF(inputfile)

    all_tx_ids = gtf.get_tx_ids(nr=True)
    tx_to_size_list = dict()
    exons_starts = gtf.select_by_key("feature", "exon").extract_data(
        "transcript_id,start",
        as_dict_of_merged_list=True,
        no_na=True,
        nr=False)

    if not len(exons_starts):
        message("No exon found.", type="ERROR")

    exons_ends = gtf.select_by_key("feature", "exon").extract_data(
        "transcript_id,end", as_dict_of_merged_list=True, no_na=True, nr=False)

    strands = gtf.select_by_key("feature", "transcript").extract_data(
        "transcript_id,strand",
        as_dict_of_values=True,
        no_na=True,
        nr=True,
        hide_undef=True)

    for tx_id in all_tx_ids:
        size_list = []
        for s, e in zip(exons_starts[tx_id], exons_ends[tx_id]):
            size = str(int(e) - int(s) + 1)
            size_list += [size]
        if strands[tx_id] == "-":
            size_list = reversed(size_list)
        tx_to_size_list[tx_id] = ",".join(size_list)

    if len(tx_to_size_list):
        gtf = gtf.add_attr_from_dict(feat="transcript",
                                     key="transcript_id",
                                     a_dict=tx_to_size_list,
                                     new_key=key_name)
    gtf.write(outputfile, gc_off=True)
    close_properly(outputfile, inputfile)
Exemple #5
0
def intron_sizes(
        inputfile=None,
        outputfile=None,
        key_name=None):
    """
 Add a new key to transcript features containing a comma-separated list of intron sizes.
    """

    gtf = GTF(inputfile, check_ensembl_format=False)

    all_tx_ids = gtf.get_tx_ids(nr=True)
    intron_bo = gtf.get_introns(by_transcript=True,
                                name=["transcript_id"],
                                intron_nb_in_name=False,
                                feat_name=False)

    strands = gtf.select_by_key("feature",
                                "transcript").extract_data("transcript_id,strand",
                                                           as_dict_of_values=True,
                                                           no_na=True,
                                                           nr=True,
                                                           hide_undef=True)

    intron_size = {tx: [] for tx in all_tx_ids}

    for bed_line in intron_bo:
        intron_size[bed_line.name] += [str(bed_line.end - bed_line.start)]

    for tx_id in intron_size:
        if len(intron_size[tx_id]):
            if strands[tx_id] == "-":
                intron_size[tx_id] = ",".join(reversed(intron_size[tx_id]))
            else:
                intron_size[tx_id] = ",".join(intron_size[tx_id])
        else:
            intron_size[tx_id] = "0"
    if len(intron_size):
        gtf = gtf.add_attr_from_dict(feat="transcript",
                                     key="transcript_id",
                                     a_dict=intron_size,
                                     new_key=key_name)
    gtf.write(outputfile,
              gc_off=True)
    close_properly(outputfile, inputfile)
def short_long(inputfile=None,
               outputfile=None,
               longs=None,
               keep_gene_lines=False):
    """ Select the shortest transcript for each gene, Or the longuest if the \
-l arguments is used. """

    gtf = GTF(inputfile, check_ensembl_format=False)

    if longs:
        gtf = gtf.select_longuest_transcripts()
    else:
        gtf = gtf.select_shortest_transcripts()

    if not keep_gene_lines:
        gtf = gtf.select_by_key("feature", "gene", 1)

    gtf.write(outputfile,
              gc_off=True)
def join_multi_file(inputfile=None,
                    outputfile=None,
                    target_feature=None,
                    key_to_join=None,
                    matrix_files=()):
    """
    Join attributes from a set of tabulated files.
    """

    # -----------------------------------------------------------
    #  load the GTF
    # -----------------------------------------------------------

    gtf = GTF(inputfile, check_ensembl_format=False)

    # -----------------------------------------------------------
    #  Check target feature
    # -----------------------------------------------------------

    feat_list = gtf.get_feature_list(nr=True)

    if target_feature is not None:
        target_feature_list = target_feature.split(",")

        for i in target_feature_list:
            if i not in feat_list + ["*"]:
                message("Feature " + i + " not found.", type="ERROR")
    else:
        target_feature = ",".join(feat_list)

    # -----------------------------------------------------------
    #  Do it
    # -----------------------------------------------------------

    for join_file in matrix_files:
        gtf = gtf.add_attr_from_matrix_file(feat=target_feature,
                                            key=key_to_join,
                                            inputfile=join_file.name)
    gtf.write(outputfile, gc_off=True)

    gc.disable()
    close_properly(outputfile, inputfile)
Exemple #8
0
def nb_exons(inputfile=None,
             outputfile=None,
             key_name=None,
             text_format=False):
    """
    Count the number of exons in the gtf file.
    """

    gtf = GTF(inputfile)
    n_exons = defaultdict(int)

    # -------------------------------------------------------------------------
    # Computing number of  exon for each transcript in input GTF file
    #
    # -------------------------------------------------------------------------

    message("Computing number of exons for each transcript in input GTF file.")

    exon = gtf.select_by_key("feature", "exon")
    fields = exon.extract_data("transcript_id")

    for i in fields:
        tx_id = i[0]
        n_exons[tx_id] += 1

    if text_format:
        for tx_id in n_exons:
            outputfile.write(tx_id + "\t" + str(n_exons[tx_id]) +
                             "\ttranscript\n")
    else:

        if len(n_exons):
            gtf = gtf.add_attr_from_dict(feat="transcript",
                                         key="transcript_id",
                                         a_dict=n_exons,
                                         new_key=key_name)
        gtf.write(outputfile, gc_off=True)

    close_properly(outputfile, inputfile)
Exemple #9
0
def overlapping(
        inputfile=None,
        outputfile=None,
        key_name=None,
        upstream=1500,
        downstream=1500,
        chrom_info=None,
        feature_type='transcript',
        same_strandedness=False,
        diff_strandedness=False,
        annotate_gtf=False,
        bool=False,
        annotate_all=False,
        invert_match=False):
    """
Description: Find transcripts whose body/TSS/TTS do or do not overlap with any
transcript from another gene.
    """

    # ----------------------------------------------------------------------
    # Prepare key names
    # ----------------------------------------------------------------------

    if annotate_gtf:
        if key_name is None:
            key_info = ["overlap",
                        feature_type,
                        "u" + str(upstream / 1000) + "k",
                        "d" + str(downstream / 1000) + "k"
                        ]
            key_name = "_".join(key_info)

        if invert_match:
            message("--annotate-gtf and --invert-match are "
                    "mutually exclusive.",
                    type="ERROR")

    if same_strandedness and diff_strandedness:
        message("--same-strandedness and --diff-strandedness are "
                "mutually exclusive.",
                type="ERROR")

    message("Using -u " + str(upstream))
    message("Using -d " + str(downstream))

    overlapping_tx = defaultdict(list)

    # Load the GTF so that it won't be lost
    # if GTF stream comes from stdin
    gtf = GTF(inputfile)

    message("Getting transcript in bed format")

    tx_feat = gtf.select_by_key("feature",
                                "transcript")

    if annotate_all:
        overlapping_tx = gtf.extract_data(keys=["transcript_id"], as_dict=True, default_val="0")
        for i in overlapping_tx:
            overlapping_tx[i] = []

    # ----------------------------------------------------------------------
    # Get transcript limits
    # ----------------------------------------------------------------------

    tx_bed = tx_feat.to_bed(name=["transcript_id", "gene_id"], sep="||")

    message("Getting " + feature_type + " and 'slopping'.")

    if feature_type == "transcript":

        bed_obj = tx_bed.slop(s=True,
                              l=upstream,
                              r=downstream,
                              g=chrom_info.name).cut([0, 1, 2, 3, 4, 5])

    elif feature_type == "promoter":

        bed_obj = tx_feat.get_tss(name=["transcript_id", "gene_id"],
                                  sep="||").slop(s=True,
                                                 l=upstream,
                                                 r=downstream,
                                                 g=chrom_info.name).cut([0, 1,
                                                                         2, 3,
                                                                         4, 5])

    elif feature_type == "tts":

        bed_obj = tx_feat.get_tts(name=["transcript_id", "gene_id"],
                                  sep="||").slop(s=True,
                                                 l=upstream,
                                                 r=downstream,
                                                 g=chrom_info.name).cut([0, 1,
                                                                         2, 3,
                                                                         4, 5])
    else:
        message("Not implemented yet", type="ERROR")

    tmp_file = make_tmp_file(feature_type + "_slopped_region", ".bed")
    bed_obj.saveas(tmp_file.name)

    overlap_regions = bed_obj.intersect(tx_bed,
                                        wb=True,
                                        s=same_strandedness,
                                        S=diff_strandedness)

    tmp_file = make_tmp_file(feature_type + "_overlapping_regions", ".bed")
    overlap_regions.saveas(tmp_file.name)

    for i in overlap_regions:

        tx_other, gn_other = i.fields[9].split("||")
        tx_id, gene_id = i.fields[3].split("||")
        if gene_id != gn_other:
            overlapping_tx[tx_id] += [tx_other]

    if bool:
        for k, _ in overlapping_tx.items():
            if not len(overlapping_tx[k]):
                overlapping_tx[k] = "0"
            else:
                overlapping_tx[k] = "1"

    if not invert_match:

        if not annotate_gtf:
            value = ",".join(set(overlapping_tx.keys()))
            gtf.select_by_key("transcript_id",
                              value).write(outputfile,
                                           gc_off=True)
        else:

            if len(overlapping_tx):
                gtf = gtf.add_attr_from_dict(feat="transcript",
                                             key="transcript_id",
                                             a_dict=overlapping_tx,
                                             new_key=key_name)
            gtf.write(outputfile,
                      gc_off=True)

    else:
        values = ",".join(set(overlapping_tx.keys()))
        gtf.select_by_key("transcript_id",
                          values,
                          invert_match).write(outputfile, gc_off=True)

    gc.disable()
    close_properly(outputfile, inputfile)
Exemple #10
0
def feature_size(inputfile=None,
                 outputfile=None,
                 ft_type="transcript",
                 names="transcript_id",
                 key_name='feature_size',
                 separator="|",
                 bed=False):
    """
 Get the size and limits (start/end) of features enclosed in the GTF. If bed
 format is requested returns the limits zero-based half open and the size as a score.
 Otherwise output GTF file with 'feat_size' as a new key and size as value.
    """

    message("Computing feature sizes.")

    gtf = GTF(inputfile)

    feat_list = gtf.get_feature_list(nr=True) + ['mature_rna']

    if ft_type not in feat_list + ["*"]:
        message("Unable to find requested feature.", type="ERROR")

    names = names.split(",")

    if ft_type != 'mature_rna':

        if bed:
            bed_obj = gtf.select_by_key("feature",
                                        ft_type).to_bed(name=names,
                                                        sep=separator,
                                                        add_feature_type=True)

            for i in bed_obj:
                i.score = str(i.end - i.start)
                write_properly(chomp(str(i)), outputfile)
        else:

            tmp_file = make_tmp_file(prefix="feature_size", suffix=".txt")

            elmt = gtf.extract_data("feature,start,end",
                                    as_list_of_list=True,
                                    no_na=False,
                                    hide_undef=False)

            for i in elmt:
                if i[0] != ft_type and ft_type != "*":
                    tmp_file.write("?\n")
                else:
                    tmp_file.write(str(int(i[2]) - int(i[1]) + 1) + "\n")

            tmp_file.close()

            gtf.add_attr_column(tmp_file, key_name).write(outputfile,
                                                          gc_off=True)

    else:

        tx_size = gtf.get_transcript_size()

        if bed:
            bed_obj = gtf.select_by_key("feature", 'transcript').to_bed(
                ['transcript_id'] + names,
                add_feature_type=False,
                sep=separator,
                more_name=['mature_rna'])

            for i in bed_obj:
                names = i.name.split(separator)
                tx_id = names.pop(0)
                i.score = tx_size[tx_id]
                i.name = separator.join(names)
                write_properly(chomp(str(i)), outputfile)
        else:

            if len(tx_size):
                gtf = gtf.add_attr_from_dict(feat="transcript",
                                             key="transcript_id",
                                             a_dict=tx_size,
                                             new_key=key_name)

            gtf.write(outputfile, gc_off=True)

    close_properly(outputfile, inputfile)
Exemple #11
0
def retrieve(species_name='homo_sapiens',
             outputfile=None,
             release=None,
             to_stdout=False,
             list_only=False,
             delete=False,
             hide_species_name=None,
             ensembl_collection='vertebrate'):
    """Retrieve a GTF file from ensembl.

    :Example:
    >>> # retrieve("Xenopus_tropicalis")
    """

    if outputfile is None:
        outputdir = os.getcwd()
    else:
        outputdir = os.path.dirname(os.path.abspath(outputfile.name))

    if species_name is None and not list_only:
        message("Choose --species-name or --list-only.", type='ERROR')

    if outputfile is not None:
        if not os.path.exists(
                os.path.dirname(os.path.abspath(outputfile.name))):
            message("Output directory does not exists. Exiting.", type='ERROR')
        else:
            if os.path.isdir(outputfile.name):
                message("Output file is a directory !.", type='ERROR')

    # Will contain the url pointing to the
    # requested gtf.
    target_gtf = None

    # -------------------------------------------------------------------------
    # Check ensembl repository
    # -------------------------------------------------------------------------

    if ensembl_collection == 'vertebrate':

        host = "ftp.ensembl.org"
        user = "******"  # votre identifiant
        password = "******"
    elif ensembl_collection in ['protists', 'fungi', 'plants', 'metazoa']:

        host = "ftp.ensemblgenomes.org"
        user = "******"
        password = "******"

    try:
        message("Trying to connect")
        ftp = ftputil.FTPHost(host, user, password)
        if pygtftk.utils.VERBOSITY:
            message("Connected to ensembl FTP website.")
    except FTPOSError as err:
        message(str(err))
        message("Unable to connect (FTPOSError).", type="ERROR")

    message("Connection successful.")

    try:
        ftp.chdir('/pub')
        message("Successfully change directory to pub")
    except:
        message("Unable to change directory to 'pub'.",
                type="ERROR")

    if ensembl_collection in ['protists', 'fungi', 'plants', 'metazoa']:
        try:
            ftp.chdir(ensembl_collection)
            message("Successfully change directory to " + ensembl_collection)
        except:
            message("Unable to change directory to '%s'." % ensembl_collection,
                    type="ERROR")

    try:
        all_releases = ftp.listdir(ftp.curdir)
    except Exception as e:
        print(str(e))
        message("Unable to list directory.",
                type="ERROR")

    if release is not None:
        release_dir = "release-" + release
        if release_dir not in all_releases:
            message("This release number could not be found. Aborting",
                    type="ERROR")
    else:

        version_list = []

        for ver in all_releases:
            regexp = re.compile("release-(\d+)")
            hit = regexp.search(ver)
            if hit:
                version_list += [int(hit.group(1))]
        release = max(version_list)
        release_dir = "release-" + str(release)
        message("Latest version is %d." % release)

    try:
        ftp.chdir(release_dir)
        message("Changed release directory: %s" % release_dir,
                type="DEBUG")
    except:
        message("Unable to change directory to '%s'." % release_dir,
                type="ERROR")

    ftp.chdir('gtf')

    try:
        all_species = ftp.listdir(ftp.curdir)
        all_species = [x for x in all_species if ftp.path.isdir(x)]
    except:
        message("Unable to list directory.",
                type="ERROR")

    if list_only:

        species_list = []
        url_list = []

        for sp in all_species:
            gtfs = [x for x in ftp.listdir(sp) if x.endswith('.gtf.gz')]

            for gtf in gtfs:
                species_list += [sp]
                current_url = 'ftp://' + host + ftp.getcwd() + '/'
                url_list += [current_url + sp + "/" + gtf]

        for sp, url in zip(species_list, url_list):
            if hide_species_name:
                print(url)
            else:
                print(sp.ljust(50) + url)

        sys.exit()
    else:

        if species_name not in all_species:
            message("Species could not be found for release: %s" % str(release))
            message("Trying species name in lower case.")
            species_name = species_name.lower()
            if species_name not in all_species:
                message("Species could not be found for release: %s" % str(release),
                        type="ERROR")

        ftp.chdir(species_name)
        gtf_list = ftp.listdir(ftp.curdir)

        # choice 1 (only regular chromosome)
        gtf_sub = [x for x in gtf_list if x.endswith("chr.gtf.gz")]

        # choice 2 should be ! choice 1 and ! 'ab_initio'.
        # Should be default gtf
        gtf_sub_2 = [x for x in gtf_list if "abinitio.gtf.gz" not in x]
        gtf_sub_2 = [x for x in gtf_sub_2 if x.endswith(".gtf.gz")]
        if gtf_sub:
            gtf_sub_2.remove(gtf_sub[0])

        # Choice 3 abinitio
        gtf_sub_3 = [x for x in gtf_list if x.endswith("abinitio.gtf.gz")]
        # Choice 4:
        # Any gtf

        if len(gtf_sub) > 0:
            target_gtf = gtf_sub[0]
        elif len(gtf_sub_2) > 0:
            target_gtf = gtf_sub_2[0]
        elif len(gtf_sub_3) > 0:
            target_gtf = gtf_sub_3[0]
        else:
            gtf_sub = [x for x in gtf_list if x.endswith(".gtf.gz")]
            target_gtf = gtf_sub[0]

    # -------------------------------------------------------------------------
    # Download if requested
    # -------------------------------------------------------------------------

    if target_gtf is not None:
        if not list_only:
            message("Downloading GTF file : " + target_gtf)

            ftp.download(target_gtf,
                         target_gtf)

            os.rename(target_gtf, os.path.join(outputdir, target_gtf))

            if to_stdout:
                gtf = GTF(os.path.join(outputdir, target_gtf),
                          check_ensembl_format=False)

                gtf.write("-", gc_off=True)

            if delete:
                os.remove(os.path.join(outputdir, target_gtf))
            else:
                if outputfile is not None:
                    message("Renaming.")
                    os.rename(os.path.join(outputdir, target_gtf),
                              outputfile.name)

    else:
        message("Species could not be found for release: " + release,
                type='ERROR')

    gc.disable()
def select_by_intron_size(inputfile=None,
                          outputfile=None,
                          intron_size=0,
                          merged=False,
                          invert_match=False,
                          delete_monoexonic=False,
                          add_intron_size=False):
    """
    Select genes which contain an intron of size at least s or whose sum of intron size is at least s
    """

    message("Searching for intronic regions.")

    gtf = GTF(inputfile, check_ensembl_format=False)

    introns_bo = gtf.get_introns(by_transcript=True,
                                 name=["transcript_id"],
                                 intron_nb_in_name=False).sort()

    # Get the list of transcripts
    all_tx_ids = gtf.get_tx_ids(nr=True)

    # The list of transcripts
    # to be deleted
    to_delete = OrderedDict()

    if merged:
        # Create a dict that will contain the sum of introns for
        # each transcript
        intron_sum_dict = OrderedDict.fromkeys(all_tx_ids, 0)

        for i in introns_bo:
            size = i.end - i.start
            tx_id = i.name
            intron_sum_dict[tx_id] += size

        for tx_id, sum_intron in list(intron_sum_dict.items()):

            if sum_intron != 0:
                if not invert_match:
                    if sum_intron < intron_size:
                        to_delete[tx_id] = 1

                else:
                    if sum_intron >= intron_size:
                        to_delete[tx_id] = 1
            else:
                if delete_monoexonic:
                    to_delete[tx_id] = 1

        if add_intron_size:
            gtf = gtf.add_attr_from_dict(feat="transcript",
                                         key="transcript_id",
                                         a_dict=intron_sum_dict,
                                         new_key="intron_size_sum")

    else:

        # Create a dict that will contain a list introns size
        # for each transcript

        intron_size_dict = defaultdict(list)

        for tx_id in all_tx_ids:
            intron_size_dict[tx_id] = []

        for i in introns_bo:
            size = i.end - i.start
            tx_id = i.name

            intron_size_dict[tx_id] += [size]

        for tx_id, list_size in list(intron_size_dict.items()):
            if not list_size:
                intron_size_dict[tx_id] = [0]
                if delete_monoexonic:
                    to_delete[tx_id] = 1
                continue

            for size in intron_size_dict[tx_id]:

                if not invert_match:
                    if size < intron_size:
                        to_delete[tx_id] = 1

                else:
                    if size >= intron_size:
                        to_delete[tx_id] = 1

        if add_intron_size:

            for tx_id, list_size in list(intron_size_dict.items()):
                list_size = [str(x) for x in list_size]
                intron_size_dict[tx_id] = ",".join(list_size)

            gtf = gtf.add_attr_from_dict(feat="transcript",
                                         key="transcript_id",
                                         a_dict=intron_size_dict,
                                         new_key="intron_size")

    all_tx_ids = gtf.get_tx_ids(nr=True)
    all_tx_ids = [x for x in all_tx_ids if x not in to_delete]
    msg_list = ",".join(list(to_delete.keys()))
    nb_char = min([len(msg_list), 40])
    msg_list = msg_list[0:nb_char]
    message("Deleting: " + msg_list + "...")

    gtf = gtf.select_by_key("transcript_id", ",".join(all_tx_ids))

    gtf.write(outputfile, gc_off=True)

    close_properly(outputfile, inputfile)
def select_by_key(inputfile=None,
                  outputfile=None,
                  key=None,
                  value=None,
                  invert_match=False,
                  file_with_values=None,
                  col=0,
                  select_transcripts=False,
                  select_genes=False,
                  select_exons=False,
                  select_cds=False,
                  select_start_codon=False,
                  bed_format=False,
                  log=False,
                  separator="|",
                  names="transcript_id"):
    """Select lines from a GTF file based on attributes and
    associated values.
    """

    # ----------------------------------------------------------------------
    # Check mode
    # ----------------------------------------------------------------------

    if select_transcripts:
        key = "feature"
        value = "transcript"

    elif select_cds:
        key = "feature"
        value = "CDS"

    elif select_start_codon:
        key = "feature"
        value = "start_codon"

    elif select_genes:
        key = "feature"
        value = "gene"

    elif select_exons:
        key = "feature"
        value = "exon"

    elif file_with_values is None:
        if key is None or value is None:
            message(
                "Key and value are mandatory. Alternatively use -e/t/g/f or -f with -k.",
                type="ERROR")

    elif file_with_values is not None:
        if key is None:
            message("Please set -k.", type="ERROR")
        if value is not None:
            message("The -f and -v arguments are mutually exclusive.",
                    type="ERROR")

    # ----------------------------------------------------------------------
    # Load file with value
    # ----------------------------------------------------------------------

    gtf = GTF(inputfile, check_ensembl_format=False)
    all_values = gtf.extract_data(key, as_list=True, no_na=True, nr=True)

    if log:
        feat_before = len(gtf)

    if not file_with_values:
        value_list = value.split(",")
        gtf = gtf.select_by_key(key, value, invert_match)
    else:
        value_list = []

        for line in file_with_values:
            cols = line.split("\t")
            value_list += [cols[col - 1]]
        file_with_values.close()
        file_with_values = open(file_with_values.name)

        gtf = gtf.select_by_key(key=key,
                                invert_match=invert_match,
                                file_with_values=file_with_values,
                                col=col)

    if log:

        not_found = list(set(value_list) - set(all_values))
        feat_after = len(gtf)
        pct = feat_after / feat_before * 100

        message("Number of features before selection: %d" % feat_before)
        message("Fraction of feature selected: %.2f%%" % pct)

        if len(not_found):
            nfj = ",".join(not_found)
            max_letter = min(len(nfj), 50)
            if len(nfj) > 50:
                etc = "..."
            else:
                etc = ""
            message("Values not found: [" + ",".join(not_found)[:max_letter] +
                    etc + "].")
        else:
            message("Values not found: [].")

    # ----------------------------------------------------------------------
    # Write GTF file
    # ----------------------------------------------------------------------

    if not bed_format:

        gtf.write(outputfile, gc_off=True)

    else:
        nb_tokens = len(names.split(","))
        keys = "seqid,start,end," + names + ",score,strand"
        nb_fields = len(keys.split(","))

        for i in gtf.extract_data_iter_list(keys, zero_based=True):
            outputfile.write("\t".join([
                i[0],
                i[1],
                i[2],
                separator.join(i[3:(3 + nb_tokens)]),
                i[nb_fields - 2],
                i[nb_fields - 1],
            ]) + "\n")

    close_properly(outputfile, inputfile)
def closest_genes(
        inputfile=None,
        outputfile=None,
        from_region_type=None,
        no_header=False,
        nb_neighbors=1,
        to_region_type=None,
        same_strandedness=False,
        diff_strandedness=False,
        text_format=False,
        identifier="gene_id",
        collapse=False):
    """
    Find the n closest genes for each gene.
    """

    if same_strandedness and diff_strandedness:
        message("--same-strandedness and --diff-strandedness are "
                "mutually exclusive.",
                type="ERROR")

    # ----------------------------------------------------------------------
    # load GTF
    # ----------------------------------------------------------------------

    gtf = GTF(inputfile)
    gn_gtf = gtf.select_by_key("feature", "gene")
    gn_ids = gn_gtf.get_gn_ids(nr=True)

    if len(gn_gtf) == 0:
        message("No gene feature found. Please use convert_ensembl.",
                type="ERROR")
    if nb_neighbors >= (len(gn_gtf) - 1):
        message("Two much neighbors",
                type="ERROR")

    all_ids = gn_gtf.extract_data(identifier, as_list=True, no_na=False)

    if "." in all_ids:
        message("Some identifiers are undefined ('.').",
                type="ERROR")

    if len(all_ids) == 0:
        message("The identifier was not found.",
                type="ERROR")

    # ----------------------------------------------------------------------
    # load GTF and requested regions (for source/'from' transcript)
    # ----------------------------------------------------------------------

    if from_region_type == 'tss':
        from_regions = gn_gtf.get_5p_end(feat_type="gene",
                                         name=[identifier],
                                         ).cut([0, 1, 2,
                                                3, 4, 5]).sort()
    elif from_region_type == 'tts':
        from_regions = gn_gtf.get_3p_end(feat_type="gene",
                                         name=[identifier],
                                         ).cut([0, 1, 2,
                                                3, 4, 5]).sort()
    elif from_region_type == 'gene':
        from_regions = gn_gtf.to_bed(name=[identifier],
                                     ).cut([0, 1, 2,
                                            3, 4, 5]).sort()
    else:
        message("Unknown type.", type="ERROR")

    # ----------------------------------------------------------------------
    # load GTF and requested regions (for dest/'to' transcript)
    # ----------------------------------------------------------------------

    if to_region_type == 'tss':
        to_regions = gn_gtf.get_5p_end(feat_type="gene",
                                       name=[identifier],
                                       ).cut([0, 1, 2,
                                              3, 4, 5]).sort()
    elif to_region_type == 'tts':
        to_regions = gn_gtf.get_3p_end(feat_type="gene",
                                       name=[identifier],
                                       ).cut([0, 1, 2,
                                              3, 4, 5]).sort()

    elif to_region_type == 'gene':
        to_regions = gn_gtf.to_bed(name=[identifier],
                                   ).cut([0, 1, 2,
                                          3, 4, 5]).sort()
    else:
        message("Unknown type.", type="ERROR")

    # ----------------------------------------------------------------------
    # Search closest genes
    # ----------------------------------------------------------------------

    gene_closest = defaultdict(list)
    gene_closest_dist = defaultdict(list)

    closest_bo = from_regions.closest(b=to_regions,
                                      k=nb_neighbors,
                                      N=True,
                                      s=same_strandedness,
                                      S=diff_strandedness,
                                      d=True)

    for i in closest_bo:
        gene_closest[i[3]] += [i[9]]
        gene_closest_dist[i[3]] += [i[12]]

    if not text_format:

        if len(gene_closest):
            gtf = gtf.add_attr_from_dict(feat="gene",
                                         key=identifier,
                                         a_dict=gene_closest,
                                         new_key="closest_gn")

            gtf = gtf.add_attr_from_dict(feat="gene",
                                         key=identifier,
                                         a_dict=gene_closest_dist,
                                         new_key="closest_dist")

        gtf.write(outputfile, gc_off=True)

    else:
        if not no_header:
            outputfile.write("genes\tclosest_genes\tdistances\n")

        for gene in gn_ids:

            if not collapse:

                outputfile.write("\t".join([gene,
                                            ",".join(gene_closest[gene]),
                                            ",".join(gene_closest_dist[gene])]) + "\n")

            else:

                for closest, dist in zip(gene_closest[gene],
                                         gene_closest_dist[gene]):
                    outputfile.write("\t".join([gene,
                                                closest,
                                                dist]) + "\n")

        gc.disable()

    close_properly(outputfile, inputfile)
Exemple #15
0
def convergent(inputfile=None,
               outputfile=None,
               upstream=1500,
               downstream=1500,
               chrom_info=None):
    """
    Find transcript with convergent tts.
    """

    message("Using -u " + str(upstream) + ".")
    message("Using -d " + str(downstream) + ".")

    tx_to_convergent_nm = dict()
    dist_to_convergent = dict()
    tts_pos = dict()

    message("Loading GTF.")

    gtf = GTF(inputfile)

    message("Getting transcript coordinates.")

    tx_feat = gtf.select_by_key("feature", "transcript")

    message("Getting tts coordinates.")

    tts_bo = tx_feat.get_tts(name=["transcript_id", "gene_id"], sep="||")

    # get tts position
    for i in tts_bo:
        tx_id_ov, gn_id_ov = i.name.split("||")
        tts_pos[tx_id_ov] = int(i.start)

    message("Getting tts coordinates.")

    tts_region_bo = tts_bo.slop(s=True,
                                l=upstream,
                                r=downstream,
                                g=chrom_info.name).cut([0, 1, 2, 3, 4, 5])

    message("Intersecting...")
    tts_intersect_bo = tts_region_bo.intersect(tts_bo,
                                               wb=True,
                                               s=False,
                                               S=True)

    tmp_file = make_tmp_file("tts_slop", ".bed")
    tts_region_bo.saveas(tmp_file.name)
    tmp_file = make_tmp_file("tts_slop_intersection_with_tts_as_", ".bed")
    tts_intersect_bo.saveas(tmp_file.name)

    for i in tts_intersect_bo:

        tx_id_main, gene_id_main = i.fields[3].split("||")
        tx_id_ov, gn_id_ov = i.fields[9].split("||")

        if gene_id_main != gn_id_ov:
            if tx_id_main in tx_to_convergent_nm:
                dist = abs(tts_pos[tx_id_main] - tts_pos[tx_id_ov])
                if dist < dist_to_convergent[tx_id_main]:
                    dist_to_convergent[tx_id_main] = dist
                    tx_to_convergent_nm[tx_id_main] = tx_id_ov
            else:
                dist = abs(tts_pos[tx_id_main] - tts_pos[tx_id_ov])
                dist_to_convergent[tx_id_main] = dist
                tx_to_convergent_nm[tx_id_main] = tx_id_ov

    if len(tx_to_convergent_nm):
        gtf = gtf.add_attr_from_dict(feat="transcript",
                                     key="transcript_id",
                                     a_dict=tx_to_convergent_nm,
                                     new_key="convergent")

        gtf = gtf.add_attr_from_dict(feat="transcript",
                                     key="transcript_id",
                                     a_dict=dist_to_convergent,
                                     new_key="dist_to_convergent")

    gtf.write(outputfile, gc_off=True)

    close_properly(outputfile, inputfile)
def tss_numbering(inputfile=None,
                  outputfile=None,
                  compute_dist=False,
                  key_name='tss_number',
                  key_name_dist='dist_to_first_tss',
                  add_nb_tss_to_gene=False,
                  gene_key='nb_tss'):
    """
    Computes the distance between TSS of gene transcripts.
    """

    gtf = GTF(inputfile, check_ensembl_format=True)

    gn_tss_dist = defaultdict(dict)

    message("Getting TSSs.")
    tss = gtf.get_tss(name=["transcript_id"], as_dict=True)
    tx_to_gn = gtf.get_tx_to_gn()

    for k in tss:
        gn_id = tx_to_gn[k]
        gn_tss_dist[gn_id][k] = int(tss[k])

    # if_dict_of_dict is true, get_gn_to_tx() returns a dict of dict
    # that maps gene_id to transcript_id and transcript_id to TSS
    # numbering (1 for most 5', then 2...). For transcripts having
    # the same TSSs, the tss number will be the same.
    gn_to_tx_to_tss = gtf.get_gn_to_tx(as_dict_of_dict=True)

    message("Numbering TSSs.")

    tss_number_file = make_tmp_file(prefix='tx_to_tss_number', suffix='.txt')

    gn_how_many_tss = dict()

    for gn_id in gn_to_tx_to_tss:
        for tx_id in gn_to_tx_to_tss[gn_id]:
            tss_num = str(gn_to_tx_to_tss[gn_id][tx_id])
            tss_number_file.write(tx_id + "\t" + tss_num + "\n")
            if gn_id not in gn_how_many_tss:
                gn_how_many_tss[gn_id] = tss_num
            else:
                if int(tss_num) > int(gn_how_many_tss[gn_id]):
                    gn_how_many_tss[gn_id] = tss_num

    tss_number_file.close()

    gtf = gtf.add_attr_from_file(feat='transcript',
                                 key='transcript_id',
                                 new_key=key_name,
                                 inputfile=open(tss_number_file.name),
                                 has_header=False)

    if add_nb_tss_to_gene:

        gn_how_many_tss_file = make_tmp_file(prefix='gn_how_many_tss',
                                             suffix='.txt')

        for a_key, a_val in gn_how_many_tss.items():
            gn_how_many_tss_file.write(a_key + "\t" + a_val + "\n")

        gn_how_many_tss_file.close()

        gtf = gtf.add_attr_from_file(feat='gene',
                                     key='gene_id',
                                     new_key=gene_key,
                                     inputfile=open(gn_how_many_tss_file.name),
                                     has_header=False)

    if compute_dist:
        gn_to_tx_ordered_by_tss = gtf.get_gn_to_tx(ordered_5p=True)
        tss_dist_file = make_tmp_file(prefix='tx_tss_dist_to_first_tss',
                                      suffix='.txt')

        for gn_id in gn_to_tx_to_tss:
            tx_list = gn_to_tx_ordered_by_tss[gn_id]
            tx_first = tx_list.pop(0)
            # The first tss as distance 0 to the
            # first tss...
            tss_dist_file.write(tx_first + "\t0\n")
            for tx_id in tx_list:
                dist_to_first = abs(int(tss[tx_first]) - int(tss[tx_id]))
                tss_dist_file.write(tx_id + "\t" + str(dist_to_first) + "\n")

        tss_dist_file.close()

        gtf = gtf.add_attr_from_file(feat='transcript',
                                     key='transcript_id',
                                     new_key=key_name_dist,
                                     inputfile=open(tss_dist_file.name),
                                     has_header=False)

    gtf.write(outputfile, gc_off=True)

    close_properly(outputfile, inputfile)
Exemple #17
0
def divergent(
        inputfile=None,
        outputfile=None,
        key_name=None,
        upstream=1500,
        downstream=1500,
        chrom_info=None,
        no_strandness=False,
        no_annotation=False):
    """
Find transcript with divergent promoters.
    """

    message("Using -u " + str(upstream) + ".")
    message("Using -d " + str(downstream) + ".")

    tx_with_divergent = dict()
    dist_to_divergent = dict()
    tss_pos = dict()

    message("Loading GTF.")

    gtf = GTF(inputfile)

    message("Getting transcript coordinates.")

    tx_feat = gtf.select_by_key("feature",
                                "transcript")
    message("Getting tss coordinates.")

    tss_bo = tx_feat.get_tss(name=["transcript_id", "gene_id"],
                             sep="||")

    # get tss position
    for i in tss_bo:
        tx_id_tss, gn_id_tss = i.name.split("||")
        tss_pos[tx_id_tss] = int(i.start)

    message("Getting promoter coordinates.")

    promoter_bo = tss_bo.slop(s=True,
                              l=upstream,
                              r=downstream,
                              g=chrom_info.name).cut([0, 1,
                                                      2, 3,
                                                      4, 5])
    message("Intersecting...")

    if no_strandness:
        prom_with_tss_bo = promoter_bo.intersect(tss_bo,
                                                 wb=True,
                                                 s=False,
                                                 S=False)
    else:
        prom_with_tss_bo = promoter_bo.intersect(tss_bo,
                                                 wb=True,
                                                 s=False,
                                                 S=True)

    tmp_file = make_tmp_file("promoter_slop", ".bed")
    promoter_bo.saveas(tmp_file.name)
    tmp_file = make_tmp_file("promoter_intersection_with_tss_as_", ".bed")
    prom_with_tss_bo.saveas(tmp_file.name)

    for i in prom_with_tss_bo:

        tx_id_tss, gn_id_tss = i.fields[9].split("||")
        tx_id_prom, gene_id_prom = i.fields[3].split("||")

        if gene_id_prom != gn_id_tss:
            if tx_id_prom in tx_with_divergent:
                dist = abs(tss_pos[tx_id_prom] - tss_pos[tx_id_tss])
                if dist < dist_to_divergent[tx_id_prom]:
                    dist_to_divergent[tx_id_prom] = dist
                    tx_with_divergent[tx_id_prom] = tx_id_tss
            else:

                dist = abs(tss_pos[tx_id_prom] - tss_pos[tx_id_tss])
                dist_to_divergent[tx_id_prom] = dist
                tx_with_divergent[tx_id_prom] = tx_id_tss

    if not no_annotation:

        if key_name is None:
            key_name = "divergent"
            key_name_dist = "dist_to_divergent"
        else:
            key_name_dist = "dist_" + key_name

        if len(tx_with_divergent):
            gtf = gtf.add_attr_from_dict(feat="transcript",
                                         key="transcript_id",
                                         a_dict=tx_with_divergent,
                                         new_key=key_name)

            gtf = gtf.add_attr_from_dict(feat="transcript",
                                         key="transcript_id",
                                         a_dict=dist_to_divergent,
                                         new_key=key_name_dist)

        gtf.write(outputfile,
                  gc_off=True)

    else:
        gtf.select_by_key("transcript_id",
                          ",".join(list(tx_with_divergent.keys()))).write(outputfile, gc_off=True)

    close_properly(outputfile, inputfile)
Exemple #18
0
def join_attr(inputfile=None,
              outputfile=None,
              join_file=None,
              has_header=False,
              new_key=None,
              target_feature=None,
              key_to_join=None,
              matrix=None):
    """
    Join attributes from a tabulated file.
    """

    # -----------------------------------------------------------
    #  Check argument consistency
    # -----------------------------------------------------------

    if matrix is True:
        if new_key is not None:
            message("--new-key and --matrix are mutually exclusive.",
                    type="ERROR")
    else:
        if new_key is None:
            message("--new-key is required when --matrix is False.",
                    type="ERROR")

    # -----------------------------------------------------------
    #  load the GTF
    # -----------------------------------------------------------

    gtf = GTF(inputfile, check_ensembl_format=False)

    # -----------------------------------------------------------
    #  Check target feature
    # -----------------------------------------------------------

    feat_list = gtf.get_feature_list(nr=True)

    if target_feature is not None:
        target_feature_list = target_feature.split(",")

        for i in target_feature_list:
            if i not in feat_list + ["*"]:
                message("Feature " + i + " not found.", type="ERROR")
    else:
        target_feature = ",".join(feat_list)

    # -----------------------------------------------------------
    #  Do it
    # -----------------------------------------------------------

    if not matrix:

        gtf = gtf.add_attr_from_file(feat=target_feature,
                                     key=key_to_join,
                                     new_key=new_key,
                                     inputfile=join_file.name,
                                     has_header=has_header)
        gtf.write(outputfile, gc_off=True)

    else:

        gtf = gtf.add_attr_from_matrix_file(feat=target_feature,
                                            key=key_to_join,
                                            inputfile=join_file.name)
        gtf.write(outputfile, gc_off=True)

    gc.disable()
    close_properly(outputfile, inputfile)