Exemple #1
0
def get_tx_seq(inputfile=None,
               outputfile=None,
               genome=None,
               with_introns=False,
               delete_version=False,
               del_chr=False,
               separator="",
               no_rev_comp=False,
               label="",
               sleuth_format=True,
               explicit=True,
               assembly="bla"):
    """
    Description: Get transcripts sequences in fasta format from a GTF file.
    """

    # -----------------------------------------------------------
    #  Check chromosomes in fasta file
    # -----------------------------------------------------------

    genome_chr_list = []

    message("%d fasta files found." % len(genome))

    as_gz_ext = [True for x in genome if x.name.endswith(".gz")]

    if any(as_gz_ext):
        message("Genome in gz format is not currently supported.",
                type="ERROR")

    if len(genome) == 1:
        message("Checking fasta file chromosome list")
        genome = genome[0]
        with genome as genome_file:
            for i in genome_file:
                if i.startswith(">"):
                    i = i.rstrip("\n")
                    genome_chr_list += [i[1:]]
    else:
        message("Merging fasta files")
        tmp_genome = make_tmp_file(prefix="genome", suffix=".fa")
        with tmp_genome as tg:
            for curr_file in genome:
                message("Merging %s" % curr_file.name)
                with curr_file as cf:
                    shutil.copyfileobj(cf, tg, 1024 * 1024 * 100)

        message("Checking fasta file chromosome list")
        genome = open(tmp_genome.name, "r")
        with genome as genome_file:
            for i in genome_file:
                if i.startswith(">"):
                    i = i.rstrip("\n")
                    genome_chr_list += [i[1:]]

    rev_comp = not no_rev_comp

    message("Chromosomes in fasta file: " + ",".join(genome_chr_list))

    # -----------------------------------------------------------
    #  Read gtf
    # -----------------------------------------------------------

    gtf = GTF(inputfile)
    nb_tx_before = gtf.extract_data("transcript_id",
                                    as_list=True,
                                    no_na=True,
                                    nr=True)

    # -----------------------------------------------------------
    #  Select genes falling in chrom defined in the fasta file
    # -----------------------------------------------------------

    message("Chromosomes in gtf file: " + ",".join(gtf.get_chroms(nr=True)))

    message("Selecting chromosome defined in the fasta file")

    gtf = gtf.select_by_key(key="seqid", value=",".join(genome_chr_list))

    message("Chromosomes in gtf file: " + ",".join(gtf.get_chroms(nr=True)))

    if len(gtf) == 0:
        message("No genes were found on chromosomes defined in fasta file.",
                type="ERROR")

    nb_tx_after = gtf.extract_data("transcript_id",
                                   as_list=True,
                                   no_na=True,
                                   nr=True)
    if len(nb_tx_after) != len(nb_tx_before):
        diff = list(set(nb_tx_before) - set(nb_tx_after))
        message("Some transcripts had"
                " no corresponding chromosome"
                " in the fasta file: " + ",".join(diff)[0:100] + "...")

    message("Using genome file: " + genome.name)
    message("Retrieving fasta sequences from " + genome.name)
    fasta_seq = gtf.get_sequences(genome=genome.name,
                                  intron=with_introns,
                                  rev_comp=rev_comp)

    tx_gtf = gtf.select_by_key("feature", "transcript")

    if sleuth_format:

        tx_biotype = tx_gtf.extract_data("transcript_id,transcript_biotype",
                                         as_dict_of_lists=True,
                                         hide_undef=False)
        gn_biotype = tx_gtf.extract_data("gene_id,gene_biotype",
                                         as_dict_of_lists=True,
                                         hide_undef=False)

        for i in fasta_seq:
            gene_id = i.gene_id
            transcript_id = i.transcript_id
            chrom = i.chrom

            gn_bio = gn_biotype[i.gene_id][0]
            tx_bio = tx_biotype[i.transcript_id][0]

            if delete_version:
                transcript_id = re.sub('\.[0-9]+$', '', transcript_id)
                gene_id = re.sub('\.[0-9]+$', '', gene_id)
            if del_chr:
                chrom = chrom.replace('chr', '')

            header = " ".join([
                transcript_id, ":".join([
                    "chromosome", assembly, chrom,
                    str(i.start),
                    str(i.end), "1"
                ]), "gene:" + gene_id, "gene_biotype:" + gn_bio,
                "transcript_biotype:" + tx_bio
            ])

            outputfile.write(">" + header + "\n")
            outputfile.write(i.sequence + "\n")
    else:
        tx_info = tx_gtf.extract_data("transcript_id," + label,
                                      as_dict_of_lists=True,
                                      hide_undef=False)
        for i in fasta_seq:
            if not explicit:
                header = separator.join(tx_info[i.transcript_id])
            else:
                header = [
                    str(x[0]) + "=" + x[1]
                    for x in zip(label.split(","), tx_info[i.transcript_id])
                ]
                header = separator.join(header)
            outputfile.write(">" + header + "\n")
            outputfile.write(i.sequence + "\n")

    gc.disable()
    close_properly(outputfile, inputfile)
Exemple #2
0
def get_feat_seq(inputfile=None,
                 outputfile=None,
                 genome=None,
                 feature_type="exon",
                 separator="",
                 no_rev_comp=False,
                 label="",
                 rev_comp_to_header=False,
                 unique=False):
    """
    Description: Get transcripts sequences in fasta format from a GTF file.
    """

    # -------------------------------------------------------------------------
    # Should sequences be reverse-complemented
    # -------------------------------------------------------------------------

    force_strandedness = not no_rev_comp

    # -------------------------------------------------------------------------
    # Check chrom to avoid segfault
    # https://github.com/dputhier/libgtftk/issues/27
    # -------------------------------------------------------------------------

    if genome.name.endswith(".gz"):
        message("Genome in gz format is not currently supported.",
                type="ERROR")

    genome_chr_list = []

    message("Fasta files found: %s" % genome.name)

    message("Checking fasta file chromosome list")

    with genome as geno:
        for i in geno:
            if i.startswith(">"):
                i = i.rstrip("\n")
                genome_chr_list += [i[1:]]

    gtf = GTF(inputfile, check_ensembl_format=False)

    gtf_chr_list = gtf.get_chroms(nr=True)

    # Check chrom to avoid segfault
    # https://github.com/dputhier/libgtftk/issues/27
    message("Comparing chromosomes from GTF and Fasta files.")
    gtf_chr_list_found = [x for x in gtf_chr_list if x in genome_chr_list]

    if len(gtf_chr_list_found) == 0:
        message("Chromosome from GTF were not found in fasta file",
                type="ERROR")

    if len(gtf_chr_list_found) != len(gtf_chr_list):
        not_found = [x for x in gtf_chr_list if x not in gtf_chr_list_found]
        message("Some chromosomes were not found in the fasta file: %s" %
                ",".join(not_found),
                type="ERROR")

    # -------------------------------------------------------------------------
    # Retrieving fasta sequences
    #
    # -------------------------------------------------------------------------

    message("Retrieving fasta sequences.")

    try:
        # The nameOnly argument is not supported
        # through all Bedtools versions

        feat_seq = gtf.select_by_key("feature", feature_type).to_bed(
            name=label.split(","),
            sep=separator).sequence(fi=genome.name,
                                    nameOnly=True,
                                    s=force_strandedness)
    except BEDToolsError:

        feat_seq = gtf.select_by_key("feature", feature_type).to_bed(
            name=label.split(","),
            sep=separator).sequence(fi=genome.name,
                                    name=True,
                                    s=force_strandedness)

    id_printed = set()

    to_print = True

    for _, line in enumerate(open(feat_seq.seqfn)):

        if line.startswith(">"):

            # This (+/-) may be added by pybedtool
            # but can be accessed though --label
            line = re.sub("\(\+\)$", "", line)
            line = re.sub("\(\-\)$", "", line)

            if rev_comp_to_header:
                if force_strandedness:
                    line = line + separator + "rev_comp"
                else:
                    line = line + separator + "no_rev_comp"

            if unique:
                if line in id_printed:
                    to_print = False
            if to_print:
                outputfile.write(line)
                id_printed.add(line)

        else:
            if not to_print:
                to_print = True
            else:
                outputfile.write(line)

    gc.disable()
    close_properly(outputfile, inputfile)
Exemple #3
0
def shift(inputfile=None,
          outputfile=None,
          shift_value=None,
          chrom_info=None,
          stranded=False,
          allow_outside=False):
    """Shift coordinates in 3' or 5' direction.
    """

    gtf = GTF(inputfile, check_ensembl_format=False)

    chrom_list_gtf = gtf.get_chroms(nr=True)
    chrom_info = chrom_info_as_dict(chrom_info)

    for chr in chrom_list_gtf:
        if chr not in chrom_info:
            raise GTFtkError("Chromosome " + chr +
                             " was not found in chrom-info file.")

    for i in gtf:
        size = i.end - i.start + 1
        if not stranded:
            new_start = i.start + shift_value
            new_end = i.end + shift_value
        else:
            if i.strand == "-":
                new_start = i.start - shift_value
                new_end = i.end - shift_value
            else:
                new_start = i.start + shift_value
                new_end = i.end + shift_value

        # Feature is going outside genome in left direction
        if not allow_outside:
            if new_start < 1:
                new_start = 1
                new_end = size

            # Feature is going outside genome in right direction
            if new_end > int(chrom_info[i.chrom]):
                new_end = int(chrom_info[i.chrom])
                new_start = new_end - size + 1
        else:
            if new_start < 1:
                new_start = 1
                if new_end < 1:
                    new_end = None

            # Feature is going outside genome in right direction
            if new_end > int(chrom_info[i.chrom]):
                new_end = int(chrom_info[i.chrom])
                if new_start > int(chrom_info[i.chrom]):
                    new_start = None

        if new_start is not None and new_end is not None:
            i.start = new_start
            i.end = new_end
            i.write(outputfile)

    gc.disable()
    close_properly(outputfile, inputfile)