def get_tx_seq(inputfile=None, outputfile=None, genome=None, with_introns=False, delete_version=False, del_chr=False, separator="", no_rev_comp=False, label="", sleuth_format=True, explicit=True, assembly="bla"): """ Description: Get transcripts sequences in fasta format from a GTF file. """ # ----------------------------------------------------------- # Check chromosomes in fasta file # ----------------------------------------------------------- genome_chr_list = [] message("%d fasta files found." % len(genome)) as_gz_ext = [True for x in genome if x.name.endswith(".gz")] if any(as_gz_ext): message("Genome in gz format is not currently supported.", type="ERROR") if len(genome) == 1: message("Checking fasta file chromosome list") genome = genome[0] with genome as genome_file: for i in genome_file: if i.startswith(">"): i = i.rstrip("\n") genome_chr_list += [i[1:]] else: message("Merging fasta files") tmp_genome = make_tmp_file(prefix="genome", suffix=".fa") with tmp_genome as tg: for curr_file in genome: message("Merging %s" % curr_file.name) with curr_file as cf: shutil.copyfileobj(cf, tg, 1024 * 1024 * 100) message("Checking fasta file chromosome list") genome = open(tmp_genome.name, "r") with genome as genome_file: for i in genome_file: if i.startswith(">"): i = i.rstrip("\n") genome_chr_list += [i[1:]] rev_comp = not no_rev_comp message("Chromosomes in fasta file: " + ",".join(genome_chr_list)) # ----------------------------------------------------------- # Read gtf # ----------------------------------------------------------- gtf = GTF(inputfile) nb_tx_before = gtf.extract_data("transcript_id", as_list=True, no_na=True, nr=True) # ----------------------------------------------------------- # Select genes falling in chrom defined in the fasta file # ----------------------------------------------------------- message("Chromosomes in gtf file: " + ",".join(gtf.get_chroms(nr=True))) message("Selecting chromosome defined in the fasta file") gtf = gtf.select_by_key(key="seqid", value=",".join(genome_chr_list)) message("Chromosomes in gtf file: " + ",".join(gtf.get_chroms(nr=True))) if len(gtf) == 0: message("No genes were found on chromosomes defined in fasta file.", type="ERROR") nb_tx_after = gtf.extract_data("transcript_id", as_list=True, no_na=True, nr=True) if len(nb_tx_after) != len(nb_tx_before): diff = list(set(nb_tx_before) - set(nb_tx_after)) message("Some transcripts had" " no corresponding chromosome" " in the fasta file: " + ",".join(diff)[0:100] + "...") message("Using genome file: " + genome.name) message("Retrieving fasta sequences from " + genome.name) fasta_seq = gtf.get_sequences(genome=genome.name, intron=with_introns, rev_comp=rev_comp) tx_gtf = gtf.select_by_key("feature", "transcript") if sleuth_format: tx_biotype = tx_gtf.extract_data("transcript_id,transcript_biotype", as_dict_of_lists=True, hide_undef=False) gn_biotype = tx_gtf.extract_data("gene_id,gene_biotype", as_dict_of_lists=True, hide_undef=False) for i in fasta_seq: gene_id = i.gene_id transcript_id = i.transcript_id chrom = i.chrom gn_bio = gn_biotype[i.gene_id][0] tx_bio = tx_biotype[i.transcript_id][0] if delete_version: transcript_id = re.sub('\.[0-9]+$', '', transcript_id) gene_id = re.sub('\.[0-9]+$', '', gene_id) if del_chr: chrom = chrom.replace('chr', '') header = " ".join([ transcript_id, ":".join([ "chromosome", assembly, chrom, str(i.start), str(i.end), "1" ]), "gene:" + gene_id, "gene_biotype:" + gn_bio, "transcript_biotype:" + tx_bio ]) outputfile.write(">" + header + "\n") outputfile.write(i.sequence + "\n") else: tx_info = tx_gtf.extract_data("transcript_id," + label, as_dict_of_lists=True, hide_undef=False) for i in fasta_seq: if not explicit: header = separator.join(tx_info[i.transcript_id]) else: header = [ str(x[0]) + "=" + x[1] for x in zip(label.split(","), tx_info[i.transcript_id]) ] header = separator.join(header) outputfile.write(">" + header + "\n") outputfile.write(i.sequence + "\n") gc.disable() close_properly(outputfile, inputfile)
def get_feat_seq(inputfile=None, outputfile=None, genome=None, feature_type="exon", separator="", no_rev_comp=False, label="", rev_comp_to_header=False, unique=False): """ Description: Get transcripts sequences in fasta format from a GTF file. """ # ------------------------------------------------------------------------- # Should sequences be reverse-complemented # ------------------------------------------------------------------------- force_strandedness = not no_rev_comp # ------------------------------------------------------------------------- # Check chrom to avoid segfault # https://github.com/dputhier/libgtftk/issues/27 # ------------------------------------------------------------------------- if genome.name.endswith(".gz"): message("Genome in gz format is not currently supported.", type="ERROR") genome_chr_list = [] message("Fasta files found: %s" % genome.name) message("Checking fasta file chromosome list") with genome as geno: for i in geno: if i.startswith(">"): i = i.rstrip("\n") genome_chr_list += [i[1:]] gtf = GTF(inputfile, check_ensembl_format=False) gtf_chr_list = gtf.get_chroms(nr=True) # Check chrom to avoid segfault # https://github.com/dputhier/libgtftk/issues/27 message("Comparing chromosomes from GTF and Fasta files.") gtf_chr_list_found = [x for x in gtf_chr_list if x in genome_chr_list] if len(gtf_chr_list_found) == 0: message("Chromosome from GTF were not found in fasta file", type="ERROR") if len(gtf_chr_list_found) != len(gtf_chr_list): not_found = [x for x in gtf_chr_list if x not in gtf_chr_list_found] message("Some chromosomes were not found in the fasta file: %s" % ",".join(not_found), type="ERROR") # ------------------------------------------------------------------------- # Retrieving fasta sequences # # ------------------------------------------------------------------------- message("Retrieving fasta sequences.") try: # The nameOnly argument is not supported # through all Bedtools versions feat_seq = gtf.select_by_key("feature", feature_type).to_bed( name=label.split(","), sep=separator).sequence(fi=genome.name, nameOnly=True, s=force_strandedness) except BEDToolsError: feat_seq = gtf.select_by_key("feature", feature_type).to_bed( name=label.split(","), sep=separator).sequence(fi=genome.name, name=True, s=force_strandedness) id_printed = set() to_print = True for _, line in enumerate(open(feat_seq.seqfn)): if line.startswith(">"): # This (+/-) may be added by pybedtool # but can be accessed though --label line = re.sub("\(\+\)$", "", line) line = re.sub("\(\-\)$", "", line) if rev_comp_to_header: if force_strandedness: line = line + separator + "rev_comp" else: line = line + separator + "no_rev_comp" if unique: if line in id_printed: to_print = False if to_print: outputfile.write(line) id_printed.add(line) else: if not to_print: to_print = True else: outputfile.write(line) gc.disable() close_properly(outputfile, inputfile)
def shift(inputfile=None, outputfile=None, shift_value=None, chrom_info=None, stranded=False, allow_outside=False): """Shift coordinates in 3' or 5' direction. """ gtf = GTF(inputfile, check_ensembl_format=False) chrom_list_gtf = gtf.get_chroms(nr=True) chrom_info = chrom_info_as_dict(chrom_info) for chr in chrom_list_gtf: if chr not in chrom_info: raise GTFtkError("Chromosome " + chr + " was not found in chrom-info file.") for i in gtf: size = i.end - i.start + 1 if not stranded: new_start = i.start + shift_value new_end = i.end + shift_value else: if i.strand == "-": new_start = i.start - shift_value new_end = i.end - shift_value else: new_start = i.start + shift_value new_end = i.end + shift_value # Feature is going outside genome in left direction if not allow_outside: if new_start < 1: new_start = 1 new_end = size # Feature is going outside genome in right direction if new_end > int(chrom_info[i.chrom]): new_end = int(chrom_info[i.chrom]) new_start = new_end - size + 1 else: if new_start < 1: new_start = 1 if new_end < 1: new_end = None # Feature is going outside genome in right direction if new_end > int(chrom_info[i.chrom]): new_end = int(chrom_info[i.chrom]) if new_start > int(chrom_info[i.chrom]): new_start = None if new_start is not None and new_end is not None: i.start = new_start i.end = new_end i.write(outputfile) gc.disable() close_properly(outputfile, inputfile)