def count(inputfile=None, outputfile=None, header=None, additional_text=None): """ Count the number of features in the gtf file. """ if header is not None: header = header.split(",") gtf = GTF(inputfile, check_ensembl_format=False) feat_nb = OrderedDict() for i in gtf.extract_data("feature"): i = i[0] if i in feat_nb: feat_nb[i] += 1 else: feat_nb[i] = 1 if header is not None: outputfile.write("\t".join(header) + "\n") for i in feat_nb: if additional_text is None: outputfile.write(i + "\t" + str(feat_nb[i]) + "\n") else: outputfile.write(i + "\t" + str(feat_nb[i]) + "\t" + additional_text + "\n") gc.disable() close_properly(outputfile, inputfile)
def convert(inputfile=None, outputfile=None, format="bed", names="gene_id,transcript_id", separator="|", more_names=''): """ Convert a GTF to various format. """ if format == "bed3": gtf = GTF(inputfile, check_ensembl_format=False) for i in gtf.extract_data("seqid,start,end", as_list_of_list=True, hide_undef=False, no_na=False): i[1] = str(int(i[1]) - 1) outputfile.write("\t".join(i) + "\n") elif format in ["bed", "bed6"]: gtf = GTF(inputfile, check_ensembl_format=False).write_bed(outputfile=outputfile, name=names, sep=separator, more_name=more_names) gc.disable() close_properly(outputfile, inputfile)
def random_list(inputfile=None, outputfile=None, number=None, ft_type=None, seed_value=None): """ Select a random list of genes or transcripts. """ message("loading the GTF.") gtf = GTF(inputfile) message("Getting ID list.") if ft_type == 'gene': id_list = gtf.extract_data("gene_id", as_list=True, nr=True, hide_undef=True, no_na=True) else: id_list = gtf.extract_data("transcript_id", as_list=True, nr=True, hide_undef=True, no_na=True) if number > len(id_list): message("To much feature. Using : " + str(len(id_list)), type="WARNING") number = len(id_list) if seed_value is not None: random.seed(seed_value, version=1) id_list = random.sample(id_list, number) message("Printing.") my_id = ft_type + "_id" gtf.select_by_key(my_id, ",".join(id_list)).write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def random_tx(inputfile=None, outputfile=None, max_transcript=None, seed_value=None): """ Select randomly up to m transcript for each gene. """ message("loading the GTF.") gtf = GTF(inputfile).select_by_key("feature", "gene", invert_match=True) message("Getting gene_id and transcript_id") gene2tx = gtf.extract_data("gene_id,transcript_id", as_dict_of_merged_list=True, no_na=True, nr=True) message("Selecting random transcript") if seed_value is not None: random.seed(seed_value, version=1) tx_to_delete = [] for gn_id in gene2tx: tx_list = gene2tx[gn_id] nb_tx = len(tx_list) max_cur = min(max_transcript, nb_tx) pos_to_keep = random.sample(list(range(len(tx_list))), max_cur) tx_list = [j for i, j in enumerate(tx_list) if i not in pos_to_keep] tx_to_delete += tx_list message("Printing results") message("Selecting transcript.") gtf.select_by_key("transcript_id", ",".join(tx_to_delete), invert_match=True).write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def count_key_values(inputfile=None, outputfile=None, keys="gene_id,transcript_id", uniq=True, additional_text=None): """ Count the number values for a set of keys. """ gtf = GTF(inputfile, check_ensembl_format=False) if uniq: val_list = defaultdict(set) else: val_list = defaultdict(list) if keys == "*": key_list = gtf.get_attr_list() keys = ",".join(key_list) else: key_list = keys.split(",") for i in gtf.extract_data(keys, as_list_of_list=True): for k, v in zip(key_list, i): if v in ['.', '?']: continue if uniq: val_list[k].add(v) else: val_list[k] += [v] for i in key_list: if additional_text is None: outputfile.write(i + "\t" + str(len(val_list[i])) + "\n") else: outputfile.write(i + "\t" + str(len(val_list[i])) + "\t" + additional_text + "\n") gc.disable() close_properly(outputfile, inputfile)
def tabulate(inputfile=None, outputfile=None, key=None, no_unset=False, unique=False, no_basic=False, accept_undef=False, select_gene_ids=False, select_gene_names=False, select_transcript_ids=False, select_exon_ids=False, separator="\t", no_header=False): """Convert a GTF to tabulated format. """ # ---------------------------------------------------------------------- # Check mode # ---------------------------------------------------------------------- if select_transcript_ids: key = "transcript_id" elif select_gene_ids: key = "gene_id" elif select_gene_names: key = "gene_id" elif select_exon_ids: key = "exon_id" no_undef = False if not accept_undef: no_undef = True # ---------------------------------------------------------------------- # REad GTF and process # ---------------------------------------------------------------------- gtf = GTF(inputfile, check_ensembl_format=False) if key in ["all", "*"]: if no_basic: attr_list = gtf.get_attr_list(add_basic=False) else: attr_list = gtf.get_attr_list(add_basic=True) tab = gtf.extract_data(attr_list) else: tab = gtf.extract_data(key) if not no_header: message("Writing header") write_properly(separator.join(tab.colnames), outputfile) message("Writing") try: if not unique: if no_unset: if no_undef: for i in tab: if any([True for x in i.fields if x in [".", "?"]]): continue i.write(outputfile, separator) else: for i in tab: if any([True for x in i.fields if x in ["."]]): continue i.write(outputfile, separator) else: if no_undef: for i in tab: if any([True for x in i.fields if x in ["?"]]): continue i.write(outputfile, separator) else: for i in tab: i.write(outputfile, separator) else: printed = {} if no_unset: if no_undef: for i in tab: t = tuple(i) if t not in printed: if any([True for x in i.fields if x in [".", "?"]]): continue i.write(outputfile, separator) printed[t] = 1 else: for i in tab: t = tuple(i) if t not in printed: if any([True for x in i.fields if x in ["."]]): continue i.write(outputfile, separator) printed[t] = 1 else: if no_undef: for i in tab: t = tuple(i) if t not in printed: if any([True for x in i.fields if x in ["?"]]): continue i.write(outputfile, separator) printed[t] = 1 else: for i in tab: t = tuple(i) if t not in printed: i.write(outputfile, separator) printed[t] = 1 except (BrokenPipeError, IOError): def _void_f(*args, **kwargs): pass message("Received a boken pipe signal", type="WARNING") sys.stdout.write = _void_f sys.stdout.flush = _void_f gc.disable() close_properly(outputfile, inputfile)
def discretize_key(inputfile=None, outputfile=None, src_key=None, dest_key="disc_key", nb_levels=2, percentiles=False, percentiles_of_uniq=False, precision=2, log=False, labels=None): """ Create a new key by discretizing a numeric key. """ if nb_levels < 2: message("--nb-levels has to be greater than 2.", type="ERROR") # ------------------------------------------------------------------------- # # Check labels and nb_levels # # ------------------------------------------------------------------------- if labels is not None: labels = labels.split(",") if len(labels) != nb_levels: message( "The number of labels should be the same as the number of levels.", type="ERROR") if len(labels) != len(set(labels)): message("Redundant labels not allowed.", type="ERROR") # ------------------------------------------------------------------------- # # Load GTF. Retrieve values for src-key # # ------------------------------------------------------------------------- gtf = GTF(inputfile, check_ensembl_format=False) src_values = gtf.extract_data(src_key, as_list=True) if len([x for x in src_values if x not in ['.', '?']]) == 0: message('The key was not found in this GTF.', type="ERROR") min_val = None max_val = None dest_values = [] dest_pos = [] for p, v in enumerate(src_values): try: a = float(v) if min_val is not None: if a > max_val: max_val = a if a < min_val: min_val = a else: min_val = a max_val = a dest_values += [a] dest_pos += [p] except ValueError: pass if min_val is None: message("Did not find numeric values in the source key.", type="ERROR") if min_val == max_val: message( "The minimum and maximum values found in the source key are the same.", type="ERROR") if log: if 0 in dest_values: message("Encountered zero values before log transformation.", type="WARNING", force=True) message("Adding a pseudocount (+1).", type="WARNING", force=True) pseudo_count = 1 dest_values = list(np.log2([x + pseudo_count for x in dest_values])) # update max/min values max_val = max(dest_values) min_val = min(dest_values) # Apply the same rule as pandas.cut when bins is an int. min_val = min_val - max_val / 1000 # ------------------------------------------------------------------------- # # Compute percentiles if required # # ------------------------------------------------------------------------- if percentiles: if percentiles_of_uniq: dest_values_tmp = [min_val] + list(set(dest_values)) else: dest_values_tmp = [min_val] + dest_values n = nb_levels q = [np.percentile(dest_values_tmp, 100 / n * i) for i in range(0, n)] q = q + [np.percentile(dest_values_tmp, 100)] if len(q) != len(set(q)): message("No ties are accepted in percentiles.", type="WARNING", force=True) message("Breaks: " + str(q), type="WARNING", force=True) message("Try -u. Exiting", type="ERROR") # ------------------------------------------------------------------------- # # Create a factor # # ------------------------------------------------------------------------- if percentiles: (breaks, cat_label) = pandas.cut(dest_values, bins=q, labels=labels, retbins=True) else: (breaks, cat_label) = pandas.cut(dest_values, bins=nb_levels, labels=labels, retbins=True) if labels is None: # The include_lowest argument of pandas is not working. # Using this workaround to avoid minimum value outside of data range. cat_label[0] = min(dest_values) cat_label = [round(x, precision) for x in cat_label] if precision == 0: cat_label = [int(x) for x in cat_label] cat_label = [str(x) for x in list(zip(cat_label[:-1], cat_label[1:]))] cat_label[0] = cat_label[0].replace("(", "[") cat_label = [x.replace(")", "]") for x in cat_label] cat_label = [str(x).replace(", ", "_") for x in cat_label] # The string can be very problematic later... breaks.categories = cat_label message("Categories: " + str(list(breaks.categories)), type="INFO", force=True) # ------------------------------------------------------------------------- # # Write to disk # # ------------------------------------------------------------------------- tmp_file = make_tmp_file(prefix="discretized_keys", suffix=".txt") with tmp_file as tp_file: for p, v in zip(dest_pos, breaks): tp_file.write(str(p) + "\t" + str(v) + '\n') gtf.add_attr_to_pos(tmp_file, new_key=dest_key).write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def overlapping( inputfile=None, outputfile=None, key_name=None, upstream=1500, downstream=1500, chrom_info=None, feature_type='transcript', same_strandedness=False, diff_strandedness=False, annotate_gtf=False, bool=False, annotate_all=False, invert_match=False): """ Description: Find transcripts whose body/TSS/TTS do or do not overlap with any transcript from another gene. """ # ---------------------------------------------------------------------- # Prepare key names # ---------------------------------------------------------------------- if annotate_gtf: if key_name is None: key_info = ["overlap", feature_type, "u" + str(upstream / 1000) + "k", "d" + str(downstream / 1000) + "k" ] key_name = "_".join(key_info) if invert_match: message("--annotate-gtf and --invert-match are " "mutually exclusive.", type="ERROR") if same_strandedness and diff_strandedness: message("--same-strandedness and --diff-strandedness are " "mutually exclusive.", type="ERROR") message("Using -u " + str(upstream)) message("Using -d " + str(downstream)) overlapping_tx = defaultdict(list) # Load the GTF so that it won't be lost # if GTF stream comes from stdin gtf = GTF(inputfile) message("Getting transcript in bed format") tx_feat = gtf.select_by_key("feature", "transcript") if annotate_all: overlapping_tx = gtf.extract_data(keys=["transcript_id"], as_dict=True, default_val="0") for i in overlapping_tx: overlapping_tx[i] = [] # ---------------------------------------------------------------------- # Get transcript limits # ---------------------------------------------------------------------- tx_bed = tx_feat.to_bed(name=["transcript_id", "gene_id"], sep="||") message("Getting " + feature_type + " and 'slopping'.") if feature_type == "transcript": bed_obj = tx_bed.slop(s=True, l=upstream, r=downstream, g=chrom_info.name).cut([0, 1, 2, 3, 4, 5]) elif feature_type == "promoter": bed_obj = tx_feat.get_tss(name=["transcript_id", "gene_id"], sep="||").slop(s=True, l=upstream, r=downstream, g=chrom_info.name).cut([0, 1, 2, 3, 4, 5]) elif feature_type == "tts": bed_obj = tx_feat.get_tts(name=["transcript_id", "gene_id"], sep="||").slop(s=True, l=upstream, r=downstream, g=chrom_info.name).cut([0, 1, 2, 3, 4, 5]) else: message("Not implemented yet", type="ERROR") tmp_file = make_tmp_file(feature_type + "_slopped_region", ".bed") bed_obj.saveas(tmp_file.name) overlap_regions = bed_obj.intersect(tx_bed, wb=True, s=same_strandedness, S=diff_strandedness) tmp_file = make_tmp_file(feature_type + "_overlapping_regions", ".bed") overlap_regions.saveas(tmp_file.name) for i in overlap_regions: tx_other, gn_other = i.fields[9].split("||") tx_id, gene_id = i.fields[3].split("||") if gene_id != gn_other: overlapping_tx[tx_id] += [tx_other] if bool: for k, _ in overlapping_tx.items(): if not len(overlapping_tx[k]): overlapping_tx[k] = "0" else: overlapping_tx[k] = "1" if not invert_match: if not annotate_gtf: value = ",".join(set(overlapping_tx.keys())) gtf.select_by_key("transcript_id", value).write(outputfile, gc_off=True) else: if len(overlapping_tx): gtf = gtf.add_attr_from_dict(feat="transcript", key="transcript_id", a_dict=overlapping_tx, new_key=key_name) gtf.write(outputfile, gc_off=True) else: values = ",".join(set(overlapping_tx.keys())) gtf.select_by_key("transcript_id", values, invert_match).write(outputfile, gc_off=True) gc.disable() close_properly(outputfile, inputfile)
def get_tx_seq(inputfile=None, outputfile=None, genome=None, with_introns=False, delete_version=False, del_chr=False, separator="", no_rev_comp=False, label="", sleuth_format=True, explicit=True, assembly="bla"): """ Description: Get transcripts sequences in fasta format from a GTF file. """ # ----------------------------------------------------------- # Check chromosomes in fasta file # ----------------------------------------------------------- genome_chr_list = [] message("%d fasta files found." % len(genome)) as_gz_ext = [True for x in genome if x.name.endswith(".gz")] if any(as_gz_ext): message("Genome in gz format is not currently supported.", type="ERROR") if len(genome) == 1: message("Checking fasta file chromosome list") genome = genome[0] with genome as genome_file: for i in genome_file: if i.startswith(">"): i = i.rstrip("\n") genome_chr_list += [i[1:]] else: message("Merging fasta files") tmp_genome = make_tmp_file(prefix="genome", suffix=".fa") with tmp_genome as tg: for curr_file in genome: message("Merging %s" % curr_file.name) with curr_file as cf: shutil.copyfileobj(cf, tg, 1024 * 1024 * 100) message("Checking fasta file chromosome list") genome = open(tmp_genome.name, "r") with genome as genome_file: for i in genome_file: if i.startswith(">"): i = i.rstrip("\n") genome_chr_list += [i[1:]] rev_comp = not no_rev_comp message("Chromosomes in fasta file: " + ",".join(genome_chr_list)) # ----------------------------------------------------------- # Read gtf # ----------------------------------------------------------- gtf = GTF(inputfile) nb_tx_before = gtf.extract_data("transcript_id", as_list=True, no_na=True, nr=True) # ----------------------------------------------------------- # Select genes falling in chrom defined in the fasta file # ----------------------------------------------------------- message("Chromosomes in gtf file: " + ",".join(gtf.get_chroms(nr=True))) message("Selecting chromosome defined in the fasta file") gtf = gtf.select_by_key(key="seqid", value=",".join(genome_chr_list)) message("Chromosomes in gtf file: " + ",".join(gtf.get_chroms(nr=True))) if len(gtf) == 0: message("No genes were found on chromosomes defined in fasta file.", type="ERROR") nb_tx_after = gtf.extract_data("transcript_id", as_list=True, no_na=True, nr=True) if len(nb_tx_after) != len(nb_tx_before): diff = list(set(nb_tx_before) - set(nb_tx_after)) message("Some transcripts had" " no corresponding chromosome" " in the fasta file: " + ",".join(diff)[0:100] + "...") message("Using genome file: " + genome.name) message("Retrieving fasta sequences from " + genome.name) fasta_seq = gtf.get_sequences(genome=genome.name, intron=with_introns, rev_comp=rev_comp) tx_gtf = gtf.select_by_key("feature", "transcript") if sleuth_format: tx_biotype = tx_gtf.extract_data("transcript_id,transcript_biotype", as_dict_of_lists=True, hide_undef=False) gn_biotype = tx_gtf.extract_data("gene_id,gene_biotype", as_dict_of_lists=True, hide_undef=False) for i in fasta_seq: gene_id = i.gene_id transcript_id = i.transcript_id chrom = i.chrom gn_bio = gn_biotype[i.gene_id][0] tx_bio = tx_biotype[i.transcript_id][0] if delete_version: transcript_id = re.sub('\.[0-9]+$', '', transcript_id) gene_id = re.sub('\.[0-9]+$', '', gene_id) if del_chr: chrom = chrom.replace('chr', '') header = " ".join([ transcript_id, ":".join([ "chromosome", assembly, chrom, str(i.start), str(i.end), "1" ]), "gene:" + gene_id, "gene_biotype:" + gn_bio, "transcript_biotype:" + tx_bio ]) outputfile.write(">" + header + "\n") outputfile.write(i.sequence + "\n") else: tx_info = tx_gtf.extract_data("transcript_id," + label, as_dict_of_lists=True, hide_undef=False) for i in fasta_seq: if not explicit: header = separator.join(tx_info[i.transcript_id]) else: header = [ str(x[0]) + "=" + x[1] for x in zip(label.split(","), tx_info[i.transcript_id]) ] header = separator.join(header) outputfile.write(">" + header + "\n") outputfile.write(i.sequence + "\n") gc.disable() close_properly(outputfile, inputfile)
def select_by_key(inputfile=None, outputfile=None, key=None, value=None, invert_match=False, file_with_values=None, col=0, select_transcripts=False, select_genes=False, select_exons=False, select_cds=False, select_start_codon=False, bed_format=False, log=False, separator="|", names="transcript_id"): """Select lines from a GTF file based on attributes and associated values. """ # ---------------------------------------------------------------------- # Check mode # ---------------------------------------------------------------------- if select_transcripts: key = "feature" value = "transcript" elif select_cds: key = "feature" value = "CDS" elif select_start_codon: key = "feature" value = "start_codon" elif select_genes: key = "feature" value = "gene" elif select_exons: key = "feature" value = "exon" elif file_with_values is None: if key is None or value is None: message( "Key and value are mandatory. Alternatively use -e/t/g/f or -f with -k.", type="ERROR") elif file_with_values is not None: if key is None: message("Please set -k.", type="ERROR") if value is not None: message("The -f and -v arguments are mutually exclusive.", type="ERROR") # ---------------------------------------------------------------------- # Load file with value # ---------------------------------------------------------------------- gtf = GTF(inputfile, check_ensembl_format=False) all_values = gtf.extract_data(key, as_list=True, no_na=True, nr=True) if log: feat_before = len(gtf) if not file_with_values: value_list = value.split(",") gtf = gtf.select_by_key(key, value, invert_match) else: value_list = [] for line in file_with_values: cols = line.split("\t") value_list += [cols[col - 1]] file_with_values.close() file_with_values = open(file_with_values.name) gtf = gtf.select_by_key(key=key, invert_match=invert_match, file_with_values=file_with_values, col=col) if log: not_found = list(set(value_list) - set(all_values)) feat_after = len(gtf) pct = feat_after / feat_before * 100 message("Number of features before selection: %d" % feat_before) message("Fraction of feature selected: %.2f%%" % pct) if len(not_found): nfj = ",".join(not_found) max_letter = min(len(nfj), 50) if len(nfj) > 50: etc = "..." else: etc = "" message("Values not found: [" + ",".join(not_found)[:max_letter] + etc + "].") else: message("Values not found: [].") # ---------------------------------------------------------------------- # Write GTF file # ---------------------------------------------------------------------- if not bed_format: gtf.write(outputfile, gc_off=True) else: nb_tokens = len(names.split(",")) keys = "seqid,start,end," + names + ",score,strand" nb_fields = len(keys.split(",")) for i in gtf.extract_data_iter_list(keys, zero_based=True): outputfile.write("\t".join([ i[0], i[1], i[2], separator.join(i[3:(3 + nb_tokens)]), i[nb_fields - 2], i[nb_fields - 1], ]) + "\n") close_properly(outputfile, inputfile)
def feature_size(inputfile=None, outputfile=None, ft_type="transcript", names="transcript_id", key_name='feature_size', separator="|", bed=False): """ Get the size and limits (start/end) of features enclosed in the GTF. If bed format is requested returns the limits zero-based half open and the size as a score. Otherwise output GTF file with 'feat_size' as a new key and size as value. """ message("Computing feature sizes.") gtf = GTF(inputfile) feat_list = gtf.get_feature_list(nr=True) + ['mature_rna'] if ft_type not in feat_list + ["*"]: message("Unable to find requested feature.", type="ERROR") names = names.split(",") if ft_type != 'mature_rna': if bed: bed_obj = gtf.select_by_key("feature", ft_type).to_bed(name=names, sep=separator, add_feature_type=True) for i in bed_obj: i.score = str(i.end - i.start) write_properly(chomp(str(i)), outputfile) else: tmp_file = make_tmp_file(prefix="feature_size", suffix=".txt") elmt = gtf.extract_data("feature,start,end", as_list_of_list=True, no_na=False, hide_undef=False) for i in elmt: if i[0] != ft_type and ft_type != "*": tmp_file.write("?\n") else: tmp_file.write(str(int(i[2]) - int(i[1]) + 1) + "\n") tmp_file.close() gtf.add_attr_column(tmp_file, key_name).write(outputfile, gc_off=True) else: tx_size = gtf.get_transcript_size() if bed: bed_obj = gtf.select_by_key("feature", 'transcript').to_bed( ['transcript_id'] + names, add_feature_type=False, sep=separator, more_name=['mature_rna']) for i in bed_obj: names = i.name.split(separator) tx_id = names.pop(0) i.score = tx_size[tx_id] i.name = separator.join(names) write_properly(chomp(str(i)), outputfile) else: if len(tx_size): gtf = gtf.add_attr_from_dict(feat="transcript", key="transcript_id", a_dict=tx_size, new_key=key_name) gtf.write(outputfile, gc_off=True) close_properly(outputfile, inputfile)