def intronic(inputfile=None, outputfile=None, names='transcript_id', separator="_", intron_nb_in_name=False, no_feature_name=False, by_transcript=False): """ Extract intronic regions. """ message("Searching for intronic regions.") # Need to load if the gtf comes from # <stdin> gtf = GTF(inputfile, check_ensembl_format=False) if not by_transcript: introns_bo = gtf.get_introns() for i in introns_bo: write_properly(chomp(str(i)), outputfile) else: introns_bo = gtf.get_introns(by_transcript=True, name=names.split(","), sep=separator, intron_nb_in_name=intron_nb_in_name, feat_name=not no_feature_name) for i in introns_bo: write_properly(chomp(str(i)), outputfile) gc.disable() close_properly(outputfile, inputfile)
def bed_to_gtf(inputfile=None, outputfile=None, ft_type="transcript", source="Unknown"): """ Convert a bed file to a gtf. This will make the poor bed feel as if it was a nice gtf (but with lots of empty fields...). May be helpful sometimes... """ message("Converting the bed file into GTF file.") if inputfile.name == '<stdin>': tmp_file = make_tmp_file(prefix="input_bed", suffix=".bed") for i in inputfile: write_properly(chomp(str(i)), tmp_file) tmp_file.close() inputfile.close() bed_obj = BedTool(tmp_file.name) else: bed_obj = BedTool(inputfile.name) n = 1 for i in bed_obj: if i.strand == "": i.strand = "." if i.name == "": i.name = str("feature_" + str(n)) if i.score == "": i.score = "0" if ft_type == "exon": key_value = "gene_id \"" + i.name + "\"; " + \ "transcript_id \"" + i.name + "\"; " + \ "exon_id \"" + i.name + "\";" elif ft_type == "gene": key_value = "gene_id \"" + i.name + "\";" else: key_value = "gene_id \"" + i.name + "\"; " + \ "transcript_id \"" + i.name + "\";" if pygtftk.utils.ADD_CHR == 1: chrom_out = "chr" + i.chrom else: chrom_out = i.chrom list_out = [ chrom_out, source, ft_type, str(i.start + 1), str(i.end), str(i.score), i.strand, ".", key_value ] write_properly("\t".join(list_out), outputfile) n += 1 gc.disable() close_properly(outputfile)
def add_attr_and_write(self, key, val, outputfile): """ Add a key/attribute record and write to a file. :Example: >>> from pygtftk.utils import get_example_feature >>> from pygtftk.utils import make_tmp_file >>> feat = get_example_feature() >>> tmp_file = make_tmp_file() >>> feat.add_attr_and_write("foo", "bar", tmp_file) >>> tmp_file.close() >>> from pygtftk.gtf_interface import GTF >>> gtf = GTF(tmp_file.name, check_ensembl_format=False) >>> assert gtf.extract_data("foo", as_list=True) == ['bar'] """ tok_list = list() for key_cur, val_cur in list(self.attr.items()): tok_list.append(''.join([key_cur, ' "', str(val_cur), '";'])) tok_list.append(''.join([key, ' "', str(val), '";'])) tok = ' '.join(tok_list) token = [ self.chrom, self.src, self.ft_type, str(self.start), str(self.end), str(self.score), self.strand, str(self.frame), tok ] write_properly('\t'.join(token), outputfile)
def midpoints(inputfile=None, outputfile=None, ft_type="transcript", names="transcript_id", separator="|"): """ Get the midpoint coordinates for the requested feature. """ message("Loading input file...") if inputfile.name == '<stdin>': is_gtf = True else: region_bo = BedTool(inputfile.name) if len(region_bo) == 0: message("Unable to find requested regions", type="ERROR") if region_bo.file_type == 'gff': is_gtf = True else: is_gtf = False if is_gtf: gtf = GTF(inputfile.name, check_ensembl_format=False) bed_obj = gtf.select_by_key("feature", ft_type).get_midpoints( name=names.split(","), sep=separator) for line in bed_obj: write_properly(chomp(str(line)), outputfile) else: for line in region_bo: diff = line.end - line.start if diff % 2 != 0: # e.g 10-13 (zero based) -> 11-13 one based # mipoint is 12 (one-based) -> 11-12 (zero based) # e.g 949-1100 (zero based) -> 950-1100 one based # mipoint is 1025 (one-based) -> 1024-1025 (zero based) # floored division (python 2)... line.end = line.start + int(diff // 2) + 1 line.start = line.end - 1 else: # e.g 10-14 (zero based) -> 11-14 one based # mipoint is 12-13 (one-based) -> 11-13 (zero based) # e.g 9-5100 (zero based) -> 10-5100 one based # mipoint is 2555-2555 (one-based) -> 2554-2555 (zero based) # floored division (python 2)... # No real center. Take both line.start = line.start + int(diff // 2) - 1 line.end = line.start + 2 outputfile.write(str(line)) gc.disable() close_properly(outputfile, inputfile)
def select_by_go(inputfile=None, outputfile=None, go_id=None, https_proxy=None, http_proxy=None, list_datasets=None, species=None, invert_match=False): """ Select lines from a GTF file based using a Gene Ontology ID (e.g GO:0050789). """ if not go_id.startswith("GO:"): go_id = "GO:" + go_id is_associated = OrderedDict() bm = Biomart(http_proxy=http_proxy, https_proxy=https_proxy) bm.get_datasets('ENSEMBL_MART_ENSEMBL') if list_datasets: for i in sorted(bm.datasets): write_properly(i.replace("_gene_ensembl", ""), outputfile) sys.exit() else: if species + "_gene_ensembl" not in bm.datasets: message("Unknow dataset/species.", type="ERROR") bm.query({'query': XML.format(species=species, go=go_id)}) for i in bm.response.content.decode().split("\n"): i = i.rstrip("\n") if i != '': is_associated[i] = 1 gtf = GTF(inputfile) gtf_associated = gtf.select_by_key("gene_id", ",".join(list(is_associated.keys())), invert_match) gtf_associated.write(outputfile, gc_off=True)
def intergenic(inputfile=None, outputfile=None, chrom_info=None): """ Extract intergenic regions. """ message("Searching for intergenic regions.") gtf = GTF(inputfile) intergenic_regions = gtf.get_intergenic(chrom_info) nb_intergenic_region = 1 for i in intergenic_regions: i.name = "region_" + str(nb_intergenic_region) write_properly(chomp(str(i)), outputfile) nb_intergenic_region += 1 gc.disable() close_properly(outputfile, inputfile)
def tabulate(inputfile=None, outputfile=None, key=None, no_unset=False, unique=False, no_basic=False, accept_undef=False, select_gene_ids=False, select_gene_names=False, select_transcript_ids=False, select_exon_ids=False, separator="\t", no_header=False): """Convert a GTF to tabulated format. """ # ---------------------------------------------------------------------- # Check mode # ---------------------------------------------------------------------- if select_transcript_ids: key = "transcript_id" elif select_gene_ids: key = "gene_id" elif select_gene_names: key = "gene_id" elif select_exon_ids: key = "exon_id" no_undef = False if not accept_undef: no_undef = True # ---------------------------------------------------------------------- # REad GTF and process # ---------------------------------------------------------------------- gtf = GTF(inputfile, check_ensembl_format=False) if key in ["all", "*"]: if no_basic: attr_list = gtf.get_attr_list(add_basic=False) else: attr_list = gtf.get_attr_list(add_basic=True) tab = gtf.extract_data(attr_list) else: tab = gtf.extract_data(key) if not no_header: message("Writing header") write_properly(separator.join(tab.colnames), outputfile) message("Writing") try: if not unique: if no_unset: if no_undef: for i in tab: if any([True for x in i.fields if x in [".", "?"]]): continue i.write(outputfile, separator) else: for i in tab: if any([True for x in i.fields if x in ["."]]): continue i.write(outputfile, separator) else: if no_undef: for i in tab: if any([True for x in i.fields if x in ["?"]]): continue i.write(outputfile, separator) else: for i in tab: i.write(outputfile, separator) else: printed = {} if no_unset: if no_undef: for i in tab: t = tuple(i) if t not in printed: if any([True for x in i.fields if x in [".", "?"]]): continue i.write(outputfile, separator) printed[t] = 1 else: for i in tab: t = tuple(i) if t not in printed: if any([True for x in i.fields if x in ["."]]): continue i.write(outputfile, separator) printed[t] = 1 else: if no_undef: for i in tab: t = tuple(i) if t not in printed: if any([True for x in i.fields if x in ["?"]]): continue i.write(outputfile, separator) printed[t] = 1 else: for i in tab: t = tuple(i) if t not in printed: i.write(outputfile, separator) printed[t] = 1 except (BrokenPipeError, IOError): def _void_f(*args, **kwargs): pass message("Received a boken pipe signal", type="WARNING") sys.stdout.write = _void_f sys.stdout.flush = _void_f gc.disable() close_properly(outputfile, inputfile)
def feature_size(inputfile=None, outputfile=None, ft_type="transcript", names="transcript_id", key_name='feature_size', separator="|", bed=False): """ Get the size and limits (start/end) of features enclosed in the GTF. If bed format is requested returns the limits zero-based half open and the size as a score. Otherwise output GTF file with 'feat_size' as a new key and size as value. """ message("Computing feature sizes.") gtf = GTF(inputfile) feat_list = gtf.get_feature_list(nr=True) + ['mature_rna'] if ft_type not in feat_list + ["*"]: message("Unable to find requested feature.", type="ERROR") names = names.split(",") if ft_type != 'mature_rna': if bed: bed_obj = gtf.select_by_key("feature", ft_type).to_bed(name=names, sep=separator, add_feature_type=True) for i in bed_obj: i.score = str(i.end - i.start) write_properly(chomp(str(i)), outputfile) else: tmp_file = make_tmp_file(prefix="feature_size", suffix=".txt") elmt = gtf.extract_data("feature,start,end", as_list_of_list=True, no_na=False, hide_undef=False) for i in elmt: if i[0] != ft_type and ft_type != "*": tmp_file.write("?\n") else: tmp_file.write(str(int(i[2]) - int(i[1]) + 1) + "\n") tmp_file.close() gtf.add_attr_column(tmp_file, key_name).write(outputfile, gc_off=True) else: tx_size = gtf.get_transcript_size() if bed: bed_obj = gtf.select_by_key("feature", 'transcript').to_bed( ['transcript_id'] + names, add_feature_type=False, sep=separator, more_name=['mature_rna']) for i in bed_obj: names = i.name.split(separator) tx_id = names.pop(0) i.score = tx_size[tx_id] i.name = separator.join(names) write_properly(chomp(str(i)), outputfile) else: if len(tx_size): gtf = gtf.add_attr_from_dict(feat="transcript", key="transcript_id", a_dict=tx_size, new_key=key_name) gtf.write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def col_from_tab(inputfile=None, outputfile=None, columns=None, invert_match=False, no_header=False, unique=False, more_col=None, output_separator="\t", separator="\t"): """Select columns from a tabulated file based on their names.""" line_set = dict() if re.search(",", columns): columns = columns.split(",") else: columns = [columns] if more_col: more_col_name, more_col_value = more_col.split(":") else: more_col_name = more_col_value = None for p, line in enumerate(inputfile): line = chomp(line) line = line.split(separator) if p == 0: if not invert_match: pos_list = list() for i in range(len(columns)): pos = line.index(columns[i]) if columns[i] in line else -1 if pos > -1: pos_list.append(pos) else: message("Column " + columns[i] + " not found", type="ERROR") else: pos_list = list(range(len(line))) for i in range(len(columns)): pos = line.index(columns[i]) if columns[i] in line else -1 if pos > -1: pos_list.remove(pos) else: message("Column " + columns[i] + " not found", type="ERROR") if not no_header: header_list = [line[k] for k in pos_list] if more_col: header_list += [more_col_name] header = output_separator.join(header_list) write_properly(header, outputfile) else: out_list = [line[k] for k in pos_list] if more_col: out_list += [more_col_value] out = output_separator.join(out_list) if unique: if out not in line_set: write_properly(out, outputfile) line_set[out] = 1 else: write_properly(out, outputfile)
def get_5p_3p_coords(inputfile=None, outputfile=None, ft_type="transcript", names="transcript_id", separator="|", more_names='', transpose=0, invert=False, explicit=False): """ Get the 5p or 3p coordinate for each feature (e.g TSS or TTS for a transcript). """ if more_names is None: more_names = [] else: more_names = more_names.split(',') if not invert: message("Computing 5' coordinates of '" + ft_type + "'.") else: message("Computing 3' coordinates of '" + ft_type + "'.") gtf = GTF(inputfile, check_ensembl_format=False) if names != "*": nms = names.split(",") else: nms = gtf.select_by_key("feature", "transcript").get_attr_list(add_basic=False) if not invert: bed_obj = gtf.get_5p_end(feat_type=ft_type, name=nms, sep=separator, more_name=more_names, explicit=explicit) else: bed_obj = gtf.get_3p_end(feat_type=ft_type, name=nms, sep=separator, more_name=more_names, explicit=explicit) if not len(bed_obj): message("Requested feature could not be found. Use convert_ensembl maybe.", type="ERROR") if transpose == 0: for i in bed_obj: write_properly(chomp(str(i)), outputfile) else: for i in bed_obj: out_list = list() if i.strand == "+": out_list = [i.chrom, str(i.start + transpose), str(i.end + transpose), i.name, i.score, i.strand] elif i.strand == "-": out_list = [i.chrom, str(i.start - transpose), str(i.end - transpose), i.name, i.score, i.strand] outputfile.write("\t".join(out_list) + "\n") gc.disable() close_properly(outputfile, inputfile)