def select_by_key(inputfile=None, outputfile=None, key=None, value=None, invert_match=False, file_with_values=None, col=0, select_transcripts=False, select_genes=False, select_exons=False, select_cds=False, select_start_codon=False, bed_format=False, log=False, separator="|", names="transcript_id"): """Select lines from a GTF file based on attributes and associated values. """ # ---------------------------------------------------------------------- # Check mode # ---------------------------------------------------------------------- if select_transcripts: key = "feature" value = "transcript" elif select_cds: key = "feature" value = "CDS" elif select_start_codon: key = "feature" value = "start_codon" elif select_genes: key = "feature" value = "gene" elif select_exons: key = "feature" value = "exon" elif file_with_values is None: if key is None or value is None: message( "Key and value are mandatory. Alternatively use -e/t/g/f or -f with -k.", type="ERROR") elif file_with_values is not None: if key is None: message("Please set -k.", type="ERROR") if value is not None: message("The -f and -v arguments are mutually exclusive.", type="ERROR") # ---------------------------------------------------------------------- # Load file with value # ---------------------------------------------------------------------- gtf = GTF(inputfile, check_ensembl_format=False) all_values = gtf.extract_data(key, as_list=True, no_na=True, nr=True) if log: feat_before = len(gtf) if not file_with_values: value_list = value.split(",") gtf = gtf.select_by_key(key, value, invert_match) else: value_list = [] for line in file_with_values: cols = line.split("\t") value_list += [cols[col - 1]] file_with_values.close() file_with_values = open(file_with_values.name) gtf = gtf.select_by_key(key=key, invert_match=invert_match, file_with_values=file_with_values, col=col) if log: not_found = list(set(value_list) - set(all_values)) feat_after = len(gtf) pct = feat_after / feat_before * 100 message("Number of features before selection: %d" % feat_before) message("Fraction of feature selected: %.2f%%" % pct) if len(not_found): nfj = ",".join(not_found) max_letter = min(len(nfj), 50) if len(nfj) > 50: etc = "..." else: etc = "" message("Values not found: [" + ",".join(not_found)[:max_letter] + etc + "].") else: message("Values not found: [].") # ---------------------------------------------------------------------- # Write GTF file # ---------------------------------------------------------------------- if not bed_format: gtf.write(outputfile, gc_off=True) else: nb_tokens = len(names.split(",")) keys = "seqid,start,end," + names + ",score,strand" nb_fields = len(keys.split(",")) for i in gtf.extract_data_iter_list(keys, zero_based=True): outputfile.write("\t".join([ i[0], i[1], i[2], separator.join(i[3:(3 + nb_tokens)]), i[nb_fields - 2], i[nb_fields - 1], ]) + "\n") close_properly(outputfile, inputfile)
def splicing_site(inputfile=None, outputfile=None, exon_numbering_key=False, names="exon_id,transcript_id,gene_id", separator="\t"): """ Compute the locations of splice donor are acceptor sites. You may extend them in 3' and 5' depending on your needs. """ gtf = GTF(inputfile) nb_exons = gtf.nb_exons() info = "feature,seqid,start,end,transcript_id," + exon_numbering_key info += ",strand," + names exon_info = gtf.extract_data_iter_list(info) for i in exon_info: if i[0] == "exon": if i[5] == ".": message( "Some exon lines do not contain any numbering. " "Use add_exon_nb or set --exon-numbering-key to the proper key.", type="ERROR") if i[6] == "+": if int(i[5]) < nb_exons[i[4]]: out = [ i[1], i[3], str(int(i[3]) + 1), separator.join(["donor"] + i[7:]), i[5], i[6] ] outputfile.write("\t".join(out) + "\n") if int(i[5]) > 1: out = [ i[1], str(int(i[2]) - 2), str(int(i[2]) - 1), separator.join(["acceptor"] + i[7:]), i[5], i[6] ] outputfile.write("\t".join(out) + "\n") elif i[6] == "-": if int(i[5]) > 1: out = [ i[1], i[3], str(int(i[3]) + 1), separator.join(["acceptor"] + i[7:]), i[5], i[6] ] outputfile.write("\t".join(out) + "\n") if int(i[5]) < nb_exons[i[4]]: out = [ i[1], str(int(i[2]) - 2), str(int(i[2]) - 1), separator.join(["donor"] + i[7:]), i[5], i[6] ] outputfile.write("\t".join(out) + "\n") gc.disable() close_properly(outputfile, inputfile)