def main(): """Entry point.""" args = parse_args() source = open(args.bed_source, "r") if args.bed_source != "stdin" else sys.stdin two_bit_data = TwoBitFile(get_2bit_path(args.db)) # so let's read input for num, line in enumerate(source): bed_info = line[:-1].split("\t") # parse bed info chrom = bed_info[0] chrom_seq = two_bit_data[chrom] gene_seq = "" chromStart = int(bed_info[1]) # chromEnd = int(bed_info[2]) name = bed_info[3] # gene_name usually # bed_score = int(bed_info[4]) # never used # strand = bed_info[5] # otherwise: strand = True if bed_info[5] == '+' else False thickStart = int(bed_info[6]) thickEnd = int(bed_info[7]) # itemRgb = bed_info[8] # never used blockCount = int(bed_info[9]) blockSizes = [int(x) for x in bed_info[10].split(',') if x != ''] blockStarts = [int(x) for x in bed_info[11].split(',') if x != ''] # not-in-file info blockEnds = [blockStarts[i] + blockSizes[i] for i in range(blockCount)] blockAbsStarts = [blockStarts[i] + chromStart for i in range(blockCount)] blockAbsEnds = [blockEnds[i] + chromStart for i in range(blockCount)] # block-by-block for block_num in range(blockCount): if not args.utr: blockStart = blockAbsStarts[block_num] blockEnd = blockAbsEnds[block_num] # skip the block if it is entirely UTR if blockEnd <= thickStart: continue elif blockStart >= thickEnd: continue blockNewStart = blockStart if blockStart >= thickStart else thickStart blockNewEnd = blockEnd if blockEnd <= thickEnd else thickEnd exon_seq = chrom_seq[blockNewStart: blockNewEnd].upper() else: exon_seq = chrom_seq[blockAbsStarts[block_num]: blockAbsEnds[block_num]] gene_seq += exon_seq if len(gene_seq) == 0: continue gene_seq = gene_seq if strand else revert_compl(gene_seq) sys.stdout.write(">{}\n{}\n".format(name, gene_seq)) source.close() if args.bed_source != "stdin" else None sys.exit(0)
def main(): (options, args) = _get_args() twoBitFile = TwoBitFile(options.twobit) pileup = Pileup(region=_parse_range(options.range)) pileup.addTrack(ScaleTrack(name="Scale")) pileup.addTrack(LocationTrack(name="Location")) pileup.addTrack(ReferenceTrack(twoBitFile, name="Reference")) for filename in args: if filename.endswith('.vcf'): pileup.addTrack(DivTrack(divider='-', name="Div1")) vcf_reader = vcf.Reader(open(filename, 'r')) variants = list(vcf_reader) pileup.addTrack(VCFTrack(variants=variants, name="Variants")) pileup.addTrack(DivTrack(divider='~', name="Div2")) pileup.addTrack(DivTrack(divider='.', name="Div3")) pileup.render()
def writeGuideRow(db, guideSeq, otRows, ofh): " write a guide row, with all off-targets merged into a single field " otRows.sort(key=operator.itemgetter(2), reverse=True) filtOtRows = [] mismCounts = [0] * 5 for row in otRows: # format of row is: chrom;start;score;pam;diffString;annotation otSeq = row[4] #mismString, mismCount = showMism(guideSeq, row[4][:20]) #row[4] = compressAln(mismString) mismCount = countMm(guideSeq, row[4][:20]) if mismCount <= 4: # very very rare >4: only when there are Ns in the off-target seq mismCounts[mismCount] += 1 #if mismCount >= 4: #continue # just show count, don't store locations of off-targets with 4 mismatches #row[2] = "%0.2f" % row[2] chrom, start, score, pam, diffString, annot = row start = int(start) - 1 # aargh!! crispor is 1-based! row = [chrom, start, score, otSeq] #otStrings.append(";".join(row)) filtOtRows.append(row) # need to determine strand. idiotic bug: old version of crispor didn't give me the strand. # get seqs, but in chrom order to get better speed otCoords = [] for row in filtOtRows: chrom, start, score, otSeq = row otCoords.append((chrom, start, otSeq)) otCoords.sort() # write to bed #tmpFh = tempfile.NamedTemporaryFile(dir="/dev/shm", prefix="max-crisprTrack") #for r in bedRows: #r = [str(x) for x in r] #tmpFh.write("%s\n" % ("\t".join(r))) #tmpFh.flush() twoBitFname = '/scratch/data/%s/%s.2bit' % (db, db) if not isfile(twoBitFname): # can happen these days, says Hiram twoBitFname = '/gbdb/%s/%s.2bit' % (db, db) if not isfile(twoBitFname): # can happen these days, says Hiram twoBitFname = '/cluster/data/%s/%s.2bit' % (db, db) genome = TwoBitFile(twoBitFname) # get sequences strands = {} for otRow in otCoords: chrom, start, otSeq = otRow twoBitChrom = genome[chrom] forwSeq = twoBitChrom[start:start + 23].upper() # two possible sequences, depending on strand revSeq = revComp(forwSeq).upper() #print guideSeq, otSeq, forwSeq, revSeq # for palindromes, we can't decide, default to + if otSeq == forwSeq: strand = "+" elif otSeq == revSeq: strand = "-" else: assert (False) strands[(chrom, start)] = strand # now add the strand to the features otStrings = [] for row in filtOtRows: chrom, start, score, mismCount = row scoreStr = str(int(score * 100)) #if scoreStr[:2]=="0.": #scoreStr = scoreStr[1:] row = (chrom, str(start) + strands[(chrom, start)], scoreStr) otStrings.append(";".join(row)) mismCounts = [str(x) for x in mismCounts] otField = "|".join(otStrings) # mysql has trouble with very long blogs, and we also can save a lot of space by # ignoring too repetitive sequences if len(otField) > 5000: otField = "" row = (guideSeq, ",".join(mismCounts), otField) ofh.write("\t".join(row)) ofh.write("\n")
transposaseMotifs["SB"] = "TA" transposaseMotifs["HelR"] = "AT" parser = argparse.ArgumentParser() parser.add_argument("-t", "--transposase", type=str, required=True, choices=["PB", "SB", "HelR"]) parser.add_argument("-f", "--filter", action="store_true") parser.add_argument("input", type=str) parser.add_argument("reference", type=str) parser.add_argument("output", type=str) args = parser.parse_args() ref = TwoBitFile(args.reference) # This is to cache the most recent 1024 genomic loci looked up. This saves # overhead by not having to needlessly query the reference @lru_cache(maxsize=1024) def fetchGenomicSequence(chromosome, start, end): return ref[chromosome][start:end].upper() # This function returns a function specifically tailored to the transposase # specified by the user. This practice is known as "currying" and prevents # having to re-initialize the insertSiteLength parameter during each iteration def makeInsertionSiteFunction(transposase): insertSiteLength = len(transposaseMotifs[transposase])
def get_spectra_from_maf(maf: pd.DataFrame, hgfile: Union[str, None] = None, cosmic: str = 'cosmic2', real_snps: bool = False): """ Attaches context categories to maf and gets counts of contexts for each sample --------------------------- Args: * maf: Pandas DataFrame of maf * hgfile: path to 2bit genome build file for computing reference context * cosmic: cosmic signatures to decompose to Returns: * Pandas DataFrame of maf with context category attached * Pandas DataFrame of counts with samples as columns and context as rows """ maf = maf.copy() if 'Start_Position' in list(maf): maf = maf.rename(columns={'Start_Position': 'Start_position'}) maf['sample'] = maf['Tumor_Sample_Barcode'] if cosmic in ['cosmic2', 'cosmic3', 'cosmic3_exome']: # Subset to SNPs if 'Variant_Type' in maf.columns: maf = maf.loc[maf['Variant_Type'] == 'SNP'] else: maf = maf.loc[maf['Reference_Allele'].apply(lambda k: len(k) == 1 and k != '-') & \ maf['Tumor_Seq_Allele2'].apply(lambda k: len(k) == 1 and k != '-')] if not real_snps: maf = get_true_snps_from_maf(maf) ref = maf['Reference_Allele'].str.upper() alt = maf['Tumor_Seq_Allele2'].str.upper() if 'ref_context' in list(maf): context = maf['ref_context'].str.upper() else: assert hgfile is not None, 'Please provide genome build file.' try: hg = TwoBitFile(hgfile) except: raise Exception("{} not a valid 2bit file.".format(hgfile)) # Map contexts _contexts = list() maf_size = maf.shape[0] for idx, (pos, chromosome) in enumerate( zip(maf["Start_position"].astype(int), maf["Chromosome"].astype(str))): stdout.write("\r * Mapping contexts: {} / {}".format( idx, maf_size)) # Double check version if chromosome == '23': chromosome = 'X' elif chromosome == '24': chromosome = 'Y' elif chromosome == 'MT': chromosome = 'M' if not chromosome.startswith('chr'): chromosome = 'chr' + chromosome _contexts.append(hg[chromosome][pos - 2:pos + 1].lower()) maf['ref_context'] = _contexts stdout.write("\n") context = maf['ref_context'].str.upper() n_context = context.str.len() mid = n_context // 2 contig = pd.Series([r + a + c[m - 1] + c[m + 1] if r in 'AC' \ else compl(r + a + c[m + 1] + c[m - 1]) \ for r, a, c, m in zip(ref, alt, context, mid)], index=maf.index) try: maf['context96.num'] = contig.apply(context96.__getitem__) except KeyError as e: raise KeyError('Unusual context: ' + str(e)) maf['context96.word'] = contig spectra = maf.groupby(['context96.word', 'sample' ]).size().unstack().fillna(0).astype(int) for c in context96: if c not in spectra.index: spectra.loc[c] = 0 spectra = spectra.loc[context96] elif cosmic == 'cosmic3_DBS': # Subset to DNPs if 'Variant_Type' not in maf.columns: ref_alt = maf['Reference_Allele'] + '>' + maf['Tumor_Seq_Allele2'] def get_variant_type(ra): r, a = ra.split('>') if len(r) == 1 and r != '-' and len(a) == 1 and a != '-': return 'SNP' if len(r) == 2 and len(a) == 2: return 'DNP' maf['Variant_Type'] = ref_alt.apply(get_variant_type) if 'DNP' in maf['Variant_Type']: maf = maf.loc[maf['Variant_Type'] == 'DNP'] else: maf = get_dnps_from_maf(maf) ref = maf['Reference_Allele'].str.upper() alt = maf['Tumor_Seq_Allele2'].str.upper() contig = pd.Series([ r + '>' + a if r + '>' + a in context78 else compl(r, reverse=True) + '>' + compl(a, reverse=True) for r, a in zip(ref, alt) ], index=maf.index) try: maf['context78.num'] = contig.apply(context78.__getitem__) except KeyError as e: raise KeyError('Unusual context: ' + str(e)) maf['context78.word'] = contig spectra = maf.groupby(['context78.word', 'sample' ]).size().unstack().fillna(0).astype(int) for c in context78: if c not in spectra.index: spectra.loc[c] = 0 spectra = spectra.loc[context78] elif cosmic == 'cosmic3_ID': maf = maf.loc[(maf['Reference_Allele'] == '-') ^ (maf['Tumor_Seq_Allele2'] == '-')] ref = maf['Reference_Allele'].str.upper() alt = maf['Tumor_Seq_Allele2'].str.upper() assert hgfile is not None, 'Please provide genome build file.' try: hg = TwoBitFile(hgfile) except: raise Exception("{} not a valid 2bit file.".format(hgfile)) # Map contexts contig = list() maf_size = maf.shape[0] for idx, (pos, chromosome, r, a) in enumerate( zip(maf["Start_position"].astype(int), maf["Chromosome"].astype(str), ref, alt)): stdout.write("\r * Mapping contexts: {} / {}".format( idx, maf_size)) # Double check version if chromosome == '23': chromosome = 'X' elif chromosome == '24': chromosome = 'Y' elif chromosome == 'MT': chromosome = 'M' if not chromosome.startswith('chr'): chromosome = 'chr' + chromosome if a == '-': del_len = len(r) _context = hg[chromosome][pos - 1 + del_len:pos - 1 + del_len * 6].upper() _context_list = [ _context[n:n + del_len] for n in range(0, 5 * del_len, del_len) ] n_repeats = 1 for c in _context_list: if c == r: n_repeats += 1 else: break microhomology = 0 if n_repeats == 1: for b1, b2 in zip(r, _context_list[0]): if b1 == b2: microhomology += 1 else: break prev_context = hg[chromosome][pos - 1 - del_len:pos - 1].upper() for b1, b2 in zip(reversed(r), reversed(prev_context)): if b1 == b2: microhomology += 1 else: break if del_len == 1: pre = 'C' if r in 'CG' else 'T' elif del_len >= 5: pre = '5+' else: pre = str(del_len) if microhomology >= 5: post = 'm5+' elif microhomology: post = 'm' + str(microhomology) elif n_repeats == 6: post = '6+' else: post = str(n_repeats) contig.append(pre + 'del' + post) elif r == '-': ins_len = len(a) _context = hg[chromosome][pos:pos + ins_len * 5].upper() _context_list = [ _context[n:n + ins_len] for n in range(0, 5 * ins_len, ins_len) ] n_repeats = 0 for c in _context_list: if c == a: n_repeats += 1 else: break if ins_len == 1: pre = 'C' if a in 'CG' else 'T' elif ins_len >= 5: pre = '5+' else: pre = str(ins_len) if n_repeats == 5: post = '5+' else: post = str(n_repeats) contig.append(pre + 'ins' + post) maf['context83.word'] = contig try: maf['context83.num'] = maf['context83.word'].apply( context83.__getitem__) except KeyError as e: raise KeyError('Unusual context: ' + str(e)) spectra = maf.groupby(['context83.word', 'sample' ]).size().unstack().fillna(0).astype(int) for c in context83: if c not in spectra.index: spectra.loc[c] = 0 spectra = spectra.loc[context83] stdout.write("\n") else: raise NotImplementedError() return maf, spectra
dest='stats', help="write orf stats to this csv file") parser.add_option('-o', '--outfile', dest='outfile', help="write output to this file (default: stdout)") parser.add_option('', '--minlength', dest='minlength', default=12, help="""minimum ORF length (in nt, including stop) [12]""") options, args = parser.parse_args() print >> sys.stderr, 'reading genome from ' + options.genome genome = TwoBitFile(options.genome) print >> sys.stderr, 'reading bed file ' + options.bed orfs = [] hash_values = [] orf_types = [] orf_length = [] utr5_length = [] utr3_length = [] def match_type(sig): if sig == 'annotated': return 'annotated' elif sig == 'utr5:cds:0': return "N-ext"
def build_index(args, unknown_args): from pyfaidx import Fasta from twobitreader import TwoBitFile import gffutils import gffutils.merge_criteria as mc import atexit import shutil from tqdm import tqdm from collections import defaultdict from pprint import pprint from tempfile import NamedTemporaryFile from urllib.parse import urlparse from urllib.request import urlopen from shutil import copyfileobj from subprocess import Popen, PIPE, call if args.debug: log_level = logging.DEBUG else: log_level = logging.INFO logging.basicConfig( level=log_level, format='%(asctime)s %(levelname)-8s %(message)s', datefmt='%m-%d %H:%M') config = args.conf logging.info("PISCES version %s", __version__) for species, datasets in list(config.items()): indices = datasets["index"] download_dir = datasets["downloads"] index_dir_base = datasets["index_dir"] if not os.path.exists(download_dir): os.makedirs(download_dir) for index_name, dataset in list(indices.items()): if args.indices and index_name not in args.indices: continue pprint(dataset, indent=4) options = defaultdict(lambda: True) options.update(dataset["options"]) index_dir_path = os.path.join(index_dir_base, species, index_name) if os.path.exists(index_dir_path): if args.overwrite: logging.warn( "index directory %s already exists! overwriting", index_dir_path) shutil.rmtree( os.path.join(index_dir_path, "transcripts")) shutil.rmtree(os.path.join(index_dir_path, "salmon")) else: continue os.makedirs(os.path.join(index_dir_path, "transcripts")) os.makedirs(os.path.join(index_dir_path, "salmon")) transcripts_fasta_file = os.path.join( index_dir_path, "transcripts", "transcripts.fa") with open(transcripts_fasta_file, 'w') as transcripts_fasta: ## all of this URI handling should probably use an existing library like ## https://github.com/intake/filesystem_spec for fasta_loc in dataset["extra_fastas"]: fasta = urlparse(fasta_loc) if fasta.scheme == '': reference = Fasta(fasta.path) elif fasta.scheme.lower() in ('ftp', 'http', 'https'): _fasta_local_path = os.path.join( download_dir, os.path.basename(fasta.path)) logging.info("Downloading %s", fasta.geturl()) if not os.path.exists(_fasta_local_path): with urlopen(fasta.geturl()) as _fasta: with open(_fasta_local_path, 'wb') as _fasta_local: copyfileobj(_fasta, _fasta_local) _fasta_local.flush() if fasta.path.endswith('gz'): logging.info("Decompressing %s", fasta.geturl()) call( ' '.join([ 'gzip -dc', _fasta_local_path, '>', _fasta_local_path.replace( ".gz", "") ]), shell=True) if _fasta_local_path.endswith("2bit"): logging.info("Converting %s to FASTA format", fasta.geturl()) twobit = TwoBitFile(_fasta_local_path) if not os.path.exists( _fasta_local_path.replace("2bit", "fa")): with open( _fasta_local_path.replace( "2bit", "fa"), 'w') as fasta: for chrom in twobit.keys(): fasta.write(">%s\n" % chrom) fasta.write(str(twobit[chrom]) + '\n') reference = Fasta( _fasta_local_path.replace("2bit", "fa")) with open(_fasta_local_path) as extra: logging.info("Adding entries from %s", fasta) for line in extra: transcripts_fasta.write(line) for gtf_loc, fasta_loc in zip(dataset["gtfs"], dataset["fastas"]): gtf = urlparse(gtf_loc) fasta = urlparse(fasta_loc) assembly = os.path.basename(fasta.path) if fasta.scheme == '': reference = Fasta(fasta.path) elif fasta.scheme.lower() in ('ftp', 'http', 'https'): _fasta_local_path = os.path.join( download_dir, os.path.basename(fasta.path)) logging.info("Downloading %s", fasta.geturl()) if not os.path.exists(_fasta_local_path): with urlopen(fasta.geturl()) as _fasta: with open(_fasta_local_path, 'wb') as _fasta_local: copyfileobj(_fasta, _fasta_local) _fasta_local.flush() if fasta.path.endswith('gz'): logging.info("Decompressing %s", fasta.geturl()) call( ' '.join([ 'gzip -dc', _fasta_local_path, '>', _fasta_local_path.replace( ".gz", "") ]), shell=True) if _fasta_local_path.endswith("2bit"): logging.info("Converting %s to FASTA format", fasta.geturl()) twobit = TwoBitFile(_fasta_local_path) if not os.path.exists( _fasta_local_path.replace("2bit", "fa")): with open( _fasta_local_path.replace( "2bit", "fa"), 'w') as fasta: for chrom in twobit.keys(): fasta.write(">%s\n" % chrom) fasta.write(str(twobit[chrom]) + '\n') reference = Fasta( _fasta_local_path.replace("2bit", "fa")) elif fasta.path.endswith('gz'): reference = Fasta( _fasta_local_path.replace(".gz", "")) else: reference = Fasta(_fasta_local_path) if gtf.scheme == '': database_filename = gtf.path + '.db' if os.path.exists(database_filename): logging.info("Loading existing GTF database file.") db = gffutils.FeatureDB(database_filename) else: logging.info( "Creating GTF database file. This will take some time..." ) try: db = gffutils.create_db( gtf.path, database_filename, disable_infer_genes= not options["infer_features"], disable_infer_transcripts= not options["infer_features"]) except: tmp_db = os.path.join(download_dir, os.path.basename(gtf.path) + '.db') logging.info( "Unable to create %s, so using %s", database_filename, tmp_db) if os.path.exists(tmp_db): logging.info("Loading existing GTF database file.") db = gffutils.FeatureDB(tmp_db) else: db = gffutils.create_db( gtf.path, tmp_db, disable_infer_genes= not options["infer_features"], disable_infer_transcripts= not options["infer_features"]) elif gtf.scheme.lower() in ('ftp', 'http', 'https'): _gtf_local_path = os.path.join(download_dir, os.path.basename( gtf.path)) logging.info("Downloading %s", gtf.geturl()) if not os.path.exists(_gtf_local_path): with urlopen(gtf.geturl()) as _gtf: with open(_gtf_local_path, 'wb') as _gtf_local: copyfileobj(_gtf, _gtf_local) _gtf_local.flush() if gtf.path.endswith('gz'): logging.info("Decompressing %s", gtf.geturl()) call( ' '.join([ 'gzip -dc', _gtf_local_path, '>', _gtf_local_path.replace( ".gz", "") ]), shell=True) logging.info( "Creating GTF database file. This will take some time..." ) db = gffutils.create_db( _gtf_local_path.replace(".gz", ""), _gtf_local_path.replace( ".gz", "") + '.db', disable_infer_genes= not options["infer_features"], disable_infer_transcripts= not options["infer_features"]) else: logging.info( "Creating GTF database file. This will take some time..." ) db = gffutils.create_db( _gtf_local_path, _gtf_local_path + '.db', disable_infer_genes= not options["infer_features"], disable_infer_transcripts= not options["infer_features"]) elif gtf.path.endswith('gz'): logging.info("Loading existing GTF database file.") db = gffutils.FeatureDB( _gtf_local_path.replace(".gz", "") + '.db') else: logging.info("Loading existing GTF database file.") db = gffutils.FeatureDB(_gtf_local_path) # https://github.com/daler/gffutils/issues/56 db.execute('ANALYZE features') #if db.count_features_of_type('intron') == 0 and options["unprocessed_transcripts"]: #logging.info("Inferring intronic sequences...") #db.update(db.create_introns()) soft_chars = set(('a', 'c', 'g', 't')) if not options["-k"]: k = 31 else: k = options["-k"] gene_tx_file = os.path.join( index_dir_path, assembly + "_transcripts_to_genes.txt") gene_annotation = os.path.join( index_dir_path, assembly + "_gene_annotation.txt") def features_to_string(features, fasta_in, masked=True, strand=True): """ """ sequences = [] feature_strand = "." for feature in features: feature_strand = feature.strand sequences.append( feature.sequence( fasta_in, use_strand=strand)) # if the transcript is on the reverse strand, reverse order of exons # before concatenating if feature_strand == "-": sequences = sequences[::-1] seq = ''.join(sequences) mask_count = sum(seq.count(a) for a in soft_chars) if masked: if mask_count > 0: seq = seq.replace( 'a', 'N').replace('t', 'N').replace( 'c', 'N').replace('g', 'N') try: frac_masked = mask_count / len(seq) except ZeroDivisionError: frac_masked = 0 return (seq, frac_masked) with open(gene_tx_file, 'w') as gene2tx, open( gene_annotation, 'w') as annotation: logging.info("Making transcripts_to_genes, annotation and FASTA file for %s", gtf.path) with tqdm( total=db.count_features_of_type('gene'), unit='gene') as pbar: for gene in db.features_of_type('gene'): first_exon = next( db.children( gene, featuretype='exon', order_by='start')) try: if options["gene_type"] == True: type_tag = "gene_type" else: type_tag = options["gene_type"] gene_type = first_exon[type_tag][0] except KeyError: logging.info("No gene type tag found for %s", gene['gene_id'][0]) gene_type = 'NA' try: if options["gene_name"] == True: name_tag = "gene_name" else: name_tag = options["gene_name"] gene_name = first_exon[name_tag][0] except KeyError: logging.info("No gene name tag found for %s", gene['gene_id'][0]) gene_name = 'NA' transcripts = db.children(gene, featuretype='transcript', order_by='start') for transcript in transcripts: # Write entry in the transcripts to genes table gene2tx.write("{txp}\t{gene}\n".format( gene=gene['gene_id'][0], txp=transcript['transcript_id'][0])) # Construct the transcript sequences and write them to the FASTA fa_seq, frac_masked = features_to_string(db.children(transcript, featuretype='exon', order_by='start'), reference, masked=options["masked"]) transcripts_fasta.write('>' + transcript['transcript_id'][0] + '\n') transcripts_fasta.write(fa_seq + '\n') exons = db.children(gene, featuretype='exon', order_by='start') merged_exons = db.merge(exons, merge_criteria=(mc.seqid, mc.feature_type, mc.overlap_any_inclusive)) if options["unprocessed_transcripts"]: introns = db.interfeatures(merged_exons, new_featuretype='intron') transcripts_fasta.write('>' + "intronic_" + gene['gene_id'][0] + '\n') fa_seq, _ = features_to_string(introns, reference, masked=options["masked"]) transcripts_fasta.write(fa_seq + '\n') annotation.write( "{gene}\t{type}\t{name}\t{chrom}\t{start}\t{stop}\t{length}\t{frac_masked}\n". format( gene=gene['gene_id'][0], type=gene_type, name=gene_name, start=gene.start, stop=gene.stop, chrom=gene.chrom, length=sum(len(exon) for exon in merged_exons), frac_masked=str(frac_masked))) transcripts = db.children( gene, featuretype='transcript', order_by='start') pbar.update(1) if options["intergenes"]: for seqid in reference.keys(): logging.info("Merging overlapping genes on %s", seqid) merged_genes = db.merge(db.region(seqid=seqid), merge_criteria=(mc.seqid, mc.feature_type, mc.overlap_any_inclusive)) with tqdm(unit='intergene features') as pbar: for intergene in db.interfeatures(merged_genes, new_featuretype='intergene'): transcripts_fasta.write('>' + 'intergene_' + seqid + "_" + str(intergene.start) + ':' + str(intergene.end) + '\n') fa_seq, _ = features_to_string([intergene], reference, masked=options["masked"], strand=False) transcripts_fasta.write(fa_seq + '\n') pbar.update(1) # This needs to happen outside of context handler so FASTA file can be closed properly logging.info("Making salmon index files for %s", species + '/' + index_name) cmd = [ os.path.join(find_data_directory(), 'redist', 'salmon', 'bin', 'salmon'), 'index', '-p', str(args.threads), '-k', str(k), '-t', transcripts_fasta.name, '-i', os.path.join(index_dir_path, "salmon") ] logging.debug(' '.join(cmd)) p = Popen(cmd, stderr=PIPE) for line in p.stderr: line = line.decode() if line.endswith('\n'): logging.info(line.rstrip()) else: logging.info(line) logging.info(line)
def __main__(): parser = argparse.ArgumentParser( description='Translate from BED') parser.add_argument( 'input_bed', default=None, help="BED to translate, '-' for stdin") pg_seq = parser.add_argument_group('Genomic sequence source') pg_seq.add_argument( '-t', '--twobit', default=None, help='Genome reference sequence in 2bit format') pg_seq.add_argument( '-c', '--column', type=int, default=None, help='Column offset containing genomic sequence' + 'between start and stop (-1) for last column') pg_out = parser.add_argument_group('Output options') pg_out.add_argument( '-f', '--fasta', default=None, help='Path to output translations.fasta') pg_out.add_argument( '-b', '--bed', default=None, help='Path to output translations.bed') pg_bed = parser.add_argument_group('BED filter options') pg_bed.add_argument( '-E', '--ensembl', action='store_true', default=False, help='Input BED is in 20 column Ensembl format') pg_bed.add_argument( '-R', '--regions', action='append', default=[], help='Filter input by regions e.g.:' + ' X,2:20000-25000,3:100-500+') pg_bed.add_argument( '-B', '--biotypes', action='append', default=[], help='For Ensembl BED restrict translations to Ensembl biotypes') pg_trans = parser.add_argument_group('Translation filter options') pg_trans.add_argument( '-m', '--min_length', type=int, default=10, help='Minimum length of protein translation to report') pg_trans.add_argument( '-e', '--enzyme', default=None, help='Digest translation with enzyme') pg_trans.add_argument( '-M', '--start_codon', action='store_true', default=False, help='Trim translations to methionine start_codon') pg_trans.add_argument( '-C', '--cds', action='store_true', default=False, help='Only translate CDS') pg_trans.add_argument( '-A', '--all', action='store_true', help='Include CDS protein translations ') pg_fmt = parser.add_argument_group('ID format options') pg_fmt.add_argument( '-r', '--reference', default='', help='Genome Reference Name') pg_fmt.add_argument( '-D', '--fa_db', dest='fa_db', default=None, help='Prefix DB identifier for fasta ID line, e.g. generic') pg_fmt.add_argument( '-s', '--fa_sep', dest='fa_sep', default='|', help='fasta ID separator defaults to pipe char, ' + 'e.g. generic|ProtID|description') pg_fmt.add_argument( '-P', '--id_prefix', default='', help='prefix for the sequence ID') parser.add_argument('-v', '--verbose', action='store_true', help='Verbose') parser.add_argument('-d', '--debug', action='store_true', help='Debug') args = parser.parse_args() input_rdr = open(args.input_bed, 'r')\ if args.input_bed != '-' else sys.stdin fa_wtr = open(args.fasta, 'w')\ if args.fasta is not None and args.fasta != '-' else sys.stdout bed_wtr = open(args.bed, 'w') if args.bed is not None else None enzyme = digest.expasy_rules.get(args.enzyme, None) biotypea = [bt.strip() for biotype in args.biotypes for bt in biotype.split(',')] twobit = TwoBitFile(args.twobit) if args.twobit else None selected_regions = dict() # chrom:(start, end) region_pat = '^(?:chr)?([^:]+)(?::(\d*)(?:-(\d+)([+-])?)?)?' if args.regions: for entry in args.regions: if not entry: continue regs = [x.strip() for x in entry.split(',') if x.strip()] for reg in regs: m = re.match(region_pat, reg) if m: (chrom, start, end, strand) = m.groups() if chrom: if chrom not in selected_regions: selected_regions[chrom] = [] selected_regions[chrom].append([start, end, strand]) if args.debug: print("selected_regions: %s" % selected_regions, file=sys.stderr) def filter_by_regions(bed): if not selected_regions: return True ref = re.sub('^(?i)chr', '', bed.chrom) if ref not in selected_regions: return False for reg in selected_regions[ref]: (_start, _stop, _strand) = reg start = int(_start) if _start else 0 stop = int(_stop) if _stop else sys.maxint if _strand and bed.strand != _strand: continue if bed.chromEnd >= start and bed.chromStart <= stop: return True return False translations = dict() # start : end : seq def unique_prot(tbed, seq): if tbed.chromStart not in translations: translations[tbed.chromStart] = dict() translations[tbed.chromStart][tbed.chromEnd] = [] translations[tbed.chromStart][tbed.chromEnd].append(seq) elif tbed.chromEnd not in translations[tbed.chromStart]: translations[tbed.chromStart][tbed.chromEnd] = [] translations[tbed.chromStart][tbed.chromEnd].append(seq) elif seq not in translations[tbed.chromStart][tbed.chromEnd]: translations[tbed.chromStart][tbed.chromEnd].append(seq) else: return False return True def get_sequence(chrom, start, end): if twobit: if chrom in twobit and 0 <= start < end < len(twobit[chrom]): return twobit[chrom][start:end] contig = chrom[3:] if chrom.startswith('chr') else 'chr%s' % chrom if contig in twobit and 0 <= start < end < len(twobit[contig]): return twobit[contig][start:end] return None def write_translation(tbed, accession, peptide): if args.id_prefix: tbed.name = "%s%s" % (args.id_prefix, tbed.name) probed = "%s\t%s\t%s\t%s%s" % (accession, peptide, 'unique', args.reference, '\t.' * 9) if bed_wtr: bed_wtr.write("%s\t%s\n" % (str(tbed), probed)) bed_wtr.flush() location = "chromosome:%s:%s:%s:%s:%s"\ % (args.reference, tbed.chrom, tbed.thickStart, tbed.thickEnd, tbed.strand) fa_desc = '%s%s' % (args.fa_sep, location) fa_db = '%s%s' % (args.fa_db, args.fa_sep) if args.fa_db else '' fa_id = ">%s%s%s\n" % (fa_db, tbed.name, fa_desc) fa_wtr.write(fa_id) fa_wtr.write(peptide) fa_wtr.write("\n") fa_wtr.flush() def translate_bed(bed): translate_count = 0 transcript_id = bed.name refprot = None if not bed.seq: if twobit: bed.seq = get_sequence(bed.chrom, bed.chromStart, bed.chromEnd) else: bed.cdna = get_cdna(transcript_id) cdna = bed.get_cdna() if cdna is not None: cdna_len = len(cdna) if args.cds or args.all: try: cds = bed.get_cds() if cds: if args.debug: print("cdna:%s" % str(cdna), file=sys.stderr) print("cds: %s" % str(cds), file=sys.stderr) if len(cds) % 3 != 0: cds = cds[:-(len(cds) % 3)] refprot = translate(cds) if cds else None except: refprot = None if args.cds: if refprot: tbed = bed.get_cds_bed() if args.start_codon: m = refprot.find('M') if m < 0: return 0 elif m > 0: bed.trim_cds(m*3) refprot = refprot[m:] stop = refprot.find('*') if stop >= 0: bed.trim_cds((stop - len(refprot)) * 3) refprot = refprot[:stop] if len(refprot) >= args.min_length: write_translation(tbed, bed.name, refprot) return 1 return 0 if args.debug: print("%s\n" % (str(bed)), file=sys.stderr) print("CDS: %s %d %d" % (bed.strand, bed.cdna_offset_of_pos(bed.thickStart), bed.cdna_offset_of_pos(bed.thickEnd)), file=sys.stderr) print("refprot: %s" % str(refprot), file=sys.stderr) for offset in range(3): seqend = cdna_len - (cdna_len - offset) % 3 aaseq = translate(cdna[offset:seqend]) aa_start = 0 while aa_start < len(aaseq): aa_end = aaseq.find('*', aa_start) if aa_end < 0: aa_end = len(aaseq) prot = aaseq[aa_start:aa_end] if args.start_codon: m = prot.find('M') aa_start += m if m >= 0 else aa_end prot = aaseq[aa_start:aa_end] if enzyme and refprot: frags = digest._cleave(prot, enzyme) for frag in reversed(frags): if frag in refprot: prot = prot[:prot.rfind(frag)] else: break is_cds = refprot and prot in refprot if args.debug: print("is_cds: %s %s" % (str(is_cds), str(prot)), file=sys.stderr) if len(prot) < args.min_length: pass elif not args.all and is_cds: pass else: tstart = aa_start*3+offset tend = aa_end*3+offset prot_acc = "%s_%d_%d" % (transcript_id, tstart, tend) tbed = bed.trim(tstart, tend) if args.all or unique_prot(tbed, prot): translate_count += 1 tbed.name = prot_acc write_translation(tbed, bed.name, prot) aa_start = aa_end + 1 return translate_count if input_rdr: translation_count = 0 transcript_count = 0 for i, bedline in enumerate(input_rdr): try: bed = bed_from_line(bedline, ensembl=args.ensembl, seq_column=args.column) if bed is None: continue transcript_count += 1 if bed.biotype and biotypea and bed.biotype not in biotypea: continue if filter_by_regions(bed): translation_count += translate_bed(bed) except Exception as e: print("BED format Error: line %d: %s\n%s" % (i, bedline, e), file=sys.stderr) break if args.debug or args.verbose: print("transcripts: %d\ttranslations: %d" % (transcript_count, translation_count), file=sys.stderr)
def __main__(): parser = argparse.ArgumentParser( description='Retrieve Ensembl cDNAs and three frame translate') parser.add_argument('-s', '--species', default='human', help='Ensembl Species to retrieve') parser.add_argument( '-R', '--regions', action='append', default=[], help= 'Restrict Ensembl retrieval to regions e.g.: X,2:20000-25000,3:100-500+' ) parser.add_argument('-B', '--biotypes', action='append', default=[], help='Restrict Ensembl biotypes to retrieve') parser.add_argument( '-i', '--input', default=None, help='Use BED instead of retrieving cDNA from ensembl (-) for stdin') parser.add_argument('-T', '--twobit', default=None, help='Genome reference sequence in 2bit format') parser.add_argument( '-t', '--transcripts', default=None, help='Path to output cDNA transcripts.bed (-) for stdout') parser.add_argument( '-r', '--raw', action='store_true', help='Report transcript exacty as returned from Ensembl') parser.add_argument('-f', '--fasta', default=None, help='Path to output translations.fasta') parser.add_argument('-b', '--bed', default=None, help='Path to output translations.bed') parser.add_argument('-m', '--min_length', type=int, default=7, help='Minimum length of protein translation to report') parser.add_argument('-e', '--enzyme', default=None, help='Digest translation with enzyme') parser.add_argument('-a', '--all', action='store_true', help='Include reference protein translations') parser.add_argument('-v', '--verbose', action='store_true', help='Verbose') parser.add_argument('-d', '--debug', action='store_true', help='Debug') args = parser.parse_args() # print >> sys.stderr, "args: %s" % args species = args.species input_rdr = None if args.input is not None: input_rdr = open(args.input, 'r') if args.input != '-' else sys.stdin tx_wtr = None if args.transcripts is not None: tx_wtr = open(args.transcripts, 'w')\ if args.transcripts != '-' else sys.stdout fa_wtr = open(args.fasta, 'w') if args.fasta is not None else None bed_wtr = open(args.bed, 'w') if args.bed is not None else None enzyme = digest.expasy_rules.get(args.enzyme, args.enzyme) # print >> sys.stderr, "args biotypes: %s" % args.biotypes biotypea = [ 'biotype=%s' % bt.strip() for biotype in args.biotypes for bt in biotype.split(',') ] # print >> sys.stderr, "args biotypes: %s" % biotypea biotypes = ';'.join([ 'biotype=%s' % bt.strip() for biotype in args.biotypes for bt in biotype.split(',') if bt.strip() ]) # print >> sys.stderr, "biotypes: %s" % biotypes twobit = TwoBitFile(args.twobit) if args.twobit else None selected_regions = dict() # chrom:(start,end) region_pat = '^([^:]+)(?::(\d*)(?:-(\d+)([+-])?)?)?' if args.regions: for entry in args.regions: if not entry: continue regs = [x.strip() for x in entry.split(',') if x.strip()] for reg in regs: m = re.match(region_pat, reg) if m: (chrom, start, end, strand) = m.groups() if chrom: if chrom not in selected_regions: selected_regions[chrom] = [] selected_regions[chrom].append([start, end, strand]) if args.debug: print >> sys.stderr, "selected_regions: %s" % selected_regions translations = dict() # start : end : seq def unique_prot(tbed, seq): if tbed.chromStart not in translations: translations[tbed.chromStart] = dict() translations[tbed.chromStart][tbed.chromEnd] = [] translations[tbed.chromStart][tbed.chromEnd].append(seq) elif tbed.chromEnd not in translations[tbed.chromStart]: translations[tbed.chromStart][tbed.chromEnd] = [] translations[tbed.chromStart][tbed.chromEnd].append(seq) elif seq not in translations[tbed.chromStart][tbed.chromEnd]: translations[tbed.chromStart][tbed.chromEnd].append(seq) else: return False return True def get_sequence(chrom, start, end): if twobit: if chrom in twobit: return twobit[chrom][start:end] contig = chrom[3:] if chrom.startswith('chr') else 'chr%s' % chrom if contig in twobit: return twobit[contig][start:end] return None def translate_bed(bed): translate_count = 0 if any([fa_wtr, bed_wtr]): transcript_id = bed.name refprot = None if twobit: bed.seq = get_sequence(bed.chrom, bed.chromStart, bed.chromEnd) else: bed.cdna = get_cdna(transcript_id) cdna = bed.get_cdna() cdna_len = len(cdna) if not args.all: try: cds = bed.get_cds() if cds is None: cds = get_cds(transcript_id) if len(cds) % 3 != 0: cds = cds[:-(len(cds) % 3)] refprot = translate(cds) if cds else None except: refprot = None for offset in range(3): seqend = cdna_len - (cdna_len - offset) % 3 aaseq = translate(cdna[offset:seqend]) aa_start = 0 while aa_start < len(aaseq): aa_end = aaseq.find('*', aa_start) if aa_end < 0: aa_end = len(aaseq) prot = aaseq[aa_start:aa_end] if enzyme and refprot: frags = digest._cleave(prot, enzyme) for frag in reversed(frags): if frag in refprot: prot = prot[:prot.rfind(frag)] else: break if len(prot) < args.min_length: pass elif refprot and prot in refprot: pass else: tstart = aa_start * 3 + offset tend = aa_end * 3 + offset prot_acc = "%s_%d_%d" % (transcript_id, tstart, tend) tbed = bed.trim(tstart, tend) if args.all or unique_prot(tbed, prot): translate_count += 1 tbed.name = prot_acc bed_wtr.write("%s\t%s\n" % (str(tbed), prot)) bed_wtr.flush() fa_id = ">%s\n" % (prot_acc) fa_wtr.write(fa_id) fa_wtr.write(prot) fa_wtr.write("\n") fa_wtr.flush() aa_start = aa_end + 1 return translate_count def translate_region(species, ref, start, stop, strand): translation_count = 0 regions = range(start, stop, max_region) if not regions or regions[-1] < stop: regions.append(stop) for end in regions[1:]: bedlines = get_transcripts_bed(species, ref, start, end, strand=strand, params=biotypes) if args.verbose or args.debug: print >> sys.stderr,\ "%s\t%s\tstart: %d\tend: %d\tcDNA transcripts:%d"\ % (species, ref, start, end, len(bedlines)) # start, end, seq for i, bedline in enumerate(bedlines): try: bed = bed_from_line(bedline)\ if any([not args.raw, fa_wtr, bed_wtr])\ else None if tx_wtr: tx_wtr.write(bedline if args.raw else str(bed)) tx_wtr.write("\n") tx_wtr.flush() if bed: translation_count += translate_bed(bed) except Exception as e: print >> sys.stderr,\ "BED error (%s) : %s\n" % (e, bedline) start = end + 1 return translation_count if input_rdr: translation_count = 0 for i, bedline in enumerate(input_rdr): try: bed = bed_from_line(bedline) if bed is None: continue if bed.biotype and biotypea and bed.biotype not in biotypea: continue translation_count += translate_bed(bed) except: print >> sys.stderr, "BED format error: %s\n" % bedline if args.debug or (args.verbose and any([fa_wtr, bed_wtr])): print >> sys.stderr,\ "%s\tcDNA translations:%d" % (species, translation_count) else: coord_systems = get_toplevel(species) if 'chromosome' in coord_systems: ref_lengths = dict() for ref in sorted(coord_systems['chromosome'].keys()): length = coord_systems['chromosome'][ref] ref_lengths[ref] = length if not any([tx_wtr, fa_wtr, bed_wtr]): print >> sys.stderr,\ "%s\t%s\tlength: %d" % (species, ref, length) if selected_regions: translation_count = 0 for ref in sorted(selected_regions.keys()): if ref in ref_lengths: for reg in selected_regions[ref]: (_start, _stop, _strand) = reg start = int(_start) if _start else 0 stop = int(_stop) if _stop else ref_lengths[ref] strand = '' if not _strand else ':1' if _strand == '+' else ':-1' translation_count += translate_region( species, ref, start, stop, strand) else: strand = '' start = 0 for ref in sorted(ref_lengths.keys()): length = ref_lengths[ref] translation_count = 0 if args.debug: print >> sys.stderr,\ "Retrieving transcripts: %s\t%s\tlength: %d"\ % (species, ref, length) translation_count += translate_region( species, ref, start, length, strand) if args.debug or (args.verbose and any([fa_wtr, bed_wtr])): print >> sys.stderr,\ "%s\t%s\tlength: %d\tcDNA translations:%d"\ % (species, ref, length, translation_count)
class EnsemblRef(object): def __init__(self, gtf_file, twobitfile, read_now=True): self.gtf_file = gtf_file self.twobitfile = twobitfile self.twobit = TwoBitFile(self.twobitfile) self.gene_dict = None self.transcript_idx = None self.name_idx = None if read_now: self.get_transcript_idx() def get_gene_dict(self): if self.gene_dict is None: gene_structures = gene.t_parse_gtf('test') self.gene_dict = gene_structures.get_genes(self.gtf_file, logger=logger) return self.gene_dict def get_transcript_idx(self): if self.transcript_idx is None: self.transcript_idx = gene_utilities.index_transcripts( self.get_gene_dict(), by_prot_id=False) return self.transcript_idx def get_name_idx(self): if self.name_idx is None: self.name_idx = dict() for i, t in self.get_transcript_idx().items(): for name in t.gene.names: self.name_idx[name] = t.gene for name in t.names: self.name_idx[name] = t if t.prot_id: self.name_idx[t.prot_id] = t return self.name_idx def get_gtf_transcript(self, name): idx = self.get_transcript_idx() if name in idx: return idx[name] else: nidx = self.get_name_idx() if name in nidx: return nidx[name] return None def transcript_is_coding(self, transcript_id): tx = self.get_transcript_idx()[transcript_id] return len(tx.start_codons) > 0 def get_transcript_start_codon(self, transcript_id): tx = self.get_transcript_idx()[transcript_id] return tx.start_codons[0] if len(tx.start_codons) > 0 else None def get_bed_line(self, transcript_id, score=0, itemRgb='0,0,0', coding=False): tx = self.get_transcript_idx()[transcript_id] chrom = tx.gene.contig chromStart = tx.coding_beg if coding else tx.beg chromEnd = tx.coding_end if coding else tx.end name = transcript_id strand = '+' if tx.gene.strand else '-' thickStart = tx.coding_beg if tx.coding_beg else chromStart thickEnd = tx.coding_end if tx.coding_end else chromEnd exons = tx.get_coding_exons() if coding else tx.get_exons() blockCount = len(exons) if tx.gene.strand: strand = '+' blockSizes = [abs(e - s) for s, e in exons] blockStarts = [s - chromStart for s, e in exons] else: strand = '-' blockSizes = [abs(e - s) for s, e in reversed(exons)] blockStarts = [s - chromStart for s, e in reversed(exons)] blockSizes = ','.join([str(x) for x in blockSizes]) blockStarts = ','.join([str(x) for x in blockStarts]) return '%s\t%d\t%d\t%s\t%s\t%s\t%d\t%d\t%s\t%d\t%s\t%s' % ( chrom, chromStart, chromEnd, name, score, strand, thickStart, thickEnd, itemRgb, blockCount, blockSizes, blockStarts) def transcripts_in_range(self, chrom, startpos, endpos, strand=None): spos = min(startpos, endpos) if endpos else startpos epos = max(startpos, endpos) if endpos else startpos transcripts = [] for i, t in self.get_transcript_idx().items(): if t.gene.contig == chrom and t.beg <= epos and spos <= t.end: if strand and t.gene.strand != strand: continue transcripts.append(t) return transcripts def genes_in_range(self, chrom, startpos, endpos, strand=None, gene_types=None): spos = min(startpos, endpos) if endpos else startpos epos = max(startpos, endpos) if endpos else startpos gene_dict = self.get_gene_dict() gtypes = set(gene_types) & set( gene_dict.keys()) if gene_types else set(gene_dict.keys()) genes = [] for gt in gtypes: for gene in gene_dict[gt]: if gene.contig == chrom and gene.beg <= epos and spos <= gene.end: if strand and gene.strand != strand: continue genes.append(gene) return genes def get_sequence(self, chrom, start, end): if self.twobit: if chrom in self.twobit: return self.twobit[chrom][start:end] contig = chrom[3:] if chrom.startswith('chr') else 'chr%s' % chrom if contig in self.twobit: return self.twobit[contig][start:end] return None def sequence_sizes(self): return self.twobit.sequence_sizes() def get_transcript_seq(self, transcript_id, coding=False): tx = self.get_transcript_idx()[transcript_id] chrom = tx.gene.contig exonbnds = tx.get_coding_exons() if coding else tx.get_exons() if tx.gene.strand: seqs = [self.get_sequence(chrom, s, e) for s, e in exonbnds] else: seqs = [ reverse_complement(self.get_sequence(chrom, s, e)) for s, e in exonbnds ] return ''.join(seqs) def get_cdna(self, transcript_id): return self.get_transcript_seq(transcript_id, coding=False) def get_cds(self, transcript_id): return self.get_transcript_seq(transcript_id, coding=True) def genome_to_transcript_pos(self, transcript_id, genome_pos, coding=False): tx = self.get_transcript_idx()[transcript_id] if not tx.beg <= genome_pos < tx.end: return None exonbnds = tx.get_coding_exons() if coding else tx.get_exons() cdna_pos = 0 if tx.gene.strand: for s, e in exonbnds: if s <= genome_pos < e: cdna_pos += genome_pos - s break else: cdna_pos += e - s else: for s, e in exonbnds: if s <= genome_pos < e: cdna_pos += e - genome_pos - 1 break else: cdna_pos += e - s return cdna_pos def genome_to_cdna_pos(self, transcript_id, genome_pos): return self.genome_to_transcript_pos(transcript_id, genome_pos, coding=False) def genome_to_cds_pos(self, transcript_id, genome_pos): return self.genome_to_transcript_pos(transcript_id, genome_pos, coding=True)
def __main__(): parser = argparse.ArgumentParser( description='Generate proBED and proBAM from mz.sqlite') parser.add_argument('mzsqlite', help="mz.sqlite converted from mzIdentML") parser.add_argument( 'genomic_mapping_sqlite', help="genomic_mapping.sqlite with feature_cds_map table") parser.add_argument('-R', '--genomeReference', default='Unknown', help='Genome reference sequence in 2bit format') parser.add_argument('-t', '--twobit', default=None, help='Genome reference sequence in 2bit format') parser.add_argument('-r', '--reads_bam', default=None, help='reads alignment bam path') parser.add_argument('-g', '--gffutils_sqlite', default=None, help='gffutils GTF sqlite DB') parser.add_argument('-B', '--probed', default=None, help='proBed path') parser.add_argument('-s', '--prosam', default=None, help='proSAM path') parser.add_argument('-b', '--probam', default=None, help='proBAM path') parser.add_argument('-l', '--limit', type=int, default=None, help='limit numbers of PSMs for testing') parser.add_argument('-v', '--verbose', action='store_true', help='Verbose') parser.add_argument('-d', '--debug', action='store_true', help='Debug') args = parser.parse_args() def get_sequence(chrom, start, end): if twobit: if chrom in twobit and 0 <= start < end < len(twobit[chrom]): return twobit[chrom][start:end] contig = chrom[3:] if chrom.startswith('chr') else 'chr%s' % chrom if contig in twobit and 0 <= start < end < len(twobit[contig]): return twobit[contig][start:end] return '' return None twobit = TwoBitFile(args.twobit) if args.twobit else None samfile = pysam.AlignmentFile(args.reads_bam, "rb") if args.reads_bam else None seqlens = twobit.sequence_sizes() probed = open(args.probed, 'w') if args.probed else sys.stdout gff_cursor = get_connection( args.gffutils_sqlite).cursor() if args.gffutils_sqlite else None map_cursor = get_connection(args.genomic_mapping_sqlite).cursor() mz_cursor = get_connection(args.mzsqlite).cursor() unmapped_accs = set() timings = dict() def add_time(name, elapsed): if name in timings: timings[name] += elapsed else: timings[name] = elapsed XG_TYPES = [ 'N', 'V', 'W', 'J', 'A', 'M', 'C', 'E', 'B', 'O', 'T', 'R', 'I', 'G', 'D', 'U', 'X', '*' ] FT_TYPES = ['CDS', 'five_prime_utr', 'three_prime_utr', 'transcript'] def get_peptide_type(exons): ## XG classify peptide ## N Normal peptide. The peptide sequence is contained in the reference protein sequence. ## V Variant peptide. A single amino acid variation (SAV) is present as compared to the reference. ## W Indel peptide. An insertion or deletion is present as compared to the reference. ## J Novel junction peptide. A peptide that spans a novel exon-intron boundary as compared to the reference. ## A Alternative junction peptide. A peptide that spans a non-canonical exon-intron boundary as compared to the reference. ## M Novel exon peptide. A peptide that resides in a novel exon that is not present in the reference. ## C Cross junction peptide. A peptide that spans through a splice site (partly exonic - partly intronic). ## E Extension peptide. A peptide that points to a non-canonical N-terminal protein extension. ## B 3' UTR peptide. A peptide that maps to the 3' UTR region from the reference. ## O Out-of-frame peptide. A peptide that is translated from an alternative frame as compared to the reference. ## T Truncation peptide. A peptide that points to a non-canonical N-terminal protein truncation. ## R Reverse strand peptide. A peptide that is derived from translation of the reverse strand of the reference. ## I Intron peptide. A peptide that is located in an intronic region of the reference isoform. ## G Gene fusion peptide. An (onco-) peptide that spans two exons of different genes, through gene-fusion. ## D Decoy peptide. A peptide that maps to a decoy sequence from the MS-based search strategy. ## U Unmapped peptide. A peptide that could not be mapped to a reference sequence. ## X Unknown. peptide_type = '*' if gff_cursor: ts = time() etypes = ['*'] * len(exons) efeatures = [None] * len(exons) if args.debug: print('exons:%d\t%s' % (len(exons), etypes), file=sys.stderr) for i, exon in enumerate(exons): (acc, gc, gs, ge, st, cs, ce) = exon fr = cs % 3 if args.debug: print('exon:\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (acc, gc, gs, ge, st, cs, ce, fr), file=sys.stderr) ft_params = { "seqid": str(gc).replace('chr', ''), "start": gs, "end": ge, 'strand': st, 'frame': fr, 'ftype': 'CDS' } features = [ f for f in gff_cursor.execute(FEATURE_ANY_QUERY, ft_params) ] efeatures[i] = features for i, exon in enumerate(exons): (acc, gc, gs, ge, st, cs, ce) = exon for f in efeatures[i]: (id, seqid, start, end, featuretype, strand, frame, in_frame) = f if args.debug: print('feat:\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (id, seqid, start, end, featuretype, strand, frame, in_frame), file=sys.stderr) if strand == st: if start <= gs and ge <= end: if in_frame: etypes[i] = 'N' break elif XG_TYPES.index('O') < XG_TYPES.index( etypes[i]): etypes[i] = 'O' break else: if XG_TYPES.index('O') < XG_TYPES.index(etypes[i]): etypes[i] = 'O' peptide_type = etypes[i] te = time() add_time('pep_type', te - ts) return peptide_type def classify_exon(exon, exons, features): ## N Normal peptide. The peptide sequence is contained in the reference protein sequence. # 1 exon, contained, in_frame # 2+ exons, contained, in_frame, on_exon_boundary ## V Variant peptide. A single amino acid variation (SAV) is present as compared to the reference. # 1 exon, contained, in_frame, AA_mismatch # 2+ exons, contained, in_frame, on_exon_boundary, AA_mismatch ## W Indel peptide. An insertion or deletion is present as compared to the reference. # 1 exon, contained, in_frame, AA_mismatch # 2+ exons, contained, in_frame, on_exon_boundary or off by 3, AA_mismatch ## J Novel junction peptide. A peptide that spans a novel exon-intron boundary as compared to the reference. # 2+ exons, contained, on_exon_boundary, same transcript, non adjacent exons ## A Alternative junction peptide. A peptide that spans a non-canonical exon-intron boundary as compared to the reference. # 2+ exons, contained, on_exon_boundary, same transcript, non adjacent exons ## M Novel exon peptide. A peptide that resides in a novel exon that is not present in the reference. ## C Cross junction peptide. A peptide that spans through a splice site (partly exonic - partly intronic). # 1 exon overlaps but not contained ## E Extension peptide. A peptide that points to a non-canonical N-terminal protein extension. ## B 3' UTR peptide. A peptide that maps to the 3' UTR region from the reference. # exon overlaps a three_prime_utr ## O Out-of-frame peptide. A peptide that is translated from an alternative frame as compared to the reference. # exon contained but not in_frame ## T Truncation peptide. A peptide that points to a non-canonical N-terminal protein truncation. ## R Reverse strand peptide. A peptide that is derived from translation of the reverse strand of the reference. ## I Intron peptide. A peptide that is located in an intronic region of the reference isoform. # exon contained in transcript, not not overlapping any exon ## G Gene fusion peptide. An (onco-) peptide that spans two exons of different genes, through gene-fusion. # exonis from different seqs, strand, or transcripts ## D Decoy peptide. A peptide that maps to a decoy sequence from the MS-based search strategy. ## U Unmapped peptide. A peptide that could not be mapped to a reference sequence. ## X Unknown. return '*' def get_variant_cds(exons, ref_prot, peptide, pep_cds): if ref_prot != peptide and samfile: try: if args.debug: print('name: %s \nref: %s\npep: %s\n' % (scan_name, ref_prot, peptide), file=sys.stderr) ts = time() for exon in exons: (acc, chrom, start, end, strand, c_start, c_end) = exon a_start = c_start / 3 * 3 a_end = c_end / 3 * 3 if ref_prot[a_start:a_end] != peptide[a_start:a_end]: pileup = get_exon_pileup(chrom, start, end) for i, (bi, ai, ao) in enumerate([ (i, i / 3, i % 3) for i in range(c_start, c_end) ]): if ao == 0 or i == 0: if ref_prot[ai] != peptide[ai]: codon = get_pep_codon( pileup, bi - c_start, peptide[ai], ao) if args.debug: print('%d %d %d %s : %s %s %s' % (bi, ai, ao, peptide[ai], str(pep_cds[:bi]), str(codon), str(pep_cds[bi + 3:])), file=sys.stderr) if codon: pep_cds = pep_cds[: bi] + codon + pep_cds[ bi + 3:] te = time() add_time('var_cds', te - ts) except Exception as e: print('name: %s \nref: %s\npep: %s\n%s\n' % (scan_name, ref_prot, peptide, e), file=sys.stderr) return pep_cds def get_mapping(acc, pep_start, pep_end): ts = time() p_start = (pep_start - 1) * 3 p_end = pep_end * 3 map_params = {"acc": acc, "p_start": p_start, "p_end": p_end} if args.debug: print('%s' % map_params, file=sys.stderr) locs = [l for l in map_cursor.execute(MAP_QUERY, map_params)] exons = [] ## ========= pep ## --- continue ## --- trim ## --- copy ## --- trim ## --- break c_end = 0 for i, (acc, chrom, start, end, strand, cds_start, cds_end) in enumerate(locs): if args.debug: print('Prot: %s\t%s:%d-%d\t%s\t%d\t%d' % (acc, chrom, start, end, strand, cds_start, cds_end), file=sys.stderr) c_start = c_end if cds_end < p_start: continue if cds_start >= p_end: break if strand == '+': if cds_start < p_start: start += p_start - cds_start if cds_end > p_end: end -= cds_end - p_end else: if cds_start < p_start: end -= p_start - cds_start if cds_end > p_end: start += cds_end - p_end c_end = c_start + abs(end - start) if args.debug: print('Pep: %s\t%s:%d-%d\t%s\t%d\t%d' % (acc, chrom, start, end, strand, cds_start, cds_end), file=sys.stderr) exons.append([acc, chrom, start, end, strand, c_start, c_end]) te = time() add_time('get_mapping', te - ts) return exons def get_cds(exons): ts = time() seqs = [] for i, (acc, chrom, start, end, strand, cds_start, cds_end) in enumerate(exons): seq = get_sequence(chrom, min(start, end), max(start, end)) if strand == '-': seq = reverse_complement(seq) seqs.append(seq) te = time() add_time('get_cds', te - ts) if args.debug: print('CDS: %s' % str(seqs), file=sys.stderr) return ''.join(seqs) if seqs else '' def genomic_mapping_count(peptide): ts = time() params = {"sequence": peptide} acc_locs = [l for l in mz_cursor.execute(PEPTIDE_ACC_QUERY, params)] te = time() add_time('PEPTIDE_ACC_QUERY', te - ts) if acc_locs: if len(acc_locs) == 1: return 1 locations = set() for i, acc_loc in enumerate(acc_locs): (acc, pep_start, pep_end) = acc_loc if acc in unmapped_accs: continue try: add_time('GENOMIC_POS_QUERY_COUNT', 1) ts = time() p_start = pep_start * 3 p_end = pep_end * 3 params = {"acc": acc, "cds_offset": p_start} (start_chrom, start_pos) = map_cursor.execute(GENOMIC_POS_QUERY, params).fetchone() params = {"acc": acc, "cds_offset": p_end} (end_chrom, end_pos) = map_cursor.execute(GENOMIC_POS_QUERY, params).fetchone() locations.add('%s:%s-%s:%s' % (start_chrom, start_pos, end_chrom, end_pos)) te = time() add_time('GENOMIC_POS_QUERY', te - ts) except: unmapped_accs.add(acc) if args.debug: print('Unmapped: %s' % acc, file=sys.stderr) return len(locations) return -1 def spectrum_peptide_count(spectrum_id): ts = time() params = {"sr_id": spectrum_id} pep_count = mz_cursor.execute(SPECTRUM_PEPTIDES_QUERY, params).fetchone()[0] te = time() add_time('SPECTRUM_PEPTIDES_QUERY', te - ts) return pep_count def get_exon_pileup(chrom, chromStart, chromEnd): cols = [] for pileupcolumn in samfile.pileup(chrom, chromStart, chromEnd): if chromStart <= pileupcolumn.reference_pos <= chromEnd: bases = dict() col = { 'depth': 0, 'cov': pileupcolumn.nsegments, 'pos': pileupcolumn.reference_pos, 'bases': bases } for pileupread in pileupcolumn.pileups: if not pileupread.is_del and not pileupread.is_refskip: col['depth'] += 1 base = pileupread.alignment.query_sequence[ pileupread.query_position] if base not in bases: bases[base] = 1 else: bases[base] += 1 cols.append(col) return cols codon_map = { "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L", "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S", "TAT": "Y", "TAC": "Y", "TAA": "*", "TAG": "*", "TGT": "C", "TGC": "C", "TGA": "*", "TGG": "W", "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L", "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P", "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q", "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R", "ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M", "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T", "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K", "AGT": "S", "AGC": "S", "AGA": "R", "AGG": "R", "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V", "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A", "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E", "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G", } aa_codon_map = dict() for c, a in codon_map.items(): aa_codon_map[a] = [ c ] if a not in aa_codon_map else aa_codon_map[a] + [c] aa_na_map = dict() # m[aa]{bo : {b1 : [b3] for c, a in codon_map.items(): if a not in aa_na_map: aa_na_map[a] = dict() d = aa_na_map[a] for i in range(3): b = c[i] if i < 2: if b not in d: d[b] = dict() if i < 1 else set() d = d[b] else: d.add(b) def get_pep_codon(pileup, idx, aa, ao): try: ts = time() bases = [] for i in range(3): if i < ao: bases.append(list(set([c[i] for c in aa_codon_map[aa]]))) else: bases.append([ b for b, cnt in reversed( sorted(pileup[idx + i]['bases'].iteritems(), key=lambda (k, v): (v, k))) ]) if args.debug: print('%s' % bases, file=sys.stderr) for b0 in bases[0]: if b0 not in aa_na_map[aa]: continue for b1 in bases[1]: if b1 not in aa_na_map[aa][b0]: continue for b2 in bases[2]: if b2 in aa_na_map[aa][b0][b1]: return '%s%s%s' % (b0, b1, b2) te = time() add_time('pep_codon', te - ts) except Exception as e: print("get_pep_codon: %s %s %s %s" % (aa, ao, idx, pileup), file=sys.stderr) raise e return None def write_probed(chrom, chromStart, chromEnd, strand, blockCount, blockSizes, blockStarts, spectrum, protacc, peptide, uniqueness, genomeReference, score=1000, psmScore='.', fdr='.', mods='.', charge='.', expMassToCharge='.', calcMassToCharge='.', psmRank='.', datasetID='.', uri='.'): probed.write('%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % \ (chrom,chromStart,chromEnd,spectrum,score,strand,chromStart,chromEnd,'0',blockCount, ','.join([str(v) for v in blockSizes]), ','.join([str(v) for v in blockStarts]), protacc,peptide,uniqueness, genomeReference, psmScore, fdr, mods, charge, expMassToCharge, calcMassToCharge, psmRank, datasetID, uri)) def get_genomic_location(exons): chrom = exons[0][1] strand = exons[0][4] pos = [exon[2] for exon in exons] + [exon[3] for exon in exons] chromStart = min(pos) chromEnd = max(pos) blockCount = len(exons) blockSizes = [abs(exon[3] - exon[2]) for exon in exons] blockStarts = [min(exon[2], exon[3]) - chromStart for exon in exons] return (chrom, chromStart, chromEnd, strand, blockCount, blockSizes, blockStarts) def get_psm_modifications(peptide_ref): mods = [] ts = time() params = {"peptide_ref": peptide_ref} pepmods = [m for m in mz_cursor.execute(PEP_MODS_QUERY, params)] if pepmods: for (location, residue, name, modType, unimod) in pepmods: mods.append('%s-%s' % (location, unimod if unimod else '%s%s' % (name, residue))) te = time() add_time('PEP_MODS_QUERY', te - ts) return ';'.join(mods) """ QNAME FLAG RNAME POS CIGAR SEQ 'NH' : 'i', #number of genomic locations to which the peptide sequence maps 'XO' : 'Z', #uniqueness of the peptide mapping 'XL' : 'i', #number of peptides to which the spectrum maps 'XP' : 'Z', #peptide sequence 'YP' : 'Z', #Protein accession ID from the original search result 'XF' : 'Z', #Reading frame of the peptide (0, 1, 2) 'XI' : 'f', #Peptide intensity 'XB' : 'Z', #massdiff; experimental mass; calculated mass massdiff can be calculated by experimental mass - calculated mass. If any number is unavailable, the value should be left blank (such as 0.01;;). 'XR' : 'Z', #reference peptide sequence 'YB' : 'Z', #Preceding amino acids (2 AA, B stands for before). 'YA' : 'Z', #Following amino acids (2 AA, A stands for after). 'XS' : 'f', #PSM score 'XQ' : 'f', #PSM FDR (i.e. q-value or 1-PEP). 'XC' : 'i', #peptide charge 'XA' : 'i', #Whether the peptide is annotated 0:yes; 1:parially unknown; 2:totally unknown; 'XM' : 'Z', #Modifications 'XN' : 'i', #Number of missed cleavages in the peptide (XP) 'XT' : 'i', #Enzyme specificity 'XE' : 'i', #Enzyme used in the experiment 'XG' : 'A', #Peptide type 'XU' : 'Z', #URI """ psm_cursor = get_connection(args.mzsqlite).cursor() ts = time() psms = psm_cursor.execute(PSM_QUERY) te = time() add_time('PSM_QUERY', te - ts) proBAM = ProBAM(species=None, assembly=args.genomeReference, seqlens=seqlens, comments=[]) proBED = ProBED(species=None, assembly=args.genomeReference, comments=[]) for i, psm in enumerate(psms): probam_dict = PROBAM_DEFAULTS.copy() (acc, pep_start, pep_end, aa_pre, aa_post, peptide, spectrum_id, spectrum_title, rank, charge, calcmass, exprmass, pepref) = psm scan_name = spectrum_title if spectrum_title else spectrum_id if args.debug: print('\nPSM: %d\t%s' % (i, '\t'.join([ str(v) for v in (acc, pep_start, pep_end, peptide, spectrum_id, scan_name, rank, charge, calcmass, exprmass) ])), file=sys.stderr) exons = get_mapping(acc, pep_start, pep_end) if args.debug: print('%s' % exons, file=sys.stderr) if not exons: continue mods = get_psm_modifications(pepref) (chrom, chromStart, chromEnd, strand, blockCount, blockSizes, blockStarts) = get_genomic_location(exons) ref_cds = get_cds(exons) if args.debug: print('%s' % ref_cds, file=sys.stderr) ref_prot = translate(ref_cds) if args.debug: print('%s' % ref_prot, file=sys.stderr) print('%s' % peptide, file=sys.stderr) spectrum_peptides = spectrum_peptide_count(spectrum_id) peptide_locations = genomic_mapping_count(peptide) if args.debug: print('spectrum_peptide_count: %d\tpeptide_location_count: %d' % (spectrum_peptides, peptide_locations), file=sys.stderr) uniqueness = 'unique' if peptide_locations == 1 else 'not-unique[unknown]' ts = time() proBEDEntry = ProBEDEntry(chrom, chromStart, chromEnd, '%s_%s' % (acc, scan_name), 1000, strand, blockCount, blockSizes, blockStarts, acc, peptide, uniqueness, args.genomeReference, charge=charge, expMassToCharge=exprmass, calcMassToCharge=calcmass, mods=mods if mods else '.', psmRank=rank) proBED.add_entry(proBEDEntry) te = time() add_time('add_probed', te - ts) if len(ref_prot) != len(peptide): continue ts = time() probam_dict['NH'] = peptide_locations probam_dict['XO'] = uniqueness probam_dict['XL'] = peptide_locations probam_dict['XP'] = peptide probam_dict['YP'] = acc probam_dict['XC'] = charge probam_dict['XB'] = '%f;%f;%f' % (exprmass - calcmass, exprmass, calcmass) probam_dict['XR'] = ref_prot # ? dbSequence probam_dict['YA'] = aa_post probam_dict['YB'] = aa_pre probam_dict['XM'] = mods if mods else '*' flag = 16 if strand == '-' else 0 if str(rank) != str(1) and rank != '*' and rank != [] and rank != "": flag += 256 probam_dict['XF'] = ','.join([str(e[2] % 3) for e in exons]) ## check for variation from ref_cds pep_cds = get_variant_cds(exons, ref_prot, peptide, ref_cds) peptide_type = '*' ## XG classify peptide probam_dict['XG'] = get_peptide_type(exons) ## probam_dict['MD'] = peptide ## FIX SAM sequence is forward strand seq = pep_cds if strand == '+' else reverse_complement(pep_cds) ## cigar based on plus strand cigar = '' if strand == '+': blkStarts = blockStarts blkSizes = blockSizes else: blkStarts = [x for x in reversed(blockStarts)] blkSizes = [x for x in reversed(blockSizes)] for j in range(blockCount): if j > 0: intron = blkStarts[j] - (blkStarts[j - 1] + blkSizes[j - 1]) if intron > 0: cigar += '%dN' % intron cigar += '%dM' % blkSizes[j] ## Mods TODO proBAMEntry = ProBAMEntry(qname=scan_name, flag=flag, rname=chrom, pos=chromStart + 1, cigar=cigar, seq=seq, optional=probam_dict) proBAM.add_entry(proBAMEntry) te = time() add_time('add_probam', te - ts) if args.debug: print('%s' % probam_dict, file=sys.stderr) if args.limit and i >= args.limit: break if args.probed: ts = time() with open(args.probed, 'w') as fh: proBED.write(fh) te = time() add_time('write_probed', te - ts) if args.prosam or args.probam: samfile = args.prosam if args.prosam else 'temp.sam' ts = time() with open(samfile, 'w') as fh: proBAM.write(fh) te = time() add_time('write_prosam', te - ts) if args.probam: ts = time() bamfile = args.prosam.replace('.sam', '.bam') pysam.view(samfile, '-b', '-o', args.probam, catch_stdout=False) te = time() add_time('write_probam', te - ts) pysam.index(args.probam) print('\n%s\n' % str(timings), file=sys.stderr)