def __init__(s, path, software): new = False if os.path.isfile(path) else True s.archive = archive.Archive(path) s.metadata = { "createdOn": time.time(), "createdBy": getpass.getuser(), "name": os.path.basename(os.path.splitext(path)[0]), "software": software } with timer.Timer("Loading"): with s.archive: s.metadata = Dict( dict(s.metadata, **decode(s.archive.get("metadata.json", "{}")))) # Metadata s.ref = Dict( reference.Reference( decode(s.archive.get("reference.json", "{}")))) # Reference file s.data = Dict(decode(s.archive.get("data.json", "{}"))) # Storage if new: s.metadata.diff = True s.data.diff = True s.ref.diff = True tree = dict((a, a.split("/")) for a in s.archive.keys()) clipIDs = set(b[1] for a, b in tree.items() if b[0] == "clips") s.clips = Dict({}) if clipIDs: for ID in clipIDs: c = Clip(ID) c.metadata = Dict( dict( c.metadata, **decode( s.archive.get( "clips/%s/metadata.json" % ID, "{}")))) c.data = decode( s.archive.get("clips/%s/data.json" % ID, "{}")) thumbs = sorted([ a for a, b in tree.items() if b[0] == "clips" and b[1] == ID and b[2] == "thumbs" ]) if thumbs: for th in thumbs: c.thumbs.append(s.cache(th)) s.clips[ID] = c s.clips.diff
def run(self): ref_path = self.reference.__str__() refasta = reference.Reference(ref_path).show_fasta() # Having reference an independent class leaves enough room for accommodating future expansion myresult = myio.IMFilterOutput(self.infilename.name) # another problem tho is args.infile_sam came along way with multiple "/" # so here you need to extract the basename opened_input = open(self.infilename.__str__(), 'r') self._lgr.info('referencing the genome fasta provided...') counter_all, counter_imp = 0, 0 for line in opened_input: samline = myclasses.LineUp(line) if samline._identity == 'SAMheader': continue # The header is excluded from the output SAM elif samline._identity == 'SAMread': counter_all += 1 samread = samline.parse_line() start = samread.pat_locator() p = IMPriming(refasta, samread._strand, samread._chroms, start, self.window_size, self.deny_number) if p._imp: counter_imp += 1 continue else: newline = samread.build_line() myresult.add2content(newline) else: raise Exception( 'Unexpected format of lines appear in SAM input') self._lgr.info("creating filtered SAM file excluding headers ...") self._lgr.info("through {0} reads".format(counter_all)) self._lgr.info( "\t\t{0} ({1}) were removed due to internal priming.".format( counter_imp, float(counter_imp) / float(counter_all))) myresult.open2write(myresult.content) self._lgr.info("Done!")
def filterSpectrumList(self, refFilePath, magmin=-1, magmax=50, zmin=-1, zmax=50, sfrmin=-1, sfrmax=1e6, outputPath=""): ref = reference.Reference(refFilePath) idList = self.fvect indexes = ref.filterIdList(idList, magmin=magmin, magmax=magmax, zmin=zmin, zmax=zmax, sfrmin=sfrmin, sfrmax=sfrmax) print("spllist filtering: indexes found n = {}".format(len(indexes))) if outputPath == "": dirPath = os.path.split(self.path)[0] nameNoExt = os.path.splitext(self.name)[0] outputFileFullPath = os.path.join( dirPath, "{}_z{}-{}_mag{}-{}_sfr{}-{}.spectrumlist".format( nameNoExt, zmin, zmax, magmin, magmax, sfrmin, sfrmax)) f = open(outputFileFullPath, 'w') for k, idThis in enumerate(self.fvect): if k in indexes: if len(self.errfvect) == len(self.fvect) and len( self.procidvect) == len(self.fvect): f.write("{}\t{}\t{}\n".format(self.fvect[k], self.errfvect[k], self.procidvect[k])) else: f.write("{}\n".format(self.fvect[k]))
else: dataY, N = readRefSeqData(options.dataY, (dataYtype == dataXtype)) infoY = readRefSeqInfo(options.dataY) sys.stdout.write('\rDataset Y [' + options.dataY + ']: ' + infoY + ', ' + str(N) + ' transcripts') print '\n' # Read configuration file refpath37, refpath38 = readConfigFile(dir) # Initialize GRCh37 reference genome if refpath37 is not None: if not os.path.isfile(refpath37): print '\nError: GRCh37 reference genome file (' + refpath37 + ') cannot be found.\n' quit() ref_GRCh37 = reference.Reference(refpath37) else: ref_GRCh37 = None # Initialize GRCh38 reference genome if refpath38 is not None: if not os.path.isfile(refpath38): print '\nError: GRCh38 reference genome file (' + refpath38 + ') cannot be found.\n' quit() ref_GRCh38 = reference.Reference(refpath38) else: ref_GRCh38 = None # Check if required reference genome files are specified if ref_GRCh37 is None and 'GRCh37' in [dataX_build, dataY_build]: print '\nError: GRCh37 reference genome file needs to be specified in configuration file.\n'
def run(options): if not ((options.series.startswith('CART37') or options.series.startswith('CART38')) and len(options.series) == 7): print '\nSeries code incorrect!\n' quit() print '\n==== ENSTWriter {} '.format(__version__) + '=' * 78 # Initialize reference sequence reader ref = reference.Reference(options.ref) # Initialize transcript database writer tdb_writer = helper.initialize_transcript_db_writer(options) # Read Ensembl database ensembl_db = transcripts.read_ensembl_db(options.ensembl) # Read previous CAVA db output and reference genome if required if options.prev_cava_db: prev_ref = reference.Reference(options.prev_ref) prev_cava_db = helper.read_prev_cava_db(options.prev_cava_db, prev_ref) else: prev_cava_db = None # Initialize output files out_genepred, out_fasta, out_genepred_annovar, out_fasta_annovar, gbk_dir = helper.initialize_output_files( options) # Initialize progress info sys.stdout.write('\nProcessing {} CARTs read from {} ... '.format( helper.number_of_input_carts(options.input), options.input)) sys.stdout.flush() # Initialize CART numbering cartidx = 10000 if options.prev_cava_db is None else helper.get_last_cartidx( options.prev_cava_db) # Iterate through input records missing_list = [] gff2_lines = {} gff3_lines = {} for line in open(options.input): line = line.strip() if line == '' or line.startswith('#'): continue cols = line.split() hgnc_id = cols[0][5:] enst = cols[1] # Add ENST to missing list if not found in Ensembl database if enst not in ensembl_db: missing_list.append('{} (HGNC:{})'.format(enst, hgnc_id)) continue # Retrieve data about ENST transcript = ensembl_db[enst] # Calculating CART ID if options.prev_cava_db is None: cartidx += 1 cart_id = '{}{}'.format(options.series, cartidx) else: content = (transcript.strand, len(transcript.exons), helper.read_mrna_sequence(transcript, ref)) if hgnc_id in prev_cava_db and content == prev_cava_db[hgnc_id][ 'content']: cart_id = '{}{}'.format(options.series, prev_cava_db[hgnc_id]['cartidx']) else: cartidx += 1 cart_id = '{}{}'.format(options.series, cartidx) # Add CART ID and HGNC ID to transcript transcript.id = cart_id transcript.hgnc_id = hgnc_id # Add transcript to database writer tdb_writer.add(transcript) # Create content of gff2 and gff3 files gff2_lines = helper.create_gff2_lines(transcript, gff2_lines) gff3_lines = helper.create_gff3_lines(transcript, gff3_lines) # Write to gp file helper.output_genepred(transcript, out_genepred) # Write to gbk output if options.gbk: helper.output_gbk(transcript, ref, gbk_dir) # Write to fasta file helper.output_fasta(transcript, out_fasta, ref) # Write annovar files if options.annovar: helper.output_genepred(transcript, out_genepred_annovar) helper.output_fasta_annovar(transcript, out_fasta_annovar, ref) # Create bgzipped, Tabix-index GFF2 and GFF3 output helper.output_gff2(gff2_lines, options.output + '.gff2') helper.output_gff3(gff3_lines, options.output + '.gff3') # Finalize outputs helper.finalize_outputs(options, tdb_writer, out_fasta, out_genepred, out_genepred_annovar, out_fasta_annovar, gbk_dir) # Print out summary info helper.print_summary_info(options, missing_list) print '\n' + '=' * 100 + '\n'
def run(options): # Checks if Ensembl TXT files exist if required if not os.path.isfile(options.dataX[:-3] + '.txt'): print 'Error: Dataset X txt file (' + options.dataX[:-3] + '.txt) cannot be found.\n' quit() if not os.path.isfile(options.dataY[:-3] + '.txt'): print 'Error: Dataset Y txt file (' + options.dataY[:-3] + '.txt) cannot be found.\n' quit() input_genes = helper.read_input_genes(options.input) # Read transcript database X sys.stdout.write('GRCh37 Ensembl db [' + options.dataX + ']: READING...') sys.stdout.flush() dataX, N = helper.readEnsemblData(options.dataX) sys.stdout.write('\rGRCh37 Ensembl db [' + options.dataX + ']: ' + str(N) + ' transcripts') print '' # Read transcript database Y sys.stdout.write('GRCh38 Ensembl db [' + options.dataY + ']: READING...') sys.stdout.flush() dataY, N = helper.readEnsemblData(options.dataY) sys.stdout.write('\rGRCh38 Ensembl db [' + options.dataY + ']: ' + str(N) + ' transcripts') print '\n' # Initialize GRCh37 reference genome if not os.path.isfile(options.ref37): print '\nError: GRCh37 reference genome file (' + options.ref37 + ') cannot be found.\n' quit() ref_GRCh37 = reference.Reference(options.ref37) # Initialize GRCh38 reference genome if not os.path.isfile(options.ref38): print '\nError: GRCh38 reference genome file (' + options.ref38 + ') cannot be found.\n' quit() ref_GRCh38 = reference.Reference(options.ref38) ensts_37 = helper.read_enst_file(options.enstsx) ensts_38 = helper.read_enst_file(options.enstsy) # Initialize output file out = open(options.output, 'w') out.write('\t'.join(['#GENE', 'ENST_37', 'ENST_38', 'DIFFERENCE']) + '\n') # Iterate through the input list of transcripts count_gene_in_both = 0 n_identical = 0 n_cds_identical = 0 i = 1 for g in input_genes: i += 1 if g not in ensts_37 or g not in ensts_38: continue count_gene_in_both += 1 enst1 = ensts_37[g] enst2 = ensts_38[g] flags = [] comparewith = [] sys.stdout.write('\rAnalysing gene ' + str(i) + '/' + str(len(input_genes))) sys.stdout.flush() if enst1 not in dataX.keys(): flags.append('NF37') else: transcript = dataX[enst1] if enst2 not in dataY.keys(): flags.append('NF38') else: comparewith = [dataY[enst2]] if len(flags) > 0: out.write('\t'.join(['HGNC:' + g, enst1, enst2, ';'.join(flags)]) + '\n') continue identical, cds_identical = helper.compare( g, transcript, comparewith, ref_GRCh37, ref_GRCh38, options, out) if identical: n_identical += 1 if cds_identical: n_cds_identical += 1 print ' - Done.' # Close output file out.close() # Goodbye message print '\nSummary:' print '- {} of the {} genes are on both ENSTs lists'.format(count_gene_in_both, len(input_genes)) print '- ' + str(n_identical) + ' genes have identical ENSTs' print '- ' + str(n_cds_identical) + ' genes have CDS-identical ENSTs' print '\nOutput written to file: ' + options.output
def __init__(self, confpath, binpath, rootoutputpath, dividecount, opt_bracketing, bracketing_templatesRootPath, refpath): self.logTagStr = "processHelper" self.ready = False self.configPath = confpath self.binPath = binpath self.baseoutputpath = rootoutputpath self.logsPath = os.path.join(self.baseoutputpath, "cluster_logs") if not os.path.exists(self.logsPath): os.mkdir(self.logsPath) self.refPath = refpath if self.refPath == "": self.enableProcessAtZ = 0 else: self.enableProcessAtZ = 1 if self.enableProcessAtZ and not dividecount == 1: print( "ERROR: incompatible options : enableProcessAtZ and dividecount!=1. Aborting." ) return if self.enableProcessAtZ: self.refcatalog = reference.Reference(referencepath=self.refPath, rtype="simple") self.proc_date = time.strftime("%Y%m%d") self.opt_bracketing = opt_bracketing #choices = "", "method" self.bracketing_templatesRootPath = bracketing_templatesRootPath ret = self.loadConfig() if not ret: print("ERROR: load config failed.") return #prepare the working dir self.work_process_dir = os.path.abspath("process-work") if not os.path.exists(self.work_process_dir): os.mkdir(self.work_process_dir) if dividecount > 0: outputPath = os.path.join( self.work_process_dir, "spectrumlist_subs_{}".format(dividecount)) if not os.path.exists(outputPath): os.mkdir(outputPath) print('INFO: splitting using full path: {}'.format(outputPath)) spclist = spectrumlist.Spectrumlist(self.config_spclistPath) self.subspclists = spclist.splitIntoSubsets( int(dividecount), outputPath) self.subrecombine_info = {} self.subsetsRelPath = "output_subsets" self.baseoutputpath = os.path.join(self.baseoutputpath, self.subsetsRelPath) if not os.path.exists(self.baseoutputpath): os.mkdir(self.baseoutputpath) else: self.subspclists = [] self.ready = True
def run(options): if not ((options.series.startswith('CART37') or options.series.startswith('CART38')) and len(options.series) == 7): print '\nSeries code incorrect!\n' quit() # ... selected_ensts = helper.read_selected_ensts(options.selected_ensts) # ... canonical_ensts = helper.read_canonical_ensts(options.canonical) # Initialize reference sequence reader ref = reference.Reference(options.ref) # Initialize transcript database writer tdb_writer = helper.initialize_transcript_db_writer(options) # Read Ensembl database ensembl_db = transcripts.read_ensembl_db(options.ensembl) ensembl_by_symbol = transcripts.read_ensembl_db_by_symbol(options.ensembl) # Read previous CAVA db output and reference genome if required if options.prev_cava_db: prev_ref = reference.Reference(options.prev_ref) prev_cava_db = helper.read_prev_cava_db(options.prev_cava_db, prev_ref) else: prev_cava_db = None # Initialize output files out_genepred, out_fasta, out_genepred_annovar, out_fasta_annovar, gbk_dir, out_id, out_excl = helper.initialize_output_files( options) # Initialize progress info sys.stdout.write('Processing {} genes ... '.format( helper.number_of_genes(options.selected_nms))) sys.stdout.flush() # Initialize CART numbering cartidx = 10000 if options.prev_cava_db is None else helper.get_last_cartidx( options.prev_cava_db) # Iterate through input records count_excluded = 0 count_selected = 0 count_canonical_or_longest = 0 gff2_lines = {} gff3_lines = {} for line in open(options.selected_nms): line = line.strip() if line == '' or line.startswith('#'): continue cols = line.split() symbol = cols[0] hgnc_id = cols[1] assoc_nm = cols[-1] if hgnc_id in selected_ensts and selected_ensts[hgnc_id] != '.': enst = selected_ensts[hgnc_id] selected = True elif symbol in canonical_ensts and canonical_ensts[ symbol] in ensembl_db: enst = canonical_ensts[symbol] selected = False elif symbol in ensembl_by_symbol: enst = transcripts.find_longest_transcript( ensembl_by_symbol[symbol]).id selected = False else: out_excl.write('{}\t{}\t{}\n'.format( hgnc_id, symbol, 'no_selection_or_canonical_or_longest')) count_excluded += 1 continue # Add to the list of excluded genes if ENST not found in Ensembl database if enst not in ensembl_db: out_excl.write('{}\t{}\t{}\n'.format(hgnc_id, symbol, 'not_in_ensembl_db')) count_excluded += 1 continue try: # Retrieve data about ENST transcript = ensembl_db[enst] if selected: # Calculating CART ID if options.prev_cava_db is None: cartidx += 1 cart_id = '{}{}'.format(options.series, cartidx) else: content = (transcript.strand, len(transcript.exons), helper.read_mrna_sequence(transcript, ref)) if hgnc_id in prev_cava_db and content == prev_cava_db[ hgnc_id]['content']: cart_id = '{}{}'.format( options.series, prev_cava_db[hgnc_id]['cartidx']) else: cartidx += 1 cart_id = '{}{}'.format(options.series, cartidx) template_id = cart_id else: template_id = enst # Add template ID and HGNC ID to transcript transcript.id = template_id transcript.hgnc_id = hgnc_id transcript.assoc_nm = assoc_nm transcript.assoc_enst = enst # Write IDs to file helper.output_ids(out_id, hgnc_id, template_id) # Add transcript to database writer tdb_writer.add(transcript) # Create content of gff3 file gff2_lines = helper.create_gff2_lines(transcript, gff2_lines) gff3_lines = helper.create_gff3_lines(transcript, gff3_lines) # Write to gp file helper.output_genepred(transcript, out_genepred) # Write to gbk output if options.gbk: helper.output_gbk(transcript, ref, gbk_dir) # Write to fasta file helper.output_fasta(transcript, out_fasta, ref) # Write annovar files if options.annovar: helper.output_genepred(transcript, out_genepred_annovar) helper.output_fasta_annovar(transcript, out_fasta_annovar, ref) if selected: count_selected += 1 else: count_canonical_or_longest += 1 except: out_excl.write('{}\t{}\t{}\n'.format(hgnc_id, symbol, 'output_error')) count_excluded += 1 # Create bgzipped, Tabix-index GFF2 and GFF3 outputs helper.output_gff2(gff2_lines, options.output + '.gff2') helper.output_gff3(gff3_lines, options.output + '.gff3') # Finalize outputs helper.finalize_outputs(options, tdb_writer, out_fasta, out_genepred, out_genepred_annovar, out_fasta_annovar, gbk_dir, out_id, out_excl) # Print out summary info helper.print_summary_info(options, count_selected, count_canonical_or_longest, count_excluded)
def _open_reference(self): ref = reference.Reference(self._refgenome.__str__()) opened_ref = ref.show_fasta() self._lgr.info("reference genome opened successfully.") # return an object that you can call .fetch() on return opened_ref
def run(self): ''' Whatever is the header of the reference, I will take it down as the first line of the output ''' ref = reference.Reference(self._reference) masterlist = ref.show_masterlist() header = masterlist.readline().rstrip( ) + '\t' + self._sample_name + '\n' # so the header of the reference must be clean self._lgr.info("header of the output: %s", header) myresult = myio.PARcounterOutput(self._infilename.name) # self._infilename points to a Path() object myresult.add2content(header) # 1. cache the info from input BED file self._cached_dict = self.cache_BED_input() self._lgr.info("input BED cached.") # 2. increment on the reference hits = 0 self._lgr.info("start matching with the given reference list...") for line in masterlist: ls_line = line.rstrip().split('\t') coords = ls_line[const.COORDS_coli] incrementals, hitted, self._cached_dict = utils.increment_reads_at( coords, self._window_size, self._cached_dict) # what is worth-noting # self._cached_dict is mutated within utils.increment_reads_at() # for the saking of write out previously unidentified ApA coords if hitted: hits += 1 if hits % const.FIVE_HUNDRED_HITS == 0: print "{0} hits".format(hits) new_line = ls_line[const.GENES_coli] + '\t'\ + ls_line[const.IDS_coli] + '\t'\ + ls_line[const.TRANSCRIPTS_coli] + '\t'\ + ls_line[const.TYPES_coli] + '\t'\ + ls_line[const.COORDS_coli] + '\t'\ + str(incrementals) + '\n' myresult.add2content(new_line) self._lgr.info("%s hits on the reference list were found!", str(hits)) # write output myresult.open2write(myresult.content) ref.close_masterlist() self._lgr.info("PARcounter table generated.") ''' The following outputs unidentified ApA coords and their read counts. ''' if len(self._cached_dict) > 0: sideresult = myio.UnIdentifiedAPAsOutput(self._infilename.name) # self._infilename points to a Path() object sideheader = const.UID_HEADER + self._sample_name + '\n' sideresult.add2content(sideheader) counter_uidapa = 0 for coords in self._cached_dict: counter_uidapa += 1 # Behind each "coords" key in self._cached_dict, it is a list of BedRead objects # So the len(ls_bedreads) is the number of hits on that coords. ls_bedreads = self._cached_dict[coords] uid_apa_line = coords + '\t' + str(len(ls_bedreads)) + '\n' sideresult.add2content(uid_apa_line) self._lgr.info( "start to output %s un-identified potential ApAs from %s ...", str(counter_uidapa), self._infilename.name) sideresult.open2write(sideresult.content) self._lgr.info("Done!") else: self._lgr.info("There is no un-identified potential ApAs left.") pass