def start(self, loglevel, configfile='ARC_config.txt'): try: logger.setup(loglevel=loglevel) logger.info("Reading config file...") config = Config(configfile) values = config.get() logger.info( "Setting up working directories and building indexes...") self.setup(values) spawn = Spawn(values) logger.info("Running ARC.") spawn.submit() spawn.run() logger.info("Cleaning up.") self.clean() return 0 except FatalError as e: logger.error("A fatal error was encountered. \n\t%s" % str(e)) return 1 except (KeyboardInterrupt, SystemExit): self.clean() logger.error("%s unexpectedly terminated" % (__name__)) return 1
def PSL_to_dict(self, filename): """Process a PSL file to the dict format """ try: inf = open(filename, 'r') except Exception as inst: if type(inst) == IOError: logger.error("Failed to open mapping dictionary %s." % filename) raise inst read_map = {} i = 0 startT = time.time() psl_header = False for l in inf: i += 1 # Check for PSL header and skip 5 lines if it exists if i == 1 and l.split()[0] == 'psLayout': psl_header = True if psl_header and i <= 5: continue l2 = l.strip().split("\t") readid = keyfunction(self.params['sra'])( l2[9]) # .split("/")[0] # remove unique part of PE reads target = l2[13] # handle references built using assembled contigs: if len(target.split("_:_")) > 1: target = target.split("_:_")[1] if target not in read_map: read_map[target] = {} read_map[target][readid] = 1 logger.info("Sample: %s, Processed %s lines from PSL in %s seconds." % (self.params['sample'], i, time.time() - startT)) return read_map
def PSL_to_dict(self, filename): """Process a PSL file to the dict format """ try: inf = open(filename, 'r') except Exception as inst: if type(inst) == IOError: logger.error("Failed to open mapping dictionary %s." % filename) raise inst read_map = {} i = 0 startT = time.time() psl_header = False for l in inf: i += 1 # Check for PSL header and skip 5 lines if it exists if i == 1 and l.split()[0] == 'psLayout': psl_header = True if psl_header and i <= 5: continue l2 = l.strip().split("\t") readid = keyfunction(self.params['sra'])(l2[9]) # .split("/")[0] # remove unique part of PE reads target = l2[13] # handle references built using assembled contigs: if len(target.split("_:_")) > 1: target = target.split("_:_")[1] if target not in read_map: read_map[target] = {} read_map[target][readid] = 1 logger.info("Sample: %s, Processed %s lines from PSL in %s seconds." % (self.params['sample'], i, time.time() - startT)) return read_map
def run(self): logger.info("Starting...") logger.debug("Setting up workers.") for i in range(self.nprocs): worker = ProcessRunner(i, self.q, self.status, self.stats, self.pid) self.workers.append(worker) worker.daemon = False worker.start() while True: try: self.q.join() # This shouldn't be needed but we will check just in case if self.all_workers_waiting(): logger.debug( "Workers are all waiting and the queue is empty. Exiting" ) break else: logger.debug( "Workers are not in a waiting state. Waiting for more." ) time.sleep(5) except exceptions.FatalError: logger.error("A fatal error was encountered.") self.killall() raise except (KeyboardInterrupt, SystemExit): logger.error("Terminating processes") self.killall() raise except Exception as e: ex_type, ex, tb = sys.exc_info() logger.error("\n".join( traceback.format_exception(ex_type, ex, tb))) logger.error("An unhandled exception occurred") self.killall() raise finally: # Kill 'em all! self.killall() logger.info("-----") logger.info("%d processes returned ok." % (self.stats[0])) logger.info("%d processes had to be rerun." % (self.stats[1])) logger.info("-----") logger.info("%d Mapper jobs run." % (self.stats[2])) logger.info("%d Assembly jobs run." % (self.stats[3])) logger.info("%d Checker jobs run." % (self.stats[4])) logger.info("%d Finisher jobs run." % (self.stats[5])) logger.info("-----")
def run(self): logger.info("Starting...") logger.debug("Setting up workers.") for i in range(self.nprocs): worker = ProcessRunner( i, self.q, self.status, self.stats, self.pid) self.workers.append(worker) worker.daemon = False worker.start() while True: try: self.q.join() # This shouldn't be needed but we will check just in case if self.all_workers_waiting(): logger.debug("Workers are all waiting and the queue is empty. Exiting") break else: logger.debug("Workers are not in a waiting state. Waiting for more.") time.sleep(5) except exceptions.FatalError: logger.error("A fatal error was encountered.") self.killall() raise except (KeyboardInterrupt, SystemExit): logger.error("Terminating processes") self.killall() raise except Exception as e: ex_type, ex, tb = sys.exc_info() logger.error("\n".join(traceback.format_exception(ex_type, ex, tb))) logger.error("An unhandled exception occurred") self.killall() raise finally: # Kill 'em all! self.killall() logger.info("-----") logger.info("%d processes returned ok." % (self.stats[0])) logger.info("%d processes had to be rerun." % (self.stats[1])) logger.info("-----") logger.info("%d Mapper jobs run." % (self.stats[2])) logger.info("%d Assembly jobs run." % (self.stats[3])) logger.info("%d Checker jobs run." % (self.stats[4])) logger.info("%d Finisher jobs run." % (self.stats[5])) logger.info("-----")
def run(self): while True: try: self.waiting() self.launch() self.update_runstats() except exceptions.RerunnableError as e: logger.warn("[%s] A job needs to be rerun: %s" % (self.name, e)) self.update_runstats(1) except exceptions.FatalError as e: logger.error("[%s] A fatal error occurred: %s" % (self.name, e)) os.kill(self.ppid, signal.SIGINT) except (KeyboardInterrupt, SystemExit): logger.debug("Process interrupted") except Exception as e: ex_type, ex, tb = sys.exc_info() logger.error("\n".join(traceback.format_exception(ex_type, ex, tb))) logger.error("An unhandled exception occurred") os.kill(self.ppid, signal.SIGINT)
def error(self, msg): if logger.level() == logging.DEBUG: name = self.name else: name = self.__class__.__name__ logger.error("%-12s| %s" % (name, msg))
def setup(self, config): """ Set up working folder for each sample. Also assign a "safe_target" name to each target so that folder creation works. This is a little bit tricky because if the user has targets with the _:_ seperator in the name it messes up the splitter and SAM_to_dict. This code is therefore written with the assumption that the user has put the _:_ in the name purposely so that multiple entries in the reference fasta will be treated as a single target. """ format = config['format'] for sample in config['Samples']: s = config['Samples'][sample] working_dir = os.path.realpath(config['workingdirectory'] + '/working_' + sample) #working_dir = os.path.realpath('./working_' + sample) finished_dir = os.path.realpath('./finished_' + sample) config['Samples'][sample]['working_dir'] = working_dir config['Samples'][sample]['finished_dir'] = finished_dir if os.path.exists(working_dir): logger.info( "WARNING working directory already exists for " "sample %s, deleting old results if any." % (sample)) os.system('rm -rf %s' % finished_dir) os.system('rm -rf %s/t__*' % working_dir) os.system('rm -rf %s/*.psl' % working_dir) os.system('rm %s/I*_contigs.fasta' % working_dir) if os.path.exists('%s/idx' % working_dir): os.system('rm -rf %s/idx' % working_dir) os.mkdir(finished_dir) else: os.mkdir(working_dir) os.mkdir(finished_dir) # Create stats file: statsf = open(os.path.join(finished_dir, "mapping_stats.tsv"), 'w') statsf.write('\t'.join( ['Sample', 'Target', 'Iteration', 'Reads']) + '\n') statsf.close() # Create Target Summary Table tstf = open(os.path.join(finished_dir, "target_summary_table.tsv"), 'w') tstf.write('\t'.join( ['Sample', 'Target', 'RefLen', 'Status', 'Iteration', 'Reads', 'Contigs', 'ContigLength']) + '\n') tstf.close() #Create a stats file for cdna if config['cdna']: countsf = open(os.path.join(finished_dir, "isogroup_read_counts.tsv"), 'a') countsf.write('\t'.join(['Sample', 'Target', 'isogroup', 'readcount']) + '\n') countsf.close() # Build a separate index for each read file in the input, put them # in working_dir #Consider parallelizing this? start = time.time() if 'PE1' in s: if not os.path.exists(os.path.join(working_dir, "PE1.idx")): print s['PE1'] p1 = SeqIO.index_db( os.path.join(working_dir, "PE1.idx"), s['PE1'], format, key_function=lambda x: x.split("/")[0]) if 'PE2' in s: if not os.path.exists(os.path.join(working_dir, "PE2.idx")): print s['PE2'] p2 = SeqIO.index_db( os.path.join(working_dir, "PE2.idx"), s['PE2'], format, key_function=lambda x: x.split("/")[0]) if len(p1) != len(p2): logger.error("The number of reads in %s and %s do not match, " "check the config for errors" % (s['PE1'], s['PE2'])) if 'SE' in s: if not os.path.exists(os.path.join(working_dir, "SE.idx")): print s['SE'] SeqIO.index_db( os.path.join(working_dir, "SE.idx"), s['SE'], format, key_function=lambda x: x.split("/")[0]) logger.info( "Sample: %s, indexed reads in %s seconds." % ( sample, time.time() - start)) #Read through the references, mask them if necessary #mapper_params['reference'] = os.path.join(self.params['working_dir'], 'I%03d' % self.params['iteration'] + '_contigs.fasta') # Read through the reference, set up a set of safe names for the targets. # Also create the Target Summary Table which is indexed by original target name (following ARC conventions) # Also mask sequences and write them to a new set of output files #safe_targets is a two-way lookup, meaning it has both the safe target ID and the contig ID. summary_stats = {} safe_targets = {} new_refsf = {} for sample in config['Samples']: s = config['Samples'][sample] new_refsf[sample] = open(os.path.join(s['working_dir'], 'I000_contigs.fasta'), 'w') i = 0 for t in SeqIO.parse(config['reference'], "fasta"): if len(t.name.split("_:_")) == 1: target = t.name else: target = t.name.split("_:_")[1] safe_targets[target] = "t__%06d" % i safe_targets["t__%06d" % i] = target i += 1 if target not in summary_stats: summary_stats[target] = {'targetLength': len(t)} else: summary_stats[target]['targetLength'] = (summary_stats[target]['targetLength'] + len(t)) #Write contigs: if config['maskrepeats']: #t.seq = Seq(str(mask_seq(t.seq.tostring(), config['mapper']))) t.seq = Seq(str(mask_seq(str(t.seq), config['mapper']))) #Bowtie2 crashes if a contig is all 'n' so only write it out if it isn't if len(t) != t.seq.count('n'): for outf in new_refsf.values(): SeqIO.write(t, outf, "fasta") else: writeTargetStats(finished_dir=s['finished_dir'], sample=sample, target=target, targetLength=summary_stats[target]['targetLength'], status='MaskedOut', iteration=0, readcount=0, num_contigs=0, contig_length=0) del summary_stats[target] config['safe_targets'] = safe_targets config['summary_stats'] = summary_stats
def setup(self, config): """ Set up working folder for each sample. Also assign a "safe_target" name to each target so that folder creation works. This is a little bit tricky because if the user has targets with the _:_ seperator in the name it messes up the splitter and SAM_to_dict. This code is therefore written with the assumption that the user has put the _:_ in the name purposely so that multiple entries in the reference fasta will be treated as a single target. """ format = config['format'] for sample in config['Samples']: s = config['Samples'][sample] working_dir = os.path.realpath(config['workingdirectory'] + '/working_' + sample) #working_dir = os.path.realpath('./working_' + sample) finished_dir = os.path.realpath('./finished_' + sample) config['Samples'][sample]['working_dir'] = working_dir config['Samples'][sample]['finished_dir'] = finished_dir if os.path.exists(working_dir): logger.info("WARNING working directory already exists for " "sample %s, deleting old results if any." % (sample)) os.system('rm -rf %s' % finished_dir) os.system('rm -rf %s/t__*' % working_dir) os.system('rm -rf %s/*.psl' % working_dir) os.system('rm %s/I*_contigs.fasta' % working_dir) if os.path.exists('%s/idx' % working_dir): os.system('rm -rf %s/idx' % working_dir) os.mkdir(finished_dir) else: os.mkdir(working_dir) os.mkdir(finished_dir) # Create stats file: statsf = open(os.path.join(finished_dir, "mapping_stats.tsv"), 'w') statsf.write( '\t'.join(['Sample', 'Target', 'Iteration', 'Reads']) + '\n') statsf.close() # Create Target Summary Table tstf = open(os.path.join(finished_dir, "target_summary_table.tsv"), 'w') tstf.write('\t'.join([ 'Sample', 'Target', 'RefLen', 'Status', 'Iteration', 'Reads', 'Contigs', 'ContigLength' ]) + '\n') tstf.close() # Create a stats file for cdna if config['cdna']: countsf = open( os.path.join(finished_dir, "isogroup_read_counts.tsv"), 'a') countsf.write( '\t'.join(['Sample', 'Target', 'isogroup', 'readcount']) + '\n') countsf.close() # Build a separate index for each read file in the input, put them # in working_dir # Consider parallelizing this? try: start = time.time() if 'PE1' in s: if not os.path.exists(os.path.join(working_dir, "PE1.idx")): print s['PE1'] index_file = os.path.join(working_dir, "PE1.idx") p1 = SeqIO.index_db(index_file, s['PE1'], format, key_function=keyfunction( config['sra'])) if 'PE2' in s: if not os.path.exists(os.path.join(working_dir, "PE2.idx")): print s['PE2'] index_file = os.path.join(working_dir, "PE2.idx") p2 = SeqIO.index_db(index_file, s['PE2'], format, key_function=keyfunction( config['sra'])) if len(p1) != len(p2): logger.error( "The number of reads in %s and %s do not match, " "check the config for errors" % (s['PE1'], s['PE2'])) if 'SE' in s: if not os.path.exists(os.path.join(working_dir, "SE.idx")): print s['SE'] index_file = os.path.join(working_dir, "SE.idx") SeqIO.index_db(index_file, s['SE'], format, key_function=keyfunction(config['sra'])) except (KeyboardInterrupt, SystemExit): print "Removing partial index: %s" % index_file os.unlink(index_file) raise logger.info("Sample: %s, indexed reads in %s seconds." % (sample, time.time() - start)) # Read through the references, mask them if necessary # mapper_params['reference'] = os.path.join(self.params['working_dir'], 'I%03d' % self.params['iteration'] + '_contigs.fasta') # Read through the reference, set up a set of safe names for the targets. # Also create the Target Summary Table which is indexed by original target name (following ARC conventions) # Also mask sequences and write them to a new set of output files # safe_targets is a two-way lookup, meaning it has both the safe target ID and the contig ID. summary_stats = {} safe_targets = {} new_refsf = {} for sample in config['Samples']: s = config['Samples'][sample] new_refsf[sample] = open( os.path.join(s['working_dir'], 'I000_contigs.fasta'), 'w') i = 0 for t in SeqIO.parse(config['reference'], "fasta"): if len(t.name.split("_:_")) == 1: target = t.name else: target = t.name.split("_:_")[1] safe_targets[target] = "t__%06d" % i safe_targets["t__%06d" % i] = target i += 1 if target not in summary_stats: summary_stats[target] = {'targetLength': len(t)} else: summary_stats[target]['targetLength'] = ( summary_stats[target]['targetLength'] + len(t)) # Write contigs: if config['maskrepeats']: t.seq = Seq(str(mask_seq(str(t.seq), config['mapper']))) # Bowtie2 crashes if a contig is all 'n' so only write it out if it isn't if len(t) != t.seq.count('n'): for outf in new_refsf.values(): SeqIO.write(t, outf, "fasta") else: writeTargetStats( finished_dir=s['finished_dir'], sample=sample, target=target, targetLength=summary_stats[target]['targetLength'], status='MaskedOut', iteration=0, readcount=0, num_contigs=0, contig_length=0) del summary_stats[target] config['safe_targets'] = safe_targets config['summary_stats'] = summary_stats