Esempio n. 1
0
File: app.py Progetto: samhunter/ARC
    def start(self, loglevel, configfile='ARC_config.txt'):
        try:
            logger.setup(loglevel=loglevel)

            logger.info("Reading config file...")
            config = Config(configfile)
            values = config.get()

            logger.info(
                "Setting up working directories and building indexes...")
            self.setup(values)

            spawn = Spawn(values)

            logger.info("Running ARC.")
            spawn.submit()
            spawn.run()

            logger.info("Cleaning up.")
            self.clean()

            return 0
        except FatalError as e:
            logger.error("A fatal error was encountered. \n\t%s" % str(e))
            return 1
        except (KeyboardInterrupt, SystemExit):
            self.clean()
            logger.error("%s unexpectedly terminated" % (__name__))
            return 1
Esempio n. 2
0
    def start(self, loglevel, configfile='ARC_config.txt'):
        try:
            logger.setup(loglevel=loglevel)

            logger.info("Reading config file...")
            config = Config(configfile)
            values = config.get()

            logger.info(
                "Setting up working directories and building indexes...")
            self.setup(values)

            spawn = Spawn(values)

            logger.info("Running ARC.")
            spawn.submit()
            spawn.run()

            logger.info("Cleaning up.")
            self.clean()

            return 0
        except FatalError as e:
            logger.error("A fatal error was encountered. \n\t%s" % str(e))
            return 1
        except (KeyboardInterrupt, SystemExit):
            self.clean()
            logger.error("%s unexpectedly terminated" % (__name__))
            return 1
Esempio n. 3
0
File: mapper.py Progetto: kdm9/ARC
    def PSL_to_dict(self, filename):
        """Process a PSL file to the dict format """
        try:
            inf = open(filename, 'r')
        except Exception as inst:
            if type(inst) == IOError:
                logger.error("Failed to open mapping dictionary %s." %
                             filename)
            raise inst
        read_map = {}
        i = 0
        startT = time.time()

        psl_header = False

        for l in inf:
            i += 1
            # Check for PSL header and skip 5 lines if it exists
            if i == 1 and l.split()[0] == 'psLayout':
                psl_header = True
            if psl_header and i <= 5:
                continue
            l2 = l.strip().split("\t")
            readid = keyfunction(self.params['sra'])(
                l2[9])  # .split("/")[0]  # remove unique part of PE reads
            target = l2[13]
            # handle references built using assembled contigs:
            if len(target.split("_:_")) > 1:
                target = target.split("_:_")[1]
            if target not in read_map:
                read_map[target] = {}
            read_map[target][readid] = 1
        logger.info("Sample: %s, Processed %s lines from PSL in %s seconds." %
                    (self.params['sample'], i, time.time() - startT))
        return read_map
Esempio n. 4
0
File: mapper.py Progetto: ibest/ARC
    def PSL_to_dict(self, filename):
        """Process a PSL file to the dict format """
        try:
            inf = open(filename, 'r')
        except Exception as inst:
            if type(inst) == IOError:
                logger.error("Failed to open mapping dictionary %s." % filename)
            raise inst
        read_map = {}
        i = 0
        startT = time.time()

        psl_header = False

        for l in inf:
            i += 1
            # Check for PSL header and skip 5 lines if it exists
            if i == 1 and l.split()[0] == 'psLayout':
                psl_header = True
            if psl_header and i <= 5:
                continue
            l2 = l.strip().split("\t")
            readid = keyfunction(self.params['sra'])(l2[9])  # .split("/")[0]  # remove unique part of PE reads
            target = l2[13]
            # handle references built using assembled contigs:
            if len(target.split("_:_")) > 1:
                target = target.split("_:_")[1]
            if target not in read_map:
                read_map[target] = {}
            read_map[target][readid] = 1
        logger.info("Sample: %s, Processed %s lines from PSL in %s seconds." % (self.params['sample'], i, time.time() - startT))
        return read_map
Esempio n. 5
0
    def run(self):
        logger.info("Starting...")
        logger.debug("Setting up workers.")

        for i in range(self.nprocs):
            worker = ProcessRunner(i, self.q, self.status, self.stats,
                                   self.pid)
            self.workers.append(worker)
            worker.daemon = False
            worker.start()

        while True:
            try:
                self.q.join()

                # This shouldn't be needed but we will check just in case
                if self.all_workers_waiting():
                    logger.debug(
                        "Workers are all waiting and the queue is empty.  Exiting"
                    )
                    break
                else:
                    logger.debug(
                        "Workers are not in a waiting state.  Waiting for more."
                    )
                    time.sleep(5)

            except exceptions.FatalError:
                logger.error("A fatal error was encountered.")
                self.killall()
                raise
            except (KeyboardInterrupt, SystemExit):
                logger.error("Terminating processes")
                self.killall()
                raise
            except Exception as e:
                ex_type, ex, tb = sys.exc_info()
                logger.error("\n".join(
                    traceback.format_exception(ex_type, ex, tb)))
                logger.error("An unhandled exception occurred")
                self.killall()
                raise
            finally:
                # Kill 'em all!
                self.killall()

        logger.info("-----")
        logger.info("%d processes returned ok." % (self.stats[0]))
        logger.info("%d processes had to be rerun." % (self.stats[1]))
        logger.info("-----")
        logger.info("%d Mapper jobs run." % (self.stats[2]))
        logger.info("%d Assembly jobs run." % (self.stats[3]))
        logger.info("%d Checker jobs run." % (self.stats[4]))
        logger.info("%d Finisher jobs run." % (self.stats[5]))
        logger.info("-----")
Esempio n. 6
0
File: spawn.py Progetto: ibest/ARC
    def run(self):
        logger.info("Starting...")
        logger.debug("Setting up workers.")

        for i in range(self.nprocs):
            worker = ProcessRunner(
                i,
                self.q,
                self.status,
                self.stats,
                self.pid)
            self.workers.append(worker)
            worker.daemon = False
            worker.start()

        while True:
            try:
                self.q.join()

                # This shouldn't be needed but we will check just in case
                if self.all_workers_waiting():
                    logger.debug("Workers are all waiting and the queue is empty.  Exiting")
                    break
                else:
                    logger.debug("Workers are not in a waiting state.  Waiting for more.")
                    time.sleep(5)

            except exceptions.FatalError:
                logger.error("A fatal error was encountered.")
                self.killall()
                raise
            except (KeyboardInterrupt, SystemExit):
                logger.error("Terminating processes")
                self.killall()
                raise
            except Exception as e:
                ex_type, ex, tb = sys.exc_info()
                logger.error("\n".join(traceback.format_exception(ex_type, ex, tb)))
                logger.error("An unhandled exception occurred")
                self.killall()
                raise
            finally:
                # Kill 'em all!
                self.killall()

        logger.info("-----")
        logger.info("%d processes returned ok." % (self.stats[0]))
        logger.info("%d processes had to be rerun." % (self.stats[1]))
        logger.info("-----")
        logger.info("%d Mapper jobs run." % (self.stats[2]))
        logger.info("%d Assembly jobs run." % (self.stats[3]))
        logger.info("%d Checker jobs run." % (self.stats[4]))
        logger.info("%d Finisher jobs run." % (self.stats[5]))
        logger.info("-----")
Esempio n. 7
0
 def run(self):
     while True:
         try:
             self.waiting()
             self.launch()
             self.update_runstats()
         except exceptions.RerunnableError as e:
             logger.warn("[%s] A job needs to be rerun: %s" % (self.name, e))
             self.update_runstats(1)
         except exceptions.FatalError as e:
             logger.error("[%s] A fatal error occurred: %s" % (self.name, e))
             os.kill(self.ppid, signal.SIGINT)
         except (KeyboardInterrupt, SystemExit):
             logger.debug("Process interrupted")
         except Exception as e:
             ex_type, ex, tb = sys.exc_info()
             logger.error("\n".join(traceback.format_exception(ex_type, ex, tb)))
             logger.error("An unhandled exception occurred")
             os.kill(self.ppid, signal.SIGINT)
Esempio n. 8
0
 def error(self, msg):
     if logger.level() == logging.DEBUG:
         name = self.name
     else:
         name = self.__class__.__name__
     logger.error("%-12s| %s" % (name, msg))
Esempio n. 9
0
File: app.py Progetto: samhunter/ARC
    def setup(self, config):
        """
            Set up working folder for each sample. Also assign a "safe_target"
            name to each target so that folder creation works. This is a little
            bit tricky because if the user has targets with the _:_ seperator
            in the name it messes up the splitter and SAM_to_dict. This code is
            therefore written with the assumption that the user has put the _:_
            in the name purposely so that multiple entries in the reference
            fasta will be treated as a single target.
        """
        format = config['format']
        for sample in config['Samples']:
            s = config['Samples'][sample]
            working_dir = os.path.realpath(config['workingdirectory'] + '/working_' + sample)
            #working_dir = os.path.realpath('./working_' + sample)
            finished_dir = os.path.realpath('./finished_' + sample)
            config['Samples'][sample]['working_dir'] = working_dir
            config['Samples'][sample]['finished_dir'] = finished_dir
            if os.path.exists(working_dir):
                logger.info(
                    "WARNING working directory already exists for "
                    "sample %s, deleting old results if any." % (sample))
                os.system('rm -rf %s' % finished_dir)
                os.system('rm -rf %s/t__*' % working_dir)
                os.system('rm -rf %s/*.psl' % working_dir)
                os.system('rm %s/I*_contigs.fasta' % working_dir)
                if os.path.exists('%s/idx' % working_dir):
                    os.system('rm -rf %s/idx' % working_dir)
                os.mkdir(finished_dir)
            else:
                os.mkdir(working_dir)
                os.mkdir(finished_dir)

            # Create stats file:
            statsf = open(os.path.join(finished_dir, "mapping_stats.tsv"), 'w')
            statsf.write('\t'.join(
                ['Sample', 'Target', 'Iteration', 'Reads']) + '\n')
            statsf.close()

            # Create Target Summary Table
            tstf = open(os.path.join(finished_dir, "target_summary_table.tsv"), 'w')
            tstf.write('\t'.join(
                ['Sample', 'Target', 'RefLen', 'Status', 'Iteration', 'Reads', 'Contigs', 'ContigLength']) + '\n')
            tstf.close()

            #Create a stats file for cdna
            if config['cdna']:
                countsf = open(os.path.join(finished_dir, "isogroup_read_counts.tsv"), 'a')
                countsf.write('\t'.join(['Sample', 'Target', 'isogroup', 'readcount']) + '\n')
                countsf.close()

            # Build a separate index for each read file in the input, put them
            # in working_dir
            #Consider parallelizing this?
            start = time.time()
            if 'PE1' in s:
                if not os.path.exists(os.path.join(working_dir, "PE1.idx")):
                    print s['PE1']
                    p1 = SeqIO.index_db(
                        os.path.join(working_dir, "PE1.idx"),
                        s['PE1'],
                        format,
                        key_function=lambda x: x.split("/")[0])
            if 'PE2' in s:
                if not os.path.exists(os.path.join(working_dir, "PE2.idx")):
                    print s['PE2']
                    p2 = SeqIO.index_db(
                        os.path.join(working_dir, "PE2.idx"),
                        s['PE2'],
                        format,
                        key_function=lambda x: x.split("/")[0])
                    if len(p1) != len(p2):
                        logger.error("The number of reads in %s and %s do not match, "
                                     "check the config for errors" % (s['PE1'], s['PE2']))
            if 'SE' in s:
                if not os.path.exists(os.path.join(working_dir, "SE.idx")):
                    print s['SE']
                    SeqIO.index_db(
                        os.path.join(working_dir, "SE.idx"),
                        s['SE'],
                        format,
                        key_function=lambda x: x.split("/")[0])

            logger.info(
                "Sample: %s, indexed reads in %s seconds." % (
                    sample, time.time() - start))

            #Read through the references, mask them if necessary

            #mapper_params['reference'] = os.path.join(self.params['working_dir'], 'I%03d' % self.params['iteration'] + '_contigs.fasta')

        # Read through the reference, set up a set of safe names for the targets.
        # Also create the Target Summary Table which is indexed by original target name (following ARC conventions)
        # Also mask sequences and write them to a new set of output files
        #safe_targets is a two-way lookup, meaning it has both the safe target ID and the contig ID.
        summary_stats = {}
        safe_targets = {}
        new_refsf = {}
        for sample in config['Samples']:
            s = config['Samples'][sample]
            new_refsf[sample] = open(os.path.join(s['working_dir'], 'I000_contigs.fasta'), 'w')

        i = 0
        for t in SeqIO.parse(config['reference'], "fasta"):
            if len(t.name.split("_:_")) == 1:
                target = t.name
            else:
                target = t.name.split("_:_")[1]

            safe_targets[target] = "t__%06d" % i
            safe_targets["t__%06d" % i] = target
            i += 1
            if target not in summary_stats:
                summary_stats[target] = {'targetLength': len(t)}
            else:
                summary_stats[target]['targetLength'] = (summary_stats[target]['targetLength'] + len(t))

            #Write contigs:
            if config['maskrepeats']:
                #t.seq = Seq(str(mask_seq(t.seq.tostring(), config['mapper'])))
                t.seq = Seq(str(mask_seq(str(t.seq), config['mapper'])))
            #Bowtie2 crashes if a contig is all 'n' so only write it out if it isn't
            if len(t) != t.seq.count('n'):
                for outf in new_refsf.values():
                    SeqIO.write(t, outf, "fasta")
            else:
                writeTargetStats(finished_dir=s['finished_dir'],
                                 sample=sample,
                                 target=target,
                                 targetLength=summary_stats[target]['targetLength'],
                                 status='MaskedOut',
                                 iteration=0,
                                 readcount=0,
                                 num_contigs=0, contig_length=0)
                del summary_stats[target]

        config['safe_targets'] = safe_targets
        config['summary_stats'] = summary_stats
Esempio n. 10
0
    def setup(self, config):
        """
            Set up working folder for each sample. Also assign a "safe_target"
            name to each target so that folder creation works. This is a little
            bit tricky because if the user has targets with the _:_ seperator
            in the name it messes up the splitter and SAM_to_dict. This code is
            therefore written with the assumption that the user has put the _:_
            in the name purposely so that multiple entries in the reference
            fasta will be treated as a single target.
        """
        format = config['format']
        for sample in config['Samples']:
            s = config['Samples'][sample]
            working_dir = os.path.realpath(config['workingdirectory'] +
                                           '/working_' + sample)
            #working_dir = os.path.realpath('./working_' + sample)
            finished_dir = os.path.realpath('./finished_' + sample)
            config['Samples'][sample]['working_dir'] = working_dir
            config['Samples'][sample]['finished_dir'] = finished_dir
            if os.path.exists(working_dir):
                logger.info("WARNING working directory already exists for "
                            "sample %s, deleting old results if any." %
                            (sample))
                os.system('rm -rf %s' % finished_dir)
                os.system('rm -rf %s/t__*' % working_dir)
                os.system('rm -rf %s/*.psl' % working_dir)
                os.system('rm %s/I*_contigs.fasta' % working_dir)
                if os.path.exists('%s/idx' % working_dir):
                    os.system('rm -rf %s/idx' % working_dir)
                os.mkdir(finished_dir)
            else:
                os.mkdir(working_dir)
                os.mkdir(finished_dir)

            # Create stats file:
            statsf = open(os.path.join(finished_dir, "mapping_stats.tsv"), 'w')
            statsf.write(
                '\t'.join(['Sample', 'Target', 'Iteration', 'Reads']) + '\n')
            statsf.close()

            # Create Target Summary Table
            tstf = open(os.path.join(finished_dir, "target_summary_table.tsv"),
                        'w')
            tstf.write('\t'.join([
                'Sample', 'Target', 'RefLen', 'Status', 'Iteration', 'Reads',
                'Contigs', 'ContigLength'
            ]) + '\n')
            tstf.close()

            # Create a stats file for cdna
            if config['cdna']:
                countsf = open(
                    os.path.join(finished_dir, "isogroup_read_counts.tsv"),
                    'a')
                countsf.write(
                    '\t'.join(['Sample', 'Target', 'isogroup', 'readcount']) +
                    '\n')
                countsf.close()

            # Build a separate index for each read file in the input, put them
            # in working_dir
            # Consider parallelizing this?
            try:
                start = time.time()
                if 'PE1' in s:
                    if not os.path.exists(os.path.join(working_dir,
                                                       "PE1.idx")):
                        print s['PE1']
                        index_file = os.path.join(working_dir, "PE1.idx")
                        p1 = SeqIO.index_db(index_file,
                                            s['PE1'],
                                            format,
                                            key_function=keyfunction(
                                                config['sra']))
                if 'PE2' in s:
                    if not os.path.exists(os.path.join(working_dir,
                                                       "PE2.idx")):
                        print s['PE2']
                        index_file = os.path.join(working_dir, "PE2.idx")
                        p2 = SeqIO.index_db(index_file,
                                            s['PE2'],
                                            format,
                                            key_function=keyfunction(
                                                config['sra']))
                        if len(p1) != len(p2):
                            logger.error(
                                "The number of reads in %s and %s do not match, "
                                "check the config for errors" %
                                (s['PE1'], s['PE2']))
                if 'SE' in s:
                    if not os.path.exists(os.path.join(working_dir, "SE.idx")):
                        print s['SE']
                        index_file = os.path.join(working_dir, "SE.idx")
                        SeqIO.index_db(index_file,
                                       s['SE'],
                                       format,
                                       key_function=keyfunction(config['sra']))
            except (KeyboardInterrupt, SystemExit):
                print "Removing partial index: %s" % index_file
                os.unlink(index_file)
                raise
            logger.info("Sample: %s, indexed reads in %s seconds." %
                        (sample, time.time() - start))

            # Read through the references, mask them if necessary

            # mapper_params['reference'] = os.path.join(self.params['working_dir'], 'I%03d' % self.params['iteration'] + '_contigs.fasta')

        # Read through the reference, set up a set of safe names for the targets.
        # Also create the Target Summary Table which is indexed by original target name (following ARC conventions)
        # Also mask sequences and write them to a new set of output files
        # safe_targets is a two-way lookup, meaning it has both the safe target ID and the contig ID.
        summary_stats = {}
        safe_targets = {}
        new_refsf = {}
        for sample in config['Samples']:
            s = config['Samples'][sample]
            new_refsf[sample] = open(
                os.path.join(s['working_dir'], 'I000_contigs.fasta'), 'w')

        i = 0
        for t in SeqIO.parse(config['reference'], "fasta"):
            if len(t.name.split("_:_")) == 1:
                target = t.name
            else:
                target = t.name.split("_:_")[1]

            safe_targets[target] = "t__%06d" % i
            safe_targets["t__%06d" % i] = target
            i += 1
            if target not in summary_stats:
                summary_stats[target] = {'targetLength': len(t)}
            else:
                summary_stats[target]['targetLength'] = (
                    summary_stats[target]['targetLength'] + len(t))

            # Write contigs:
            if config['maskrepeats']:
                t.seq = Seq(str(mask_seq(str(t.seq), config['mapper'])))
            # Bowtie2 crashes if a contig is all 'n' so only write it out if it isn't
            if len(t) != t.seq.count('n'):
                for outf in new_refsf.values():
                    SeqIO.write(t, outf, "fasta")
            else:
                writeTargetStats(
                    finished_dir=s['finished_dir'],
                    sample=sample,
                    target=target,
                    targetLength=summary_stats[target]['targetLength'],
                    status='MaskedOut',
                    iteration=0,
                    readcount=0,
                    num_contigs=0,
                    contig_length=0)
                del summary_stats[target]

        config['safe_targets'] = safe_targets
        config['summary_stats'] = summary_stats