Esempio n. 1
0
    def start(self, fastq_file1, fastq_file2, fastq_fileU, output_prefix, rdpPath, gene='16srrna', train=None, batchsize=10000, minQ=None, minL=0, procs=1, test=False, verbose=True, debug=False):
        """
        Start classifying double barcoded Illumina sequencing run
        """
        results = {}

        self.verbose = verbose
        try:
            if (train is None and gene != '16srrna' and gene != 'fungallsu' and gene != "fungalits_warcup" and gene != "fungalits_unite"):
                sys.stderr.write("ERROR:[classify] parameter -g (--gene) must be one of 16srrna or fungallsu or fungalits_warcup or fungalits_unite \n")
                raise Exception
            # establish and open the Illumina run
            if fastq_file1 is not None and fastq_file2 is not None:
                self.runPairs = TwoReadIlluminaRun(fastq_file1, fastq_file2)
                self.runPairs.open()
            else:
                self.runPairs = None
            if fastq_fileU is not None:
                self.runSingle = OneReadIlluminaRun(fastq_fileU)
                self.runSingle.open()
            else:
                self.runSingle = None
            if self.runPairs is None and self.runSingle is None:
                sys.stderr.write("ERROR:[classify] input reads not specified, or incorrect pairs\n")
                raise Exception

            lasttime = time.time()
            batch = 0
            pool = Pool(procs, maxtasksperchild=1)

            #For OneReadIllumina:
            if (self.runSingle is not None):
                while 1:
                    # get next batch of reads
                    reads = self.runSingle.next(batchsize)
                    batch = batch + len(reads)
                    if len(reads) == 0:
                        break
                    run_out = IlluminaFastaOutput(output_prefix + "." + str(batch))
                    # process individual reads
                    for read in reads:
                        if minQ != 0 or minL != 0:
                            read.trimRead(minQ, minL)
                            if read.goodRead == True:
                                run_out.addRead(read.getFasta())
                        else:
                            run_out.addRead(read.getFasta())
                    # Write out reads
                    rcount = run_out.count()
                    if rcount > batchsize:
                        sys.stderr.write("WARNING:[classify] output count exceeds batch count")
                    run_out.writeReads()
                    rdp_out = output_prefix + "." + str(batch) + ".fixrank"
                    results[rdp_out] = pool.apply_async(rdpCall, (run_out.output_prefix, rdp_out, gene, train, rdpPath, self.verbose))
                    if test:
                        break

            #For TwoReadIllumina:
            if (self.runPairs is not None):
                while 1:
                    # get next batch of reads
                    reads = self.runPairs.next(batchsize)
                    batch = batch + len(reads)
                    if len(reads) == 0:
                        break
                    run_out = IlluminaFastaOutput(output_prefix + "." + str(batch))
                    # process individual reads
                    for read in reads:
                        if minQ != 0 or minL != 0:
                            read.trimRead(minQ, minL)
                            if read.goodRead == True:
                                run_out.addRead(read.getJoinedFasta())
                        else:
                            run_out.addRead(read.getJoinedFasta())
                    # Write out reads
                    run_out.writeReads()
                    rdp_out = output_prefix + "." + str(batch) + ".fixrank"
                    results[rdp_out] = pool.apply_async(rdpCall, (run_out.output_prefix, rdp_out, gene, train, rdpPath, self.verbose))
                    if test:
                        break

            allfinished = False
            while not allfinished:
                time.sleep(1)
                np = check_status(results)
                if np == 0:
                    allfinished = True
            if self.verbose:
                sys.stderr.write("Combining temporary files\n")

            with open(output_prefix + ".fixrank", "wb") as outfile:
                for f in results.keys():
                    with open(f, "rb") as infile:
                        outfile.write(infile.read())
                    os.remove(f)
                    
            if self.verbose:
                sys.stdout.write("%s reads processed in %s minutes\n" % (batch, round((time.time() - lasttime) / (60), 2)))
            self.clean(results)
            return 0
        except (KeyboardInterrupt, SystemExit):
            self.clean(results)
            sys.stderr.write("%s unexpectedly terminated\n" % (__name__))
            return 1
        except:
            self.clean(results)
            sys.stderr.write("A fatal error was encountered.\n")
            if debug:
                sys.stderr.write("".join(traceback.format_exception(*sys.exc_info())))
            return 1
Esempio n. 2
0
    def start(self,
              fastq_file1,
              fastq_file2,
              fastq_fileU,
              output_prefix,
              rdpPath,
              gene='16srrna',
              train=None,
              batchsize=10000,
              minQ=None,
              minL=0,
              procs=1,
              test=False,
              verbose=True,
              debug=False):
        """
        Start classifying double barcoded Illumina sequencing run
        """
        results = {}

        self.verbose = verbose
        try:
            if (train is None and gene != '16srrna' and gene != 'fungallsu'
                    and gene != "fungalits_warcup"
                    and gene != "fungalits_unite"):
                sys.stderr.write(
                    "ERROR:[classify] parameter -g (--gene) must be one of 16srrna or fungallsu or fungalits_warcup or fungalits_unite \n"
                )
                raise Exception
            # establish and open the Illumina run
            if fastq_file1 is not None and fastq_file2 is not None:
                self.runPairs = TwoReadIlluminaRun(fastq_file1, fastq_file2)
                self.runPairs.open()
            else:
                self.runPairs = None
            if fastq_fileU is not None:
                self.runSingle = OneReadIlluminaRun(fastq_fileU)
                self.runSingle.open()
            else:
                self.runSingle = None
            if self.runPairs is None and self.runSingle is None:
                sys.stderr.write(
                    "ERROR:[classify] input reads not specified, or incorrect pairs\n"
                )
                raise Exception

            lasttime = time.time()
            batch = 0
            pool = Pool(procs, maxtasksperchild=1)

            #For OneReadIllumina:
            if (self.runSingle is not None):
                while 1:
                    # get next batch of reads
                    reads = self.runSingle.next(batchsize)
                    batch = batch + len(reads)
                    if len(reads) == 0:
                        break
                    run_out = IlluminaFastaOutput(output_prefix + "." +
                                                  str(batch))
                    # process individual reads
                    for read in reads:
                        if minQ != 0 or minL != 0:
                            read.trimRead(minQ, minL)
                            if read.goodRead == True:
                                run_out.addRead(read.getFasta())
                        else:
                            run_out.addRead(read.getFasta())
                    # Write out reads
                    rcount = run_out.count()
                    if rcount > batchsize:
                        sys.stderr.write(
                            "WARNING:[classify] output count exceeds batch count"
                        )
                    run_out.writeReads()
                    rdp_out = output_prefix + "." + str(batch) + ".fixrank"
                    results[rdp_out] = pool.apply_async(
                        rdpCall, (run_out.output_prefix, rdp_out, gene, train,
                                  rdpPath, self.verbose))
                    if test:
                        break

            #For TwoReadIllumina:
            if (self.runPairs is not None):
                while 1:
                    # get next batch of reads
                    reads = self.runPairs.next(batchsize)
                    batch = batch + len(reads)
                    if len(reads) == 0:
                        break
                    run_out = IlluminaFastaOutput(output_prefix + "." +
                                                  str(batch))
                    # process individual reads
                    for read in reads:
                        if minQ != 0 or minL != 0:
                            read.trimRead(minQ, minL)
                            if read.goodRead == True:
                                run_out.addRead(read.getJoinedFasta())
                        else:
                            run_out.addRead(read.getJoinedFasta())
                    # Write out reads
                    run_out.writeReads()
                    rdp_out = output_prefix + "." + str(batch) + ".fixrank"
                    results[rdp_out] = pool.apply_async(
                        rdpCall, (run_out.output_prefix, rdp_out, gene, train,
                                  rdpPath, self.verbose))
                    if test:
                        break

            allfinished = False
            while not allfinished:
                time.sleep(1)
                np = check_status(results)
                if np == 0:
                    allfinished = True
            if self.verbose:
                sys.stderr.write("Combining temporary files\n")

            with open(output_prefix + ".fixrank", "wb") as outfile:
                for f in results.keys():
                    with open(f, "rb") as infile:
                        outfile.write(infile.read())
                    os.remove(f)

            if self.verbose:
                sys.stdout.write(
                    "%s reads processed in %s minutes\n" %
                    (batch, round((time.time() - lasttime) / (60), 2)))
            self.clean(results)
            return 0
        except (KeyboardInterrupt, SystemExit):
            self.clean(results)
            sys.stderr.write("%s unexpectedly terminated\n" % (__name__))
            return 1
        except:
            self.clean(results)
            sys.stderr.write("A fatal error was encountered.\n")
            if debug:
                sys.stderr.write("".join(
                    traceback.format_exception(*sys.exc_info())))
            return 1
Esempio n. 3
0
 def start(self,
           fastq_file1,
           fastq_file2,
           fastq_fileU,
           output_prefix,
           batchsize=100000,
           uncompressed=False,
           verbose=True,
           debug=False):
     """
     Split double barcoded Illumina sequencing run from two to four reads by sample identifier
     """
     self.verbose = verbose
     if fastq_fileU is not None and (fastq_file1 is not None
                                     and fastq_file2 is not None):
         sys.stderr.write(
             "ERROR:[SplitBySample] cannot have both paired and single reads\n"
         )
         return 1
     try:
         if fastq_file1 is not None and fastq_file2 is not None:
             self.runPairs = TwoReadIlluminaRun(fastq_file1, fastq_file2)
             self.runPairs.open()
         else:
             self.runPairs = None
         if fastq_fileU is not None:
             self.runSingle = OneReadIlluminaRun(fastq_fileU)
             self.runSingle.open()
         else:
             self.runSingle = None
         if self.runPairs is None and self.runSingle is None:
             sys.stderr.write(
                 "ERROR:[SplitBySample] input reads not specified, or incorrect pairs\n"
             )
             raise Exception
         self.run_out = {}
         if (self.runPairs is not None):
             while 1:
                 if self.verbose:
                     sys.stderr.write("Processing sequence files.\n")
                 # get next batch of reads
                 reads = self.runPairs.next(batchsize)
                 if len(reads) == 0:
                     break
                 # process individual reads, check to see if sample was already added to the library of self.run_out
                 for read in reads:
                     sample = read.sample
                     if sample in self.run_out:
                         self.run_out[sample].addRead(read.getFastqSRA())
                     else:
                         self.run_out[sample] = IlluminaTwoReadOutput(
                             os.path.join(output_prefix, sample),
                             uncompressed)
                         self.run_out[sample].addRead(read.getFastqSRA())
                 # Write out reads for each key in dictionary
                 for key in self.run_out:
                     self.run_out[key].writeReads()
             if self.verbose:
                 sys.stderr.write("\nSplit out %s total samples in %s.\n" %
                                  (len(self.run_out), output_prefix))
             return 0
         if (self.runSingle is not None):
             while 1:
                 if self.verbose:
                     sys.stderr.write("Processing sequence files.\n")
                 # get next batch of reads
                 reads = self.runSingle.next(batchsize)
                 if len(reads) == 0:
                     break
                 # process individual reads, check to see if sample was already added to the library of self.run_out
                 for read in reads:
                     sample = read.sample
                     if sample in self.run_out:
                         self.run_out[sample].addRead(read.getFastqSRA())
                     else:
                         self.run_out[sample] = IlluminaOneReadOutput(
                             os.path.join(output_prefix, sample),
                             uncompressed)
                         self.run_out[sample].addRead(read.getFastqSRA())
                 # Write out reads for each key in dictionary
                 for key in self.run_out:
                     self.run_out[key].writeReads()
             if self.verbose:
                 sys.stderr.write("\nSplit out %s total samples in %s.\n" %
                                  (len(self.run_out), output_prefix))
             return 0
     except (KeyboardInterrupt, SystemExit):
         self.clean()
         sys.stderr.write("%s unexpectedly terminated.\n" % (__name__))
         return 1
     except:
         self.clean()
         sys.stderr.write("A fatal error was encountered.\n")
         if debug:
             sys.stderr.write("".join(
                 traceback.format_exception(*sys.exc_info())))
         return 1