def start(self,
              fastq_file1,
              fastq_file2,
              barcode1,
              barcode2,
              output_prefix,
              batchsize=100000,
              uncompressed=False,
              verbose=True,
              debug=False):
        """
        Start conversion of double barcoded Illumina sequencing run from two to four reads
        """
        self.verbose = verbose
        try:
            # setup output files
            self.run_out = IlluminaFourReadOutput(output_prefix, uncompressed)

            # establish and open the Illumin run
            self.run = TwoReadIlluminaRun(fastq_file1, fastq_file2)
            self.run.open()
            lasttime = time.time()
            while 1:
                # get next batch of reads
                reads = self.run.next(batchsize)
                if len(reads) == 0:
                    break
                # process individual reads
                for read in reads:
                    self.run_out.addRead(
                        read.getFourReads(bc1_length=barcode1,
                                          bc2_length=barcode2))
                # Write out reads
                self.run_out.writeReads()
                if self.verbose:
                    sys.stderr.write(
                        "processed %s total reads, %s Reads/second\n" %
                        (self.run.count(),
                         round(self.run.count() /
                               (time.time() - lasttime), 0)))
            if self.verbose:
                sys.stdout.write(
                    "%s reads processed in %s minutes\n" %
                    (self.run.count(), round(
                        (time.time() - lasttime) / (60), 2)))
            # write out project table
            self.clean()
            return 0
        except (KeyboardInterrupt, SystemExit):
            self.clean()
            sys.stderr.write("%s unexpectedly terminated\n" % (__name__))
            return 1
        except:
            self.clean()
            sys.stderr.write("A fatal error was encountered.\n")
            if debug:
                sys.stderr.write("".join(
                    traceback.format_exception(*sys.exc_info())))
            return 1
Example #2
0
 def start(self, fastq_file1, fastq_file2, output_prefix, samplesFile, batchsize=10000, uncompressed=False, output_unidentified=False, verbose=True, debug=False):
     """
         split a double barcoded Illumina Sequencing Run by project
     """
     self.verbose = verbose
     try:
         # read in primer sequences
         sTable = sampleTable(samplesFile)
         if self.verbose:
             sys.stdout.write("sample table length: %s, and %s projects.\n" % (sTable.getSampleNumber(), len(sTable.getProjectList())))
         # read in primer sequences if present
         # setup output files
         identified_count = 0
         unidentified_count = 0
         self.run_out = {}
         for project in sTable.getProjectList():
             self.run_out[project] = IlluminaTwoReadOutput(os.path.join(output_prefix, project), uncompressed)
         if output_unidentified:
             self.run_out["Unidentified"] = IlluminaTwoReadOutput(os.path.join(output_prefix, 'UnidentifiedProject'), uncompressed)
         # establish and open the Illumin run
         self.run = TwoReadIlluminaRun(fastq_file1, fastq_file2)
         self.run.open()
         lasttime = time.time()
         while 1:
             # get next batch of reads
             reads = self.run.next(batchsize)
             if len(reads) == 0:
                 break
             # process individual reads
             for read in reads:
                 read.assignRead(sTable)  # barcode
                 if read.goodRead is True:
                     self.run_out[read.project].addRead(read.getFastq())
                     identified_count += 1
                 else:
                     unidentified_count += 1
                     if output_unidentified:
                         self.run_out["Unidentified"].addRead(read.getFastq())
             # Write out reads
             for key in self.run_out:
                 self.run_out[key].writeReads()
             if self.verbose:
                 sys.stderr.write("processed %s total reads, %s Reads/second, %s identified reads, %s unidentified reads (%s%%)\n" % (self.run.count(), round(self.run.count()/(time.time() - lasttime), 0), identified_count, unidentified_count, round((float(identified_count)/float(self.run.count()))*100)))
         if self.verbose:
             sys.stdout.write("%s reads processed in %s minutes, %s (%s%%) identified\n\n" % (self.run.count(), round((time.time()-lasttime)/(60), 2), identified_count, round((float(identified_count)/float(self.run.count()))*100, 1)))
         for key in self.run_out:
             sys.stdout.write("%s (%s%%)\treads found for project\t%s\n" % (self.run_out[key].count(), round((float(self.run_out[key].count())/float(self.run.count()))*100, 1), key))
         self.clean()
         return 0
     except (KeyboardInterrupt, SystemExit):
         self.clean()
         sys.stderr.write("%s unexpectedly terminated\n" % (__name__))
         return 1
     except:
         self.clean()
         if not debug:
             sys.stderr.write("A fatal error was encountered. trying turning on debug\n")
         if debug:
             sys.stderr.write("".join(traceback.format_exception(*sys.exc_info())))
         return 1
    def start(self, fastq_file1, fastq_file2, barcode1, barcode2, barcodesFile, max_diff, flip, output_prefix, batchsize=100000, uncompressed=False, verbose=True, debug=False):
        """
        Start conversion of double barcoded Illumina sequencing run from two to four reads
        """
        self.verbose = verbose
        try:
            # read in barcode sequences
            bcTable = barcodeTable(barcodesFile, i1_rc=False)
            if self.verbose:
                sys.stdout.write("barcode table length: %s\n" % bcTable.getLength())

            # setup output files
            self.run_out = IlluminaFourReadOutput(output_prefix, uncompressed)

            # establish and open the Illumin run
            self.run = TwoReadIlluminaRun(fastq_file1, fastq_file2)
            self.run.open()
            failed_reads = 0
            flipped_reads = 0
            lasttime = time.time()
            while 1:
                # get next batch of reads
                reads = self.run.next(batchsize)
                if len(reads) == 0:
                    break
                # process individual reads
                for read in reads:
                    tmp = read.getFourReadsInline(bcTable, bc1_length=barcode1, bc2_length=barcode2, max_diff=max_diff, flip=flip)
                    if len(tmp) == 0:  # failed read
                        failed_reads += 1
                        continue
                    if tmp[4]:
                        flipped_reads += 1
                    self.run_out.addRead(tmp[0:4])
                # Write out reads
                self.run_out.writeReads()
                if self.verbose:
                    sys.stderr.write("processed %s total reads, %s failed reads, %s flipped reads, %s Reads/second\n" % (self.run.count(), failed_reads, flipped_reads, round(self.run.count() / (time.time() - lasttime), 0)))
            if self.verbose:
                sys.stdout.write("%s reads processed, %s failed reads, %s flipped reads in %s minutes\n" % (self.run.count(), failed_reads, flipped_reads, round((time.time() - lasttime) / (60), 2)))
            # write out project table
            self.clean()
            return 0
        except (KeyboardInterrupt, SystemExit):
            self.clean()
            sys.stderr.write("%s unexpectedly terminated\n" % (__name__))
            return 1
        except Exception:
            self.clean()
            sys.stderr.write("A fatal error was encountered.\n")
            if debug:
                sys.stderr.write("".join(traceback.format_exception(*sys.exc_info())))
            return 1
class convertApp:
    """
    Convert two read Illumina files (barcodes processed) back to a four read set to processed with dbcAmplicons
    """

    def __init__(self):
        self.verbose = False

    def start(self, fastq_file1, fastq_file2, barcode1, barcode2, output_prefix, batchsize=100000, uncompressed=False, verbose=True, debug=False):
        """
        Start conversion of double barcoded Illumina sequencing run from two to four reads
        """
        self.verbose = verbose
        try:
            # setup output files
            self.run_out = IlluminaFourReadOutput(output_prefix, uncompressed)

            # establish and open the Illumin run
            self.run = TwoReadIlluminaRun(fastq_file1, fastq_file2)
            self.run.open()
            lasttime = time.time()
            while 1:
                # get next batch of reads
                reads = self.run.next(batchsize)
                if len(reads) == 0:
                    break
                # process individual reads
                for read in reads:
                    self.run_out.addRead(read.getFourReads(bc1_length=barcode1, bc2_length=barcode2))
                # Write out reads
                self.run_out.writeReads()
                if self.verbose:
                    sys.stderr.write("processed %s total reads, %s Reads/second\n" % (self.run.count(), round(self.run.count()/(time.time() - lasttime), 0)))
            if self.verbose:
                sys.stdout.write("%s reads processed in %s minutes\n" % (self.run.count(), round((time.time()-lasttime)/(60), 2)))
            # write out project table
            self.clean()
            return 0
        except (KeyboardInterrupt, SystemExit):
            self.clean()
            sys.stderr.write("%s unexpectedly terminated\n" % (__name__))
            return 1
        except:
            self.clean()
            sys.stderr.write("A fatal error was encountered.\n")
            if debug:
                sys.stderr.write("".join(traceback.format_exception(*sys.exc_info())))
            return 1

    def clean(self):
        if self.verbose:
            sys.stderr.write("Cleaning up.\n")
        try:
            self.run.close()
            self.run_out.close()
        except:
            pass
Example #5
0
    def start(self,
              fastq_file1,
              fastq_file2,
              fastq_fileU,
              output_prefix,
              rdpPath,
              gene='16srrna',
              train=None,
              batchsize=10000,
              minQ=None,
              minL=0,
              procs=1,
              test=False,
              verbose=True,
              debug=False):
        """
        Start classifying double barcoded Illumina sequencing run
        """
        results = {}

        self.verbose = verbose
        try:
            if (train is None and gene != '16srrna' and gene != 'fungallsu'
                    and gene != "fungalits_warcup"
                    and gene != "fungalits_unite"):
                sys.stderr.write(
                    "ERROR:[classify] parameter -g (--gene) must be one of 16srrna or fungallsu or fungalits_warcup or fungalits_unite \n"
                )
                raise Exception
            # establish and open the Illumina run
            if fastq_file1 is not None and fastq_file2 is not None:
                self.runPairs = TwoReadIlluminaRun(fastq_file1, fastq_file2)
                self.runPairs.open()
            else:
                self.runPairs = None
            if fastq_fileU is not None:
                self.runSingle = OneReadIlluminaRun(fastq_fileU)
                self.runSingle.open()
            else:
                self.runSingle = None
            if self.runPairs is None and self.runSingle is None:
                sys.stderr.write(
                    "ERROR:[classify] input reads not specified, or incorrect pairs\n"
                )
                raise Exception

            lasttime = time.time()
            batch = 0
            pool = Pool(procs, maxtasksperchild=1)

            #For OneReadIllumina:
            if (self.runSingle is not None):
                while 1:
                    # get next batch of reads
                    reads = self.runSingle.next(batchsize)
                    batch = batch + len(reads)
                    if len(reads) == 0:
                        break
                    run_out = IlluminaFastaOutput(output_prefix + "." +
                                                  str(batch))
                    # process individual reads
                    for read in reads:
                        if minQ != 0 or minL != 0:
                            read.trimRead(minQ, minL)
                            if read.goodRead == True:
                                run_out.addRead(read.getFasta())
                        else:
                            run_out.addRead(read.getFasta())
                    # Write out reads
                    rcount = run_out.count()
                    if rcount > batchsize:
                        sys.stderr.write(
                            "WARNING:[classify] output count exceeds batch count"
                        )
                    run_out.writeReads()
                    rdp_out = output_prefix + "." + str(batch) + ".fixrank"
                    results[rdp_out] = pool.apply_async(
                        rdpCall, (run_out.output_prefix, rdp_out, gene, train,
                                  rdpPath, self.verbose))
                    if test:
                        break

            #For TwoReadIllumina:
            if (self.runPairs is not None):
                while 1:
                    # get next batch of reads
                    reads = self.runPairs.next(batchsize)
                    batch = batch + len(reads)
                    if len(reads) == 0:
                        break
                    run_out = IlluminaFastaOutput(output_prefix + "." +
                                                  str(batch))
                    # process individual reads
                    for read in reads:
                        if minQ != 0 or minL != 0:
                            read.trimRead(minQ, minL)
                            if read.goodRead == True:
                                run_out.addRead(read.getJoinedFasta())
                        else:
                            run_out.addRead(read.getJoinedFasta())
                    # Write out reads
                    run_out.writeReads()
                    rdp_out = output_prefix + "." + str(batch) + ".fixrank"
                    results[rdp_out] = pool.apply_async(
                        rdpCall, (run_out.output_prefix, rdp_out, gene, train,
                                  rdpPath, self.verbose))
                    if test:
                        break

            allfinished = False
            while not allfinished:
                time.sleep(1)
                np = check_status(results)
                if np == 0:
                    allfinished = True
            if self.verbose:
                sys.stderr.write("Combining temporary files\n")

            with open(output_prefix + ".fixrank", "wb") as outfile:
                for f in results.keys():
                    with open(f, "rb") as infile:
                        outfile.write(infile.read())
                    os.remove(f)

            if self.verbose:
                sys.stdout.write(
                    "%s reads processed in %s minutes\n" %
                    (batch, round((time.time() - lasttime) / (60), 2)))
            self.clean(results)
            return 0
        except (KeyboardInterrupt, SystemExit):
            self.clean(results)
            sys.stderr.write("%s unexpectedly terminated\n" % (__name__))
            return 1
        except:
            self.clean(results)
            sys.stderr.write("A fatal error was encountered.\n")
            if debug:
                sys.stderr.write("".join(
                    traceback.format_exception(*sys.exc_info())))
            return 1
Example #6
0
    def start(self, fastq_file1, fastq_file2, fastq_fileU, output_prefix, rdpPath, gene='16srrna', train=None, batchsize=10000, minQ=None, minL=0, procs=1, test=False, verbose=True, debug=False):
        """
        Start classifying double barcoded Illumina sequencing run
        """
        results = {}

        self.verbose = verbose
        try:
            if (train is None and gene != '16srrna' and gene != 'fungallsu' and gene != "fungalits_warcup" and gene != "fungalits_unite"):
                sys.stderr.write("ERROR:[classify] parameter -g (--gene) must be one of 16srrna or fungallsu or fungalits_warcup or fungalits_unite \n")
                raise Exception
            # establish and open the Illumina run
            if fastq_file1 is not None and fastq_file2 is not None:
                self.runPairs = TwoReadIlluminaRun(fastq_file1, fastq_file2)
                self.runPairs.open()
            else:
                self.runPairs = None
            if fastq_fileU is not None:
                self.runSingle = OneReadIlluminaRun(fastq_fileU)
                self.runSingle.open()
            else:
                self.runSingle = None
            if self.runPairs is None and self.runSingle is None:
                sys.stderr.write("ERROR:[classify] input reads not specified, or incorrect pairs\n")
                raise Exception

            lasttime = time.time()
            batch = 0
            pool = Pool(procs, maxtasksperchild=1)

            #For OneReadIllumina:
            if (self.runSingle is not None):
                while 1:
                    # get next batch of reads
                    reads = self.runSingle.next(batchsize)
                    batch = batch + len(reads)
                    if len(reads) == 0:
                        break
                    run_out = IlluminaFastaOutput(output_prefix + "." + str(batch))
                    # process individual reads
                    for read in reads:
                        if minQ != 0 or minL != 0:
                            read.trimRead(minQ, minL)
                            if read.goodRead == True:
                                run_out.addRead(read.getFasta())
                        else:
                            run_out.addRead(read.getFasta())
                    # Write out reads
                    rcount = run_out.count()
                    if rcount > batchsize:
                        sys.stderr.write("WARNING:[classify] output count exceeds batch count")
                    run_out.writeReads()
                    rdp_out = output_prefix + "." + str(batch) + ".fixrank"
                    results[rdp_out] = pool.apply_async(rdpCall, (run_out.output_prefix, rdp_out, gene, train, rdpPath, self.verbose))
                    if test:
                        break

            #For TwoReadIllumina:
            if (self.runPairs is not None):
                while 1:
                    # get next batch of reads
                    reads = self.runPairs.next(batchsize)
                    batch = batch + len(reads)
                    if len(reads) == 0:
                        break
                    run_out = IlluminaFastaOutput(output_prefix + "." + str(batch))
                    # process individual reads
                    for read in reads:
                        if minQ != 0 or minL != 0:
                            read.trimRead(minQ, minL)
                            if read.goodRead == True:
                                run_out.addRead(read.getJoinedFasta())
                        else:
                            run_out.addRead(read.getJoinedFasta())
                    # Write out reads
                    run_out.writeReads()
                    rdp_out = output_prefix + "." + str(batch) + ".fixrank"
                    results[rdp_out] = pool.apply_async(rdpCall, (run_out.output_prefix, rdp_out, gene, train, rdpPath, self.verbose))
                    if test:
                        break

            allfinished = False
            while not allfinished:
                time.sleep(1)
                np = check_status(results)
                if np == 0:
                    allfinished = True
            if self.verbose:
                sys.stderr.write("Combining temporary files\n")

            with open(output_prefix + ".fixrank", "wb") as outfile:
                for f in results.keys():
                    with open(f, "rb") as infile:
                        outfile.write(infile.read())
                    os.remove(f)
                    
            if self.verbose:
                sys.stdout.write("%s reads processed in %s minutes\n" % (batch, round((time.time() - lasttime) / (60), 2)))
            self.clean(results)
            return 0
        except (KeyboardInterrupt, SystemExit):
            self.clean(results)
            sys.stderr.write("%s unexpectedly terminated\n" % (__name__))
            return 1
        except:
            self.clean(results)
            sys.stderr.write("A fatal error was encountered.\n")
            if debug:
                sys.stderr.write("".join(traceback.format_exception(*sys.exc_info())))
            return 1
Example #7
0
    def preprocPair_with_inlineBC(self,
                                  fastq_file1,
                                  fastq_file2,
                                  barcode1,
                                  barcode2,
                                  bcFile,
                                  max_diff,
                                  flip_float,
                                  output_prefix,
                                  batchsize=100000,
                                  uncompressed=False,
                                  verbose=True,
                                  debug=False):
        """
        Start conversion of double barcoded Illumina sequencing run from two to four reads
        """
        print('---')
        print('Running preprocPair_with_inlineBC')
        print('')
        self.verbose = verbose
        try:
            # read in barcode sequences
            bcTable = barcodeTable(bcFile, i1_rc=False)
            if self.verbose:
                sys.stdout.write("barcode table length: %s\n" %
                                 bcTable.getLength())

            # setup output files
            self.run_out = IlluminaFourReadOutput(output_prefix, uncompressed)

            # establish and open the Illumin run
            self.run = TwoReadIlluminaRun(fastq_file1, fastq_file2)
            self.run.open()
            failed_reads = 0
            flipped_reads = 0
            lasttime = time.time()
            while 1:
                # get next batch of reads
                reads = self.run.next(batchsize)
                if len(reads) == 0:
                    break
                # process individual reads
                for read in reads:
                    tmp = read.getFourReadsInline(bcTable,
                                                  bc1_length=barcode1,
                                                  bc2_length=barcode2,
                                                  max_diff=max_diff,
                                                  flip=flip_float)
                    if len(tmp) == 0:  # failed read
                        failed_reads += 1
                        continue
                    if tmp[4]:
                        flipped_reads += 1
                    self.run_out.addRead(tmp[0:4])
                # Write out reads
                self.run_out.writeReads()
                if self.verbose:
                    sys.stderr.write(
                        "processed %s total reads, %s failed reads, %s flipped reads, %s Reads/second\n"
                        % (self.run.count(), failed_reads, flipped_reads,
                           round(self.run.count() /
                                 (time.time() - lasttime), 0)))
            if self.verbose:
                sys.stdout.write(
                    "%s reads processed, %s failed reads, %s flipped reads in %s minutes\n"
                    % (self.run.count(), failed_reads, flipped_reads,
                       round((time.time() - lasttime) / (60), 2)))
            # write out project table
            fastq_file1 = output_prefix + '_R1.fastq.gz'
            fastq_file2 = output_prefix + '_R2.fastq.gz'
            fastq_file3 = output_prefix + '_R3.fastq.gz'
            fastq_file4 = output_prefix + '_R4.fastq.gz'
            return [fastq_file1, fastq_file2, fastq_file3, fastq_file4]
            self.clean()
        except (KeyboardInterrupt, SystemExit):
            self.clean()
            sys.stderr.write("%s unexpectedly terminated\n" % (__name__))
            return 1
        except:
            self.clean()
            sys.stderr.write("A fatal error was encountered.\n")
            if debug:
                sys.stderr.write("".join(
                    traceback.format_exception(*sys.exc_info())))
            return 1
        print(run_out)
Example #8
0
class convertApp:
    """
    Convert two read Illumina files (barcodes processed) back to a four read set to processed with dbcAmplicons
    """
    def __init__(self):
        self.verbose = False

    def start(self,
              fastq_file1,
              fastq_file2,
              barcode1,
              barcode2,
              barcodesFile,
              max_diff,
              flip,
              output_prefix,
              batchsize=100000,
              uncompressed=False,
              verbose=True,
              debug=False):
        """
        Start conversion of double barcoded Illumina sequencing run from two to four reads
        """
        self.verbose = verbose
        try:
            # read in barcode sequences
            bcTable = barcodeTable(barcodesFile)
            if self.verbose:
                sys.stdout.write("barcode table length: %s\n" %
                                 bcTable.getLength())

            # setup output files
            self.run_out = IlluminaFourReadOutput(output_prefix, uncompressed)

            # establish and open the Illumin run
            self.run = TwoReadIlluminaRun(fastq_file1, fastq_file2)
            self.run.open()
            failed_reads = 0
            flipped_reads = 0
            lasttime = time.time()
            while 1:
                # get next batch of reads
                reads = self.run.next(batchsize)
                if len(reads) == 0:
                    break
                # process individual reads
                for read in reads:
                    tmp = read.getFourReadsInline(bcTable,
                                                  bc1_length=barcode1,
                                                  bc2_length=barcode2,
                                                  max_diff=max_diff,
                                                  flip=flip)
                    if len(tmp) == 0:  # failed read
                        failed_reads += 1
                        continue
                    if tmp[4]:
                        flipped_reads += 1
                    self.run_out.addRead(tmp[0:4])
                # Write out reads
                self.run_out.writeReads()
                if self.verbose:
                    sys.stderr.write(
                        "processed %s total reads, %s failed reads, %s flipped reads, %s Reads/second\n"
                        % (self.run.count(), failed_reads, flipped_reads,
                           round(self.run.count() /
                                 (time.time() - lasttime), 0)))
            if self.verbose:
                sys.stdout.write(
                    "%s reads processed, %s failed reads, %s flipped reads in %s minutes\n"
                    % (self.run.count(), failed_reads, flipped_reads,
                       round((time.time() - lasttime) / (60), 2)))
            # write out project table
            self.clean()
            return 0
        except (KeyboardInterrupt, SystemExit):
            self.clean()
            sys.stderr.write("%s unexpectedly terminated\n" % (__name__))
            return 1
        except:
            self.clean()
            sys.stderr.write("A fatal error was encountered.\n")
            if debug:
                sys.stderr.write("".join(
                    traceback.format_exception(*sys.exc_info())))
            return 1

    def clean(self):
        if self.verbose:
            sys.stderr.write("Cleaning up.\n")
        try:
            self.run.close()
            self.run_out.close()
        except:
            pass
Example #9
0
 def start(self,
           fastq_file1,
           fastq_file2,
           fastq_fileU,
           output_prefix,
           batchsize=100000,
           uncompressed=False,
           verbose=True,
           debug=False):
     """
     Split double barcoded Illumina sequencing run from two to four reads by sample identifier
     """
     self.verbose = verbose
     if fastq_fileU is not None and (fastq_file1 is not None
                                     and fastq_file2 is not None):
         sys.stderr.write(
             "ERROR:[SplitBySample] cannot have both paired and single reads\n"
         )
         return 1
     try:
         if fastq_file1 is not None and fastq_file2 is not None:
             self.runPairs = TwoReadIlluminaRun(fastq_file1, fastq_file2)
             self.runPairs.open()
         else:
             self.runPairs = None
         if fastq_fileU is not None:
             self.runSingle = OneReadIlluminaRun(fastq_fileU)
             self.runSingle.open()
         else:
             self.runSingle = None
         if self.runPairs is None and self.runSingle is None:
             sys.stderr.write(
                 "ERROR:[SplitBySample] input reads not specified, or incorrect pairs\n"
             )
             raise Exception
         self.run_out = {}
         if (self.runPairs is not None):
             while 1:
                 if self.verbose:
                     sys.stderr.write("Processing sequence files.\n")
                 # get next batch of reads
                 reads = self.runPairs.next(batchsize)
                 if len(reads) == 0:
                     break
                 # process individual reads, check to see if sample was already added to the library of self.run_out
                 for read in reads:
                     sample = read.sample
                     if sample in self.run_out:
                         self.run_out[sample].addRead(read.getFastqSRA())
                     else:
                         self.run_out[sample] = IlluminaTwoReadOutput(
                             os.path.join(output_prefix, sample),
                             uncompressed)
                         self.run_out[sample].addRead(read.getFastqSRA())
                 # Write out reads for each key in dictionary
                 for key in self.run_out:
                     self.run_out[key].writeReads()
             if self.verbose:
                 sys.stderr.write("\nSplit out %s total samples in %s.\n" %
                                  (len(self.run_out), output_prefix))
             return 0
         if (self.runSingle is not None):
             while 1:
                 if self.verbose:
                     sys.stderr.write("Processing sequence files.\n")
                 # get next batch of reads
                 reads = self.runSingle.next(batchsize)
                 if len(reads) == 0:
                     break
                 # process individual reads, check to see if sample was already added to the library of self.run_out
                 for read in reads:
                     sample = read.sample
                     if sample in self.run_out:
                         self.run_out[sample].addRead(read.getFastqSRA())
                     else:
                         self.run_out[sample] = IlluminaOneReadOutput(
                             os.path.join(output_prefix, sample),
                             uncompressed)
                         self.run_out[sample].addRead(read.getFastqSRA())
                 # Write out reads for each key in dictionary
                 for key in self.run_out:
                     self.run_out[key].writeReads()
             if self.verbose:
                 sys.stderr.write("\nSplit out %s total samples in %s.\n" %
                                  (len(self.run_out), output_prefix))
             return 0
     except (KeyboardInterrupt, SystemExit):
         self.clean()
         sys.stderr.write("%s unexpectedly terminated.\n" % (__name__))
         return 1
     except:
         self.clean()
         sys.stderr.write("A fatal error was encountered.\n")
         if debug:
             sys.stderr.write("".join(
                 traceback.format_exception(*sys.exc_info())))
         return 1
Example #10
0
 def start(self,
           fastq_file1,
           fastq_file2,
           output_prefix,
           samplesFile,
           batchsize=10000,
           uncompressed=False,
           output_unidentified=False,
           verbose=True,
           debug=False):
     """
         split a double barcoded Illumina Sequencing Run by project
     """
     self.verbose = verbose
     try:
         # read in primer sequences
         sTable = sampleTable(samplesFile)
         if self.verbose:
             sys.stdout.write(
                 "sample table length: %s, and %s projects.\n" %
                 (sTable.getSampleNumber(), len(sTable.getProjectList())))
         # read in primer sequences if present
         # setup output files
         identified_count = 0
         unidentified_count = 0
         self.run_out = {}
         for project in sTable.getProjectList():
             self.run_out[project] = IlluminaTwoReadOutput(
                 os.path.join(output_prefix, project), uncompressed)
         if output_unidentified:
             self.run_out["Unidentified"] = IlluminaTwoReadOutput(
                 os.path.join(output_prefix, 'UnidentifiedProject'),
                 uncompressed)
         # establish and open the Illumin run
         self.run = TwoReadIlluminaRun(fastq_file1, fastq_file2)
         self.run.open()
         lasttime = time.time()
         while 1:
             # get next batch of reads
             reads = self.run.next(batchsize)
             if len(reads) == 0:
                 break
             # process individual reads
             for read in reads:
                 read.assignRead(sTable)  # barcode
                 if read.goodRead is True:
                     self.run_out[read.project].addRead(read.getFastq())
                     identified_count += 1
                 else:
                     unidentified_count += 1
                     if output_unidentified:
                         self.run_out["Unidentified"].addRead(
                             read.getFastq())
             # Write out reads
             for key in self.run_out:
                 self.run_out[key].writeReads()
             if self.verbose:
                 sys.stderr.write(
                     "processed %s total reads, %s Reads/second, %s identified reads, %s unidentified reads (%s%%)\n"
                     % (self.run.count(),
                        round(self.run.count() / (time.time() - lasttime),
                              0), identified_count, unidentified_count,
                        round((float(identified_count) /
                               float(self.run.count())) * 100)))
         if self.verbose:
             sys.stdout.write(
                 "%s reads processed in %s minutes, %s (%s%%) identified\n\n"
                 %
                 (self.run.count(), round(
                     (time.time() - lasttime) / (60), 2), identified_count,
                  round(
                      (float(identified_count) / float(self.run.count())) *
                      100, 1)))
         for key in self.run_out:
             sys.stdout.write(
                 "%s (%s%%)\treads found for project\t%s\n" %
                 (self.run_out[key].count(),
                  round((float(self.run_out[key].count()) /
                         float(self.run.count())) * 100, 1), key))
         self.clean()
         return 0
     except (KeyboardInterrupt, SystemExit):
         self.clean()
         sys.stderr.write("%s unexpectedly terminated\n" % (__name__))
         return 1
     except:
         self.clean()
         if not debug:
             sys.stderr.write(
                 "A fatal error was encountered. trying turning on debug\n")
         if debug:
             sys.stderr.write("".join(
                 traceback.format_exception(*sys.exc_info())))
         return 1