Esempio n. 1
0
def make_stats(data, raws):
    """ reads in pickled stats, collates, and writes to file """
    ## stats for each rawdata file
    perfile = {}
    for rawtuple in raws:
        handle = os.path.splitext(os.path.basename(rawtuple[0]))[0]
        perfile[handle] = {}
        perfile[handle]["ftotal"] = 0
        perfile[handle]["fcutfound"] = 0
        perfile[handle]["fmatched"] = 0

    ## stats for each sample
    fdbars = {}
    fsamplehits = Counter()
    fbarhits = Counter()
    fmisses = Counter()

    ## get stats from each file pickle
    pickles = glob.glob(os.path.join(data.dirs.fastqs, "*.pickle"))
    for picfile in pickles:
        with open(picfile, "rb") as pickin:
            filestats, samplestats = pickle.load(pickin)

        #counts = [total, cutfound, matched]
        handle, total, cutfound, matched = filestats
        samplehits, barhits, misses, dbars = samplestats

        ## update file stats
        perfile[handle]["ftotal"] += total
        perfile[handle]["fcutfound"] += cutfound
        perfile[handle]["fmatched"] += matched    

        ## update sample stats
        fsamplehits.update(samplehits)
        fbarhits.update(barhits)        
        fmisses.update(misses)
        fdbars.update(dbars)


    data.statsfiles.s1 = os.path.join(data.dirs.fastqs, 
                                      's1_demultiplex_stats.txt')
    outfile = open(data.statsfiles.s1, 'w')

    ## how many from each rawfile
    outfile.write('{:<35}  {:>13}{:>13}{:>13}\n'.\
                  format("raw_file", "total_reads", 
                         "cut_found", "bar_matched"))
    ## sort rawfile names
    rawfilenames = sorted(perfile)
    for rawstat in rawfilenames:
        dat = [perfile[rawstat][i] for i in ["ftotal", "fcutfound", "fmatched"]]
        outfile.write('{:<35}  {:>13}{:>13}{:>13}\n'.\
            format(*[rawstat]+[str(i) for i in dat]))
        if "pair" in data.paramsdict["datatype"]:
            rawstat2 = rawstat.replace("_R1_", "_R2_")
            outfile.write('{:<35}  {:>13}{:>13}{:>13}\n'.\
                format(*[rawstat2]+[str(i) for i in dat]))

    ## spacer, how many records for each sample
    outfile.write('\n{:<35}  {:>13}\n'.\
                  format("sample_name", "total_R1_reads"))

    ## names alphabetical
    names_sorted = sorted(data.barcodes)
    for name in names_sorted:
        outfile.write("{:<35}  {:>13}\n".format(name, fsamplehits[name]))

    ## spacer, which barcodes were found
    outfile.write('\n{:<35}  {:>13}{:>13}{:>13}\n'.\
                  format("sample_name", "true_bar", "obs_bar", "N_records"))

    ## write sample results
    for name in names_sorted:
        ## write perfect hit
        hit = data.barcodes[name]
        outfile.write('{:<35}  {:>13}{:>13}{:>13}\n'.\
            format(name, hit, hit, fsamplehits[name]))

        ## write off-n hits
        ## sort list of off-n hits
        if name in fdbars:
            offkeys = list(fdbars.get(name))
            offkeys.sort(key=fbarhits.get)
            for offhit in offkeys[::-1]:
                ## exclude perfect hit
                if offhit not in data.barcodes.values():
                    outfile.write('{:<35}  {:>13}{:>13}{:>13}\n'.\
                        format(name, hit, offhit, fbarhits[offhit]))

    ## write misses
    misskeys = list(fmisses.keys())
    misskeys.sort(key=fmisses.get)
    for key in misskeys[::-1]:
        outfile.write('{:<35}  {:>13}{:>13}{:>13}\n'.\
            format("no_match", "_", key, fmisses[key]))

    outfile.close()

    ## Link Sample with this data file to the Assembly object
    for name in data.barcodes:
        sample = Sample()
        sample.name = name
        sample.barcode = data.barcodes[name]
        if "pair" in data.paramsdict["datatype"]:
            sample.files.fastqs = [(os.path.join(data.dirs.fastqs,
                                                  name+"_R1_.fastq.gz"),
                                     os.path.join(data.dirs.fastqs,
                                                  name+"_R2_.fastq.gz"))]
        else:
            sample.files.fastqs = [(os.path.join(data.dirs.fastqs,
                                                  name+"_R1_.fastq.gz"),)]
        sample.stats["reads_raw"] = fsamplehits[name]
        if sample.stats["reads_raw"]:
            sample.stats.state = 1
            data.samples[sample.name] = sample
        else:
            print("Excluded sample: no data found for", name)
Esempio n. 2
0
    def store_stats(self):
        "Write stats and stores to Assembly object."

        # out file
        self.data.stats_files.s1 = os.path.join(self.data.dirs.fastqs,
                                                's1_demultiplex_stats.txt')
        outfile = open(self.data.stats_files.s1, 'w')

        # write the header for file stats ------------------------------------
        outfile.write("{:<35}  {:>13}{:>13}{:>13}\n".format(
            "raw_file", "total_reads", "cut_found", "bar_matched"))

        # write the file stats
        r1names = sorted(self.stats.perfile)
        for fname in r1names:
            dat = self.stats.perfile[fname]
            outfile.write("{:<35}  {:>13}{:>13}{:>13}\n".format(
                fname, dat[0], dat[1], dat[2]))
            # repeat for pairfile
            if 'pair' in self.data.params.datatype:
                fname = fname.replace("_R1_", "_R2_")
                outfile.write("{:<35}  {:>13}{:>13}{:>13}\n".format(
                    fname, dat[0], dat[1], dat[2]))

        # spacer, how many records for each sample --------------------------
        outfile.write("\n{:<35}  {:>13}\n".format("sample_name",
                                                  "total_reads"))

        # names alphabetical. Write to file. Will save again below to Samples.
        snames = set()
        for sname in self.data.barcodes:
            if "-technical-replicate-" in sname:
                sname = sname.rsplit("-technical-replicate", 1)[0]
            snames.add(sname)

        for sname in sorted(list(snames)):
            outfile.write("{:<35}  {:>13}\n".format(
                sname, self.stats.fsamplehits[sname]))

        ## spacer, which barcodes were found -----------------------------------
        outfile.write('\n{:<35}  {:>13} {:>13} {:>13}\n'.format(
            "sample_name", "true_bar", "obs_bar", "N_records"))

        ## write sample results
        for sname in sorted(self.data.barcodes):
            if "-technical-replicate-" in sname:
                fname = sname.rsplit("-technical-replicate", 1)[0]
            else:
                fname = sname

            # write perfect hit
            hit = self.data.barcodes[sname]
            offhitstring = ""

            # write off-n hits
            # sort list of off-n hits
            if fname in self.stats.fdbars:
                offkeys = list(self.stats.fdbars.get(fname))
                for offhit in offkeys[::-1]:
                    # exclude perfect hit
                    if offhit not in self.data.barcodes.values():
                        offhitstring += (
                            "{:<35}  {:>13} {:>13} {:>13}\n".format(
                                sname, hit, offhit,
                                int(self.stats.fbarhits[offhit] / 2)))
                        #sumoffhits += fbarhits[offhit]

                # write string to file
                outfile.write("{:<35}  {:>13} {:>13} {:>13}\n".format(
                    sname, hit, hit, int(self.stats.fbarhits[hit] / 2)))
                outfile.write(offhitstring)

        # write misses
        misskeys = list(self.stats.fmisses.keys())
        misskeys.sort(key=self.stats.fmisses.get)
        for key in misskeys[::-1]:
            outfile.write('{:<35}  {:>13} {:>13} {:>13}\n'.format(
                "no_match", "_", key, self.stats.fmisses[key]))
        outfile.close()

        # Link Sample with this data file to the Assembly object
        for sname in snames:

            # make the sample
            sample = Sample(sname)

            # allow multiple barcodes if its a replicate.
            barcodes = []
            for n in range(500):
                fname = "{}-technical-replicate-{}".format(sname, n)
                fbar = self.data.barcodes.get(fname)
                if fbar:
                    barcodes.append(fbar)
            if barcodes:
                sample.barcode = barcodes
            else:
                sample.barcode = self.data.barcodes[sname]

            # file names
            if 'pair' in self.data.params.datatype:
                sample.files.fastqs = [(
                    os.path.join(self.data.dirs.fastqs,
                                 sname + "_R1_.fastq.gz"),
                    os.path.join(self.data.dirs.fastqs,
                                 sname + "_R2_.fastq.gz"),
                )]
            else:
                sample.files.fastqs = [
                    (os.path.join(
                        self.data.dirs.fastqs,
                        sname + "_R1_.fastq.gz",
                    ), ""),
                ]

            # fill in the summary stats
            sample.stats["reads_raw"] = int(self.stats.fsamplehits[sname])
            # fill in the full df stats value
            sample.stats_dfs.s1["reads_raw"] = int(
                self.stats.fsamplehits[sname])

            # Only link Sample if it has data
            if sample.stats["reads_raw"]:
                sample.stats.state = 1
                self.data.samples[sample.name] = sample
            else:
                print("Excluded sample: no data found for", sname)

        # initiate s1 key for data object
        self.data.stats_dfs.s1 = self.data._build_stat("s1")

        # cleanup
        shutil.rmtree(self.tmpdir)
Esempio n. 3
0
    def remote_run_linker(self):
        "read in fastq files and count nreads for stats and chunking in s2."

        # local counters
        createdinc = 0

        # iterate over input files
        for ftup in self.ftuples:

            # remove file extension from name
            sname = get_name_from_file(ftup[0], None, None)

            # Create new Sample Class objects with names from files
            if sname not in self.data.samples:
                newsamp = Sample(sname)
                newsamp.stats.state = 1
                newsamp.barcode = None
                newsamp.files.fastqs = [ftup]
                self.data.samples[sname] = newsamp
                createdinc += 1

        # send jobs to engines for counting with cat/zcat | wc
        rasyncs = {}
        if createdinc:
            for sample in self.data.samples.values():

                # get zip var
                gzipped = bool(sample.files.fastqs[0][0].endswith(".gz"))

                # submit job to count lines and store async
                rasyncs[sample.name] = self.lbview.apply(
                    zbufcountlines, *(sample.files.fastqs[0][0], gzipped))

        # wait for link jobs to finish if parallel
        start = time.time()
        printstr = ("loading reads       ", "s1")
        while 1:
            fin = [i.ready() for i in rasyncs.values()]
            self.data._progressbar(len(fin), sum(fin), start, printstr)
            time.sleep(0.1)
            if len(fin) == sum(fin):
                self.data._print("")
                break

        # collect link job results
        for sname in rasyncs:
            res = rasyncs[sname].get() / 4
            self.data.samples[sname].stats.reads_raw = res
            self.data.samples[sname].stats_dfs.s1["reads_raw"] = res
            self.data.samples[sname].state = 1

        # print if data were linked
        if createdinc:
            # double for paired data
            if 'pair' in self.data.params.datatype:
                createdinc = createdinc * 2
            if self.data._cli:
                self.data._print("{} fastq files loaded to {} Samples.".format(
                    createdinc,
                    len(self.data.samples),
                ))

        # save step-1 stats. We don't want to write this to the fastq dir, b/c
        # it is not necessarily inside our project dir. Instead, we'll write
        # this file into our project dir in the case of linked_fastqs.
        self.data.stats_dfs.s1 = self.data._build_stat("s1")
        self.data.stats_files.s1 = os.path.join(
            self.data.params.project_dir,
            self.data.name + '_s1_demultiplex_stats.txt')
        with open(self.data.stats_files.s1, 'w') as outfile:
            (self.data.stats_dfs.s1.fillna(value=0).astype(
                np.int).to_string(outfile))
Esempio n. 4
0
def make_stats(data, raws):
    """ reads in pickled stats, collates, and writes to file """
    ## stats for each rawdata file
    perfile = {}
    for rawtuple in raws:
        handle = os.path.splitext(os.path.basename(rawtuple[0]))[0]
        perfile[handle] = {}
        perfile[handle]["ftotal"] = 0
        perfile[handle]["fcutfound"] = 0
        perfile[handle]["fmatched"] = 0

    ## stats for each sample
    fdbars = {}
    fsamplehits = Counter()
    fbarhits = Counter()
    fmisses = Counter()

    ## get stats from each file pickle
    pickles = glob.glob(os.path.join(data.dirs.fastqs, "*.pickle"))
    for picfile in pickles:
        with open(picfile, "rb") as pickin:
            filestats, samplestats = pickle.load(pickin)

        #counts = [total, cutfound, matched]
        handle, total, cutfound, matched = filestats
        samplehits, barhits, misses, dbars = samplestats

        ## update file stats
        perfile[handle]["ftotal"] += total
        perfile[handle]["fcutfound"] += cutfound
        perfile[handle]["fmatched"] += matched

        ## update sample stats
        fsamplehits.update(samplehits)
        fbarhits.update(barhits)
        fmisses.update(misses)
        fdbars.update(dbars)

    data.statsfiles.s1 = os.path.join(data.dirs.fastqs,
                                      's1_demultiplex_stats.txt')
    outfile = open(data.statsfiles.s1, 'w')

    ## how many from each rawfile
    outfile.write('{:<35}  {:>13}{:>13}{:>13}\n'.\
                  format("raw_file", "total_reads",
                         "cut_found", "bar_matched"))
    ## sort rawfile names
    rawfilenames = sorted(perfile)
    for rawstat in rawfilenames:
        dat = [
            perfile[rawstat][i] for i in ["ftotal", "fcutfound", "fmatched"]
        ]
        outfile.write('{:<35}  {:>13}{:>13}{:>13}\n'.\
            format(*[rawstat]+[str(i) for i in dat]))
        if "pair" in data.paramsdict["datatype"]:
            rawstat2 = rawstat.replace("_R1_", "_R2_")
            outfile.write('{:<35}  {:>13}{:>13}{:>13}\n'.\
                format(*[rawstat2]+[str(i) for i in dat]))

    ## spacer, how many records for each sample
    outfile.write('\n{:<35}  {:>13}\n'.\
                  format("sample_name", "total_R1_reads"))

    ## names alphabetical
    names_sorted = sorted(data.barcodes)
    for name in names_sorted:
        outfile.write("{:<35}  {:>13}\n".format(name, fsamplehits[name]))

    ## spacer, which barcodes were found
    outfile.write('\n{:<35}  {:>13}{:>13}{:>13}\n'.\
                  format("sample_name", "true_bar", "obs_bar", "N_records"))

    ## write sample results
    for name in names_sorted:
        ## write perfect hit
        hit = data.barcodes[name]
        outfile.write('{:<35}  {:>13}{:>13}{:>13}\n'.\
            format(name, hit, hit, fsamplehits[name]))

        ## write off-n hits
        ## sort list of off-n hits
        if name in fdbars:
            offkeys = list(fdbars.get(name))
            offkeys.sort(key=fbarhits.get)
            for offhit in offkeys[::-1]:
                ## exclude perfect hit
                if offhit not in data.barcodes.values():
                    outfile.write('{:<35}  {:>13}{:>13}{:>13}\n'.\
                        format(name, hit, offhit, fbarhits[offhit]))

    ## write misses
    misskeys = list(fmisses.keys())
    misskeys.sort(key=fmisses.get)
    for key in misskeys[::-1]:
        outfile.write('{:<35}  {:>13}{:>13}{:>13}\n'.\
            format("no_match", "_", key, fmisses[key]))

    outfile.close()

    ## Link Sample with this data file to the Assembly object
    for name in data.barcodes:
        sample = Sample()
        sample.name = name
        sample.barcode = data.barcodes[name]
        if "pair" in data.paramsdict["datatype"]:
            sample.files.fastqs = [(os.path.join(data.dirs.fastqs,
                                                 name + "_R1_.fastq.gz"),
                                    os.path.join(data.dirs.fastqs,
                                                 name + "_R2_.fastq.gz"))]
        else:
            sample.files.fastqs = [(os.path.join(data.dirs.fastqs,
                                                 name + "_R1_.fastq.gz"), )]
        sample.stats["reads_raw"] = fsamplehits[name]
        if sample.stats["reads_raw"]:
            sample.stats.state = 1
            data.samples[sample.name] = sample
        else:
            print("Excluded sample: no data found for", name)
Esempio n. 5
0
def make_stats(data, perfile, fsamplehits, fbarhits, fmisses, fdbars):
    """
    Write stats and stores to Assembly object.
    """

    ## out file
    outhandle = os.path.join(data.dirs.fastqs, 's1_demultiplex_stats.txt')
    outfile = open(outhandle, 'w')

    ## write the header for file stats
    outfile.write('{:<35}  {:>13}{:>13}{:>13}\n'.\
                  format("raw_file", "total_reads", "cut_found", "bar_matched"))

    ## write the file stats
    r1names = sorted(perfile)
    for fname in r1names:
        dat = perfile[fname]
        #dat = [perfile[fname][i] for i in ["ftotal", "fcutfound", "fmatched"]]
        outfile.write('{:<35}  {:>13}{:>13}{:>13}\n'.\
            format(fname, dat[0], dat[1], dat[2]))
        ## repeat for pairfile
        if 'pair' in data.paramsdict["datatype"]:
            fname = fname.replace("_R1_", "_R2_")
            outfile.write('{:<35}  {:>13}{:>13}{:>13}\n'.\
                format(fname, dat[0], dat[1], dat[2]))

    ## spacer, how many records for each sample
    outfile.write('\n{:<35}  {:>13}\n'.format("sample_name", "total_reads"))

    ## names alphabetical. Write to file. Will save again below to Samples.
    names_sorted = sorted(data.barcodes)
    for name in names_sorted:
        outfile.write("{:<35}  {:>13}\n".format(name, fsamplehits[name]))

    ## spacer, which barcodes were found
    outfile.write('\n{:<35}  {:>13} {:>13} {:>13}\n'.\
                  format("sample_name", "true_bar", "obs_bar", "N_records"))

    ## write sample results
    for name in names_sorted:
        ## write perfect hit
        hit = data.barcodes[name]
        offhitstring = ""
        sumoffhits = 0
        ## write off-n hits
        ## sort list of off-n hits
        if name in fdbars:
            offkeys = list(fdbars.get(name))
            offkeys.sort(key=fbarhits.get)
            for offhit in offkeys[::-1]:
                ## exclude perfect hit
                if offhit not in data.barcodes.values():
                    offhitstring += '{:<35}  {:>13} {:>13} {:>13}\n'.\
                        format(name, hit, offhit, fbarhits[offhit])
                    sumoffhits += fbarhits[offhit]
        ## write string to file
        outfile.write('{:<35}  {:>13} {:>13} {:>13}\n'.\
            format(name, hit, hit, fsamplehits[name]-sumoffhits))
        outfile.write(offhitstring)

    ## write misses
    misskeys = list(fmisses.keys())
    misskeys.sort(key=fmisses.get)
    for key in misskeys[::-1]:
        outfile.write('{:<35}  {:>13}{:>13}{:>13}\n'.\
            format("no_match", "_", key, fmisses[key]))
    outfile.close()

    ## Link Sample with this data file to the Assembly object
    for name in data.barcodes:
        sample = Sample()
        sample.name = name
        sample.barcode = data.barcodes[name]
        if 'pair' in data.paramsdict["datatype"]:
            sample.files.fastqs = [(os.path.join(data.dirs.fastqs,
                                                  name+"_R1_.fastq.gz"),
                                     os.path.join(data.dirs.fastqs,
                                                  name+"_R2_.fastq.gz"))]
        else:
            sample.files.fastqs = [(os.path.join(data.dirs.fastqs,
                                                  name+"_R1_.fastq.gz"), "")]
        ## fill in the summary stats
        sample.stats["reads_raw"] = int(fsamplehits[name])
        ## fill in the full df stats value
        sample.stats_dfs.s1["reads_raw"] = int(fsamplehits[name])

        ## Only link Sample if it has data
        if sample.stats["reads_raw"]:
            sample.stats.state = 1
            data.samples[sample.name] = sample
        else:
            print("Excluded sample: no data found for", name)

    ## initiate s1 key for data object
    data.stats_dfs.s1 = data._build_stat("s1")
    data.stats_files.s1 = outhandle
Esempio n. 6
0
def make_stats(data, perfile, fsamplehits, fbarhits, fmisses, fdbars):
    """
    Write stats and stores to Assembly object.
    """

    ## out file
    outhandle = os.path.join(data.dirs.fastqs, 's1_demultiplex_stats.txt')
    outfile = open(outhandle, 'w')

    ## how many from each rawfile
    outfile.write('{:<35}  {:>13}{:>13}{:>13}\n'.\
                  format("raw_file", "total_reads", "cut_found", "bar_matched"))

    ## sort rawfile names
    rawfilenames = sorted(perfile)
    for rawstat in rawfilenames:
        dat = [
            perfile[rawstat][i] for i in ["ftotal", "fcutfound", "fmatched"]
        ]
        outfile.write('{:<35}  {:>13}{:>13}{:>13}\n'.\
            format(*[rawstat]+[str(i) for i in dat]))
        if "pair" in data.paramsdict["datatype"]:
            rawstat2 = rawstat.replace("_R1_", "_R2_")
            outfile.write('{:<35}  {:>13}{:>13}{:>13}\n'.\
                format(*[rawstat2]+[str(i) for i in dat]))

    ## spacer, how many records for each sample
    outfile.write('\n{:<35}  {:>13}\n'.\
                  format("sample_name", "total_reads"))

    ## names alphabetical. Write to file. Will save again below to Samples.
    names_sorted = sorted(data.barcodes)
    for name in names_sorted:
        outfile.write("{:<35}  {:>13}\n".format(name, fsamplehits[name]))

    ## spacer, which barcodes were found
    outfile.write('\n{:<35}  {:>13}{:>13}{:>13}\n'.\
                  format("sample_name", "true_bar", "obs_bar", "N_records"))

    ## write sample results
    for name in names_sorted:
        ## write perfect hit
        hit = data.barcodes[name]
        outfile.write('{:<35}  {:>13}{:>13}{:>13}\n'.\
            format(name, hit, hit, fsamplehits[name]))

        ## write off-n hits
        ## sort list of off-n hits
        if name in fdbars:
            offkeys = list(fdbars.get(name))
            offkeys.sort(key=fbarhits.get)
            for offhit in offkeys[::-1]:
                ## exclude perfect hit
                if offhit not in data.barcodes.values():
                    outfile.write('{:<35}  {:>13}{:>13}{:>13}\n'.\
                        format(name, hit, offhit, fbarhits[offhit]))

    ## write misses
    misskeys = list(fmisses.keys())
    misskeys.sort(key=fmisses.get)
    for key in misskeys[::-1]:
        outfile.write('{:<35}  {:>13}{:>13}{:>13}\n'.\
            format("no_match", "_", key, fmisses[key]))
    outfile.close()

    ## Link Sample with this data file to the Assembly object
    for name in data.barcodes:
        sample = Sample()
        sample.name = name
        sample.barcode = data.barcodes[name]
        if "pair" in data.paramsdict["datatype"]:
            sample.files.fastqs = [(os.path.join(data.dirs.fastqs,
                                                 name + "_R1_.fastq.gz"),
                                    os.path.join(data.dirs.fastqs,
                                                 name + "_R2_.fastq.gz"))]
        else:
            sample.files.fastqs = [(os.path.join(data.dirs.fastqs,
                                                 name + "_R1_.fastq.gz"), "")]
        ## fill in the summary stats
        sample.stats["reads_raw"] = int(fsamplehits[name])
        ## fill in the full df stats value
        sample.stats_dfs.s1["reads_raw"] = int(fsamplehits[name])

        ## Only link Sample if it has data
        if sample.stats["reads_raw"]:
            sample.stats.state = 1
            data.samples[sample.name] = sample
        else:
            print("Excluded sample: no data found for", name)

    ## initiate s1 key for data object
    data.stats_dfs.s1 = data.build_stat("s1")
    data.stats_files.s1 = outhandle