Beispiel #1
0
    def store_stats(self):
        "Write stats and stores to Assembly object."

        # out file
        self.data.stats_files.s1 = os.path.join(self.data.dirs.fastqs,
                                                's1_demultiplex_stats.txt')
        outfile = open(self.data.stats_files.s1, 'w')

        # write the header for file stats ------------------------------------
        outfile.write("{:<35}  {:>13}{:>13}{:>13}\n".format(
            "raw_file", "total_reads", "cut_found", "bar_matched"))

        # write the file stats
        r1names = sorted(self.stats.perfile)
        for fname in r1names:
            dat = self.stats.perfile[fname]
            outfile.write("{:<35}  {:>13}{:>13}{:>13}\n".format(
                fname, dat[0], dat[1], dat[2]))
            # repeat for pairfile
            if 'pair' in self.data.params.datatype:
                fname = fname.replace("_R1_", "_R2_")
                outfile.write("{:<35}  {:>13}{:>13}{:>13}\n".format(
                    fname, dat[0], dat[1], dat[2]))

        # spacer, how many records for each sample --------------------------
        outfile.write("\n{:<35}  {:>13}\n".format("sample_name",
                                                  "total_reads"))

        # names alphabetical. Write to file. Will save again below to Samples.
        snames = set()
        for sname in self.data.barcodes:
            if "-technical-replicate-" in sname:
                sname = sname.rsplit("-technical-replicate", 1)[0]
            snames.add(sname)

        for sname in sorted(list(snames)):
            outfile.write("{:<35}  {:>13}\n".format(
                sname, self.stats.fsamplehits[sname]))

        ## spacer, which barcodes were found -----------------------------------
        outfile.write('\n{:<35}  {:>13} {:>13} {:>13}\n'.format(
            "sample_name", "true_bar", "obs_bar", "N_records"))

        ## write sample results
        for sname in sorted(self.data.barcodes):
            if "-technical-replicate-" in sname:
                fname = sname.rsplit("-technical-replicate", 1)[0]
            else:
                fname = sname

            # write perfect hit
            hit = self.data.barcodes[sname]
            offhitstring = ""

            # write off-n hits
            # sort list of off-n hits
            if fname in self.stats.fdbars:
                offkeys = list(self.stats.fdbars.get(fname))
                for offhit in offkeys[::-1]:
                    # exclude perfect hit
                    if offhit not in self.data.barcodes.values():
                        offhitstring += (
                            "{:<35}  {:>13} {:>13} {:>13}\n".format(
                                sname, hit, offhit,
                                int(self.stats.fbarhits[offhit] / 2)))
                        #sumoffhits += fbarhits[offhit]

                # write string to file
                outfile.write("{:<35}  {:>13} {:>13} {:>13}\n".format(
                    sname, hit, hit, int(self.stats.fbarhits[hit] / 2)))
                outfile.write(offhitstring)

        # write misses
        misskeys = list(self.stats.fmisses.keys())
        misskeys.sort(key=self.stats.fmisses.get)
        for key in misskeys[::-1]:
            outfile.write('{:<35}  {:>13} {:>13} {:>13}\n'.format(
                "no_match", "_", key, self.stats.fmisses[key]))
        outfile.close()

        # Link Sample with this data file to the Assembly object
        for sname in snames:

            # make the sample
            sample = Sample(sname)

            # allow multiple barcodes if its a replicate.
            barcodes = []
            for n in range(500):
                fname = "{}-technical-replicate-{}".format(sname, n)
                fbar = self.data.barcodes.get(fname)
                if fbar:
                    barcodes.append(fbar)
            if barcodes:
                sample.barcode = barcodes
            else:
                sample.barcode = self.data.barcodes[sname]

            # file names
            if 'pair' in self.data.params.datatype:
                sample.files.fastqs = [(
                    os.path.join(self.data.dirs.fastqs,
                                 sname + "_R1_.fastq.gz"),
                    os.path.join(self.data.dirs.fastqs,
                                 sname + "_R2_.fastq.gz"),
                )]
            else:
                sample.files.fastqs = [
                    (os.path.join(
                        self.data.dirs.fastqs,
                        sname + "_R1_.fastq.gz",
                    ), ""),
                ]

            # fill in the summary stats
            sample.stats["reads_raw"] = int(self.stats.fsamplehits[sname])
            # fill in the full df stats value
            sample.stats_dfs.s1["reads_raw"] = int(
                self.stats.fsamplehits[sname])

            # Only link Sample if it has data
            if sample.stats["reads_raw"]:
                sample.stats.state = 1
                self.data.samples[sample.name] = sample
            else:
                print("Excluded sample: no data found for", sname)

        # initiate s1 key for data object
        self.data.stats_dfs.s1 = self.data._build_stat("s1")

        # cleanup
        shutil.rmtree(self.tmpdir)
Beispiel #2
0
def make_stats(data, raws):
    """ reads in pickled stats, collates, and writes to file """
    ## stats for each rawdata file
    perfile = {}
    for rawtuple in raws:
        handle = os.path.splitext(os.path.basename(rawtuple[0]))[0]
        perfile[handle] = {}
        perfile[handle]["ftotal"] = 0
        perfile[handle]["fcutfound"] = 0
        perfile[handle]["fmatched"] = 0

    ## stats for each sample
    fdbars = {}
    fsamplehits = Counter()
    fbarhits = Counter()
    fmisses = Counter()

    ## get stats from each file pickle
    pickles = glob.glob(os.path.join(data.dirs.fastqs, "*.pickle"))
    for picfile in pickles:
        with open(picfile, "rb") as pickin:
            filestats, samplestats = pickle.load(pickin)

        #counts = [total, cutfound, matched]
        handle, total, cutfound, matched = filestats
        samplehits, barhits, misses, dbars = samplestats

        ## update file stats
        perfile[handle]["ftotal"] += total
        perfile[handle]["fcutfound"] += cutfound
        perfile[handle]["fmatched"] += matched    

        ## update sample stats
        fsamplehits.update(samplehits)
        fbarhits.update(barhits)        
        fmisses.update(misses)
        fdbars.update(dbars)


    data.statsfiles.s1 = os.path.join(data.dirs.fastqs, 
                                      's1_demultiplex_stats.txt')
    outfile = open(data.statsfiles.s1, 'w')

    ## how many from each rawfile
    outfile.write('{:<35}  {:>13}{:>13}{:>13}\n'.\
                  format("raw_file", "total_reads", 
                         "cut_found", "bar_matched"))
    ## sort rawfile names
    rawfilenames = sorted(perfile)
    for rawstat in rawfilenames:
        dat = [perfile[rawstat][i] for i in ["ftotal", "fcutfound", "fmatched"]]
        outfile.write('{:<35}  {:>13}{:>13}{:>13}\n'.\
            format(*[rawstat]+[str(i) for i in dat]))
        if "pair" in data.paramsdict["datatype"]:
            rawstat2 = rawstat.replace("_R1_", "_R2_")
            outfile.write('{:<35}  {:>13}{:>13}{:>13}\n'.\
                format(*[rawstat2]+[str(i) for i in dat]))

    ## spacer, how many records for each sample
    outfile.write('\n{:<35}  {:>13}\n'.\
                  format("sample_name", "total_R1_reads"))

    ## names alphabetical
    names_sorted = sorted(data.barcodes)
    for name in names_sorted:
        outfile.write("{:<35}  {:>13}\n".format(name, fsamplehits[name]))

    ## spacer, which barcodes were found
    outfile.write('\n{:<35}  {:>13}{:>13}{:>13}\n'.\
                  format("sample_name", "true_bar", "obs_bar", "N_records"))

    ## write sample results
    for name in names_sorted:
        ## write perfect hit
        hit = data.barcodes[name]
        outfile.write('{:<35}  {:>13}{:>13}{:>13}\n'.\
            format(name, hit, hit, fsamplehits[name]))

        ## write off-n hits
        ## sort list of off-n hits
        if name in fdbars:
            offkeys = list(fdbars.get(name))
            offkeys.sort(key=fbarhits.get)
            for offhit in offkeys[::-1]:
                ## exclude perfect hit
                if offhit not in data.barcodes.values():
                    outfile.write('{:<35}  {:>13}{:>13}{:>13}\n'.\
                        format(name, hit, offhit, fbarhits[offhit]))

    ## write misses
    misskeys = list(fmisses.keys())
    misskeys.sort(key=fmisses.get)
    for key in misskeys[::-1]:
        outfile.write('{:<35}  {:>13}{:>13}{:>13}\n'.\
            format("no_match", "_", key, fmisses[key]))

    outfile.close()

    ## Link Sample with this data file to the Assembly object
    for name in data.barcodes:
        sample = Sample()
        sample.name = name
        sample.barcode = data.barcodes[name]
        if "pair" in data.paramsdict["datatype"]:
            sample.files.fastqs = [(os.path.join(data.dirs.fastqs,
                                                  name+"_R1_.fastq.gz"),
                                     os.path.join(data.dirs.fastqs,
                                                  name+"_R2_.fastq.gz"))]
        else:
            sample.files.fastqs = [(os.path.join(data.dirs.fastqs,
                                                  name+"_R1_.fastq.gz"),)]
        sample.stats["reads_raw"] = fsamplehits[name]
        if sample.stats["reads_raw"]:
            sample.stats.state = 1
            data.samples[sample.name] = sample
        else:
            print("Excluded sample: no data found for", name)
Beispiel #3
0
    def remote_run_linker(self):
        "read in fastq files and count nreads for stats and chunking in s2."

        # local counters
        createdinc = 0

        # iterate over input files
        for ftup in self.ftuples:

            # remove file extension from name
            sname = get_name_from_file(ftup[0], None, None)

            # Create new Sample Class objects with names from files
            if sname not in self.data.samples:
                newsamp = Sample(sname)
                newsamp.stats.state = 1
                newsamp.barcode = None
                newsamp.files.fastqs = [ftup]
                self.data.samples[sname] = newsamp
                createdinc += 1

        # send jobs to engines for counting with cat/zcat | wc
        rasyncs = {}
        if createdinc:
            for sample in self.data.samples.values():

                # get zip var
                gzipped = bool(sample.files.fastqs[0][0].endswith(".gz"))

                # submit job to count lines and store async
                rasyncs[sample.name] = self.lbview.apply(
                    zbufcountlines, *(sample.files.fastqs[0][0], gzipped))

        # wait for link jobs to finish if parallel
        start = time.time()
        printstr = ("loading reads       ", "s1")
        while 1:
            fin = [i.ready() for i in rasyncs.values()]
            self.data._progressbar(len(fin), sum(fin), start, printstr)
            time.sleep(0.1)
            if len(fin) == sum(fin):
                self.data._print("")
                break

        # collect link job results
        for sname in rasyncs:
            res = rasyncs[sname].get() / 4
            self.data.samples[sname].stats.reads_raw = res
            self.data.samples[sname].stats_dfs.s1["reads_raw"] = res
            self.data.samples[sname].state = 1

        # print if data were linked
        if createdinc:
            # double for paired data
            if 'pair' in self.data.params.datatype:
                createdinc = createdinc * 2
            if self.data._cli:
                self.data._print("{} fastq files loaded to {} Samples.".format(
                    createdinc,
                    len(self.data.samples),
                ))

        # save step-1 stats. We don't want to write this to the fastq dir, b/c
        # it is not necessarily inside our project dir. Instead, we'll write
        # this file into our project dir in the case of linked_fastqs.
        self.data.stats_dfs.s1 = self.data._build_stat("s1")
        self.data.stats_files.s1 = os.path.join(
            self.data.params.project_dir,
            self.data.name + '_s1_demultiplex_stats.txt')
        with open(self.data.stats_files.s1, 'w') as outfile:
            (self.data.stats_dfs.s1.fillna(value=0).astype(
                np.int).to_string(outfile))
Beispiel #4
0
def make_stats(data, raws):
    """ reads in pickled stats, collates, and writes to file """
    ## stats for each rawdata file
    perfile = {}
    for rawtuple in raws:
        handle = os.path.splitext(os.path.basename(rawtuple[0]))[0]
        perfile[handle] = {}
        perfile[handle]["ftotal"] = 0
        perfile[handle]["fcutfound"] = 0
        perfile[handle]["fmatched"] = 0

    ## stats for each sample
    fdbars = {}
    fsamplehits = Counter()
    fbarhits = Counter()
    fmisses = Counter()

    ## get stats from each file pickle
    pickles = glob.glob(os.path.join(data.dirs.fastqs, "*.pickle"))
    for picfile in pickles:
        with open(picfile, "rb") as pickin:
            filestats, samplestats = pickle.load(pickin)

        #counts = [total, cutfound, matched]
        handle, total, cutfound, matched = filestats
        samplehits, barhits, misses, dbars = samplestats

        ## update file stats
        perfile[handle]["ftotal"] += total
        perfile[handle]["fcutfound"] += cutfound
        perfile[handle]["fmatched"] += matched

        ## update sample stats
        fsamplehits.update(samplehits)
        fbarhits.update(barhits)
        fmisses.update(misses)
        fdbars.update(dbars)

    data.statsfiles.s1 = os.path.join(data.dirs.fastqs,
                                      's1_demultiplex_stats.txt')
    outfile = open(data.statsfiles.s1, 'w')

    ## how many from each rawfile
    outfile.write('{:<35}  {:>13}{:>13}{:>13}\n'.\
                  format("raw_file", "total_reads",
                         "cut_found", "bar_matched"))
    ## sort rawfile names
    rawfilenames = sorted(perfile)
    for rawstat in rawfilenames:
        dat = [
            perfile[rawstat][i] for i in ["ftotal", "fcutfound", "fmatched"]
        ]
        outfile.write('{:<35}  {:>13}{:>13}{:>13}\n'.\
            format(*[rawstat]+[str(i) for i in dat]))
        if "pair" in data.paramsdict["datatype"]:
            rawstat2 = rawstat.replace("_R1_", "_R2_")
            outfile.write('{:<35}  {:>13}{:>13}{:>13}\n'.\
                format(*[rawstat2]+[str(i) for i in dat]))

    ## spacer, how many records for each sample
    outfile.write('\n{:<35}  {:>13}\n'.\
                  format("sample_name", "total_R1_reads"))

    ## names alphabetical
    names_sorted = sorted(data.barcodes)
    for name in names_sorted:
        outfile.write("{:<35}  {:>13}\n".format(name, fsamplehits[name]))

    ## spacer, which barcodes were found
    outfile.write('\n{:<35}  {:>13}{:>13}{:>13}\n'.\
                  format("sample_name", "true_bar", "obs_bar", "N_records"))

    ## write sample results
    for name in names_sorted:
        ## write perfect hit
        hit = data.barcodes[name]
        outfile.write('{:<35}  {:>13}{:>13}{:>13}\n'.\
            format(name, hit, hit, fsamplehits[name]))

        ## write off-n hits
        ## sort list of off-n hits
        if name in fdbars:
            offkeys = list(fdbars.get(name))
            offkeys.sort(key=fbarhits.get)
            for offhit in offkeys[::-1]:
                ## exclude perfect hit
                if offhit not in data.barcodes.values():
                    outfile.write('{:<35}  {:>13}{:>13}{:>13}\n'.\
                        format(name, hit, offhit, fbarhits[offhit]))

    ## write misses
    misskeys = list(fmisses.keys())
    misskeys.sort(key=fmisses.get)
    for key in misskeys[::-1]:
        outfile.write('{:<35}  {:>13}{:>13}{:>13}\n'.\
            format("no_match", "_", key, fmisses[key]))

    outfile.close()

    ## Link Sample with this data file to the Assembly object
    for name in data.barcodes:
        sample = Sample()
        sample.name = name
        sample.barcode = data.barcodes[name]
        if "pair" in data.paramsdict["datatype"]:
            sample.files.fastqs = [(os.path.join(data.dirs.fastqs,
                                                 name + "_R1_.fastq.gz"),
                                    os.path.join(data.dirs.fastqs,
                                                 name + "_R2_.fastq.gz"))]
        else:
            sample.files.fastqs = [(os.path.join(data.dirs.fastqs,
                                                 name + "_R1_.fastq.gz"), )]
        sample.stats["reads_raw"] = fsamplehits[name]
        if sample.stats["reads_raw"]:
            sample.stats.state = 1
            data.samples[sample.name] = sample
        else:
            print("Excluded sample: no data found for", name)
Beispiel #5
0
    def link_fastqs(self, path=None, merged=False, force=False, append=False):
        """ Create Sample objects for samples in sorted_fastq_path.

        Note
        ----
        link_fastqs() is called automatically during step2() if no Samples
        are yet present in the Assembly object (data were not demultiplexed
        in step1().) It looks for demultiplexed data files located in the
        [sorted_fastq_path].


        Parameters
        ----------
        path : str
            Path to the fastq files to be linked to Sample objects. The default
            location is to select all files in the 'sorted_fastq_path'. 
            Alternatively a different path can be entered here. 

        merged : bool
            Set to True if files represent first and second reads that were 
            merged using some external software such as `PEAR` or `VSEARCH`. 

        append : bool
            The default action is to overwrite fastq files linked to Samples if 
            they already have linked files. Use append=True to instead append 
            additional fastq files to a Sample (file names should be formatted 
            the same as usual, e.g., [name]_R1_[optional].fastq.gz).

        Returns
        -------
        str
            Prints the number of new Sample objects created and the number of 
            fastq files linked to Sample objects in the Assembly object. 
        
        """
        ## cannot both force and append at once
        if force and append:
            raise Exception("Cannot use force and append at the same time.")

        if self.samples and not (force or append):
            raise Exception("Files already linked to `{}`. ".format(self.name)\
                +"Use force=True to replace all files, or append=True to "
                +"add additional files to existing Samples.")

        ## get path to data files
        if not path:
            path = self.paramsdict["sorted_fastq_path"]

        ## does location exist, if no files selected, try selecting all
        if os.path.isdir(path):
            path += "*"

        ## grab fastqs/fq/gzip/all
        fastqs = glob.glob(path)
        fastqs = [i for i in fastqs if i.endswith(".gz") \
                                    or i.endswith(".fastq") \
                                    or i.endswith(".fq")]

        ## sort alphabetical
        fastqs.sort()

        ## link pairs into tuples
        if 'pair' in self.paramsdict["datatype"]:
            ## check that names fit the paired naming convention
            r1_files = [i for i in fastqs if "_R1_" in i]
            r2_files = [i.replace("_R1_", "_R2_") for i in r1_files]

            if not any(["_R1_" in i for i in fastqs]) or \
                   (len(r1_files) != len(r2_files)):
                raise Exception("File name format error: paired file names " \
                +"must be identical except for _R1_ and _R2_ in their names.")
            fastqs = [(i, j) for i, j in zip(r1_files, r2_files)]

        ## data are not paired, create empty tuple pair
        else:
            if any(["_R2_" in i for i in fastqs]):
                print("Given the presence of '_R2_' in file names, this "\
              +"is a warning that if your data are paired-end you should set "\
              +"the Assembly object datatype to a paired type (e.g., "\
              +"pairddrad or pairgbs) prior to running link_fastqs().")
            fastqs = [(i, ) for i in fastqs]

        ## counters for the printed output
        created = 0
        linked = 0
        appended = 0
        for fastqtuple in list(fastqs):
            assert isinstance(fastqtuple, tuple), "fastqs not a tuple."
            ## local counters
            createdinc = 0
            linkedinc = 0
            appendinc = 0
            ## remove file extension from name
            sname = _name_from_file(fastqtuple[0])

            if sname not in self.samples:
                ## create new Sample
                self.samples[sname] = Sample(sname)
                self.samples[sname].stats.state = 1
                self.samples[sname].barcode = None
                self.samples[sname].files.fastqs.append(fastqtuple)
                createdinc += 1
                linkedinc += 1
            else:
                ## if not forcing, shouldn't be here with existing Samples
                if append:
                    if fastqtuple not in self.samples[sname].files.fastqs:
                        self.samples[sname].files.fastqs.append(fastqtuple)
                        appendinc += 1
                    else:
                        print("The files {} are already in Sample {}, "\
                              .format(fastqtuple, sname) \
                              +"cannot append duplicate files to a Sample.\n")
                elif force:
                    ## create new Sample
                    self.samples[sname] = Sample(sname)
                    self.samples[sname].stats.state = 1
                    self.samples[sname].barcode = None
                    self.samples[sname].files.fastqs.append(fastqtuple)
                    createdinc += 1
                    linkedinc += 1
                else:
                    print("The files {} are already in Sample.".format(sname) \
                    + " Use append=True to append additional files to a Sample"\
                    + " or force=True to replace all existing Samples.")

            ## record whether data were merged.
            if merged:
                self.samples[sname].merged = 1

            ## do not allow merged=False and .forward in file names
            if (merged == False) and ('forward' in fastqtuple[0]):
                print(\
                "If R1 and R2 data are merged (e.g., with PEAR) " \
              + "use link_fastqs(merge=True) to indicate this. You " \
              + "may need force=True to overwrite existing files.\n")

            ## if fastqs already demultiplexed, try to link stats
            if any([linkedinc, createdinc, appendinc]):
                gzipped = bool(fastqtuple[0].endswith(".gz"))
                nreads = 0
                ## iterate over files if there are multiple
                for alltuples in self.samples[sname].files.fastqs:
                    nreads += bufcount(alltuples[0], gzipped)
                self.samples[sname].stats.reads_raw = nreads / 4
                created += createdinc
                linked += linkedinc
                appended += appendinc

        ## print if data were linked
        print("{} new Samples created in `{}`.".format(created, self.name))
        if linked:
            print("{} fastq files linked to {} new Samples.".\
                  format(linked, len(self.samples)))
        if appended:
            print("{} fastq files appended to {} existing Samples.".\
                  format(appended, len(self.samples)))
Beispiel #6
0
def make_stats(data, perfile, fsamplehits, fbarhits, fmisses, fdbars):
    """
    Write stats and stores to Assembly object.
    """

    ## out file
    outhandle = os.path.join(data.dirs.fastqs, 's1_demultiplex_stats.txt')
    outfile = open(outhandle, 'w')

    ## write the header for file stats
    outfile.write('{:<35}  {:>13}{:>13}{:>13}\n'.\
                  format("raw_file", "total_reads", "cut_found", "bar_matched"))

    ## write the file stats
    r1names = sorted(perfile)
    for fname in r1names:
        dat = perfile[fname]
        #dat = [perfile[fname][i] for i in ["ftotal", "fcutfound", "fmatched"]]
        outfile.write('{:<35}  {:>13}{:>13}{:>13}\n'.\
            format(fname, dat[0], dat[1], dat[2]))
        ## repeat for pairfile
        if 'pair' in data.paramsdict["datatype"]:
            fname = fname.replace("_R1_", "_R2_")
            outfile.write('{:<35}  {:>13}{:>13}{:>13}\n'.\
                format(fname, dat[0], dat[1], dat[2]))

    ## spacer, how many records for each sample
    outfile.write('\n{:<35}  {:>13}\n'.format("sample_name", "total_reads"))

    ## names alphabetical. Write to file. Will save again below to Samples.
    names_sorted = sorted(data.barcodes)
    for name in names_sorted:
        outfile.write("{:<35}  {:>13}\n".format(name, fsamplehits[name]))

    ## spacer, which barcodes were found
    outfile.write('\n{:<35}  {:>13} {:>13} {:>13}\n'.\
                  format("sample_name", "true_bar", "obs_bar", "N_records"))

    ## write sample results
    for name in names_sorted:
        ## write perfect hit
        hit = data.barcodes[name]
        offhitstring = ""
        sumoffhits = 0
        ## write off-n hits
        ## sort list of off-n hits
        if name in fdbars:
            offkeys = list(fdbars.get(name))
            offkeys.sort(key=fbarhits.get)
            for offhit in offkeys[::-1]:
                ## exclude perfect hit
                if offhit not in data.barcodes.values():
                    offhitstring += '{:<35}  {:>13} {:>13} {:>13}\n'.\
                        format(name, hit, offhit, fbarhits[offhit])
                    sumoffhits += fbarhits[offhit]
        ## write string to file
        outfile.write('{:<35}  {:>13} {:>13} {:>13}\n'.\
            format(name, hit, hit, fsamplehits[name]-sumoffhits))
        outfile.write(offhitstring)

    ## write misses
    misskeys = list(fmisses.keys())
    misskeys.sort(key=fmisses.get)
    for key in misskeys[::-1]:
        outfile.write('{:<35}  {:>13}{:>13}{:>13}\n'.\
            format("no_match", "_", key, fmisses[key]))
    outfile.close()

    ## Link Sample with this data file to the Assembly object
    for name in data.barcodes:
        sample = Sample()
        sample.name = name
        sample.barcode = data.barcodes[name]
        if 'pair' in data.paramsdict["datatype"]:
            sample.files.fastqs = [(os.path.join(data.dirs.fastqs,
                                                  name+"_R1_.fastq.gz"),
                                     os.path.join(data.dirs.fastqs,
                                                  name+"_R2_.fastq.gz"))]
        else:
            sample.files.fastqs = [(os.path.join(data.dirs.fastqs,
                                                  name+"_R1_.fastq.gz"), "")]
        ## fill in the summary stats
        sample.stats["reads_raw"] = int(fsamplehits[name])
        ## fill in the full df stats value
        sample.stats_dfs.s1["reads_raw"] = int(fsamplehits[name])

        ## Only link Sample if it has data
        if sample.stats["reads_raw"]:
            sample.stats.state = 1
            data.samples[sample.name] = sample
        else:
            print("Excluded sample: no data found for", name)

    ## initiate s1 key for data object
    data.stats_dfs.s1 = data._build_stat("s1")
    data.stats_files.s1 = outhandle
Beispiel #7
0
def make_stats(data, perfile, fsamplehits, fbarhits, fmisses, fdbars):
    """
    Write stats and stores to Assembly object.
    """

    ## out file
    outhandle = os.path.join(data.dirs.fastqs, 's1_demultiplex_stats.txt')
    outfile = open(outhandle, 'w')

    ## how many from each rawfile
    outfile.write('{:<35}  {:>13}{:>13}{:>13}\n'.\
                  format("raw_file", "total_reads", "cut_found", "bar_matched"))

    ## sort rawfile names
    rawfilenames = sorted(perfile)
    for rawstat in rawfilenames:
        dat = [
            perfile[rawstat][i] for i in ["ftotal", "fcutfound", "fmatched"]
        ]
        outfile.write('{:<35}  {:>13}{:>13}{:>13}\n'.\
            format(*[rawstat]+[str(i) for i in dat]))
        if "pair" in data.paramsdict["datatype"]:
            rawstat2 = rawstat.replace("_R1_", "_R2_")
            outfile.write('{:<35}  {:>13}{:>13}{:>13}\n'.\
                format(*[rawstat2]+[str(i) for i in dat]))

    ## spacer, how many records for each sample
    outfile.write('\n{:<35}  {:>13}\n'.\
                  format("sample_name", "total_reads"))

    ## names alphabetical. Write to file. Will save again below to Samples.
    names_sorted = sorted(data.barcodes)
    for name in names_sorted:
        outfile.write("{:<35}  {:>13}\n".format(name, fsamplehits[name]))

    ## spacer, which barcodes were found
    outfile.write('\n{:<35}  {:>13}{:>13}{:>13}\n'.\
                  format("sample_name", "true_bar", "obs_bar", "N_records"))

    ## write sample results
    for name in names_sorted:
        ## write perfect hit
        hit = data.barcodes[name]
        outfile.write('{:<35}  {:>13}{:>13}{:>13}\n'.\
            format(name, hit, hit, fsamplehits[name]))

        ## write off-n hits
        ## sort list of off-n hits
        if name in fdbars:
            offkeys = list(fdbars.get(name))
            offkeys.sort(key=fbarhits.get)
            for offhit in offkeys[::-1]:
                ## exclude perfect hit
                if offhit not in data.barcodes.values():
                    outfile.write('{:<35}  {:>13}{:>13}{:>13}\n'.\
                        format(name, hit, offhit, fbarhits[offhit]))

    ## write misses
    misskeys = list(fmisses.keys())
    misskeys.sort(key=fmisses.get)
    for key in misskeys[::-1]:
        outfile.write('{:<35}  {:>13}{:>13}{:>13}\n'.\
            format("no_match", "_", key, fmisses[key]))
    outfile.close()

    ## Link Sample with this data file to the Assembly object
    for name in data.barcodes:
        sample = Sample()
        sample.name = name
        sample.barcode = data.barcodes[name]
        if "pair" in data.paramsdict["datatype"]:
            sample.files.fastqs = [(os.path.join(data.dirs.fastqs,
                                                 name + "_R1_.fastq.gz"),
                                    os.path.join(data.dirs.fastqs,
                                                 name + "_R2_.fastq.gz"))]
        else:
            sample.files.fastqs = [(os.path.join(data.dirs.fastqs,
                                                 name + "_R1_.fastq.gz"), "")]
        ## fill in the summary stats
        sample.stats["reads_raw"] = int(fsamplehits[name])
        ## fill in the full df stats value
        sample.stats_dfs.s1["reads_raw"] = int(fsamplehits[name])

        ## Only link Sample if it has data
        if sample.stats["reads_raw"]:
            sample.stats.state = 1
            data.samples[sample.name] = sample
        else:
            print("Excluded sample: no data found for", name)

    ## initiate s1 key for data object
    data.stats_dfs.s1 = data.build_stat("s1")
    data.stats_files.s1 = outhandle