def make_stats(data, raws): """ reads in pickled stats, collates, and writes to file """ ## stats for each rawdata file perfile = {} for rawtuple in raws: handle = os.path.splitext(os.path.basename(rawtuple[0]))[0] perfile[handle] = {} perfile[handle]["ftotal"] = 0 perfile[handle]["fcutfound"] = 0 perfile[handle]["fmatched"] = 0 ## stats for each sample fdbars = {} fsamplehits = Counter() fbarhits = Counter() fmisses = Counter() ## get stats from each file pickle pickles = glob.glob(os.path.join(data.dirs.fastqs, "*.pickle")) for picfile in pickles: with open(picfile, "rb") as pickin: filestats, samplestats = pickle.load(pickin) #counts = [total, cutfound, matched] handle, total, cutfound, matched = filestats samplehits, barhits, misses, dbars = samplestats ## update file stats perfile[handle]["ftotal"] += total perfile[handle]["fcutfound"] += cutfound perfile[handle]["fmatched"] += matched ## update sample stats fsamplehits.update(samplehits) fbarhits.update(barhits) fmisses.update(misses) fdbars.update(dbars) data.statsfiles.s1 = os.path.join(data.dirs.fastqs, 's1_demultiplex_stats.txt') outfile = open(data.statsfiles.s1, 'w') ## how many from each rawfile outfile.write('{:<35} {:>13}{:>13}{:>13}\n'.\ format("raw_file", "total_reads", "cut_found", "bar_matched")) ## sort rawfile names rawfilenames = sorted(perfile) for rawstat in rawfilenames: dat = [perfile[rawstat][i] for i in ["ftotal", "fcutfound", "fmatched"]] outfile.write('{:<35} {:>13}{:>13}{:>13}\n'.\ format(*[rawstat]+[str(i) for i in dat])) if "pair" in data.paramsdict["datatype"]: rawstat2 = rawstat.replace("_R1_", "_R2_") outfile.write('{:<35} {:>13}{:>13}{:>13}\n'.\ format(*[rawstat2]+[str(i) for i in dat])) ## spacer, how many records for each sample outfile.write('\n{:<35} {:>13}\n'.\ format("sample_name", "total_R1_reads")) ## names alphabetical names_sorted = sorted(data.barcodes) for name in names_sorted: outfile.write("{:<35} {:>13}\n".format(name, fsamplehits[name])) ## spacer, which barcodes were found outfile.write('\n{:<35} {:>13}{:>13}{:>13}\n'.\ format("sample_name", "true_bar", "obs_bar", "N_records")) ## write sample results for name in names_sorted: ## write perfect hit hit = data.barcodes[name] outfile.write('{:<35} {:>13}{:>13}{:>13}\n'.\ format(name, hit, hit, fsamplehits[name])) ## write off-n hits ## sort list of off-n hits if name in fdbars: offkeys = list(fdbars.get(name)) offkeys.sort(key=fbarhits.get) for offhit in offkeys[::-1]: ## exclude perfect hit if offhit not in data.barcodes.values(): outfile.write('{:<35} {:>13}{:>13}{:>13}\n'.\ format(name, hit, offhit, fbarhits[offhit])) ## write misses misskeys = list(fmisses.keys()) misskeys.sort(key=fmisses.get) for key in misskeys[::-1]: outfile.write('{:<35} {:>13}{:>13}{:>13}\n'.\ format("no_match", "_", key, fmisses[key])) outfile.close() ## Link Sample with this data file to the Assembly object for name in data.barcodes: sample = Sample() sample.name = name sample.barcode = data.barcodes[name] if "pair" in data.paramsdict["datatype"]: sample.files.fastqs = [(os.path.join(data.dirs.fastqs, name+"_R1_.fastq.gz"), os.path.join(data.dirs.fastqs, name+"_R2_.fastq.gz"))] else: sample.files.fastqs = [(os.path.join(data.dirs.fastqs, name+"_R1_.fastq.gz"),)] sample.stats["reads_raw"] = fsamplehits[name] if sample.stats["reads_raw"]: sample.stats.state = 1 data.samples[sample.name] = sample else: print("Excluded sample: no data found for", name)
def make_stats(data, perfile, fsamplehits, fbarhits, fmisses, fdbars): """ Write stats and stores to Assembly object. """ ## out file outhandle = os.path.join(data.dirs.fastqs, 's1_demultiplex_stats.txt') outfile = open(outhandle, 'w') ## write the header for file stats outfile.write('{:<35} {:>13}{:>13}{:>13}\n'.\ format("raw_file", "total_reads", "cut_found", "bar_matched")) ## write the file stats r1names = sorted(perfile) for fname in r1names: dat = perfile[fname] #dat = [perfile[fname][i] for i in ["ftotal", "fcutfound", "fmatched"]] outfile.write('{:<35} {:>13}{:>13}{:>13}\n'.\ format(fname, dat[0], dat[1], dat[2])) ## repeat for pairfile if 'pair' in data.paramsdict["datatype"]: fname = fname.replace("_R1_", "_R2_") outfile.write('{:<35} {:>13}{:>13}{:>13}\n'.\ format(fname, dat[0], dat[1], dat[2])) ## spacer, how many records for each sample outfile.write('\n{:<35} {:>13}\n'.format("sample_name", "total_reads")) ## names alphabetical. Write to file. Will save again below to Samples. names_sorted = sorted(data.barcodes) for name in names_sorted: outfile.write("{:<35} {:>13}\n".format(name, fsamplehits[name])) ## spacer, which barcodes were found outfile.write('\n{:<35} {:>13} {:>13} {:>13}\n'.\ format("sample_name", "true_bar", "obs_bar", "N_records")) ## write sample results for name in names_sorted: ## write perfect hit hit = data.barcodes[name] offhitstring = "" sumoffhits = 0 ## write off-n hits ## sort list of off-n hits if name in fdbars: offkeys = list(fdbars.get(name)) offkeys.sort(key=fbarhits.get) for offhit in offkeys[::-1]: ## exclude perfect hit if offhit not in data.barcodes.values(): offhitstring += '{:<35} {:>13} {:>13} {:>13}\n'.\ format(name, hit, offhit, fbarhits[offhit]) sumoffhits += fbarhits[offhit] ## write string to file outfile.write('{:<35} {:>13} {:>13} {:>13}\n'.\ format(name, hit, hit, fsamplehits[name]-sumoffhits)) outfile.write(offhitstring) ## write misses misskeys = list(fmisses.keys()) misskeys.sort(key=fmisses.get) for key in misskeys[::-1]: outfile.write('{:<35} {:>13}{:>13}{:>13}\n'.\ format("no_match", "_", key, fmisses[key])) outfile.close() ## Link Sample with this data file to the Assembly object for name in data.barcodes: sample = Sample() sample.name = name sample.barcode = data.barcodes[name] if 'pair' in data.paramsdict["datatype"]: sample.files.fastqs = [(os.path.join(data.dirs.fastqs, name+"_R1_.fastq.gz"), os.path.join(data.dirs.fastqs, name+"_R2_.fastq.gz"))] else: sample.files.fastqs = [(os.path.join(data.dirs.fastqs, name+"_R1_.fastq.gz"), "")] ## fill in the summary stats sample.stats["reads_raw"] = int(fsamplehits[name]) ## fill in the full df stats value sample.stats_dfs.s1["reads_raw"] = int(fsamplehits[name]) ## Only link Sample if it has data if sample.stats["reads_raw"]: sample.stats.state = 1 data.samples[sample.name] = sample else: print("Excluded sample: no data found for", name) ## initiate s1 key for data object data.stats_dfs.s1 = data._build_stat("s1") data.stats_files.s1 = outhandle
def make_stats(data, raws): """ reads in pickled stats, collates, and writes to file """ ## stats for each rawdata file perfile = {} for rawtuple in raws: handle = os.path.splitext(os.path.basename(rawtuple[0]))[0] perfile[handle] = {} perfile[handle]["ftotal"] = 0 perfile[handle]["fcutfound"] = 0 perfile[handle]["fmatched"] = 0 ## stats for each sample fdbars = {} fsamplehits = Counter() fbarhits = Counter() fmisses = Counter() ## get stats from each file pickle pickles = glob.glob(os.path.join(data.dirs.fastqs, "*.pickle")) for picfile in pickles: with open(picfile, "rb") as pickin: filestats, samplestats = pickle.load(pickin) #counts = [total, cutfound, matched] handle, total, cutfound, matched = filestats samplehits, barhits, misses, dbars = samplestats ## update file stats perfile[handle]["ftotal"] += total perfile[handle]["fcutfound"] += cutfound perfile[handle]["fmatched"] += matched ## update sample stats fsamplehits.update(samplehits) fbarhits.update(barhits) fmisses.update(misses) fdbars.update(dbars) data.statsfiles.s1 = os.path.join(data.dirs.fastqs, 's1_demultiplex_stats.txt') outfile = open(data.statsfiles.s1, 'w') ## how many from each rawfile outfile.write('{:<35} {:>13}{:>13}{:>13}\n'.\ format("raw_file", "total_reads", "cut_found", "bar_matched")) ## sort rawfile names rawfilenames = sorted(perfile) for rawstat in rawfilenames: dat = [ perfile[rawstat][i] for i in ["ftotal", "fcutfound", "fmatched"] ] outfile.write('{:<35} {:>13}{:>13}{:>13}\n'.\ format(*[rawstat]+[str(i) for i in dat])) if "pair" in data.paramsdict["datatype"]: rawstat2 = rawstat.replace("_R1_", "_R2_") outfile.write('{:<35} {:>13}{:>13}{:>13}\n'.\ format(*[rawstat2]+[str(i) for i in dat])) ## spacer, how many records for each sample outfile.write('\n{:<35} {:>13}\n'.\ format("sample_name", "total_R1_reads")) ## names alphabetical names_sorted = sorted(data.barcodes) for name in names_sorted: outfile.write("{:<35} {:>13}\n".format(name, fsamplehits[name])) ## spacer, which barcodes were found outfile.write('\n{:<35} {:>13}{:>13}{:>13}\n'.\ format("sample_name", "true_bar", "obs_bar", "N_records")) ## write sample results for name in names_sorted: ## write perfect hit hit = data.barcodes[name] outfile.write('{:<35} {:>13}{:>13}{:>13}\n'.\ format(name, hit, hit, fsamplehits[name])) ## write off-n hits ## sort list of off-n hits if name in fdbars: offkeys = list(fdbars.get(name)) offkeys.sort(key=fbarhits.get) for offhit in offkeys[::-1]: ## exclude perfect hit if offhit not in data.barcodes.values(): outfile.write('{:<35} {:>13}{:>13}{:>13}\n'.\ format(name, hit, offhit, fbarhits[offhit])) ## write misses misskeys = list(fmisses.keys()) misskeys.sort(key=fmisses.get) for key in misskeys[::-1]: outfile.write('{:<35} {:>13}{:>13}{:>13}\n'.\ format("no_match", "_", key, fmisses[key])) outfile.close() ## Link Sample with this data file to the Assembly object for name in data.barcodes: sample = Sample() sample.name = name sample.barcode = data.barcodes[name] if "pair" in data.paramsdict["datatype"]: sample.files.fastqs = [(os.path.join(data.dirs.fastqs, name + "_R1_.fastq.gz"), os.path.join(data.dirs.fastqs, name + "_R2_.fastq.gz"))] else: sample.files.fastqs = [(os.path.join(data.dirs.fastqs, name + "_R1_.fastq.gz"), )] sample.stats["reads_raw"] = fsamplehits[name] if sample.stats["reads_raw"]: sample.stats.state = 1 data.samples[sample.name] = sample else: print("Excluded sample: no data found for", name)
def make_stats(data, perfile, fsamplehits, fbarhits, fmisses, fdbars): """ Write stats and stores to Assembly object. """ ## out file outhandle = os.path.join(data.dirs.fastqs, 's1_demultiplex_stats.txt') outfile = open(outhandle, 'w') ## how many from each rawfile outfile.write('{:<35} {:>13}{:>13}{:>13}\n'.\ format("raw_file", "total_reads", "cut_found", "bar_matched")) ## sort rawfile names rawfilenames = sorted(perfile) for rawstat in rawfilenames: dat = [ perfile[rawstat][i] for i in ["ftotal", "fcutfound", "fmatched"] ] outfile.write('{:<35} {:>13}{:>13}{:>13}\n'.\ format(*[rawstat]+[str(i) for i in dat])) if "pair" in data.paramsdict["datatype"]: rawstat2 = rawstat.replace("_R1_", "_R2_") outfile.write('{:<35} {:>13}{:>13}{:>13}\n'.\ format(*[rawstat2]+[str(i) for i in dat])) ## spacer, how many records for each sample outfile.write('\n{:<35} {:>13}\n'.\ format("sample_name", "total_reads")) ## names alphabetical. Write to file. Will save again below to Samples. names_sorted = sorted(data.barcodes) for name in names_sorted: outfile.write("{:<35} {:>13}\n".format(name, fsamplehits[name])) ## spacer, which barcodes were found outfile.write('\n{:<35} {:>13}{:>13}{:>13}\n'.\ format("sample_name", "true_bar", "obs_bar", "N_records")) ## write sample results for name in names_sorted: ## write perfect hit hit = data.barcodes[name] outfile.write('{:<35} {:>13}{:>13}{:>13}\n'.\ format(name, hit, hit, fsamplehits[name])) ## write off-n hits ## sort list of off-n hits if name in fdbars: offkeys = list(fdbars.get(name)) offkeys.sort(key=fbarhits.get) for offhit in offkeys[::-1]: ## exclude perfect hit if offhit not in data.barcodes.values(): outfile.write('{:<35} {:>13}{:>13}{:>13}\n'.\ format(name, hit, offhit, fbarhits[offhit])) ## write misses misskeys = list(fmisses.keys()) misskeys.sort(key=fmisses.get) for key in misskeys[::-1]: outfile.write('{:<35} {:>13}{:>13}{:>13}\n'.\ format("no_match", "_", key, fmisses[key])) outfile.close() ## Link Sample with this data file to the Assembly object for name in data.barcodes: sample = Sample() sample.name = name sample.barcode = data.barcodes[name] if "pair" in data.paramsdict["datatype"]: sample.files.fastqs = [(os.path.join(data.dirs.fastqs, name + "_R1_.fastq.gz"), os.path.join(data.dirs.fastqs, name + "_R2_.fastq.gz"))] else: sample.files.fastqs = [(os.path.join(data.dirs.fastqs, name + "_R1_.fastq.gz"), "")] ## fill in the summary stats sample.stats["reads_raw"] = int(fsamplehits[name]) ## fill in the full df stats value sample.stats_dfs.s1["reads_raw"] = int(fsamplehits[name]) ## Only link Sample if it has data if sample.stats["reads_raw"]: sample.stats.state = 1 data.samples[sample.name] = sample else: print("Excluded sample: no data found for", name) ## initiate s1 key for data object data.stats_dfs.s1 = data.build_stat("s1") data.stats_files.s1 = outhandle