def demultiplex_fastq(outdir, samplesheet, fastq1, fastq2=None): """Demultiplex a bcl-converted illumina fastq file. Assumes it has the index sequence in the header a la CASAVA 1.8+ """ outfiles = {} counts = {} sdata = HiSeqRun.parse_samplesheet(samplesheet) reads = [1] if fastq2 is not None: reads.append(2) # For each Lane-Index combination, create a file and open a filehandle for sd in sdata: lane = sd['Lane'] index = sd['Index'] if lane not in outfiles: outfiles[lane] = {} counts[lane] = {} outfiles[lane][index] = [] counts[lane][index] = 0 for read in reads: fname = "tmp_{}_{}_L00{}_R{}_001.fastq.gz".format(sd['SampleID'], index, lane, read) outfiles[lane][index].append(FastQWriter(os.path.join(outdir,fname))) # Parse the input file(s) and write the records to the appropriate output files fhs = [FastQParser(fastq1)] if fastq2 is not None: fhs.append(FastQParser(fastq2)) for r, fh in enumerate(fhs): for record in fh: header = parse_header(record[0]) lane = str(header['lane']) index = header['index'] if lane in outfiles and index in outfiles[lane]: outfiles[lane][index][r].write(record) counts[lane][index] += 1 # Close filehandles and replace the handles with the file names for lane in outfiles.keys(): for index in outfiles[lane].keys(): for r, fh in enumerate(outfiles[lane][index]): fh.close() fname = fh.name() # If no sequences were written, remove the temporary file and the entry from the results if counts[lane][index] == 0: os.unlink(fname) del outfiles[lane][index] break # Rename the temporary file to a persistent name nname = fname.replace("tmp_","") os.rename(fname,nname) outfiles[lane][index][r] = nname return outfiles
def test_parse_samplesheet(self): """Write and parse a csv-file """ # Assert non-existing file raises exception with self.assertRaises(IOError): HiSeqRun.parse_samplesheet(os.path.join(self.rootdir,'non-existing-samplesheet')) # Write a csv file with some bogus values sdata = td.generate_samplesheet_data() samplesheet = os.path.join(self.rootdir,'SampleSheet.csv') HiSeqRun.write_samplesheet(sdata,samplesheet) # Assert that the written data corresponds to the generated data with open(samplesheet) as fh: # Assert that header is correct self.assertListEqual(HiSeqRun._samplesheet_header(), fh.next().strip().split(","), "Written header does not match expected header") for entry in sdata: # Assert that all rows have the correct values in the correct columns self.assertListEqual([str(e) for e in entry], fh.next().strip().split(","), "Written data row does not match entry in generated samplesheet") # Assert that all rows from samplesheet has been consumed with self.assertRaises(StopIteration): fh.next() # Assert that the parsed data matches the generated data data = HiSeqRun.parse_samplesheet(samplesheet) self.assertEqual(len(sdata), len(data), "Number of parsed entries does not match number of generated entries") for d in data: self.assertListEqual([str(e) for e in sdata.pop(0)], [d[col] for col in HiSeqRun._samplesheet_header()], "Parsed data row does not match entry in generated samplesheet") # Assert that filtering on lane returns expected output lanes = list(set([d["Lane"] for d in data])) obs_lane_data = HiSeqRun.parse_samplesheet(samplesheet,lane=lanes[-1]) exp_lane_data = [d for d in data if str(d["Lane"]) == str(lanes[-1])] self.assertListEqual(sorted(obs_lane_data), sorted(exp_lane_data), "Parsed data row does not match entry in generated samplesheet")
def demultiplex_fastq(outdir, samplesheet, fastq1, fastq2=None): """Demultiplex a bcl-converted illumina fastq file. Assumes it has the index sequence in the header a la CASAVA 1.8+ """ outfiles = {} counts = {} sdata = HiSeqRun.parse_samplesheet(samplesheet) reads = [1] if fastq2 is not None: reads.append(2) # For each Lane-Index combination, create a file and open a filehandle for sd in sdata: lane = sd['Lane'] index = sd['Index'] if lane not in outfiles: outfiles[lane] = {} counts[lane] = {} outfiles[lane][index] = [] counts[lane][index] = 0 for read in reads: fname = "tmp_{}_{}_L00{}_R{}_001.fastq.gz".format( sd['SampleID'], index, lane, read) outfiles[lane][index].append( FastQWriter(os.path.join(outdir, fname))) # Parse the input file(s) and write the records to the appropriate output files fhs = [FastQParser(fastq1)] if fastq2 is not None: fhs.append(FastQParser(fastq2)) for r, fh in enumerate(fhs): for record in fh: header = parse_header(record[0]) lane = str(header['lane']) index = header['index'] if lane in outfiles and index in outfiles[lane]: outfiles[lane][index][r].write(record) counts[lane][index] += 1 # Close filehandles and replace the handles with the file names for lane in outfiles.keys(): for index in outfiles[lane].keys(): for r, fh in enumerate(outfiles[lane][index]): fh.close() fname = fh.name() # If no sequences were written, remove the temporary file and the entry from the results if counts[lane][index] == 0: os.unlink(fname) del outfiles[lane][index] break # Rename the temporary file to a persistent name nname = fname.replace("tmp_", "") os.rename(fname, nname) outfiles[lane][index][r] = nname return outfiles
def get_expected(csv_file, lane): """Extract the expected barcodes in a lane from a supplied csv samplesheet """ rows = HiSeqRun.parse_samplesheet(csv_file, lane=lane) return [r["Index"] for r in rows]
def get_expected(csv_file, lane): """Extract the expected barcodes in a lane from a supplied csv samplesheet """ rows = HiSeqRun.parse_samplesheet(csv_file,lane=lane) return [r["Index"] for r in rows]