Example #1
0
def demultiplex_fastq(outdir, samplesheet, fastq1, fastq2=None):
    """Demultiplex a bcl-converted illumina fastq file. Assumes it has the index sequence
    in the header a la CASAVA 1.8+
    """
    outfiles = {}
    counts = {}
    sdata = HiSeqRun.parse_samplesheet(samplesheet)
    reads = [1]
    if fastq2 is not None:
        reads.append(2)
        
    # For each Lane-Index combination, create a file and open a filehandle
    for sd in sdata:
        lane = sd['Lane']
        index = sd['Index']
        if lane not in outfiles:
            outfiles[lane] = {}
            counts[lane] = {}
        outfiles[lane][index] = []
        counts[lane][index] = 0
        for read in reads:
            fname = "tmp_{}_{}_L00{}_R{}_001.fastq.gz".format(sd['SampleID'],
                                                              index,
                                                              lane,
                                                              read)
            outfiles[lane][index].append(FastQWriter(os.path.join(outdir,fname)))
    
    # Parse the input file(s) and write the records to the appropriate output files
    fhs = [FastQParser(fastq1)]
    if fastq2 is not None:
        fhs.append(FastQParser(fastq2))
        
    for r, fh in enumerate(fhs):
        for record in fh:
            header = parse_header(record[0])
            lane = str(header['lane'])
            index = header['index']
            if lane in outfiles and index in outfiles[lane]:
                outfiles[lane][index][r].write(record)
                counts[lane][index] += 1
    
    # Close filehandles and replace the handles with the file names
    for lane in outfiles.keys():
        for index in outfiles[lane].keys():
            for r, fh in enumerate(outfiles[lane][index]):
                fh.close()
                fname = fh.name()
                # If no sequences were written, remove the temporary file and the entry from the results
                if counts[lane][index] == 0:
                    os.unlink(fname)
                    del outfiles[lane][index]
                    break
                
                # Rename the temporary file to a persistent name
                nname = fname.replace("tmp_","")
                os.rename(fname,nname)
                outfiles[lane][index][r] = nname
    
    return outfiles
Example #2
0
def demultiplex_fastq(outdir, samplesheet, fastq1, fastq2=None):
    """Demultiplex a bcl-converted illumina fastq file. Assumes it has the index sequence
    in the header a la CASAVA 1.8+
    """
    outfiles = {}
    counts = {}
    sdata = HiSeqRun.parse_samplesheet(samplesheet)
    reads = [1]
    if fastq2 is not None:
        reads.append(2)

    # For each Lane-Index combination, create a file and open a filehandle
    for sd in sdata:
        lane = sd['Lane']
        index = sd['Index']
        if lane not in outfiles:
            outfiles[lane] = {}
            counts[lane] = {}
        outfiles[lane][index] = []
        counts[lane][index] = 0
        for read in reads:
            fname = "tmp_{}_{}_L00{}_R{}_001.fastq.gz".format(
                sd['SampleID'], index, lane, read)
            outfiles[lane][index].append(
                FastQWriter(os.path.join(outdir, fname)))

    # Parse the input file(s) and write the records to the appropriate output files
    fhs = [FastQParser(fastq1)]
    if fastq2 is not None:
        fhs.append(FastQParser(fastq2))

    for r, fh in enumerate(fhs):
        for record in fh:
            header = parse_header(record[0])
            lane = str(header['lane'])
            index = header['index']
            if lane in outfiles and index in outfiles[lane]:
                outfiles[lane][index][r].write(record)
                counts[lane][index] += 1

    # Close filehandles and replace the handles with the file names
    for lane in outfiles.keys():
        for index in outfiles[lane].keys():
            for r, fh in enumerate(outfiles[lane][index]):
                fh.close()
                fname = fh.name()
                # If no sequences were written, remove the temporary file and the entry from the results
                if counts[lane][index] == 0:
                    os.unlink(fname)
                    del outfiles[lane][index]
                    break

                # Rename the temporary file to a persistent name
                nname = fname.replace("tmp_", "")
                os.rename(fname, nname)
                outfiles[lane][index][r] = nname

    return outfiles
Example #3
0
 def test_get_project_names(self):
     """Get the projects from a samplesheet
     """
     # Assert that an empty file returns an empty list
     fh, ssheet = tempfile.mkstemp(dir=self.rootdir, suffix=".csv")
     os.close(fh)
     self.assertListEqual([],HiSeqRun.get_project_names(ssheet),
                          "The list of projects for an empty file is not empty")
     
     # Generate artificial samplesheet data
     data = td.generate_samplesheet_data()
     projects = {}
     for d in data:
         projects[d[-1]] = 1
     
     # Write the data to a samplesheet
     td._write_samplesheet(data,ssheet)
      
     # Assert that the list of projects returned is the same that we generated
     self.assertListEqual(sorted(projects.keys()),sorted(HiSeqRun.get_project_names(ssheet)),
                          "The list of projects does not match the original list")
Example #4
0
 def test_get_project_sample_ids(self):
     """Test that getting the project samples from a samplesheet behaves as expected
     """
     
     # Generate artificial samplesheet data
     data = td.generate_samplesheet_data()
     fh, ssheet = tempfile.mkstemp(dir=self.rootdir, suffix=".csv")
     os.close(fh)
     td._write_samplesheet(data,ssheet)
      
     # Assert that getting samples for a non-existing project returns an empty list
     self.assertListEqual([],HiSeqRun.get_project_sample_ids(ssheet,td.generate_project()),
                          "Getting samples for a non-existing project returned unexpected output")
     
     # Iterate over the projects and assert that the returned samples are correct
     samples = {}
     for row in data:
         if row[9] not in samples:
             samples[row[9]] = []
         samples[row[9]].append(row[2])
     
     for proj, sample in samples.items():
         self.assertListEqual(sorted(sample),sorted(HiSeqRun.get_project_sample_ids(ssheet,proj)),
                              "The returned list of samples did not match the original")
Example #5
0
 def test_parse_samplesheet(self):
     """Write and parse a csv-file
     """
     
     # Assert non-existing file raises exception
     with self.assertRaises(IOError):
         HiSeqRun.parse_samplesheet(os.path.join(self.rootdir,'non-existing-samplesheet'))
         
     # Write a csv file with some bogus values
     sdata = td.generate_samplesheet_data()
     samplesheet = os.path.join(self.rootdir,'SampleSheet.csv')
     HiSeqRun.write_samplesheet(sdata,samplesheet)
     
     # Assert that the written data corresponds to the generated data
     with open(samplesheet) as fh:
         # Assert that header is correct
         self.assertListEqual(HiSeqRun._samplesheet_header(),
                              fh.next().strip().split(","),
                              "Written header does not match expected header")
         for entry in sdata:
             # Assert that all rows have the correct values in the correct columns
             self.assertListEqual([str(e) for e in entry],
                                  fh.next().strip().split(","),
                                  "Written data row does not match entry in generated samplesheet")
         
         # Assert that all rows from samplesheet has been consumed
         with self.assertRaises(StopIteration):
             fh.next()
     
     # Assert that the parsed data matches the generated data
     data = HiSeqRun.parse_samplesheet(samplesheet)
     self.assertEqual(len(sdata),
                      len(data),
                      "Number of parsed entries does not match number of generated entries")
     for d in data:
         self.assertListEqual([str(e) for e in sdata.pop(0)],
                              [d[col] for col in HiSeqRun._samplesheet_header()],
                              "Parsed data row does not match entry in generated samplesheet")
         
     # Assert that filtering on lane returns expected output
     lanes = list(set([d["Lane"] for d in data]))
     obs_lane_data = HiSeqRun.parse_samplesheet(samplesheet,lane=lanes[-1])
     exp_lane_data = [d for d in data if str(d["Lane"]) == str(lanes[-1])]
     self.assertListEqual(sorted(obs_lane_data),
                          sorted(exp_lane_data),
                          "Parsed data row does not match entry in generated samplesheet")
Example #6
0
def get_expected(csv_file, lane):
    """Extract the expected barcodes in a lane from a supplied csv samplesheet
    """
    rows = HiSeqRun.parse_samplesheet(csv_file, lane=lane)
    return [r["Index"] for r in rows]
Example #7
0
def status_query(archive_dir, analysis_dir, flowcell, project, brief):
    """Get a status report of the progress of flowcells based on a snapshot of the file system
    """
    
    last_step = 14
    status = []
    # Process each flowcell in the archive directory
    for fcdir in IlluminaRun.get_flowcell(archive_dir,flowcell):
        fc_status = {}
        fc_status['flowcell'] = os.path.basename(fcdir)
        
        # Locate the samplesheet
        samplesheet = IlluminaRun.get_samplesheet(fcdir)
        if samplesheet is None:
            print("{}***ERROR***: Could not locate samplesheet in flowcell directory. Skipping..")
            continue
        fc_status['samplesheet'] = samplesheet

        # Get a list of the projects in the samplesheet
        projects = HiSeqRun.get_project_names(samplesheet)
        if len(projects) == 0:
            print("\t***WARNING***: No projects matched your filter [{}] for flowcell. Skipping..".format(project))
            continue
        
        fc_status['projects'] = []
        
        # Iterate over the projects in the flowcell
        for proj in projects:
            proj = proj.replace("__",".")
            proj_status = {}
            proj_status['project'] = proj
            
            pdir = bcbio.get_project_analysis_dir(analysis_dir, proj)
            if not pdir:
                continue
            
            proj_status['project_dir'] = pdir
            proj_status['samples'] = []
            proj_status['no_finished_samples'] = 0
            samples = HiSeqRun.get_project_sample_ids(samplesheet, proj)
            for smpl in samples:
                smpl = smpl.replace("__",".")
                sample_status = {}
                proj_status['samples'].append(sample_status)
                sample_status['sample_id'] = smpl
                sdir = bcbio.get_sample_analysis_dir(pdir, smpl)
                if not sdir:
                    continue
                sample_status['sample_dir'] = sdir
                
                # Match the flowcell we're processing to the sample flowcell directories
                sample_fc = [d for d in IlluminaRun.get_flowcell(sdir) if d.split("_")[-1] == fcdir.split("_")[-1]]
                if len(sample_fc) == 0:
                    continue
                sample_fc = sample_fc[0]
                sample_status['sample_fc_dir'] = sample_fc
                
                fastq_screen = bcbio.get_fastq_screen_folder(sample_fc)
                if fastq_screen:
                    sample_status['fastq_screen'] = [fastq_screen,bcbio.fastq_screen_finished(fastq_screen)]
                
                now = datetime.datetime.now()
                pipeline_start_indicator = bcbio.get_pipeline_indicator(sample_fc,[1])
                if len(pipeline_start_indicator) == 0:
                    continue
                pipeline_start_indicator = pipeline_start_indicator[0]
                
                most_recent, _ = bcbio.get_most_recent_indicator([pipeline_start_indicator])
                sample_status['pipeline_started'] = [pipeline_start_indicator,most_recent]
                
                most_recent, ifile = bcbio.get_most_recent_indicator(bcbio.get_pipeline_indicator(sample_fc))
                sample_status['pipeline_progress'] = [ifile,most_recent]
                
                sample_log = bcbio.get_sample_pipeline_log(sample_fc,smpl)
                if not sample_log:
                    continue
                st = os.stat(sample_log)
                sample_status['pipeline_log'] = [sample_log,datetime.datetime.fromtimestamp(st.st_mtime)]
                
                jobids = slurm.get_slurm_jobid(smpl)
                sample_status['slurm_job'] = []
                for jobid in jobids:
                    sample_status['slurm_job'].append([jobid,slurm.get_slurm_jobstatus(jobid)])
                
                most_recent, ifile = bcbio.get_most_recent_indicator(bcbio.get_pipeline_indicator(sample_fc,[last_step]))
                if ifile is not None and sample_status.get('fastq_screen',[None,False])[1]:
                    sample_status['finished'] = True
                    proj_status['no_finished_samples'] += 1
                
            
            if proj_status['no_finished_samples'] == len(samples):
                proj_status['finished'] = True
                
            fc_status['projects'].append(proj_status)
            
        status.append(fc_status) 
    print_status(status,brief)
def get_expected(csv_file, lane):
    """Extract the expected barcodes in a lane from a supplied csv samplesheet
    """
    rows = HiSeqRun.parse_samplesheet(csv_file,lane=lane)
    return [r["Index"] for r in rows]
Example #9
0
 def setUp(self): 
     self.rootdir = tempfile.mkdtemp(prefix="test_illumina_hiseq_")
     self.hiseq = HiSeqRun(self.rootdir)