Example #1
0
 def parse_directory(self):
     """Traverse a CASAVA 1.8+ generated directory structure and return a dictionary
     """ 
     raise NotImplementedError("This method is not yet implemented")
 
     projects = []
     
     # Create a Flowcell object
     fc = Flowcell()
     fc.filename = self.samplesheet_file
     fc.fc_id()
     
     unaligned_dir = self.get_sequence_dir()
     basecall_stats_dir = self.get_basecall_stats()
     
     project_dir_pattern = os.path.join(unaligned_dir,"Project_*")
     for project_dir in glob.glob(project_dir_pattern):
         project_samples = []
         sample_dir_pattern = os.path.join(project_dir,"Sample_*")
         for sample_dir in glob.glob(sample_dir_pattern):
             fastq_file_pattern = os.path.join(sample_dir,"*.fastq.gz")
             samplesheet_pattern = os.path.join(sample_dir,"*.csv")
             fastq_files = [os.path.basename(file) for file in glob.glob(fastq_file_pattern)]
             samplesheet = glob.glob(samplesheet_pattern)
             assert len(samplesheet) == 1, "ERROR: Could not unambiguously locate samplesheet in %s" % sample_dir
             sample_name = sample_dir.replace(sample_dir_pattern[0:-1],'')
             project_samples.append({'sample_dir': os.path.relpath(sample_dir,project_dir), 'sample_name': sample_name, 'files': fastq_files, 'samplesheet': os.path.basename(samplesheet[0])})
         project_name = project_dir.replace(project_dir_pattern[0:-1],'')
         projects.append({'project_dir': os.path.relpath(project_dir,unaligned_dir), 'project_name': project_name, 'samples': project_samples})
     
     return {'fc_dir': fc_dir, 'fc_name': fc_name, 'fc_date': fc_date, 'data_dir': os.path.relpath(unaligned_dir,fc_dir), 'basecall_stats_dir': basecall_stats_dir, 'projects': projects}
Example #2
0
    def parse_directory(self):
        """Traverse a CASAVA 1.8+ generated directory structure and return a dictionary
        """
        raise NotImplementedError("This method is not yet implemented")

        projects = []

        # Create a Flowcell object
        fc = Flowcell()
        fc.filename = self.samplesheet_file
        fc.fc_id()

        unaligned_dir = self.get_sequence_dir()
        basecall_stats_dir = self.get_basecall_stats()

        project_dir_pattern = os.path.join(unaligned_dir, "Project_*")
        for project_dir in glob.glob(project_dir_pattern):
            project_samples = []
            sample_dir_pattern = os.path.join(project_dir, "Sample_*")
            for sample_dir in glob.glob(sample_dir_pattern):
                fastq_file_pattern = os.path.join(sample_dir, "*.fastq.gz")
                samplesheet_pattern = os.path.join(sample_dir, "*.csv")
                fastq_files = [
                    os.path.basename(file)
                    for file in glob.glob(fastq_file_pattern)
                ]
                samplesheet = glob.glob(samplesheet_pattern)
                assert len(
                    samplesheet
                ) == 1, "ERROR: Could not unambiguously locate samplesheet in %s" % sample_dir
                sample_name = sample_dir.replace(sample_dir_pattern[0:-1], '')
                project_samples.append({
                    'sample_dir':
                    os.path.relpath(sample_dir, project_dir),
                    'sample_name':
                    sample_name,
                    'files':
                    fastq_files,
                    'samplesheet':
                    os.path.basename(samplesheet[0])
                })
            project_name = project_dir.replace(project_dir_pattern[0:-1], '')
            projects.append({
                'project_dir':
                os.path.relpath(project_dir, unaligned_dir),
                'project_name':
                project_name,
                'samples':
                project_samples
            })

        return {
            'fc_dir': fc_dir,
            'fc_name': fc_name,
            'fc_date': fc_date,
            'data_dir': os.path.relpath(unaligned_dir, fc_dir),
            'basecall_stats_dir': basecall_stats_dir,
            'projects': projects
        }
Example #3
0
 def _from_pre_casava_structure(self):
     if not self._check_pargs(["project", "flowcell"]):
         return
     fc = Flowcell()
     fc.load([os.path.join(x, self.pargs.flowcell) for x in [self.config.get("archive", "root"), self.config.get("production", "root")]])
     indir = os.path.join(self.config.get("production", "root"), self.pargs.flowcell)
     if not fc:
         self.log.warn("No run information available for {}".format(self.pargs.flowcell))
         return
     fc_new = fc.subset("sample_prj", self.pargs.project)
     fc_new.collect_files(indir)        
     return fc_new
Example #4
0
 def _from_pre_casava_structure(self):
     if not self._check_pargs(["project", "flowcell"]):
         return
     fc = Flowcell()
     fc.load([os.path.join(x, self.pargs.flowcell) for x in [self.config.get("archive", "root"), self.config.get("production", "root")]])
     indir = os.path.join(self.config.get("production", "root"), self.pargs.flowcell)
     if not fc:
         self.log.warn("No run information available for {}".format(self.pargs.flowcell))
         return
     fc_new = fc.subset("sample_prj", self.pargs.project)
     fc_new.collect_files(indir)        
     return fc_new
Example #5
0
 def _from_casava_structure(self):
     """Get information from casava structure"""
     if not self._check_pargs(["project"]):
         return
     fc_list = []
     pattern = "-bcbb-config.yaml$"
     def bcbb_yaml_filter(f):
         return re.search(pattern, f) != None
     samples = filtered_walk(os.path.join(self._meta.root_path, self._meta.path_id), bcbb_yaml_filter)
     for s in samples:
         fc = Flowcell(s)
         fc_new = fc.subset("sample_prj", self.pargs.project)
         fc_new.collect_files(os.path.dirname(s))        
         fc_list.append(fc_new)
     return fc_list
Example #6
0
 def _from_casava_structure(self):
     """Get information from casava structure"""
     if not self._check_pargs(["project"]):
         return
     fc_list = []
     pattern = "-bcbb-config.yaml$"
     def bcbb_yaml_filter(f):
         return re.search(pattern, f) != None
     samples = filtered_walk(os.path.join(self._meta.root_path, self._meta.path_id), bcbb_yaml_filter)
     for s in samples:
         fc = Flowcell(s)
         fc_new = fc.subset("sample_prj", self.pargs.project)
         fc_new.collect_files(os.path.dirname(s))        
         fc_list.append(fc_new)
     return fc_list
Example #7
0
def samplesheet_csv_to_yaml(fn):
    """Convert SampleSheet.csv to bcbb-config.yaml file.

    :param fn: input file
    """
    fc = Flowcell(infile=fn)
    bc_id = 1
    for s in fc.samples:
        sequence = fc.get_entry(s, "sequence")
        name = fc.get_entry(s, "name")
        # Currently only look for casava-based pattern
        pat = os.path.join(os.path.dirname(fn), "{}_{}*fastq*".format(name, sequence))
        seqfiles = glob.glob(pat)
        if seqfiles:
            seqfiles.sort()
            fc.set_entry(s, "files", seqfiles)
        fc.set_entry(s, "barcode_id", bc_id)
        if bc_id == 1:
            flowcell_id = fc.get_entry(s, "flowcell_id").split("-")[1]
        bc_id = bc_id + 1
    fc.fc_date = datetime.datetime.now().strftime("%y%m%d")
    fc.fc_name = flowcell_id
    outfile = os.path.join(os.path.dirname(fn), "{}-bcbb-config.yaml".format(name))
    with open(outfile, "w") as fh:
        fh.write(fc.as_yaml())
Example #8
0
def samplesheet_csv_to_yaml(fn):
    """Convert SampleSheet.csv to bcbb-config.yaml file.

    :param fn: input file
    """
    fc = Flowcell(infile=fn)
    bc_id = 1
    for s in fc.samples:
        sequence = fc.get_entry(s, "sequence")
        name = fc.get_entry(s, "name")
        # Currently only look for casava-based pattern
        pat = os.path.join(os.path.dirname(fn), "{}_{}*fastq*".format(name, sequence))
        seqfiles = glob.glob(pat)
        if seqfiles:
            seqfiles.sort()
            fc.set_entry(s, "files", seqfiles)
        fc.set_entry(s, "barcode_id", bc_id)
        if bc_id == 1:
            flowcell_id = fc.get_entry(s, "flowcell_id").split("-")[1]
        bc_id = bc_id + 1
    fc.fc_date = datetime.datetime.now().strftime("%y%m%d")
    fc.fc_name = flowcell_id
    outfile = os.path.join(os.path.dirname(fn), "{}-bcbb-config.yaml".format(name))
    with open(outfile, "w") as fh:
        fh.write(fc.as_yaml())