def __init__(self,name,parent_sample=None): """Create a new SolidLibrary instance. Inputs: name: name of the library (e.g. AS_07) parent_sample: (optional) parent SolidSample object """ # Name self.name = str(name) # Name-based information self.initials = bcf_utils.extract_initials(self.name) self.prefix = bcf_utils.extract_prefix(self.name) self.index_as_string = bcf_utils.extract_index_as_string(self.name) self.index = bcf_utils.extract_index(self.name) # Barcoding self.is_barcoded = False # Associated canonical data files self.csfasta = None self.qual = None self.csfasta_f5 = None self.qual_f5 = None # References to all primary data self.primary_data = [] # Parent sample self.parent_sample = parent_sample
def get_casava_sample_sheet(samplesheet=None,fp=None,FCID_default='FC1'): """Load data into a 'standard' CASAVA sample sheet CSV file Reads the data from an Illumina platform sample sheet CSV file and populates and returns a CasavaSampleSheet object which can be used to generate make a SampleSheet suitable for bcl-to-fastq conversion. The source sample sheet may be in the format output by the Experimental Manager software (needed when running BaseSpace) or may already be in "standard" format for bcl-to-fastq format. For Experimental Manager format, the sample sheet consists of sections delimited by headers of the form "[Header]", "[Reads]" etc. The information about the sample names and barcodes are in the "[Data]" section, which is essentially a list of CSV format lines with the following fields: MiSEQ: Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index, Sample_Project,Description HiSEQ: Lane,Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID, index,Sample_Project,Description (Note that for dual-indexed runs the fields are e.g.: Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index, I5_Index_ID,index2,Sample_Project,Description i.e. there are an additional pair of fields describing the second index) The conversion maps a subset of these onto fields in the Casava format: Sample_ID -> SampleID index -> Index Sample_Project -> SampleProject Description -> Description If no lane information is present in the original file then this is set to 1. The FCID is set to an arbitrary value. For dual-indexed samples, the Index field is generated by putting together the index and index2 fields. All other fields are left empty. Arguments: samplesheet: name of the Miseq sample sheet file FCID_default: name to use for flow cell ID if not present in the source file (optional) Returns: A populated CasavaSampleSheet object. """ # Open the file for reading (if necessary) if fp is not None: # Use file object already provided sample_sheet_fp = fp else: # Open file sample_sheet_fp = open(samplesheet,'rU') # Read the sample sheet file to see if we can identify # the format line = sample_sheet_fp.readline() if line.startswith('[Header]'): # "Experimental Manager"-style format with [...] delimited sections experiment_manager_format = True # Skip through until we reach a [Data] section while not line.startswith('[Data]'): line = sample_sheet_fp.readline() # Feed the rest of the file to a TabFile data = TabFile.TabFile(fp=sample_sheet_fp,delimiter=',', first_line_is_header=True) elif line.count(',') > 0: # Looks like a comma-delimited header experiment_manager_format = False # Feed the rest of the file to a TabFile data = TabFile.TabFile(fp=sample_sheet_fp,delimiter=',', column_names=line.split(',')) else: # Don't know what to do with this raise Exception, "SampleSheet format not recognised" # Close file, if we opened it if fp is None: sample_sheet_fp.close() # Clean up data: remove double quotes from fields for line in data: for col in data.header(): line[col] = str(line[col]).strip('"') # Try to make sense of what we've got header_line = ','.join(data.header()) if experiment_manager_format: # Build new sample sheet with standard format sample_sheet = CasavaSampleSheet() for line in data: sample_sheet_line = sample_sheet.append() # Set the lane try: lane = line['Lane'] except KeyError: # No lane column (e.g. MiSEQ) lane = 1 # Set the index tag (if any) try: index_tag = "%s-%s" % (line['index'].strip(), line['index2'].strip()) except KeyError: # Assume not dual-indexed (no index2) try: index_tag = line['index'].strip() except KeyError: # No index index_tag = '' sample_sheet_line['FCID'] = FCID_default sample_sheet_line['Lane'] = lane sample_sheet_line['Index'] = index_tag sample_sheet_line['SampleID'] = line['Sample_ID'] sample_sheet_line['Description'] = line['Description'] # Deal with project name if line['Sample_Project'] == '': # No project name - try to use initials from sample name sample_sheet_line['SampleProject'] = \ bcf_utils.extract_initials(line['Sample_ID']) else: sample_sheet_line['SampleProject'] = line['Sample_Project'] else: # Assume standard format, convert directly to CasavaSampleSheet sample_sheet = CasavaSampleSheet() for line in data: if str(line[0]).startswith('#') or str(line).strip() == '': continue sample_sheet.append(tabdata=str(line)) # Finished return sample_sheet
def convert_miseq_samplesheet_to_casava(samplesheet=None,fp=None): """Convert a Miseq sample sheet file to CASAVA format Reads the data in a Miseq-format sample sheet file and returns a CasavaSampleSheet object with the equivalent data. The MiSeq sample sheet consists of various sections delimited by headers of the form "[Header]", "[Reads]" etc. The information about the sample names and barcodes are in the "[Data]" section, which is essentially a list of CSV format lines with the following fields: Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index, Sample_Project,Description The conversion maps a subset of these onto fields in the Casava format: Sample_ID -> SampleID index -> Index Sample_Project -> SampleProject Description -> Description Lane is always set to 1 and the FCID is set to an arbitrary value. All other fields are left empty. Arguments: samplesheet: name of the Miseq sample sheet file Returns: A populated CasavaSampleSheet object. """ # Read MiSEQ data into a TabFile if fp is not None: # Use file object already provided miseq_fp = fp else: # Open file miseq_fp = open(samplesheet,'rU') # Skip through the header until we get to the [Data] section for line in miseq_fp: if line.startswith('[Data]'): # Feed the rest of the file to a TabFile miseq_sample_sheet = TabFile.TabFile(fp=miseq_fp,delimiter=',', first_line_is_header=True) break # Close file, if we opened it if fp is None: miseq_fp.close() # Check for paired end data if 'index2' in miseq_sample_sheet.header(): paired_end = True else: paired_end = False # Create an empty CASAVA-style sample sheet casava_sample_sheet = CasavaSampleSheet() # Reformat each line of the Miseq samplesheet into CASAVA format for line in miseq_sample_sheet: casava_line = casava_sample_sheet.append() casava_line['FCID'] = '660DMAAXX' casava_line['Lane'] = 1 casava_line['SampleID'] = line['Sample_ID'] casava_line['Description'] = line['Description'] # Deal with index sequences if not paired_end: casava_line['Index'] = line['index'] else: casava_line['Index'] = "%s-%s" % (line['index'],line['index2']) # Deal with project name if casava_line['SampleProject'] == '': # No project name - try to use initials from sample name casava_line['SampleProject'] = \ bcf_utils.extract_initials(casava_line['SampleID']) else: casava_line['SampleProject'] = line['Sample_Project'] return casava_sample_sheet