예제 #1
0
파일: SolidData.py 프로젝트: gwmei/genomics
    def __init__(self,name,parent_sample=None):
        """Create a new SolidLibrary instance.

        Inputs:
          name: name of the library (e.g. AS_07)
          parent_sample: (optional) parent SolidSample object
        """
        # Name
        self.name = str(name)
        # Name-based information
        self.initials = bcf_utils.extract_initials(self.name)
        self.prefix = bcf_utils.extract_prefix(self.name)
        self.index_as_string = bcf_utils.extract_index_as_string(self.name)
        self.index = bcf_utils.extract_index(self.name)
        # Barcoding
        self.is_barcoded = False
        # Associated canonical data files
        self.csfasta = None
        self.qual = None
        self.csfasta_f5 = None
        self.qual_f5 = None
        # References to all primary data
        self.primary_data = []
        # Parent sample
        self.parent_sample = parent_sample
예제 #2
0
def get_casava_sample_sheet(samplesheet=None,fp=None,FCID_default='FC1'):
    """Load data into a 'standard' CASAVA sample sheet CSV file

    Reads the data from an Illumina platform sample sheet CSV file and
    populates and returns a CasavaSampleSheet object which can be
    used to generate make a SampleSheet suitable for bcl-to-fastq
    conversion.

    The source sample sheet may be in the format output by the
    Experimental Manager software (needed when running BaseSpace) or
    may already be in "standard" format for bcl-to-fastq format.

    For Experimental Manager format, the sample sheet consists of
    sections delimited by headers of the form "[Header]", "[Reads]" etc.
    The information about the sample names and barcodes are in the
    "[Data]" section, which is essentially a list of CSV format lines
    with the following fields:

    MiSEQ:

    Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index,
    Sample_Project,Description

    HiSEQ:

    Lane,Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,
    index,Sample_Project,Description

    (Note that for dual-indexed runs the fields are e.g.:

    Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index,
    I5_Index_ID,index2,Sample_Project,Description

    i.e. there are an additional pair of fields describing the second
    index)
    
    The conversion maps a subset of these onto fields in the Casava
    format:

    Sample_ID -> SampleID
    index -> Index
    Sample_Project -> SampleProject
    Description -> Description

    If no lane information is present in the original file then this
    is set to 1. The FCID is set to an arbitrary value.

    For dual-indexed samples, the Index field is generated by putting
    together the index and index2 fields.

    All other fields are left empty.

    Arguments:
      samplesheet: name of the Miseq sample sheet file
      FCID_default: name to use for flow cell ID if not present in
        the source file (optional)
    
    Returns:
      A populated CasavaSampleSheet object.

    """
    # Open the file for reading (if necessary)
    if fp is not None:
        # Use file object already provided
        sample_sheet_fp = fp
    else:
        # Open file
        sample_sheet_fp = open(samplesheet,'rU')
    # Read the sample sheet file to see if we can identify
    # the format
    line = sample_sheet_fp.readline()
    if line.startswith('[Header]'):
        # "Experimental Manager"-style format with [...] delimited sections
        experiment_manager_format = True
        # Skip through until we reach a [Data] section
        while not line.startswith('[Data]'):
            line = sample_sheet_fp.readline()
        # Feed the rest of the file to a TabFile
        data = TabFile.TabFile(fp=sample_sheet_fp,delimiter=',',
                               first_line_is_header=True)
    elif line.count(',') > 0:
        # Looks like a comma-delimited header
        experiment_manager_format = False
        # Feed the rest of the file to a TabFile
        data = TabFile.TabFile(fp=sample_sheet_fp,delimiter=',',
                               column_names=line.split(','))
    else:
        # Don't know what to do with this
        raise Exception, "SampleSheet format not recognised"
    # Close file, if we opened it
    if fp is None:
        sample_sheet_fp.close()
    # Clean up data: remove double quotes from fields
    for line in data:
        for col in data.header():
            line[col] = str(line[col]).strip('"')
    # Try to make sense of what we've got
    header_line = ','.join(data.header())
    if experiment_manager_format:
        # Build new sample sheet with standard format
        sample_sheet = CasavaSampleSheet()
        for line in data:
            sample_sheet_line = sample_sheet.append()
            # Set the lane
            try:
                lane = line['Lane']
            except KeyError:
                # No lane column (e.g. MiSEQ)
                lane = 1
            # Set the index tag (if any)
            try:
                index_tag = "%s-%s" % (line['index'].strip(),
                                       line['index2'].strip())
            except KeyError:
                # Assume not dual-indexed (no index2)
                try:
                    index_tag = line['index'].strip()
                except KeyError:
                    # No index
                    index_tag = ''
            sample_sheet_line['FCID'] = FCID_default
            sample_sheet_line['Lane'] = lane
            sample_sheet_line['Index'] = index_tag
            sample_sheet_line['SampleID'] = line['Sample_ID']
            sample_sheet_line['Description'] = line['Description']
            # Deal with project name
            if line['Sample_Project'] == '':
                # No project name - try to use initials from sample name
                sample_sheet_line['SampleProject'] = \
                   bcf_utils.extract_initials(line['Sample_ID'])
            else:
                sample_sheet_line['SampleProject'] = line['Sample_Project']
    else:
        # Assume standard format, convert directly to CasavaSampleSheet
        sample_sheet = CasavaSampleSheet()
        for line in data:
            if str(line[0]).startswith('#') or str(line).strip() == '':
                continue
            sample_sheet.append(tabdata=str(line))
    # Finished
    return sample_sheet
예제 #3
0
def convert_miseq_samplesheet_to_casava(samplesheet=None,fp=None):
    """Convert a Miseq sample sheet file to CASAVA format

    Reads the data in a Miseq-format sample sheet file and returns a
    CasavaSampleSheet object with the equivalent data.

    The MiSeq sample sheet consists of various sections delimited by
    headers of the form "[Header]", "[Reads]" etc. The information
    about the sample names and barcodes are in the "[Data]" section,
    which is essentially a list of CSV format lines with the following
    fields:

    Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index,
    Sample_Project,Description

    The conversion maps a subset of these onto fields in the Casava
    format:

    Sample_ID -> SampleID
    index -> Index
    Sample_Project -> SampleProject
    Description -> Description

    Lane is always set to 1 and the FCID is set to an arbitrary value.
    All other fields are left empty.

    Arguments:
      samplesheet: name of the Miseq sample sheet file
    
    Returns:
      A populated CasavaSampleSheet object.
    """
    # Read MiSEQ data into a TabFile
    if fp is not None:
        # Use file object already provided
        miseq_fp = fp
    else:
        # Open file
        miseq_fp = open(samplesheet,'rU')
    # Skip through the header until we get to the [Data] section
    for line in miseq_fp:
        if line.startswith('[Data]'):
            # Feed the rest of the file to a TabFile
            miseq_sample_sheet = TabFile.TabFile(fp=miseq_fp,delimiter=',',
                                                 first_line_is_header=True)
            break
    # Close file, if we opened it
    if fp is None:
        miseq_fp.close()
    # Check for paired end data
    if 'index2' in miseq_sample_sheet.header():
        paired_end = True
    else:
        paired_end = False
    # Create an empty CASAVA-style sample sheet
    casava_sample_sheet = CasavaSampleSheet()
    # Reformat each line of the Miseq samplesheet into CASAVA format
    for line in miseq_sample_sheet:
        casava_line = casava_sample_sheet.append()
        casava_line['FCID'] = '660DMAAXX'
        casava_line['Lane'] = 1
        casava_line['SampleID'] = line['Sample_ID']
        casava_line['Description'] = line['Description']
        # Deal with index sequences
        if not paired_end:
            casava_line['Index'] = line['index']
        else:
            casava_line['Index'] = "%s-%s" % (line['index'],line['index2'])
        # Deal with project name
        if casava_line['SampleProject'] == '':
            # No project name - try to use initials from sample name
            casava_line['SampleProject'] = \
                bcf_utils.extract_initials(casava_line['SampleID'])
        else:
            casava_line['SampleProject'] = line['Sample_Project']
    return casava_sample_sheet