コード例 #1
0
def verify_sample_sheet_file_names_in_file_list(sample_sheet_file,
                                                run_data_directory_file_list):
    """
    Given a sample sheet, and a list of files in a directory,
    verify that all the files on the sheet exist in the file list

    If a file is missing, a SampleSheetError is raised

    :param sample_sheet_file:
    :param run_data_directory_file_list:
    :return:
    """
    sample_list = _parse_samples(sample_sheet_file)

    for sample in sample_list:
        sample_dict = sample.get_uploadable_dict()
        paired_end_read = len(sample_dict['File_Reverse']) > 0

        # Check if file names are in the files we found in the directory file list
        if sample_dict['File_Forward'] not in run_data_directory_file_list:
            raise exceptions.SampleSheetError((
                "Your sample sheet is malformed. {} Does not match any file list in sample sheet file"
                "".format(sample_dict['File_Forward'])), sample_sheet_file)
        if paired_end_read and sample_dict[
                'File_Reverse'] not in run_data_directory_file_list:
            raise exceptions.SampleSheetError((
                "Your sample sheet is malformed. {} Does not match any file list in sample sheet file"
                "".format(sample_dict['File_Reverse'])), sample_sheet_file)
コード例 #2
0
def parse_metadata(sample_sheet_file):

    """
    Parse all lines under [Header], [Reads] and [BCLConvert_Settings] in .csv file
    Lines under [Reads] are stored in a list with key name "readLengths"
    All other key names are translated according to the
        metadata_key_translation_dict

    arguments:
            sample_sheet_file -- path to UploadList.csv

    returns a dictionary containing the parsed key:pair values from .csv file
    """

    metadata_dict = {"readLengths": [], "indexCycles": []}

    csv_reader = common.get_csv_reader(sample_sheet_file)

    section = None

    for line in csv_reader:
        if "[Header]" in line or "[BCLConvert_Settings]" in line:
            section = "header"
            continue
        elif "[Reads]" in line:
            section = "reads"
            continue
        elif "[BCLConvert_Data]" in line:
            break
        elif line and line[0].startswith("["):
            section = "unknown"
            continue

        if not line or not line[0]:
            continue

        if not section:
            logging.debug("Sample sheet is missing important sections: no sections were found")
            raise exceptions.SampleSheetError("Sample sheet is missing important sections: no sections were found.",
                                              sample_sheet_file)
        elif section == "reads":
            if line[0] == "Read1Cycles" or line[0] == "Read2Cycles":
                metadata_dict["readLengths"].append(line[1])
            elif line[0] == "Index1Cycles" or line[0] == "Index2Cycles":
                metadata_dict["indexCycles"].append(line[1])

    # currently sends just the larger readLengths
    if len(metadata_dict["readLengths"]) == 2:
        metadata_dict["layoutType"] = "PAIRED_END"
    elif len(metadata_dict["readLengths"]) == 1:
        metadata_dict["layoutType"] = "SINGLE_END"
    else:
        logging.debug("The sample sheet has invalid [Reads] sections: [Reads] section should have 2 or 4 entries.")
        raise exceptions.SampleSheetError("The sample sheet has invalid [Reads] sections: "
                                          "[Reads] section should have 2 or 4 entries.",
                                          sample_sheet_file)
    metadata_dict["readLengths"] = max(metadata_dict["readLengths"])
    metadata_dict["indexCycles"] = max(metadata_dict["indexCycles"])

    return metadata_dict
コード例 #3
0
def build_sample_list_from_sample_sheet_with_abs_path(sample_sheet_file):
    """
    Create a list of Sample objects, where each SequenceFile object has an absolute file path

    :param sample_sheet_file:
    :return:
    """
    sample_list = _parse_samples(sample_sheet_file)
    # Data directory is used if file names on sample sheet are not absolute paths (in directory files)
    data_dir = path.dirname(sample_sheet_file)
    sample_sheet_dir_file_list = common.get_file_list(data_dir)

    for sample in sample_list:
        sample_dict = sample.get_uploadable_dict()
        paired_end_read = len(sample_dict['File_Reverse']) > 0

        # create file list of full paths
        file_list = []
        # If file is not an abspath already, make it an abspath from filename + data dir
        if path.isabs(sample_dict['File_Forward']):
            file_list.append(sample_dict['File_Forward'])
        elif sample_dict['File_Forward'] in sample_sheet_dir_file_list:
            sample_dict['File_Forward'] = path.join(
                path.abspath(data_dir), sample_dict['File_Forward'])

            file_list.append(sample_dict['File_Forward'])
        else:
            raise exceptions.SampleSheetError((
                "Your sample sheet is malformed. {} Does not match any file in the directory {}"
                "".format(sample_dict['File_Forward'], data_dir)),
                                              sample_sheet_file)

        # reverse file is same as for forward file
        if paired_end_read:
            if path.isabs(sample_dict['File_Reverse']):
                file_list.append(sample_dict['File_Reverse'])
            elif sample_dict['File_Reverse'] in sample_sheet_dir_file_list:
                sample_dict['File_Reverse'] = path.join(
                    path.abspath(data_dir), sample_dict['File_Reverse'])
                file_list.append(sample_dict['File_Reverse'])
            else:
                raise exceptions.SampleSheetError((
                    "Your sample sheet is malformed. {} Does not match any file in the directory {}"
                    "".format(sample_dict['File_Reverse'], data_dir)),
                                                  sample_sheet_file)

        # Create sequence file object and attach to sample
        sq = model.SequenceFile(file_list=file_list)
        sample.sequence_file = deepcopy(sq)

    return sample_list
コード例 #4
0
def get_csv_reader(sample_sheet_file):
    """
    tries to create a csv.reader object which will be used to
        parse through the lines in SampleSheet.csv
    raises an error if:
            sample_sheet_file is not an existing file
            sample_sheet_file contains null byte(s)

    arguments:
            data_dir -- the directory that has SampleSheet.csv in it

    returns a csv.reader object
    """

    if os.path.isfile(sample_sheet_file):
        csv_file = open(sample_sheet_file, "r")
        # strip any trailing newline characters from the end of the line
        # including Windows newline characters (\r\n)
        csv_lines = [x.rstrip('\n') for x in csv_file]
        csv_lines = [x.rstrip('\r') for x in csv_lines]

        # open and read file in binary then send it to be parsed by csv's reader
        csv_reader = reader(csv_lines)
    else:
        raise exceptions.SampleSheetError(
            "Sample sheet cannot be parsed as a CSV file because it's not a regular file.",
            sample_sheet_file)

    return csv_reader
コード例 #5
0
def _parse_samples(sample_sheet_file):

    """
    Parse all the lines under "[Data]" in .csv file
    Keys in sample_key_translation_dict have their values changed for
        uploading to REST API
    All other keys keep the same name that they have in .csv file

    arguments:
            sample_sheet_file -- path to UploadList.csv

    returns	a list containing Sample objects that have been created by a
        dictionary from the parsed out key:pair values from .csv file
    """

    logging.info("Reading data from sample sheet {}".format(sample_sheet_file))

    csv_reader = common.get_csv_reader(sample_sheet_file)
    # start with an ordered dictionary so that keys are ordered in the same
    # way that they are inserted.
    sample_dict = OrderedDict()
    sample_list = []

    sample_key_translation_dict = {
        'Sample_ID': 'sampleName',
        'Sample_Project': 'sample_project'
    }

    _parse_samples.sample_key_translation_dict = sample_key_translation_dict

    # initialize dictionary keys from first line (data headers/attributes)
    set_attributes = False
    for line in csv_reader:

        if set_attributes:
            for item in line:

                if item in sample_key_translation_dict:
                    key_name = sample_key_translation_dict[item]
                else:
                    key_name = item

                sample_dict[key_name] = ""

            break

        if "[BCLConvert_Data]" in line:
            set_attributes = True

    # fill in values for keys. line is currently below the [Data] headers
    for sample_number, line in enumerate(csv_reader):

        if len(sample_dict.keys()) != len(line):
            """
            if there is one more Data header compared to the length of
            data values then add an empty string to the end of data values
            i.e the Description will be empty string
            assumes the last Data header is going to be the Description
            this handles the case where the last trailing comma is trimmed

            Shaun said this issue may come up when a user edits the
            SampleSheet from within the MiSeq software
            """
            if len(sample_dict.keys()) - len(line) == 1:
                line.append("")
            else:
                raise exceptions.SampleSheetError(
                    ("Your sample sheet is malformed. Expected to find {} "
                     "columns in the [Data] section, but only found {} columns "
                     "for line {}.".format(len(sample_dict.keys()), len(line), line)),
                    sample_sheet_file
                )

        for index, key in enumerate(sample_dict.keys()):
            sample_dict[key] = line[index].strip()  # assumes values are never empty

        new_sample_dict = deepcopy(sample_dict)
        new_sample_name = new_sample_dict['sampleName']
        del new_sample_dict['sampleName']

        sample = model.Sample(
            sample_name=new_sample_name,
            description="",
            sample_number=sample_number + 1,
            samp_dict=new_sample_dict)
        sample_list.append(sample)

    return sample_list
コード例 #6
0
def parse_metadata(sample_sheet_file):
    """
    Parse all lines under [Header], [Reads] and [Settings] in .csv file
    Lines under [Reads] are stored in a list with key name "readLengths"
    All other key names are translated according to the
        metadata_key_translation_dict

    arguments:
            sample_sheet_file -- path to SampleSheet.csv

    returns a dictionary containing the parsed key:pair values from .csv file
    """

    metadata_dict = {"readLengths": []}

    csv_reader = common.get_csv_reader(sample_sheet_file)

    metadata_key_translation_dict = {
        'Assay': 'assay',
        'Description': 'description',
        'Application': 'application',
        'Investigator Name': 'investigatorName',
        'Adapter': 'adapter',
        'AdapterRead2': 'adapterread2',
        'Workflow': 'workflow',
        'ReverseComplement': 'reversecomplement',
        'IEMFileVersion': 'iemfileversion',
        'Date': 'date',
        'Experiment Name': 'experimentName',
        'Chemistry': 'chemistry',
        'Project Name': 'projectName'
    }

    section = None

    for line in csv_reader:
        if "[Header]" in line or "[Settings]" in line:
            section = "header"
            continue
        elif "[Reads]" in line:
            section = "reads"
            continue
        elif "[Data]" in line:
            break
        elif line and line[0].startswith("["):
            section = "unknown"
            continue

        if not line or not line[0]:
            continue

        if not section:
            logging.debug(
                "Sample sheet is missing important sections: no sections were found"
            )
            raise exceptions.SampleSheetError(
                "Sample sheet is missing important sections: no sections were found.",
                sample_sheet_file)
        elif section == "header":
            try:
                key_name = metadata_key_translation_dict[line[0]]
                metadata_dict[key_name] = line[1]
            except KeyError:
                logging.debug("Unexpected key in header: [{}]".format(line[0]))
        elif section == "reads":
            metadata_dict["readLengths"].append(line[0])

    # currently sends just the larger readLengths
    if len(metadata_dict["readLengths"]) > 0:
        if len(metadata_dict["readLengths"]) == 2:
            metadata_dict["layoutType"] = "PAIRED_END"
        else:
            metadata_dict["layoutType"] = "SINGLE_END"
        metadata_dict["readLengths"] = max(metadata_dict["readLengths"])
    else:
        # this is an exceptional case, you can't have no read lengths!
        logging.debug(
            "The sample sheet is missing important sections: no [Reads] section found."
        )
        raise exceptions.SampleSheetError(
            "The sample sheet is missing important sections: no [Reads] section found.",
            sample_sheet_file)

    return metadata_dict
コード例 #7
0
def parse_sample_list(sample_sheet_file, run_data_directory_file_list):
    """
    Creates a list of all sample data in the sample_sheet_file
    Verifies data is valid for uploading

    :param sample_sheet_file:
    :param run_data_directory_file_list: list of all files
    :return: list of Sample objects
    """
    sample_list = _parse_samples(sample_sheet_file)

    data_dir = path.dirname(sample_sheet_file)

    data_dir_file_list_full_path = []
    for file_name in run_data_directory_file_list:
        data_dir_file_list_full_path.append(
            path.join(path.abspath(data_dir), file_name))

    has_paired_end_read = False
    has_single_end_read = False

    logging.info(
        "Verifying data parsed from sample sheet {}".format(sample_sheet_file))

    for sample in sample_list:

        sample_dict = sample.get_uploadable_dict()

        paired_end_read = len(sample_dict['File_Reverse']) > 0
        # keep track if we have both paired and single end reads
        if paired_end_read:
            has_paired_end_read = True
        else:
            has_single_end_read = True

        # Check if file names are in the files we found in the directory
        if ((sample_dict['File_Forward'] not in run_data_directory_file_list)
                and
            (sample_dict['File_Forward'] not in data_dir_file_list_full_path)):
            raise exceptions.SampleSheetError((
                "Your sample sheet is malformed. {} Does not match any file in the directory {}"
                "".format(sample_dict['File_Forward'], data_dir)),
                                              sample_sheet_file)
        if ((paired_end_read and sample_dict['File_Reverse']
             not in run_data_directory_file_list)
                and (paired_end_read and sample_dict['File_Reverse']
                     not in data_dir_file_list_full_path)):
            raise exceptions.SampleSheetError((
                "Your sample sheet is malformed. {} Does not match any file in the directory {}"
                "".format(sample_dict['File_Reverse'], data_dir)),
                                              sample_sheet_file)

        # create file list of full paths
        file_list = []
        # Add the dir to each file to create the full path
        if sample_dict['File_Forward'] not in data_dir_file_list_full_path:
            sample_dict['File_Forward'] = path.join(
                data_dir, sample_dict['File_Forward'])
            file_list.append(sample_dict['File_Forward'])
        if paired_end_read and sample_dict[
                'File_Reverse'] not in data_dir_file_list_full_path:
            sample_dict['File_Reverse'] = path.join(
                data_dir, sample_dict['File_Reverse'])
            file_list.append(sample_dict['File_Reverse'])

        # Create sequence file object and attach to sample
        sq = model.SequenceFile(file_list=file_list)
        sample.sequence_file = deepcopy(sq)

    # Verify we don't have both single end and paired end reads
    if has_single_end_read and has_paired_end_read:
        raise exceptions.SampleSheetError(
            ("Your sample sheet is malformed. "
             "SampleSheet cannot have both paired end and single end runs. "
             "Make sure all samples are either paired or single."),
            sample_sheet_file)

    return sample_list
コード例 #8
0
def _parse_samples(sample_sheet_file):
    """
    Parse all the lines under "[Data]" in .csv file

    arguments:
            sample_sheet_file -- path to SampleSheet.csv

    returns	a list containing Sample objects that have been created by a
        dictionary from the parsed out key:pair values from .csv file
    """

    logging.info("Reading data from sample sheet {}".format(sample_sheet_file))

    csv_reader = common.get_csv_reader(sample_sheet_file)
    # start with an ordered dictionary so that keys are ordered in the same
    # way that they are inserted.
    sample_dict = OrderedDict()
    sample_list = []

    sample_key_list = [
        'Sample_Name', 'Project_ID', 'File_Forward', 'File_Reverse'
    ]

    # initialize dictionary keys from first line (data headers/attributes)
    set_attributes = False
    for line in csv_reader:

        if set_attributes:
            for item in line:

                if item in sample_key_list:
                    key_name = item
                    sample_dict[key_name] = ""

            break

        if "[Data]" in line:
            set_attributes = True

    # fill in values for keys. line is currently below the [Data] headers
    for sample_number, line in enumerate(csv_reader):
        # if the line is empty (like a blank line at the end of the file) continue
        if not line:
            continue

        if len(sample_dict.keys()) != len(line):
            """
            if there is one more Data header compared to the length of
            data values then add an empty string to the end of data values
            i.e the File_Reverse will be empty string
            assumes the last Data header is going to be the File_Reverse
            this handles the case where the last trailing comma is trimmed when
            doing a single end run
            """
            if len(sample_dict.keys()) - len(line) == 1:
                line.append("")
            else:
                raise exceptions.SampleSheetError((
                    "Your sample sheet is malformed. Expected to find {} "
                    "columns in the [Data] section, but only found {} columns "
                    "for line {}.".format(len(sample_dict.keys()), len(line),
                                          line)), sample_sheet_file)

        for index, key in enumerate(sample_dict.keys()):
            value = line[index].strip()

            # Keys other than 'File_Reverse' cannot be empty
            if len(value) is 0:  # no value
                if key != 'File_Reverse':
                    raise exceptions.SampleSheetError((
                        "Your sample sheet is malformed. {} in the [Data] section cannot be empty."
                        "".format(key)), sample_sheet_file)

            sample_dict[key] = value

        sample_key_list = [
            'Sample_Name', 'Project_ID', 'File_Forward', 'File_Reverse'
        ]

        new_sample_dict = deepcopy(sample_dict)
        new_sample_name = new_sample_dict['Sample_Name']
        new_sample_project = new_sample_dict['Project_ID']
        new_sample_dict['sample_project'] = new_sample_project
        del new_sample_dict['Sample_Name']
        del new_sample_dict['Project_ID']

        sample = model.Sample(sample_name=new_sample_name,
                              description="",
                              sample_number=sample_number + 1,
                              samp_dict=new_sample_dict)

        sample_list.append(sample)

    return sample_list
コード例 #9
0
ファイル: parser.py プロジェクト: phac-nml/irida-uploader
    def get_sequencing_run(self,
                           sample_sheet,
                           run_data_directory_file_list=None):
        """
        Does local validation on the integrity of the run directory / sample sheet

        Throws a ValidationError with a validation result attached if it cannot make a sequencing run

        :param sample_sheet:
        :param run_data_directory_file_list: Optional: List of files in the data directory to verify against the
        SampleList.csv file. This is used when deploying the parsers on a cloud environment.
        :return: SequencingRun
        """

        # Try to get the sample sheet, validate that the sample sheet is valid
        validation_result = validation.validate_sample_sheet(sample_sheet)
        if not validation_result.is_valid():
            logging.error("Errors occurred while getting sample sheet")
            raise exceptions.ValidationError(
                "Errors occurred while getting sample sheet",
                validation_result)

        # When running with a premade file list, verify files on sample_sheet are in file list
        try:
            if run_data_directory_file_list is not None:
                sample_parser.verify_sample_sheet_file_names_in_file_list(
                    sample_sheet, run_data_directory_file_list)
        except (exceptions.SequenceFileError,
                exceptions.SampleSheetError) as error:
            validation_result.add_error(error)
            logging.error(
                "Errors occurred while building sequence run from sample sheet"
            )
            raise exceptions.ValidationError(
                "Errors occurred while building sequence run from sample sheet",
                validation_result)
        except Exception as error:
            validation_result.add_error(error)
            logging.error("System error while building sequencing run")
            raise exceptions.ValidationError(
                "System error while building sequencing run",
                validation_result)

        # Build a list of sample objects from sample sheet
        try:
            if run_data_directory_file_list is not None:
                sample_list = sample_parser.build_sample_list_from_sample_sheet_no_verify(
                    sample_sheet)
            else:
                sample_list = sample_parser.build_sample_list_from_sample_sheet_with_abs_path(
                    sample_sheet)
        except (exceptions.DirectoryError,
                exceptions.SampleSheetError) as error:
            validation_result.add_error(error)
            logging.error("Errors occurred while parsing files")
            raise exceptions.ValidationError(
                "Errors occurred while parsing files", validation_result)
        except Exception as error:
            validation_result.add_error(error)
            logging.error("System error while parsing files")
            raise exceptions.ValidationError(
                "System error while parsing files", validation_result)

        # verify samples in sample_list are all of one type, either single or paired end
        if not sample_parser.only_single_or_paired_in_sample_list(sample_list):
            e = exceptions.SampleSheetError((
                "Your sample sheet is malformed. "
                "SampleSheet cannot have both paired end and single end runs. "
                "Make sure all samples are either paired or single."),
                                            sample_sheet)
            validation_result.add_error(e)
            logging.error(
                "Error occurred while building file list: Sample sheet has both paired and single end reads"
            )
            raise exceptions.ValidationError(
                "Errors occurred while building file list.", validation_result)

        # Try to build sequencing run from sample sheet & meta data, raise validation error if errors occur
        try:
            run_metadata = sample_parser.parse_metadata(sample_list)
            sequencing_run = common.build_sequencing_run_from_samples(
                sample_list, run_metadata, self.get_parser_type_name())
        except exceptions.SequenceFileError as error:
            validation_result.add_error(error)
            logging.error(
                "Errors occurred while building sequence run from sample sheet"
            )
            raise exceptions.ValidationError(
                "Errors occurred while building sequence run from sample sheet",
                validation_result)

        return sequencing_run
コード例 #10
0
ファイル: validation.py プロジェクト: pvanheus/irida-uploader
def validate_sample_sheet(sample_sheet_file):
    """
    Checks if the given sample_sheet_file can be parsed
    Requires [Header] because it contains Workflow
    Requires [Data] for creating Sample objects and requires
        Sample_ID, Sample_Name, Sample_Project and Description table headers

    arguments:
            sample_sheet_file -- path to SampleSheet.csv

    returns ValidationResult object - stores list of string error messages
    """

    csv_reader = common.get_csv_reader(sample_sheet_file)

    v_res = model.ValidationResult()

    all_data_headers_found = False
    data_sect_found = False
    check_data_headers = False

    # status of required data headers
    found_data_headers = {
        "Sample_Name": False,
        "Project_ID": False,
        "File_Forward": False,
        "File_Reverse": False
    }

    for line in csv_reader:

        if "[Data]" in line:
            data_sect_found = True
            check_data_headers = True  # next line contains data headers

        elif check_data_headers:
            for data_header in found_data_headers.keys():
                if data_header in line:
                    found_data_headers[data_header] = True

            # if all required dataHeaders are found
            if all(found_data_headers.values()):
                all_data_headers_found = True

            check_data_headers = False

    if not all([data_sect_found, all_data_headers_found]):

        if data_sect_found is False:
            v_res.add_error(
                exceptions.SampleSheetError(
                    "[Data] section not found in SampleSheet",
                    sample_sheet_file))

        if all_data_headers_found is False:
            missing_str = ""
            for data_header in found_data_headers:
                if found_data_headers[data_header] is False:
                    missing_str = missing_str + data_header + ", "

            missing_str = missing_str[:-2]  # remove last ", "
            v_res.add_error(
                exceptions.SampleSheetError(
                    "Missing required data header(s): " + missing_str,
                    sample_sheet_file))

    return v_res