def _make_seq_run():
     """
     Make a sequencing run pointed at real data for the tests
     :return: SequencingRun object
     """
     files_1 = model.SequenceFile([
         path.join(path_to_module, "fake_ngs_data", "Data", "Intensities",
                   "BaseCalls", "01-1111_S1_L001_R1_001.fastq.gz"),
         path.join(path_to_module, "fake_ngs_data", "Data", "Intensities",
                   "BaseCalls", "01-1111_S1_L001_R2_001.fastq.gz"),
     ])
     files_2 = model.SequenceFile([
         path.join(path_to_module, "fake_ngs_data", "Data", "Intensities",
                   "BaseCalls", "02-2222_S1_L001_R1_001.fastq.gz"),
         path.join(path_to_module, "fake_ngs_data", "Data", "Intensities",
                   "BaseCalls", "02-2222_S1_L001_R2_001.fastq.gz"),
     ])
     files_3 = model.SequenceFile([
         path.join(path_to_module, "fake_ngs_data", "Data", "Intensities",
                   "BaseCalls", "03-3333_S1_L001_R1_001.fastq.gz"),
         path.join(path_to_module, "fake_ngs_data", "Data", "Intensities",
                   "BaseCalls", "03-3333_S1_L001_R2_001.fastq.gz"),
     ])
     sample_1 = model.Sample("test_sample", "description", 1)
     sample_1.sequence_file = files_1
     sample_2 = model.Sample("test_sample", "description", 1)
     sample_2.sequence_file = files_2
     sample_3 = model.Sample("test_sample", "description", 1)
     sample_3.sequence_file = files_3
     project = model.Project("test_project", [sample_1, sample_2, sample_3],
                             "description")
     sequencing_run = model.SequencingRun({"layoutType": "PAIRED_END"},
                                          [project], "miseq")
     return sequencing_run
def build_sample_list_from_sample_sheet_no_verify(sample_sheet_file):
    """
    Create a list of Sample objects, file existence is not verified before SequenceFile is created
    this is used when a pre-generated file list is used (e.g. cloud deployment)

    :param sample_sheet_file:
    :return:
    """
    sample_list = _parse_samples(sample_sheet_file)

    for sample in sample_list:

        sample_dict = sample.get_uploadable_dict()
        # create file list
        file_list = [sample_dict['File_Forward']]

        # if paired end add file to file list
        paired_end_read = len(sample_dict['File_Reverse']) > 0
        if paired_end_read:
            file_list.append(sample_dict['File_Reverse'])

        # Create sequence file object and attach to sample
        sq = model.SequenceFile(file_list=file_list)
        sample.sequence_file = deepcopy(sq)

    return sample_list
def parse_sample_list(sample_sheet_file, run_data_directory, run_data_directory_file_list):
    """
    Creates a list of Sample Objects

    :param sample_sheet_file: Sample Sheet file
    :param run_data_directory: Data directory including run directory (e.g. my_run/Data/Intensities/BaseCalls)
    :param run_data_directory_file_list: The list of all files in the data directory
    :return: list of Sample objects
    """
    sample_list = _parse_samples(sample_sheet_file)

    for sample in sample_list:
        properties_dict = _parse_out_sequence_file(sample)
        # this is the Illumina-defined pattern for naming fastq files, from:
        # http://blog.basespace.illumina.com/2014/08/18/fastq-upload-in-now-available-in-basespace/
        file_pattern = "{sample_name}_S{sample_number}_L\\d{{3}}_R(\\d+)_\\S+\\.fastq.*$".format(
            sample_name=re.escape(sample.sample_name), sample_number=sample.sample_number)
        logging.info("Looking for files with pattern {}".format(file_pattern))
        regex = re.compile(file_pattern)
        pf_list = list(filter(regex.search, run_data_directory_file_list))
        if not pf_list:
            # OK. So we didn't find any files using the **correct** file name
            # definition according to Illumina. Let's try again with our deprecated
            # behaviour, where we didn't actually care about the sample number:
            file_pattern = "{sample_name}_S\\d+_L\\d{{3}}_R(\\d+)_\\S+\\.fastq.*$".format(
                sample_name=re.escape(sample.sample_name))
            logging.info("Looking for files with pattern {}".format(file_pattern))

            regex = re.compile(file_pattern)
            pf_list = list(filter(regex.search, run_data_directory_file_list))

            if not pf_list:
                # we **still** didn't find anything. It's pretty likely, then that
                # there aren't any fastq files in the directory that match what
                # the sample sheet says...
                raise exceptions.SequenceFileError(
                    ("The uploader was unable to find an files with a file name that ends with "
                     ".fastq.gz for the sample in your sample sheet with name {} in the directory {}. "
                     "This usually happens when the Illumina MiSeq Reporter tool "
                     "does not generate any FastQ data.").format(
                        sample.sample_name, run_data_directory))

        # List of files may be invalid if directory searching in has been modified by user
        if not _validate_pf_list(pf_list):
            raise exceptions.SequenceFileError(
                ("The following file list {} found in the directory {} is invalid. "
                 "Please verify the folder containing the sequence files matches the SampleSheet file").format(
                    pf_list, run_data_directory))

        # Add the dir to each file to create the full path
        for i in range(len(pf_list)):
            pf_list[i] = path.join(run_data_directory, pf_list[i])

        sq = model.SequenceFile(file_list=pf_list, properties_dict=properties_dict)
        sample.sequence_file = deepcopy(sq)

    return sample_list
Exemple #4
0
def _parse_sample_list(sample_sheet_file):
    """
    Creates a list of all samples in the sample_sheet_file, with accompanying data/metadata

    :param sample_sheet_file:
    :return: list of samples
    """
    sample_list = _parse_samples(sample_sheet_file)
    sample_sheet_dir = path.dirname(sample_sheet_file)
    base_data_dir = path.join(sample_sheet_dir, "Data", "Intensities",
                              "BaseCalls")
    project_dir_list = next(walk(base_data_dir))[
        1]  # Get the list of project directories that contain sample files

    for sample in sample_list:

        project_directory = sample.get('sample_project')
        if project_directory not in project_dir_list:
            # The project number in the sample sheet does not match a project folder in the run directory
            raise exceptions.SequenceFileError(
                "The uploader was unable to find the directory '{}' in '{}'. "
                "Please verify your SampleSheet Sample_Project matches the directory structure"
                .format(project_directory, base_data_dir))
        project_data_dir = path.join(base_data_dir, project_directory)
        # Create a file list of the data directory, only hit the os once
        data_dir_file_list = next(walk(project_data_dir))[2]

        properties_dict = _parse_out_sequence_file(sample)
        file_pattern = "{sample_name}_S(\\S+)_R(\\d+)_(\\S*)\\.fastq.*$".format(
            sample_name=re.escape(sample.sample_name))
        logging.info("Looking for files with pattern {}".format(file_pattern))
        regex = re.compile(file_pattern)
        pf_list = list(filter(regex.search, data_dir_file_list))

        if not pf_list:
            # we didn't find anything
            raise exceptions.SequenceFileError((
                "The uploader was unable to find an files with a file name that ends with "
                ".fastq.gz for the sample in your sample sheet with name {} in the directory {}. "
            ).format(sample.sample_name, project_data_dir))

        # List of files may be invalid if directory searching in has been modified by user
        if not _validate_pf_list(pf_list):
            raise exceptions.SequenceFileError((
                "The following file list {} found in the directory {} is invalid. "
                "Please verify the folder containing the sequence files matches the SampleSheet file"
            ).format(pf_list, project_data_dir))

        # Add the dir to each file to create the full path
        for i in range(len(pf_list)):
            pf_list[i] = path.join(project_data_dir, pf_list[i])

        sq = model.SequenceFile(file_list=pf_list,
                                properties_dict=properties_dict)
        sample.sequence_file = deepcopy(sq)

    return sample_list
def build_sample_list_from_sample_sheet_with_abs_path(sample_sheet_file):
    """
    Create a list of Sample objects, where each SequenceFile object has an absolute file path

    :param sample_sheet_file:
    :return:
    """
    sample_list = _parse_samples(sample_sheet_file)
    # Data directory is used if file names on sample sheet are not absolute paths (in directory files)
    data_dir = path.dirname(sample_sheet_file)
    sample_sheet_dir_file_list = common.get_file_list(data_dir)

    for sample in sample_list:
        sample_dict = sample.get_uploadable_dict()
        paired_end_read = len(sample_dict['File_Reverse']) > 0

        # create file list of full paths
        file_list = []
        # If file is not an abspath already, make it an abspath from filename + data dir
        if path.isabs(sample_dict['File_Forward']):
            file_list.append(sample_dict['File_Forward'])
        elif sample_dict['File_Forward'] in sample_sheet_dir_file_list:
            sample_dict['File_Forward'] = path.join(
                path.abspath(data_dir), sample_dict['File_Forward'])

            file_list.append(sample_dict['File_Forward'])
        else:
            raise exceptions.SampleSheetError((
                "Your sample sheet is malformed. {} Does not match any file in the directory {}"
                "".format(sample_dict['File_Forward'], data_dir)),
                                              sample_sheet_file)

        # reverse file is same as for forward file
        if paired_end_read:
            if path.isabs(sample_dict['File_Reverse']):
                file_list.append(sample_dict['File_Reverse'])
            elif sample_dict['File_Reverse'] in sample_sheet_dir_file_list:
                sample_dict['File_Reverse'] = path.join(
                    path.abspath(data_dir), sample_dict['File_Reverse'])
                file_list.append(sample_dict['File_Reverse'])
            else:
                raise exceptions.SampleSheetError((
                    "Your sample sheet is malformed. {} Does not match any file in the directory {}"
                    "".format(sample_dict['File_Reverse'], data_dir)),
                                                  sample_sheet_file)

        # Create sequence file object and attach to sample
        sq = model.SequenceFile(file_list=file_list)
        sample.sequence_file = deepcopy(sq)

    return sample_list
    def test_send_and_get_sequence_files(self):
        """
        Tests sending and receiving sequence files
        :return:
        """
        # upload a project
        project_name = "test_project_2"
        project_description = "test_project_description"
        project = model.Project(name=project_name,
                                description=project_description)

        proj_json_res = self.test_api.send_project(project)
        project_identifier = proj_json_res['resource']['identifier']

        # upload a sample
        sample_name = "test_sample"
        sample_desc = "test_sample_desc"
        sample = model.Sample(sample_name, sample_desc)

        self.test_api.send_sample(sample, project_identifier)

        # upload sequence files
        sequence_file_list = [
            path.join(path_to_module, "fake_dir_data", "file_1.fastq.gz"),
            path.join(path_to_module, "fake_dir_data", "file_2.fastq.gz")
        ]
        sequence_file = model.SequenceFile(sequence_file_list)

        upload_id = self.test_api.create_seq_run({'layoutType': 'PAIRED_END'},
                                                 'miseq')

        self.test_api.send_sequence_files(sequence_file, sample_name,
                                          project_identifier, upload_id)

        # verify sequence files match what we sent to IRIDA
        returned_sequence_files = self.test_api.get_sequence_files(
            project_identifier, sample_name)

        self.assertEqual(returned_sequence_files[0]['fileName'],
                         'file_1.fastq.gz')
        self.assertEqual(returned_sequence_files[1]['fileName'],
                         'file_2.fastq.gz')
def parse_sample_list(sample_sheet_file, run_data_directory_file_list):
    """
    Creates a list of all sample data in the sample_sheet_file
    Verifies data is valid for uploading

    :param sample_sheet_file:
    :param run_data_directory_file_list: list of all files
    :return: list of Sample objects
    """
    sample_list = _parse_samples(sample_sheet_file)

    data_dir = path.dirname(sample_sheet_file)

    data_dir_file_list_full_path = []
    for file_name in run_data_directory_file_list:
        data_dir_file_list_full_path.append(
            path.join(path.abspath(data_dir), file_name))

    has_paired_end_read = False
    has_single_end_read = False

    logging.info(
        "Verifying data parsed from sample sheet {}".format(sample_sheet_file))

    for sample in sample_list:

        sample_dict = sample.get_uploadable_dict()

        paired_end_read = len(sample_dict['File_Reverse']) > 0
        # keep track if we have both paired and single end reads
        if paired_end_read:
            has_paired_end_read = True
        else:
            has_single_end_read = True

        # Check if file names are in the files we found in the directory
        if ((sample_dict['File_Forward'] not in run_data_directory_file_list)
                and
            (sample_dict['File_Forward'] not in data_dir_file_list_full_path)):
            raise exceptions.SampleSheetError((
                "Your sample sheet is malformed. {} Does not match any file in the directory {}"
                "".format(sample_dict['File_Forward'], data_dir)),
                                              sample_sheet_file)
        if ((paired_end_read and sample_dict['File_Reverse']
             not in run_data_directory_file_list)
                and (paired_end_read and sample_dict['File_Reverse']
                     not in data_dir_file_list_full_path)):
            raise exceptions.SampleSheetError((
                "Your sample sheet is malformed. {} Does not match any file in the directory {}"
                "".format(sample_dict['File_Reverse'], data_dir)),
                                              sample_sheet_file)

        # create file list of full paths
        file_list = []
        # Add the dir to each file to create the full path
        if sample_dict['File_Forward'] not in data_dir_file_list_full_path:
            sample_dict['File_Forward'] = path.join(
                data_dir, sample_dict['File_Forward'])
            file_list.append(sample_dict['File_Forward'])
        if paired_end_read and sample_dict[
                'File_Reverse'] not in data_dir_file_list_full_path:
            sample_dict['File_Reverse'] = path.join(
                data_dir, sample_dict['File_Reverse'])
            file_list.append(sample_dict['File_Reverse'])

        # Create sequence file object and attach to sample
        sq = model.SequenceFile(file_list=file_list)
        sample.sequence_file = deepcopy(sq)

    # Verify we don't have both single end and paired end reads
    if has_single_end_read and has_paired_end_read:
        raise exceptions.SampleSheetError(
            ("Your sample sheet is malformed. "
             "SampleSheet cannot have both paired end and single end runs. "
             "Make sure all samples are either paired or single."),
            sample_sheet_file)

    return sample_list