Ejemplo n.º 1
0
    def setUp(self):
        fd, self.seqs_fp = mkstemp(suffix='_seqs.fastq')
        close(fd)
        fd, self.barcodes_fp = mkstemp(suffix='_barcodes.fastq')
        close(fd)
        self.filetype = 2
        self.filepaths = [(self.seqs_fp, 1), (self.barcodes_fp, 2)]
        _, self.db_test_raw_dir = get_mountpoint('raw_data')[0]

        with open(self.seqs_fp, "w") as f:
            f.write("\n")
        with open(self.barcodes_fp, "w") as f:
            f.write("\n")
        self._clean_up_files = []

        # Create some new PrepTemplates
        metadata_dict = {
            'SKB8.640193': {'center_name': 'ANL',
                            'primer': 'GTGCCAGCMGCCGCGGTAA',
                            'barcode': 'GTCCGCAAGTTA',
                            'run_prefix': "s_G1_L001_sequences",
                            'platform': 'ILLUMINA',
                            'library_construction_protocol': 'AAAA',
                            'experiment_design_description': 'BBBB'}}
        metadata = pd.DataFrame.from_dict(metadata_dict, orient='index')
        self.pt1 = PrepTemplate.create(metadata, Study(1), "16S")
        self.pt2 = PrepTemplate.create(metadata, Study(1), "18S")
        self.prep_templates = [self.pt1, self.pt2]
Ejemplo n.º 2
0
    def test_get_sample_names_by_run_prefix(self):
        metadata_dict = {
            'SKB8.640193': {'run_prefix': "s1", 'primer': 'A',
                            'barcode': 'A', 'center_name': 'ANL',
                            'platform': 'ILLUMINA',
                            'instrument_model': 'Illumina MiSeq',
                            'library_construction_protocol': 'A',
                            'experiment_design_description': 'A'},
            'SKD8.640184': {'run_prefix': "s2", 'primer': 'A',
                            'barcode': 'A', 'center_name': 'ANL',
                            'platform': 'ILLUMINA',
                            'instrument_model': 'Illumina MiSeq',
                            'library_construction_protocol': 'A',
                            'experiment_design_description': 'A'},
            'SKB7.640196': {'run_prefix': "s3", 'primer': 'A',
                            'barcode': 'A', 'center_name': 'ANL',
                            'platform': 'ILLUMINA',
                            'instrument_model': 'Illumina MiSeq',
                            'library_construction_protocol': 'A',
                            'experiment_design_description': 'A'}}
        md_template = pd.DataFrame.from_dict(metadata_dict, orient='index')
        prep_template = PrepTemplate.create(md_template, Study(1), '16S')
        for _, fp in prep_template.get_filepaths():
            self.files_to_remove.append(fp)

        obs = _get_sample_names_by_run_prefix(prep_template)

        exp = {'s3': '1.SKB7.640196', 's2': '1.SKD8.640184',
               's1': '1.SKB8.640193'}
        self.assertEqual(obs, exp)

        # This should raise an error
        metadata_dict = {
            'SKB8.640193': {'run_prefix': "s1", 'primer': 'A',
                            'barcode': 'A', 'center_name': 'ANL',
                            'platform': 'ILLUMINA',
                            'instrument_model': 'Illumina MiSeq',
                            'library_construction_protocol': 'A',
                            'experiment_design_description': 'A'},
            'SKD8.640184': {'run_prefix': "s1", 'primer': 'A',
                            'barcode': 'A', 'center_name': 'ANL',
                            'platform': 'ILLUMINA',
                            'instrument_model': 'Illumina MiSeq',
                            'library_construction_protocol': 'A',
                            'experiment_design_description': 'A'},
            'SKB7.640196': {'run_prefix': "s3", 'primer': 'A',
                            'barcode': 'A', 'center_name': 'ANL',
                            'platform': 'ILLUMINA',
                            'instrument_model': 'Illumina MiSeq',
                            'library_construction_protocol': 'A',
                            'experiment_design_description': 'A'}}
        md_template = pd.DataFrame.from_dict(metadata_dict, orient='index')
        prep_template = PrepTemplate.create(md_template, Study(1), '16S')
        for _, fp in prep_template.get_filepaths():
            self.files_to_remove.append(fp)

        with self.assertRaises(ValueError):
            _get_sample_names_by_run_prefix(prep_template)
Ejemplo n.º 3
0
    def remove_add_prep_template(self, fp_rpt, raw_data_id, study,
                                 data_type_id, investigation_type, callback):
        """add prep templates
        """
        PrepTemplate.create(load_template_to_dataframe(fp_rpt),
                            RawData(raw_data_id), study, int(data_type_id),
                            investigation_type=investigation_type)
        remove(fp_rpt)

        callback()
Ejemplo n.º 4
0
 def remove_add_prep_template(self, fp_rpt, study, data_type_id, investigation_type):
     """add prep templates"""
     pt_id = PrepTemplate.create(
         load_template_to_dataframe(fp_rpt), study, _to_int(data_type_id), investigation_type=investigation_type
     ).id
     remove(fp_rpt)
     return pt_id
Ejemplo n.º 5
0
    def test_create(self):
        """Creates a new PrepTemplate"""
        pt = PrepTemplate.create(self.metadata, self.new_raw_data)
        # The returned object has the correct id
        self.assertEqual(pt.id, 3)

        # The relevant rows to common_prep_info have been added.
        obs = self.conn_handler.execute_fetchall(
            "SELECT * FROM qiita.common_prep_info WHERE raw_data_id=3")
        # raw_data_id, sample_id, center_name, center_project_name,
        # ebi_submission_accession, ebi_study_accession, emp_status_id,
        # data_type_id
        exp = [[3, 'SKB8.640193', 'ANL', 'Test Project', 1, 2],
               [3, 'SKD8.640184', 'ANL', 'Test Project', 1, 2],
               [3, 'SKB7.640196', 'ANL', 'Test Project', 1, 2]]
        self.assertEqual(sorted(obs), sorted(exp))

        # The relevant rows have been added to the raw_data_prep_columns
        obs = self.conn_handler.execute_fetchall(
            "SELECT * FROM qiita.raw_data_prep_columns WHERE raw_data_id=3")
        # raw_data_id, column_name, column_type
        exp = [[3, 'str_column', 'varchar'],
               [3, 'ebi_submission_accession', 'varchar']]
        self.assertEqual(obs, exp)

        # The new table exists
        self.assertTrue(exists_table("prep_3", self.conn_handler))

        # The new table hosts the correct values
        obs = self.conn_handler.execute_fetchall("SELECT * FROM qiita.prep_3")
        # sample_id, str_column
        exp = [['SKB8.640193', "Value for sample 1", None],
               ['SKD8.640184', "Value for sample 2", None],
               ['SKB7.640196', "Value for sample 3", None]]
        self.assertEqual(sorted(obs), sorted(exp))
Ejemplo n.º 6
0
    def test_get_preprocess_fastq_cmd_per_sample_FASTQ_failure(self):
        metadata_dict = {
            'SKB8.640193': {'run_prefix': "sample1_failure", 'primer': 'A',
                            'barcode': 'A', 'center_name': 'ANL',
                            'platform': 'ILLUMINA',
                            'library_construction_protocol': 'A',
                            'experiment_design_description': 'A'}}
        md_template = pd.DataFrame.from_dict(metadata_dict, orient='index')
        prep_template = PrepTemplate.create(md_template, Study(1), '16S')

        # This part should fail
        fp1 = self.path_builder('sample1_failure.fastq')
        with open(fp1, 'w') as f:
            f.write('\n')
        self.files_to_remove.append(fp1)
        fp2 = self.path_builder('sample1_failure.barcodes.fastq.gz')
        with open(fp2, 'w') as f:
            f.write('\n')
        self.files_to_remove.append(fp2)
        forward_filepath_id = convert_to_id('raw_forward_seqs',
                                            'filepath_type')
        barcode_filepath_id = convert_to_id('raw_barcodes', 'filepath_type')

        fps = [(fp1, forward_filepath_id), (fp2, barcode_filepath_id)]

        filetype_id = get_filetypes()['per_sample_FASTQ']
        raw_data = RawData.create(filetype_id, [prep_template], fps)
        params = [p for p in list(PreprocessedIlluminaParams.iter())
                  if p.name == 'per sample FASTQ defaults'][0]

        with self.assertRaises(ValueError):
            _get_preprocess_fastq_cmd(raw_data, prep_template, params)
Ejemplo n.º 7
0
    def test_create(self):
        """Creates a new PrepTemplate"""
        pt = PrepTemplate.create(self.metadata, self.new_raw_data)
        # The returned object has the correct id
        self.assertEqual(pt.id, 3)

        # The relevant rows to common_prep_info have been added.
        obs = self.conn_handler.execute_fetchall(
            "SELECT * FROM qiita.common_prep_info WHERE raw_data_id=3")
        # raw_data_id, sample_id, center_name, center_project_name,
        # ebi_submission_accession, ebi_study_accession, emp_status_id,
        # data_type_id
        exp = [[3, 'SKB8.640193', 'ANL', 'Test Project', None, None, 1, 2],
               [3, 'SKD8.640184', 'ANL', 'Test Project', None, None, 1, 2],
               [3, 'SKB7.640196', 'ANL', 'Test Project', None, None, 1, 2]]
        self.assertEqual(sorted(obs), sorted(exp))

        # The relevant rows have been added to the raw_data_prep_columns
        obs = self.conn_handler.execute_fetchall(
            "SELECT * FROM qiita.raw_data_prep_columns WHERE raw_data_id=3")
        # raw_data_id, column_name, column_type
        exp = [[3, "str_column", "varchar"]]
        self.assertEqual(obs, exp)

        # The new table exists
        self.assertTrue(exists_table("prep_3", self.conn_handler))

        # The new table hosts the correct values
        obs = self.conn_handler.execute_fetchall(
            "SELECT * FROM qiita.prep_3")
        # sample_id, str_column
        exp = [['SKB8.640193', "Value for sample 1"],
               ['SKD8.640184', "Value for sample 2"],
               ['SKB7.640196', "Value for sample 3"]]
        self.assertEqual(sorted(obs), sorted(exp))
Ejemplo n.º 8
0
    def test_get_qiime_minimal_mapping_multiple(self):
        # We need to create a prep template in which we have different run
        # prefix values, so we can test this case
        metadata_dict = {
            'SKB8.640193': {'center_name': 'ANL',
                            'center_project_name': 'Test Project',
                            'ebi_submission_accession': None,
                            'EMP_status': 'EMP',
                            'str_column': 'Value for sample 1',
                            'primer': 'GTGCCAGCMGCCGCGGTAA',
                            'barcode': 'GTCCGCAAGTTA',
                            'run_prefix': "s_G1_L001_sequences",
                            'platform': 'ILLUMINA',
                            'library_construction_protocol': 'AAA',
                            'experiment_design_description': 'BBB'},
            'SKD8.640184': {'center_name': 'ANL',
                            'center_project_name': 'Test Project',
                            'ebi_submission_accession': None,
                            'EMP_status': 'EMP',
                            'str_column': 'Value for sample 2',
                            'primer': 'GTGCCAGCMGCCGCGGTAA',
                            'barcode': 'CGTAGAGCTCTC',
                            'run_prefix': "s_G1_L001_sequences",
                            'platform': 'ILLUMINA',
                            'library_construction_protocol': 'AAA',
                            'experiment_design_description': 'BBB'},
            'SKB7.640196': {'center_name': 'ANL',
                            'center_project_name': 'Test Project',
                            'ebi_submission_accession': None,
                            'EMP_status': 'EMP',
                            'str_column': 'Value for sample 3',
                            'primer': 'GTGCCAGCMGCCGCGGTAA',
                            'barcode': 'CCTCTGAGAGCT',
                            'run_prefix': "s_G1_L002_sequences",
                            'platform': 'ILLUMINA',
                            'library_construction_protocol': 'AAA',
                            'experiment_design_description': 'BBB'}
            }
        md_template = pd.DataFrame.from_dict(metadata_dict, orient='index')
        prep_template = PrepTemplate.create(md_template, Study(1), '16S')
        for _, fp in prep_template.get_filepaths():
            self.files_to_remove.append(fp)

        out_dir = mkdtemp()

        obs_fps = sorted(_get_qiime_minimal_mapping(prep_template, out_dir))
        exp_fps = sorted([join(out_dir, 's_G1_L001_sequences_MMF.txt'),
                          join(out_dir, 's_G1_L002_sequences_MMF.txt')])

        # Check that the returned list is as expected
        self.assertEqual(obs_fps, exp_fps)
        # Check that the file exists
        for fp in exp_fps:
            self.assertTrue(exists(fp))
        # Check the contents of the file
        for fp, contents in zip(exp_fps, [EXP_PREP_1, EXP_PREP_2]):
            with open(fp, "U") as f:
                self.assertEqual(f.read(), contents)
Ejemplo n.º 9
0
 def remove_add_prep_template(self, fp_rpt, raw_data_id, study,
                              data_type_id, investigation_type):
     """add prep templates"""
     pt_id = PrepTemplate.create(load_template_to_dataframe(fp_rpt),
                                 RawData(raw_data_id), study,
                                 _to_int(data_type_id),
                                 investigation_type=investigation_type).id
     remove(fp_rpt)
     return pt_id
Ejemplo n.º 10
0
 def test_to_file(self):
     """to file writes a tab delimited file with all the metadata"""
     fd, fp = mkstemp()
     close(fd)
     pt = PrepTemplate.create(self.metadata, self.new_raw_data)
     pt.to_file(fp)
     self._clean_up_files.append(fp)
     with open(fp, 'U') as f:
         obs = f.read()
     self.assertEqual(obs, EXP_PREP_TEMPLATE)
Ejemplo n.º 11
0
 def test_to_file(self):
     """to file writes a tab delimited file with all the metadata"""
     fd, fp = mkstemp()
     close(fd)
     pt = PrepTemplate.create(self.metadata, self.new_raw_data)
     pt.to_file(fp)
     self._clean_up_files.append(fp)
     with open(fp, 'U') as f:
         obs = f.read()
     self.assertEqual(obs, EXP_PREP_TEMPLATE)
Ejemplo n.º 12
0
    def test_get_preprocess_fastq_cmd_per_sample_FASTQ(self):
        metadata_dict = {
            'SKB8.640193': {'run_prefix': "sample1", 'primer': 'A',
                            'barcode': 'A', 'center_name': 'ANL',
                            'platform': 'ILLUMINA',
                            'instrument_model': 'Illumina MiSeq',
                            'library_construction_protocol': 'A',
                            'experiment_design_description': 'A'},
            'SKD8.640184': {'run_prefix': "sample2", 'primer': 'A',
                            'barcode': 'A', 'center_name': 'ANL',
                            'platform': 'ILLUMINA',
                            'instrument_model': 'Illumina MiSeq',
                            'library_construction_protocol': 'A',
                            'experiment_design_description': 'A'}}
        md_template = pd.DataFrame.from_dict(metadata_dict, orient='index')
        prep_template = PrepTemplate.create(md_template, Study(1), '16S')

        fp1 = self.path_builder('sample1.fastq')
        with open(fp1, 'w') as f:
            f.write('\n')
        self.files_to_remove.append(fp1)
        fp2 = self.path_builder('sample2.fastq.gz')
        with open(fp2, 'w') as f:
            f.write('\n')
        self.files_to_remove.append(fp2)
        filepath_id = convert_to_id('raw_forward_seqs', 'filepath_type')

        fps = [(fp1, filepath_id), (fp2, filepath_id)]

        filetype_id = get_filetypes()['per_sample_FASTQ']
        raw_data = RawData.create(filetype_id, [prep_template], fps)
        params = [p for p in list(PreprocessedIlluminaParams.iter())
                  if p.name == 'per sample FASTQ defaults'][0]

        obs_cmd, obs_output_dir = _get_preprocess_fastq_cmd(raw_data,
                                                            prep_template,
                                                            params)

        raw_fps = ','.join([fp for _, fp, _ in
                            sorted(raw_data.get_filepaths())])
        exp_cmd = (
            "split_libraries_fastq.py --store_demultiplexed_fastq -i "
            "{} --sample_ids 1.SKB8.640193,1.SKD8.640184 -o {} --barcode_type "
            "not-barcoded --max_bad_run_length 3 --max_barcode_errors 1.5 "
            "--min_per_read_length_fraction 0.75 --phred_quality_threshold 3 "
            "--sequence_max_n 0").format(raw_fps, obs_output_dir)
        self.assertEqual(obs_cmd, exp_cmd)
Ejemplo n.º 13
0
    def test_load_data_from_cmd(self):
        filepaths = [self.forward_fp, self.reverse_fp, self.barcodes_fp]
        filepath_types = ['raw_forward_seqs', 'raw_reverse_seqs',
                          'raw_barcodes']

        filetype = 'FASTQ'
        metadata_dict = {
            'SKB8.640193': {'center_name': 'ANL',
                            'primer': 'GTGCCAGCMGCCGCGGTAA',
                            'barcode': 'GTCCGCAAGTTA',
                            'run_prefix': "s_G1_L001_sequences",
                            'platform': 'ILLUMINA',
                            'instrument_model': 'Illumina MiSeq',
                            'library_construction_protocol': 'AAAA',
                            'experiment_design_description': 'BBBB'}}
        metadata = pd.DataFrame.from_dict(metadata_dict, orient='index')
        pt1 = PrepTemplate.create(metadata, Study(1), "16S")
        prep_templates = [pt1.id]

        initial_raw_count = get_count('qiita.raw_data')
        initial_fp_count = get_count('qiita.filepath')
        initial_raw_fp_count = get_count('qiita.raw_filepath')

        new = load_raw_data_cmd(filepaths, filepath_types, filetype,
                                prep_templates)
        raw_data_id = new.id
        self.files_to_remove.append(
            join(self.db_test_raw_dir,
                 '%d_%s' % (raw_data_id, basename(self.forward_fp))))
        self.files_to_remove.append(
            join(self.db_test_raw_dir,
                 '%d_%s' % (raw_data_id, basename(self.reverse_fp))))
        self.files_to_remove.append(
            join(self.db_test_raw_dir,
                 '%d_%s' % (raw_data_id, basename(self.barcodes_fp))))

        self.assertTrue(check_count('qiita.raw_data', initial_raw_count + 1))
        self.assertTrue(check_count('qiita.filepath',
                                    initial_fp_count + 3))
        self.assertTrue(check_count('qiita.raw_filepath',
                                    initial_raw_fp_count + 3))

        # Ensure that the ValueError is raised when a filepath_type is not
        # provided for each and every filepath
        with self.assertRaises(ValueError):
            load_raw_data_cmd(filepaths, filepath_types[:-1], filetype,
                              prep_templates)
Ejemplo n.º 14
0
    def test_move_filepaths_to_upload_folder(self):
        # setting up test, done here as this is the only test that uses these
        # files
        fd, seqs_fp = mkstemp(suffix="_seqs.fastq")
        close(fd)
        st = Study(1)
        metadata_dict = {
            "SKB8.640193": {
                "center_name": "ANL",
                "primer": "GTGCCAGCMGCCGCGGTAA",
                "barcode": "GTCCGCAAGTTA",
                "run_prefix": "s_G1_L001_sequences",
                "platform": "ILLUMINA",
                "library_construction_protocol": "AAAA",
                "experiment_design_description": "BBBB",
            }
        }
        metadata = pd.DataFrame.from_dict(metadata_dict, orient="index")
        pt = PrepTemplate.create(metadata, Study(1), "16S")

        rd = RawData.create(2, [pt], [(seqs_fp, 1)])
        filepaths = rd.get_filepaths()
        # deleting reference so we can directly call
        # move_filepaths_to_upload_folder
        for fid, _, _ in filepaths:
            self.conn_handler.execute("DELETE FROM qiita.raw_filepath WHERE filepath_id=%s", (fid,))

        # moving filepaths
        move_filepaths_to_upload_folder(st.id, filepaths)

        # check that they do not exist in the old path but do in the new one
        path_for_removal = join(get_mountpoint("uploads")[0][1], str(st.id))
        for _, fp, _ in filepaths:
            self.assertFalse(exists(fp))
            new_fp = join(path_for_removal, basename(fp).split("_", 1)[1])
            self.assertTrue(exists(new_fp))

            self.files_to_remove.append(new_fp)
Ejemplo n.º 15
0
    def generate_new_study_with_preprocessed_data(self):
        """Creates a new study up to the processed data for testing"""
        # ignoring warnings generated when adding templates
        simplefilter("ignore")
        info = {
            "timeseries_type_id": 1,
            "metadata_complete": True,
            "mixs_compliant": True,
            "number_samples_collected": 3,
            "number_samples_promised": 3,
            "study_alias": "Test EBI",
            "study_description": "Study for testing EBI",
            "study_abstract": "Study for testing EBI",
            "emp_person_id": StudyPerson(2),
            "principal_investigator_id": StudyPerson(3),
            "lab_person_id": StudyPerson(1)
        }
        study = Study.create(User('*****@*****.**'), "Test EBI study", [1], info)
        metadata_dict = {
            'Sample1': {'collection_timestamp': datetime(2015, 6, 1, 7, 0, 0),
                        'physical_specimen_location': 'location1',
                        'taxon_id': 9606,
                        'scientific_name': 'h**o sapiens',
                        'Description': 'Test Sample 1'},
            'Sample2': {'collection_timestamp': datetime(2015, 6, 2, 7, 0, 0),
                        'physical_specimen_location': 'location1',
                        'taxon_id': 9606,
                        'scientific_name': 'h**o sapiens',
                        'Description': 'Test Sample 2'},
            'Sample3': {'collection_timestamp': datetime(2015, 6, 3, 7, 0, 0),
                        'physical_specimen_location': 'location1',
                        'taxon_id': 9606,
                        'scientific_name': 'h**o sapiens',
                        'Description': 'Test Sample 3'}
        }
        metadata = pd.DataFrame.from_dict(metadata_dict, orient='index')
        SampleTemplate.create(metadata, study)
        metadata_dict = {
            'Sample1': {'primer': 'GTGCCAGCMGCCGCGGTAA',
                        'barcode': 'CGTAGAGCTCTC',
                        'center_name': 'KnightLab',
                        'platform': 'ILLUMINA',
                        'instrument_model': 'Illumina MiSeq',
                        'library_construction_protocol': 'Protocol ABC',
                        'experiment_design_description': "Random value 1"},
            'Sample2': {'primer': 'GTGCCAGCMGCCGCGGTAA',
                        'barcode': 'CGTAGAGCTCTA',
                        'center_name': 'KnightLab',
                        'platform': 'ILLUMINA',
                        'instrument_model': 'Illumina MiSeq',
                        'library_construction_protocol': 'Protocol ABC',
                        'experiment_design_description': "Random value 2"},
            'Sample3': {'primer': 'GTGCCAGCMGCCGCGGTAA',
                        'barcode': 'CGTAGAGCTCTT',
                        'center_name': 'KnightLab',
                        'platform': 'ILLUMINA',
                        'instrument_model': 'Illumina MiSeq',
                        'library_construction_protocol': 'Protocol ABC',
                        'experiment_design_description': "Random value 3"},
        }
        metadata = pd.DataFrame.from_dict(metadata_dict, orient='index')
        pt = PrepTemplate.create(metadata, study, "16S", 'Metagenomics')
        fna_fp = join(self.temp_dir, 'seqs.fna')
        demux_fp = join(self.temp_dir, 'demux.seqs')
        with open(fna_fp, 'w') as f:
            f.write(FASTA_EXAMPLE_2.format(study.id))
        with File(demux_fp, 'w') as f:
            to_hdf5(fna_fp, f)

        ppd = PreprocessedData.create(
            study, "preprocessed_sequence_illumina_params", 1,
            [(demux_fp, 6)], pt)

        return ppd
Ejemplo n.º 16
0
def create_templates_from_qiime_mapping_file(fp, study, data_type):
    """Creates a sample template and a prep template from qiime mapping file

    Parameters
    ----------
    fp : str or file-like object
        Path to the QIIME mapping file
    study : Study
        The study to which the sample template belongs to
    data_type : str or int
        The data_type of the prep_template

    Returns
    -------
    (SampleTemplate, PrepTemplate)
        The templates created from the QIIME mapping file
    """
    qiime_map = load_template_to_dataframe(fp, index='#SampleID')

    # There are a few columns in the QIIME mapping file that are special and
    # we know how to deal with them
    rename_cols = {
        'BarcodeSequence': 'barcode',
        'LinkerPrimerSequence': 'primer',
        'Description': 'description',
    }

    if 'ReverseLinkerPrimer' in qiime_map:
        rename_cols['ReverseLinkerPrimer'] = 'reverselinkerprimer'

    missing = set(rename_cols).difference(qiime_map.columns)
    if missing:
        raise QiitaWareError(
            "Error generating the templates from the QIIME mapping file. "
            "Missing QIIME mapping file columns: %s" % ', '.join(missing))

    qiime_map.rename(columns=rename_cols, inplace=True)

    # Fix the casing in the columns that we control
    qiime_map.columns = [c.lower() if c.lower() in CONTROLLED_COLS else c
                         for c in qiime_map.columns]

    # Figure out which columns belong to the prep template
    def _col_iterator(restriction_set):
        for restriction in viewvalues(restriction_set):
            for cols in viewkeys(restriction.columns):
                yield cols

    pt_cols = set(col for col in _col_iterator(PREP_TEMPLATE_COLUMNS))

    data_type_str = (convert_from_id(data_type, "data_type")
                     if isinstance(data_type, (int, long)) else data_type)

    if data_type_str in TARGET_GENE_DATA_TYPES:
        pt_cols.update(
            col for col in _col_iterator(PREP_TEMPLATE_COLUMNS_TARGET_GENE))
        pt_cols.add('reverselinkerprimer')

    qiime_cols = set(qiime_map.columns)
    pt_cols = qiime_cols.intersection(pt_cols)
    st_cols = qiime_cols.difference(pt_cols)

    st_md = qiime_map.ix[:, st_cols]
    pt_md = qiime_map.ix[:, pt_cols]

    return (SampleTemplate.create(st_md, study),
            PrepTemplate.create(pt_md, study, data_type))
Ejemplo n.º 17
0
    def test_get_qiime_minimal_mapping_multiple(self):
        # We need to create a prep template in which we have different run
        # prefix values, so we can test this case
        metadata_dict = {
            'SKB8.640193': {
                'center_name': 'ANL',
                'center_project_name': 'Test Project',
                'ebi_submission_accession': None,
                'EMP_status': 'EMP',
                'str_column': 'Value for sample 1',
                'linkerprimersequence': 'GTGCCAGCMGCCGCGGTAA',
                'barcodesequence': 'GTCCGCAAGTTA',
                'run_prefix': "s_G1_L001_sequences",
                'platform': 'ILLUMINA',
                'library_construction_protocol': 'AAA',
                'experiment_design_description': 'BBB'
            },
            'SKD8.640184': {
                'center_name': 'ANL',
                'center_project_name': 'Test Project',
                'ebi_submission_accession': None,
                'EMP_status': 'EMP',
                'str_column': 'Value for sample 2',
                'linkerprimersequence': 'GTGCCAGCMGCCGCGGTAA',
                'barcodesequence': 'CGTAGAGCTCTC',
                'run_prefix': "s_G1_L001_sequences",
                'platform': 'ILLUMINA',
                'library_construction_protocol': 'AAA',
                'experiment_design_description': 'BBB'
            },
            'SKB7.640196': {
                'center_name': 'ANL',
                'center_project_name': 'Test Project',
                'ebi_submission_accession': None,
                'EMP_status': 'EMP',
                'str_column': 'Value for sample 3',
                'linkerprimersequence': 'GTGCCAGCMGCCGCGGTAA',
                'barcodesequence': 'CCTCTGAGAGCT',
                'run_prefix': "s_G1_L002_sequences",
                'platform': 'ILLUMINA',
                'library_construction_protocol': 'AAA',
                'experiment_design_description': 'BBB'
            }
        }
        md_template = pd.DataFrame.from_dict(metadata_dict, orient='index')
        prep_template = PrepTemplate.create(md_template, RawData(2), Study(1),
                                            '16S')

        out_dir = mkdtemp()

        obs_fps = sorted(_get_qiime_minimal_mapping(prep_template, out_dir))
        exp_fps = sorted([
            join(out_dir, 's_G1_L001_sequences_MMF.txt'),
            join(out_dir, 's_G1_L002_sequences_MMF.txt')
        ])

        # Check that the returned list is as expected
        self.assertEqual(obs_fps, exp_fps)
        # Check that the file exists
        for fp in exp_fps:
            self.assertTrue(exists(fp))
        # Check the contents of the file
        for fp, contents in zip(exp_fps, [EXP_PREP_1, EXP_PREP_2]):
            with open(fp, "U") as f:
                self.assertEqual(f.read(), contents)
Ejemplo n.º 18
0
    def setUp(self):
        self.db_dir = get_db_files_base_dir()

        # Create a SFF dataset: add prep template and a RawData
        study = Study(1)
        md_dict = {
            'SKB8.640193': {'center_name': 'ANL',
                            'primer': 'GTGCCAGCMGCCGCGGTAA',
                            'barcode': 'GTCCGCAAGTTA',
                            'run_prefix': "preprocess_test",
                            'platform': 'ILLUMINA',
                            'library_construction_protocol': 'AAAA',
                            'experiment_design_description': 'BBBB'},
            'SKD8.640184': {'center_name': 'ANL',
                            'primer': 'GTGCCAGCMGCCGCGGTAA',
                            'barcode': 'CGTAGAGCTCTC',
                            'run_prefix': "preprocess_test",
                            'platform': 'ILLUMINA',
                            'library_construction_protocol': 'AAAA',
                            'experiment_design_description': 'BBBB'},
            'SKB7.640196': {'center_name': 'ANL',
                            'primer': 'GTGCCAGCMGCCGCGGTAA',
                            'barcode': 'CCTCTGAGAGCT',
                            'run_prefix': "preprocess_test",
                            'platform': 'ILLUMINA',
                            'library_construction_protocol': 'AAAA',
                            'experiment_design_description': 'BBBB'}
        }
        md = pd.DataFrame.from_dict(md_dict, orient='index')
        self.sff_prep_template = PrepTemplate.create(md, study, "16S")

        tmp_dir = mkdtemp()
        self.path_builder = partial(join, tmp_dir)
        fp1 = self.path_builder('preprocess_test1.sff')
        with open(fp1, 'w') as f:
            f.write('\n')
        fp2 = self.path_builder('preprocess_test2.sff')
        with open(fp2, 'w') as f:
            f.write('\n')
        self.raw_sff_id = convert_to_id('raw_sff', 'filepath_type')
        fps = [(fp1, self.raw_sff_id), (fp2, self.raw_sff_id)]

        # Magic number 1: is the filetype id
        self.raw_data = RawData.create(1, [self.sff_prep_template], fps)

        md = pd.DataFrame.from_dict(md_dict, orient='index')
        self.sff_prep_template_gz = PrepTemplate.create(md, study, "16S")
        fp1_gz = self.path_builder('preprocess_test1.sff.gz')
        with gzip.open(fp1_gz, 'w') as f:
            f.write('\n')
        fps = [(fp1_gz, self.raw_sff_id)]
        self.raw_data_gz = RawData.create(1, [self.sff_prep_template_gz], fps)

        # Create a SFF dataset with multiple run prefix:
        # add prep template and a RawData
        md_dict['SKD8.640184']['run_prefix'] = "new"
        md_rp = pd.DataFrame.from_dict(md_dict, orient='index')
        self.sff_prep_template_rp = PrepTemplate.create(md_rp, study, "16S")

        rp_fp1 = self.path_builder('preprocess_test1.sff')
        with open(rp_fp1, 'w') as f:
            f.write('\n')
        rp_fp2 = self.path_builder('preprocess_test2.sff')
        with open(rp_fp2, 'w') as f:
            f.write('\n')
        fps = [(rp_fp1, self.raw_sff_id), (rp_fp2, self.raw_sff_id)]

        # Magic number 1: is the filetype id
        self.raw_data_rp = RawData.create(1, [self.sff_prep_template_rp], fps)

        # Make sure that we clean up all created files
        self.files_to_remove = [fp1, fp2, rp_fp1, rp_fp2]
        self.dirs_to_remove = [tmp_dir]

        for pt in [self.sff_prep_template, self.sff_prep_template_rp]:
            for _, fp in pt.get_filepaths():
                self.files_to_remove.append(fp)
Ejemplo n.º 19
0
 def test_create_duplicate(self):
     """Create raises an error when creating a duplicated PrepTemplate"""
     with self.assertRaises(QiitaDBDuplicateError):
         PrepTemplate.create(self.metadata, self.test_raw_data)
Ejemplo n.º 20
0
 def test_create_duplicate_header(self):
     """Create raises an error when duplicate headers are present"""
     self.metadata['STR_COLUMN'] = pd.Series(['', '', ''],
                                             index=self.metadata.index)
     with self.assertRaises(QiitaDBDuplicateHeaderError):
         PrepTemplate.create(self.metadata, self.new_raw_data)
Ejemplo n.º 21
0
 def test_create_duplicate(self):
     """Create raises an error when creating a duplicated PrepTemplate"""
     with self.assertRaises(QiitaDBDuplicateError):
         PrepTemplate.create(self.metadata, self.test_raw_data)
Ejemplo n.º 22
0
 def test_create_duplicate_header(self):
     """Create raises an error when duplicate headers are present"""
     self.metadata['STR_COLUMN'] = pd.Series(['', '', ''],
                                             index=self.metadata.index)
     with self.assertRaises(QiitaDBDuplicateHeaderError):
         PrepTemplate.create(self.metadata, self.new_raw_data)