def generate_demux_file(sl_out): """Creates the HDF5 demultiplexed file Parameters ---------- sl_out : str Path to the output directory of split libraries Returns ------- str The path of the demux file Raises ------ ValueError If the split libraries output does not contain the demultiplexed fastq file """ fastq_fp = str(join(sl_out, 'seqs.fastq')) if not exists(fastq_fp): raise ValueError("The split libraries output directory does not " "contain the demultiplexed fastq file.") demux_fp = join(sl_out, 'seqs.demux') with File(demux_fp, "w") as f: to_hdf5(fastq_fp, f) return demux_fp
def write_demux_files(self, prep_template, generate_hdf5=True): """Writes a demux test file to avoid duplication of code""" fna_fp = join(self.temp_dir, 'seqs.fna') demux_fp = join(self.temp_dir, 'demux.seqs') if generate_hdf5: with open(fna_fp, 'w') as f: f.write(FASTA_EXAMPLE) with File(demux_fp, "w") as f: to_hdf5(fna_fp, f) else: with open(demux_fp, 'w') as f: f.write('') if prep_template.artifact is None: ppd = Artifact.create( [(demux_fp, 6)], "Demultiplexed", prep_template=prep_template, can_be_submitted_to_ebi=True, can_be_submitted_to_vamps=True) else: params = Parameters.from_default_params( DefaultParameters(1), {'input_data': prep_template.artifact.id}) ppd = Artifact.create( [(demux_fp, 6)], "Demultiplexed", parents=[prep_template.artifact], processing_parameters=params, can_be_submitted_to_ebi=True, can_be_submitted_to_vamps=True) return ppd
def generate_demux_file(sl_out, **kwargs): """Creates the HDF5 demultiplexed file Parameters ---------- sl_out : str Path to the output directory of split libraries kwargs: ignored Necessary to include to support execution via moi. Raises ------ ValueError If the split libraries output does not contain the demultiplexed fastq file """ from os.path import join, exists from h5py import File from qiita_ware.demux import to_hdf5 fastq_fp = join(sl_out, 'seqs.fastq') if not exists(fastq_fp): raise ValueError("The split libraries output directory does not " "contain the demultiplexed fastq file") demux_fp = join(sl_out, 'seqs.demux') with File(demux_fp, "w") as f: to_hdf5(fastq_fp, f) return demux_fp
def write_demux_files(self, prep_template, generate_hdf5=True): """Writes a demux test file to avoid duplication of code""" fna_fp = join(self.temp_dir, 'seqs.fna') demux_fp = join(self.temp_dir, 'demux.seqs') if generate_hdf5: with open(fna_fp, 'w') as f: f.write(FASTA_EXAMPLE) with File(demux_fp, "w") as f: to_hdf5(fna_fp, f) else: with open(demux_fp, 'w') as f: f.write('') if prep_template.artifact is None: ppd = Artifact.create([(demux_fp, 6)], "Demultiplexed", prep_template=prep_template, can_be_submitted_to_ebi=True, can_be_submitted_to_vamps=True) else: params = Parameters.from_default_params( DefaultParameters(1), {'input_data': prep_template.artifact.id}) ppd = Artifact.create([(demux_fp, 6)], "Demultiplexed", parents=[prep_template.artifact], processing_parameters=params, can_be_submitted_to_ebi=True, can_be_submitted_to_vamps=True) return ppd
def generate_demux_file(sl_out): """Creates the HDF5 demultiplexed file Parameters ---------- sl_out : str Path to the output directory of split libraries Returns ------- str The path of the demux file Raises ------ ValueError If the split libraries output does not contain the demultiplexed fastq file """ fastq_fp = str(join(sl_out, 'seqs.fastq')) if not exists(fastq_fp): raise ValueError("The split libraries output directory does not " "contain the demultiplexed fastq file.") demux_fp = join(sl_out, 'seqs.demux') with File(demux_fp, "w") as f: to_hdf5(fastq_fp, f) return demux_fp
def generate_demux_file(sl_out, **kwargs): """Creates the HDF5 demultiplexed file Parameters ---------- sl_out : str Path to the output directory of split libraries kwargs: ignored Necessary to include to support execution via moi. Raises ------ ValueError If the split libraries output does not contain the demultiplexed fastq file """ from os.path import join, exists from h5py import File from qiita_ware.demux import to_hdf5 fastq_fp = join(sl_out, 'seqs.fastq') if not exists(fastq_fp): raise ValueError("The split libraries output directory does not " "contain the demultiplexed fastq file") demux_fp = join(sl_out, 'seqs.demux') with File(demux_fp, "w") as f: to_hdf5(fastq_fp, f) return demux_fp
def test_to_hdf5(self): with tempfile.NamedTemporaryFile('r+', suffix='.fna', delete=False) as f: f.write(seqdata) f.flush() f.close() to_hdf5(f.name, self.hdf5_file) self.to_remove.append(f.name) npt.assert_equal(self.hdf5_file['a/sequence'][:], np.array(["x", "xy", "xyz"])) npt.assert_equal(self.hdf5_file['a/qual'][:], np.array([[0, 0, 0], [0, 0, 0], [0, 0, 0]])) npt.assert_equal(self.hdf5_file['a/barcode/original'][:], np.array(["abc", "aby", "abz"])) npt.assert_equal(self.hdf5_file['a/barcode/corrected'][:], np.array(["abc", "ybc", "zbc"])) npt.assert_equal(self.hdf5_file['a/barcode/error'][:], np.array([0, 2, 3])) npt.assert_equal(self.hdf5_file['b/sequence'][:], np.array(["xyz", "abcd"])) npt.assert_equal(self.hdf5_file['b/qual'][:], np.array([[0, 0, 0, 0], [0, 0, 0, 0]])) npt.assert_equal(self.hdf5_file['b/barcode/original'][:], np.array(["abx", "abw"])) npt.assert_equal(self.hdf5_file['b/barcode/corrected'][:], np.array(["xbc", "wbc"])) npt.assert_equal(self.hdf5_file['b/barcode/error'][:], np.array([1, 4]))
def test_to_hdf5(self): with tempfile.NamedTemporaryFile('r+', suffix='.fna', delete=False) as f: f.write(seqdata) self.to_remove.append(f.name) to_hdf5(f.name, self.hdf5_file) npt.assert_equal(self.hdf5_file['a/sequence'][:], np.array(["x", "xy", "xyz"])) npt.assert_equal(self.hdf5_file['a/qual'][:], np.array([[0, 0, 0], [0, 0, 0], [0, 0, 0]])) npt.assert_equal(self.hdf5_file['a/barcode/original'][:], np.array(["abc", "aby", "abz"])) npt.assert_equal(self.hdf5_file['a/barcode/corrected'][:], np.array(["abc", "ybc", "zbc"])) npt.assert_equal(self.hdf5_file['a/barcode/error'][:], np.array([0, 2, 3])) npt.assert_equal(self.hdf5_file['b/sequence'][:], np.array(["xyz", "abcd"])) npt.assert_equal(self.hdf5_file['b/qual'][:], np.array([[0, 0, 0, 0], [0, 0, 0, 0]])) npt.assert_equal(self.hdf5_file['b/barcode/original'][:], np.array(["abx", "abw"])) npt.assert_equal(self.hdf5_file['b/barcode/corrected'][:], np.array(["xbc", "wbc"])) npt.assert_equal(self.hdf5_file['b/barcode/error'][:], np.array([1, 4]))
def test_get(self): demux_fp = [fp for _, fp, fp_type in Artifact(2).filepaths if fp_type == 'preprocessed_demux'][0] fd, fna_fp = mkstemp(suffix='_seqs.fna') close(fd) self._clean_up_files.extend([fna_fp, demux_fp]) with open(fna_fp, 'w') as f: f.write('>a_1 X orig_bc=X new_bc=X bc_diffs=0\nCCC') with File(demux_fp, "w") as f: to_hdf5(fna_fp, f) BaseHandler.get_current_user = Mock(return_value=User("*****@*****.**")) response = self.get("/ebi_submission/2") self.assertEqual(response.code, 200)
def test_to_ascii(self): with tempfile.NamedTemporaryFile('r+', suffix='.fq', delete=False) as f: f.write(fqdata) self.to_remove.append(f.name) to_hdf5(f.name, self.hdf5_file) exp = [b"@a_0 orig_bc=abc new_bc=abc bc_diffs=0\nxyz\n+\nABC\n", b"@b_0 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n+\nDFG\n", b"@b_1 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n+\nDEF\n"] obs = list(to_ascii(self.hdf5_file, samples=['a', 'b'])) self.assertEqual(obs, exp)
def test_to_ascii(self): with tempfile.NamedTemporaryFile('r+', suffix='.fq', delete=False) as f: f.write(fqdata) f.flush() f.close() to_hdf5(f.name, self.hdf5_file) self.to_remove.append(f.name) exp = [(b"@a_0 orig_bc=abc new_bc=abc bc_diffs=0\nxyz\n+\nABC\n"), (b"@b_0 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n+\nDFG\n"), (b"@b_1 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n+\nDEF\n")] obs = list(to_ascii(self.hdf5_file, samples=['a', 'b'])) self.assertEqual(obs, exp)
def _generate_files(self): fd, fastq_fp = mkstemp(suffix=".fastq") close(fd) with open(fastq_fp, 'w') as f: f.write(FASTQ_SEQS) demux_fp = "%s.demux" % fastq_fp with File(demux_fp) as f: to_hdf5(fastq_fp, f) out_dir = mkdtemp() self._clean_up_files.extend([fastq_fp, demux_fp, out_dir]) return demux_fp, fastq_fp, out_dir
def _generate_files(self): fd, fastq_fp = mkstemp(suffix=".fastq") close(fd) with open(fastq_fp, 'w') as f: f.write(FASTQ_SEQS) demux_fp = "%s.demux" % fastq_fp with File(demux_fp) as f: to_hdf5(fastq_fp, f) out_dir = mkdtemp() self._clean_up_files.extend([fastq_fp, demux_fp, out_dir]) return demux_fp, fastq_fp, out_dir
def test_get(self): demux_fp = [ fp for _, fp, fp_type in Artifact(2).filepaths if fp_type == 'preprocessed_demux' ][0] fd, fna_fp = mkstemp(suffix='_seqs.fna') close(fd) self._clean_up_files.extend([fna_fp, demux_fp]) with open(fna_fp, 'w') as f: f.write('>a_1 X orig_bc=X new_bc=X bc_diffs=0\nCCC') with File(demux_fp, "w") as f: to_hdf5(fna_fp, f) BaseHandler.get_current_user = Mock(return_value=User("*****@*****.**")) response = self.get("/ebi_submission/2") self.assertEqual(response.code, 200)
def test_to_ascii_fasta(self): with tempfile.NamedTemporaryFile('r+', suffix='.fna', delete=False) as f: f.write(seqdata) self.to_remove.append(f.name) to_hdf5(f.name, self.hdf5_file) exp = [b">a_0 orig_bc=abc new_bc=abc bc_diffs=0\nx\n", b">a_1 orig_bc=aby new_bc=ybc bc_diffs=2\nxy\n", b">a_2 orig_bc=abz new_bc=zbc bc_diffs=3\nxyz\n", b">b_0 orig_bc=abx new_bc=xbc bc_diffs=1\nxyz\n", b">b_1 orig_bc=abw new_bc=wbc bc_diffs=4\nabcd\n"] obs = list(to_ascii(self.hdf5_file, samples=['a', 'b'])) self.assertEqual(obs, exp)
def test_to_per_sample_ascii(self): with tempfile.NamedTemporaryFile('r+', suffix='.fq', delete=False) as f: f.write(fqdata) self.to_remove.append(f.name) to_hdf5(f.name, self.hdf5_file) exp = [('a', [(b"@a_0 orig_bc=abc new_bc=abc bc_diffs=0\nxyz\n+\n" "ABC\n")]), ('b', [(b"@b_0 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n+\n" "DFG\n"), (b"@b_1 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n+\n" "DEF\n")])] obs = [(s[0], list(s[1])) for s in to_per_sample_ascii(self.hdf5_file)] self.assertEqual(obs, exp)
def write_demux_files(self, prep_template, generate_hdf5=True): """Writes a demux test file to avoid duplication of code""" fna_fp = join(self.temp_dir, 'seqs.fna') demux_fp = join(self.temp_dir, 'demux.seqs') if generate_hdf5: with open(fna_fp, 'w') as f: f.write(FASTA_EXAMPLE) with File(demux_fp, "w") as f: to_hdf5(fna_fp, f) else: with open(demux_fp, 'w') as f: f.write('') ppd = PreprocessedData.create(Study(1), "preprocessed_sequence_illumina_params", 1, [(demux_fp, 6)], prep_template) return ppd
def test_to_per_sample_ascii(self): with tempfile.NamedTemporaryFile('r+', suffix='.fq', delete=False) as f: f.write(fqdata) self.to_remove.append(f.name) to_hdf5(f.name, self.hdf5_file) exp = [('a', [(b"@a_0 orig_bc=abc new_bc=abc bc_diffs=0\nxyz\n+\n" "ABC\n")]), ('b', [(b"@b_0 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n+\n" "DFG\n"), (b"@b_1 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n+\n" "DEF\n")])] obs = [(s[0], list(s[1])) for s in to_per_sample_ascii(self.hdf5_file)] self.assertEqual(obs, exp)
def test_to_ascii_fasta(self): with tempfile.NamedTemporaryFile('r+', suffix='.fna', delete=False) as f: f.write(seqdata) self.to_remove.append(f.name) to_hdf5(f.name, self.hdf5_file) exp = [ b">a_0 orig_bc=abc new_bc=abc bc_diffs=0\nx\n", b">a_1 orig_bc=aby new_bc=ybc bc_diffs=2\nxy\n", b">a_2 orig_bc=abz new_bc=zbc bc_diffs=3\nxyz\n", b">b_0 orig_bc=abx new_bc=xbc bc_diffs=1\nxyz\n", b">b_1 orig_bc=abw new_bc=wbc bc_diffs=4\nabcd\n" ] obs = list(to_ascii(self.hdf5_file, samples=['a', 'b'])) self.assertEqual(obs, exp)
def test_fetch_qual_length_bug(self): # fetch was not trimming qual to the length of the sequence resulting # in qual scores for positions beyond the length of the sequence. with tempfile.NamedTemporaryFile('r+', suffix='.fq', delete=False) as f: f.write(fqdata_variable_length) self.to_remove.append(f.name) to_hdf5(f.name, self.hdf5_file) exp = [('a', [(b"@a_0 orig_bc=abc new_bc=abc bc_diffs=0\nxyz\n+\n" "ABC\n")]), ('b', [(b"@b_0 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n+\n" "DFG\n"), (b"@b_1 orig_bc=abw new_bc=wbc bc_diffs=4\nqwexx\n+\n" "DEF#G\n")])] obs = [(s[0], list(s[1])) for s in to_per_sample_ascii(self.hdf5_file)] self.assertEqual(obs, exp)
def test_fetch_qual_length_bug(self): # fetch was not trimming qual to the length of the sequence resulting # in qual scores for positions beyond the length of the sequence. with tempfile.NamedTemporaryFile('r+', suffix='.fq', delete=False) as f: f.write(fqdata_variable_length) self.to_remove.append(f.name) to_hdf5(f.name, self.hdf5_file) exp = [('a', [(b"@a_0 orig_bc=abc new_bc=abc bc_diffs=0\nxyz\n+\n" "ABC\n")]), ('b', [(b"@b_0 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n+\n" "DFG\n"), (b"@b_1 orig_bc=abw new_bc=wbc bc_diffs=4\nqwexx\n+\n" "DEF#G\n")])] obs = [(s[0], list(s[1])) for s in to_per_sample_ascii(self.hdf5_file)] self.assertEqual(obs, exp)
def test_to_ascii(self): with tempfile.NamedTemporaryFile('r+', suffix='.fq', delete=False) as f: f.write(fqdata) f.flush() f.close() to_hdf5(f.name, self.hdf5_file) self.to_remove.append(f.name) exp = [(b"@a_0 orig_bc=abc new_bc=abc bc_diffs=0\nxyz\n+\n" "A\x00\x00\x00\x00\x00\x00\x00" "B\x00\x00\x00\x00\x00\x00\x00" "C\x00\x00\x00\x00\x00\x00\x00\n"), (b"@b_0 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n+\n" "D\x00\x00\x00\x00\x00\x00\x00" "F\x00\x00\x00\x00\x00\x00\x00" "G\x00\x00\x00\x00\x00\x00\x00\n"), (b"@b_1 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n+\n" "D\x00\x00\x00\x00\x00\x00\x00" "E\x00\x00\x00\x00\x00\x00\x00" "F\x00\x00\x00\x00\x00\x00\x00\n")] obs = list(to_ascii(self.hdf5_file, samples=['a', 'b'])) self.assertEqual(obs, exp)
def generate_new_study_with_preprocessed_data(self): """Creates a new study up to the processed data for testing""" # ignoring warnings generated when adding templates simplefilter("ignore") info = { "timeseries_type_id": 1, "metadata_complete": True, "mixs_compliant": True, "number_samples_collected": 3, "number_samples_promised": 3, "study_alias": "Test EBI", "study_description": "Study for testing EBI", "study_abstract": "Study for testing EBI", "emp_person_id": StudyPerson(2), "principal_investigator_id": StudyPerson(3), "lab_person_id": StudyPerson(1) } study = Study.create(User('*****@*****.**'), "Test EBI study", [1], info) metadata_dict = { 'Sample1': {'collection_timestamp': datetime(2015, 6, 1, 7, 0, 0), 'physical_specimen_location': 'location1', 'taxon_id': 9606, 'scientific_name': 'h**o sapiens', 'Description': 'Test Sample 1'}, 'Sample2': {'collection_timestamp': datetime(2015, 6, 2, 7, 0, 0), 'physical_specimen_location': 'location1', 'taxon_id': 9606, 'scientific_name': 'h**o sapiens', 'Description': 'Test Sample 2'}, 'Sample3': {'collection_timestamp': datetime(2015, 6, 3, 7, 0, 0), 'physical_specimen_location': 'location1', 'taxon_id': 9606, 'scientific_name': 'h**o sapiens', 'Description': 'Test Sample 3'} } metadata = pd.DataFrame.from_dict(metadata_dict, orient='index', dtype=str) SampleTemplate.create(metadata, study) metadata_dict = { 'Sample1': {'primer': 'GTGCCAGCMGCCGCGGTAA', 'barcode': 'CGTAGAGCTCTC', 'center_name': 'KnightLab', 'platform': 'ILLUMINA', 'instrument_model': 'Illumina MiSeq', 'library_construction_protocol': 'Protocol ABC', 'experiment_design_description': "Random value 1"}, 'Sample2': {'primer': 'GTGCCAGCMGCCGCGGTAA', 'barcode': 'CGTAGAGCTCTA', 'center_name': 'KnightLab', 'platform': 'ILLUMINA', 'instrument_model': 'Illumina MiSeq', 'library_construction_protocol': 'Protocol ABC', 'experiment_design_description': "Random value 2"}, 'Sample3': {'primer': 'GTGCCAGCMGCCGCGGTAA', 'barcode': 'CGTAGAGCTCTT', 'center_name': 'KnightLab', 'platform': 'ILLUMINA', 'instrument_model': 'Illumina MiSeq', 'library_construction_protocol': 'Protocol ABC', 'experiment_design_description': "Random value 3"}, } metadata = pd.DataFrame.from_dict(metadata_dict, orient='index', dtype=str) pt = PrepTemplate.create(metadata, study, "16S", 'Metagenomics') fna_fp = join(self.temp_dir, 'seqs.fna') demux_fp = join(self.temp_dir, 'demux.seqs') with open(fna_fp, 'w') as f: f.write(FASTA_EXAMPLE_2.format(study.id)) with File(demux_fp, 'w') as f: to_hdf5(fna_fp, f) ppd = Artifact.create( [(demux_fp, 6)], "Demultiplexed", prep_template=pt) return ppd
def _validate_demultiplexed(qclient, job_id, prep_info, files, out_dir): """Validate and fix a new 'Demultiplexed' artifact Parameters ---------- qclient : qiita_client.QiitaClient The Qiita server client job_id : str The job id prep_info : dict of {str: dict of {str: str}} The prep information keyed by sample id files : dict of {str: list of str} The files to add to the new artifact, keyed by filepath type out_dir : str The output directory Returns ------- dict The results of the job """ qclient.update_job_step(job_id, "Step 2: Validating 'Demultiplexed' files") supported_fp_types = {'preprocessed_fasta', 'preprocessed_fastq', 'preprocessed_demux', 'log'} unsupported_fp_types = set(files) - supported_fp_types if unsupported_fp_types: return format_payload( success=False, error_msg="Filepath type(s) %s not supported by artifact type " "Demultiplexed. Supported filepath types: %s" % (', '.join(unsupported_fp_types), ', '.join(sorted(supported_fp_types))) ) # At most one file of each type can be provided offending = set(fp_t for fp_t, fps in files.items() if len(fps) > 1) if offending: errors = ["%s (%d): %s" % (fp_t, len(files[fp_t]), ', '.join(files[fp_t])) for fp_t in sorted(offending)] return format_payload( success=False, error_msg="Only one filepath of each file type is supported, " "offending types:\n%s" % "; ".join(errors)) # Check which files we have available: fasta = (files['preprocessed_fasta'][0] if 'preprocessed_fasta' in files else None) fastq = (files['preprocessed_fastq'][0] if 'preprocessed_fastq' in files else None) demux = (files['preprocessed_demux'][0] if 'preprocessed_demux' in files else None) log = (files['log'][0] if 'log' in files else None) if demux: # If demux is available, use that one to perform the validation and # generate the fasta and fastq from it payload = _validate_demux_file(qclient, job_id, prep_info, out_dir, demux, log_fp=log) elif fastq: # Generate the demux file from the fastq demux = join(out_dir, "%s.demux" % splitext(basename(fastq))[0]) with File(demux, "w") as f: to_hdf5(fastq, f) # Validate the demux, providing the original fastq payload = _validate_demux_file(qclient, job_id, prep_info, out_dir, demux, fastq_fp=fastq, log_fp=log) elif fasta: # Generate the demux file from the fasta demux = join(out_dir, "%s.demux" % splitext(basename(fasta))[0]) with File(demux, "w") as f: to_hdf5(fasta, f) # Validate the demux, providing the original fasta payload = _validate_demux_file(qclient, job_id, prep_info, out_dir, demux, fasta_fp=fasta, log_fp=log) else: payload = format_payload( success=False, error_msg="Either a 'preprocessed_demux', 'preprocessed_fastq' or " "'preprocessed_fasta' file should be provided.") return payload
def generate_new_study_with_preprocessed_data(self): """Creates a new study up to the processed data for testing""" # ignoring warnings generated when adding templates simplefilter("ignore") info = { "timeseries_type_id": 1, "metadata_complete": True, "mixs_compliant": True, "number_samples_collected": 3, "number_samples_promised": 3, "study_alias": "Test EBI", "study_description": "Study for testing EBI", "study_abstract": "Study for testing EBI", "emp_person_id": StudyPerson(2), "principal_investigator_id": StudyPerson(3), "lab_person_id": StudyPerson(1) } study = Study.create(User('*****@*****.**'), "Test EBI study", [1], info) metadata_dict = { 'Sample1': { 'collection_timestamp': datetime(2015, 6, 1, 7, 0, 0), 'physical_specimen_location': 'location1', 'taxon_id': 9606, 'scientific_name': 'h**o sapiens', 'Description': 'Test Sample 1' }, 'Sample2': { 'collection_timestamp': datetime(2015, 6, 2, 7, 0, 0), 'physical_specimen_location': 'location1', 'taxon_id': 9606, 'scientific_name': 'h**o sapiens', 'Description': 'Test Sample 2' }, 'Sample3': { 'collection_timestamp': datetime(2015, 6, 3, 7, 0, 0), 'physical_specimen_location': 'location1', 'taxon_id': 9606, 'scientific_name': 'h**o sapiens', 'Description': 'Test Sample 3' } } metadata = pd.DataFrame.from_dict(metadata_dict, orient='index') SampleTemplate.create(metadata, study) metadata_dict = { 'Sample1': { 'primer': 'GTGCCAGCMGCCGCGGTAA', 'barcode': 'CGTAGAGCTCTC', 'center_name': 'KnightLab', 'platform': 'ILLUMINA', 'instrument_model': 'Illumina MiSeq', 'library_construction_protocol': 'Protocol ABC', 'experiment_design_description': "Random value 1" }, 'Sample2': { 'primer': 'GTGCCAGCMGCCGCGGTAA', 'barcode': 'CGTAGAGCTCTA', 'center_name': 'KnightLab', 'platform': 'ILLUMINA', 'instrument_model': 'Illumina MiSeq', 'library_construction_protocol': 'Protocol ABC', 'experiment_design_description': "Random value 2" }, 'Sample3': { 'primer': 'GTGCCAGCMGCCGCGGTAA', 'barcode': 'CGTAGAGCTCTT', 'center_name': 'KnightLab', 'platform': 'ILLUMINA', 'instrument_model': 'Illumina MiSeq', 'library_construction_protocol': 'Protocol ABC', 'experiment_design_description': "Random value 3" }, } metadata = pd.DataFrame.from_dict(metadata_dict, orient='index') pt = PrepTemplate.create(metadata, study, "16S", 'Metagenomics') fna_fp = join(self.temp_dir, 'seqs.fna') demux_fp = join(self.temp_dir, 'demux.seqs') with open(fna_fp, 'w') as f: f.write(FASTA_EXAMPLE_2.format(study.id)) with File(demux_fp, 'w') as f: to_hdf5(fna_fp, f) ppd = Artifact.create([(demux_fp, 6)], "Demultiplexed", prep_template=pt, can_be_submitted_to_ebi=True, can_be_submitted_to_vamps=True) return ppd
def _validate_demultiplexed(qclient, job_id, prep_info, files, out_dir): """Validate and fix a new 'Demultiplexed' artifact Parameters ---------- qclient : qiita_client.QiitaClient The Qiita server client job_id : str The job id prep_info : dict of {str: dict of {str: str}} The prep information keyed by sample id files : dict of {str: list of str} The files to add to the new artifact, keyed by filepath type out_dir : str The output directory Returns ------- dict The results of the job """ qclient.update_job_step(job_id, "Step 2: Validating 'Demultiplexed' files") supported_fp_types = {'preprocessed_fasta', 'preprocessed_fastq', 'preprocessed_demux', 'log'} unsupported_fp_types = set(files) - supported_fp_types if unsupported_fp_types: error_msg = ("Filepath type(s) %s not supported by artifact type " "Demultiplexed. Supported filepath types: %s" % (', '.join(unsupported_fp_types), ', '.join(sorted(supported_fp_types)))) return False, None, error_msg # At most one file of each type can be provided offending = set(fp_t for fp_t, fps in files.items() if len(fps) > 1) if offending: errors = ["%s (%d): %s" % (fp_t, len(files[fp_t]), ', '.join(files[fp_t])) for fp_t in sorted(offending)] error_msg = ("Only one filepath of each file type is supported, " "offending types:\n%s" % "; ".join(errors)) return False, None, error_msg # Check which files we have available: fasta = (files['preprocessed_fasta'][0] if 'preprocessed_fasta' in files else None) fastq = (files['preprocessed_fastq'][0] if 'preprocessed_fastq' in files else None) demux = (files['preprocessed_demux'][0] if 'preprocessed_demux' in files else None) log = (files['log'][0] if 'log' in files else None) if demux: # If demux is available, use that one to perform the validation and # generate the fasta and fastq from it success, a_info, error_msg = _validate_demux_file( qclient, job_id, prep_info, out_dir, demux, log_fp=log) elif fastq: # Generate the demux file from the fastq demux = join(out_dir, "%s.demux" % splitext(basename(fastq))[0]) with File(demux, "w") as f: to_hdf5(fastq, f) # Validate the demux, providing the original fastq success, a_info, error_msg = _validate_demux_file( qclient, job_id, prep_info, out_dir, demux, fastq_fp=fastq, log_fp=log) elif fasta: # Generate the demux file from the fasta demux = join(out_dir, "%s.demux" % splitext(basename(fasta))[0]) with File(demux, "w") as f: to_hdf5(fasta, f) # Validate the demux, providing the original fasta success, a_info, error_msg = _validate_demux_file( qclient, job_id, prep_info, out_dir, demux, fasta_fp=fasta, log_fp=log) else: error_msg = ("Either a 'preprocessed_demux', 'preprocessed_fastq' or " "'preprocessed_fasta' file should be provided.") return False, None, error_msg return success, a_info, error_msg