def generate_demux_file(sl_out): """Creates the HDF5 demultiplexed file Parameters ---------- sl_out : str Path to the output directory of split libraries Returns ------- str The path of the demux file Raises ------ ValueError If the split libraries output does not contain the demultiplexed fastq file """ fastq_fp = str(join(sl_out, 'seqs.fastq')) if not exists(fastq_fp): raise ValueError("The split libraries output directory does not " "contain the demultiplexed fastq file.") elif stat(fastq_fp).st_size == 0: raise ValueError("No sequences were demuxed. Check your parameters.") demux_fp = join(sl_out, 'seqs.demux') with File(demux_fp, "w") as f: to_hdf5(fastq_fp, f) return demux_fp
def test_to_hdf5(self): with tempfile.NamedTemporaryFile('r+', suffix='.fna', delete=False) as f: f.write(seqdata) self.to_remove.append(f.name) to_hdf5(f.name, self.hdf5_file) npt.assert_equal(self.hdf5_file['a/sequence'][:], np.array([b"x", b"xy", b"xyz"])) npt.assert_equal(self.hdf5_file['a/qual'][:], np.array([[0, 0, 0], [0, 0, 0], [0, 0, 0]])) npt.assert_equal(self.hdf5_file['a/barcode/original'][:], np.array([b"abc", b"aby", b"abz"])) npt.assert_equal(self.hdf5_file['a/barcode/corrected'][:], np.array([b"abc", b"ybc", b"zbc"])) npt.assert_equal(self.hdf5_file['a/barcode/error'][:], np.array([0, 2, 3])) npt.assert_equal(self.hdf5_file['b/sequence'][:], np.array([b"xyz", b"abcd"])) npt.assert_equal(self.hdf5_file['b/qual'][:], np.array([[0, 0, 0, 0], [0, 0, 0, 0]])) npt.assert_equal(self.hdf5_file['b/barcode/original'][:], np.array([b"abx", b"abw"])) npt.assert_equal(self.hdf5_file['b/barcode/corrected'][:], np.array([b"xbc", b"wbc"])) npt.assert_equal(self.hdf5_file['b/barcode/error'][:], np.array([1, 4]))
def write_demux_files(self, prep_template, generate_hdf5=True): """Writes a demux test file to avoid duplication of code""" fna_fp = join(self.temp_dir, 'seqs.fna') demux_fp = join(self.temp_dir, 'demux.seqs') if generate_hdf5: with open(fna_fp, 'w') as f: f.write(FASTA_EXAMPLE) with File(demux_fp, "w") as f: to_hdf5(fna_fp, f) else: with open(demux_fp, 'w') as f: f.write('') if prep_template.artifact is None: ppd = Artifact.create([(demux_fp, 6)], "Demultiplexed", prep_template=prep_template) else: params = Parameters.from_default_params( DefaultParameters(1), {'input_data': prep_template.artifact.id}) ppd = Artifact.create([(demux_fp, 6)], "Demultiplexed", parents=[prep_template.artifact], processing_parameters=params) return ppd
def test_to_per_sample_files(self): with tempfile.NamedTemporaryFile('r+', suffix='.fq', delete=False) as f: f.write(fqdata_variable_length) self.to_remove.append(f.name) with tempfile.NamedTemporaryFile('r+', suffix='.demux', delete=False) as demux_f: pass self.to_remove.append(demux_f.name) with h5py.File(demux_f.name, 'w') as demux: to_hdf5(f.name, demux) tmp_dir = tempfile.mkdtemp() self.to_remove.append(tmp_dir) path_builder = partial(os.path.join, tmp_dir) # Test to fastq to_per_sample_files(demux_f.name, out_dir=tmp_dir, n_jobs=1, out_format='fastq') sample_a_path = path_builder("a.fastq") sample_b_path = path_builder("b.fastq") self.assertTrue(os.path.exists(sample_a_path)) self.assertTrue(os.path.exists(sample_b_path)) with open(sample_a_path, 'rb') as af: obs = af.read() self.assertEqual( obs, b'@a_0 orig_bc=abc new_bc=abc bc_diffs=0\nxyz\n+\nABC\n') with open(sample_b_path, 'rb') as bf: obs = bf.read() self.assertEqual( obs, b'@b_0 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n+\nDFG\n' b'@b_1 orig_bc=abw new_bc=wbc bc_diffs=4\nqwexx\n+\nDEF#G\n') # Test to fasta and parallel to_per_sample_files(demux_f.name, out_dir=tmp_dir, n_jobs=2, out_format='fasta') sample_a_path = path_builder("a.fna") sample_b_path = path_builder("b.fna") self.assertTrue(os.path.exists(sample_a_path)) self.assertTrue(os.path.exists(sample_b_path)) with open(sample_a_path, 'rb') as af: obs = af.read() self.assertEqual( obs, b'>a_0 orig_bc=abc new_bc=abc bc_diffs=0\nxyz\n') with open(sample_b_path, 'rb') as bf: obs = bf.read() self.assertEqual( obs, b'>b_0 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n' b'>b_1 orig_bc=abw new_bc=wbc bc_diffs=4\nqwexx\n')
def test_to_ascii_file(self): with tempfile.NamedTemporaryFile('r+', suffix='.fq', delete=False) as f: f.write(fqdata_variable_length) self.to_remove.append(f.name) with tempfile.NamedTemporaryFile('r+', suffix='.demux', delete=False) as demux_f: pass self.to_remove.append(demux_f.name) with h5py.File(demux_f.name, 'r+') as demux: to_hdf5(f.name, demux) with tempfile.NamedTemporaryFile('r+', suffix='.fq', delete=False) as obs_fq: pass self.to_remove.append(obs_fq.name) to_ascii_file(demux_f.name, obs_fq.name) with open(obs_fq.name, 'rb') as obs_f: obs = obs_f.read() exp = (b'@a_0 orig_bc=abc new_bc=abc bc_diffs=0\nxyz\n+\nABC\n' b'@b_0 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n+\nDFG\n' b'@b_1 orig_bc=abw new_bc=wbc bc_diffs=4\nqwexx\n+\nDEF#G\n') self.assertEqual(obs, exp) with tempfile.NamedTemporaryFile('r+', suffix='.fa', delete=False) as obs_fa: pass self.to_remove.append(obs_fa.name) to_ascii_file(demux_f.name, obs_fa.name, out_format='fasta') with open(obs_fa.name, 'rb') as obs_f: obs = obs_f.read() exp = (b'>a_0 orig_bc=abc new_bc=abc bc_diffs=0\nxyz\n' b'>b_0 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n' b'>b_1 orig_bc=abw new_bc=wbc bc_diffs=4\nqwexx\n') self.assertEqual(obs, exp) with tempfile.NamedTemporaryFile('r+', suffix='.fq', delete=False) as obs_fq: pass self.to_remove.append(obs_fq.name) to_ascii_file(demux_f.name, obs_fq.name, samples=['b']) with open(obs_fq.name, 'rb') as obs_f: obs = obs_f.read() exp = (b'@b_0 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n+\nDFG\n' b'@b_1 orig_bc=abw new_bc=wbc bc_diffs=4\nqwexx\n+\nDEF#G\n') self.assertEqual(obs, exp)
def test_get(self): demux_fp = [fp for _, fp, fp_type in Artifact(2).filepaths if fp_type == 'preprocessed_demux'][0] fd, fna_fp = mkstemp(suffix='_seqs.fna') close(fd) self._clean_up_files.extend([fna_fp, demux_fp]) with open(fna_fp, 'w') as f: f.write('>a_1 X orig_bc=X new_bc=X bc_diffs=0\nCCC') with File(demux_fp, "w") as f: to_hdf5(fna_fp, f) BaseHandler.get_current_user = Mock(return_value=User("*****@*****.**")) response = self.get("/ebi_submission/2") self.assertEqual(response.code, 200)
def test_to_ascii(self): with tempfile.NamedTemporaryFile('r+', suffix='.fq', delete=False) as f: f.write(fqdata) self.to_remove.append(f.name) to_hdf5(f.name, self.hdf5_file) exp = [b"@a_0 orig_bc=abc new_bc=abc bc_diffs=0\nxyz\n+\nABC\n", b"@b_0 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n+\nDFG\n", b"@b_1 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n+\nDEF\n"] obs = list(to_ascii(self.hdf5_file, samples=[b'a', b'b'])) self.assertEqual(obs, exp)
def _generate_files(self, sample_names): fd, fastq_fp = mkstemp(suffix=".fastq") close(fd) with open(fastq_fp, 'w') as f: f.write(FASTQ_SEQS.format(**sample_names)) demux_fp = "%s.demux" % fastq_fp with File(demux_fp) as f: to_hdf5(fastq_fp, f) out_dir = mkdtemp() self._clean_up_files.extend([fastq_fp, demux_fp, out_dir]) return demux_fp, fastq_fp, out_dir
def _generate_files(self, sample_names): fd, fastq_fp = mkstemp(suffix=".fastq") close(fd) with open(fastq_fp, 'w') as f: f.write(FASTQ_SEQS.format(**sample_names)) demux_fp = "%s.demux" % fastq_fp with File(demux_fp, 'w') as f: to_hdf5(fastq_fp, f) out_dir = mkdtemp() self._clean_up_files.extend([fastq_fp, demux_fp, out_dir]) return demux_fp, fastq_fp, out_dir
def test_to_ascii_fasta(self): with tempfile.NamedTemporaryFile('r+', suffix='.fna', delete=False) as f: f.write(seqdata) self.to_remove.append(f.name) to_hdf5(f.name, self.hdf5_file) exp = [b">a_0 orig_bc=abc new_bc=abc bc_diffs=0\nx\n", b">a_1 orig_bc=aby new_bc=ybc bc_diffs=2\nxy\n", b">a_2 orig_bc=abz new_bc=zbc bc_diffs=3\nxyz\n", b">b_0 orig_bc=abx new_bc=xbc bc_diffs=1\nxyz\n", b">b_1 orig_bc=abw new_bc=wbc bc_diffs=4\nabcd\n"] obs = list(to_ascii(self.hdf5_file, samples=[b'a', b'b'])) self.assertEqual(obs, exp)
def test_to_per_sample_ascii(self): with tempfile.NamedTemporaryFile('r+', suffix='.fq', delete=False) as f: f.write(fqdata) self.to_remove.append(f.name) to_hdf5(f.name, self.hdf5_file) exp = [(b'a', [(b"@a_0 orig_bc=abc new_bc=abc bc_diffs=0\nxyz\n+\n" b"ABC\n")]), (b'b', [(b"@b_0 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n+\n" b"DFG\n"), (b"@b_1 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n+\n" b"DEF\n")])] obs = [(s[0], list(s[1])) for s in to_per_sample_ascii(self.hdf5_file)] self.assertEqual(obs, exp)
def test_submit_to_EBI(self): # setting up test fna_fp = join(self.temp_dir, 'seqs.fna') demux_fp = join(self.temp_dir, 'demux.seqs') with open(fna_fp, 'w') as f: f.write(FASTA_EXAMPLE) with File(demux_fp, "w") as f: to_hdf5(fna_fp, f) pt = PrepTemplate(1) params = Parameters.from_default_params(DefaultParameters(1), {'input_data': pt.artifact.id}) artifact = Artifact.create([(demux_fp, 6)], "Demultiplexed", parents=[pt.artifact], processing_parameters=params) # submit job job = self._create_job('submit_to_EBI', { 'artifact': artifact.id, 'submission_type': 'VALIDATE' }) job._set_status('in_construction') job.submit() # wait for the job to fail, and check that the status is submitting checked_submitting = True while job.status != 'error': if checked_submitting: self.assertEqual('submitting', artifact.study.ebi_submission_status) checked_submitting = False # once it fails wait for a few to check status again sleep(5) exp = 'Some artifact submissions failed: %d' % artifact.id obs = artifact.study.ebi_submission_status self.assertEqual(obs, exp) # make sure that the error is correct, we have 2 options if environ.get('ASPERA_SCP_PASS', '') != '': self.assertIn('1.SKM2.640199', job.log.msg) else: self.assertIn('ASCP Error:', job.log.msg) # wait for everything to finish to avoid DB deadlocks sleep(5)
def test_fetch_qual_length_bug(self): # fetch was not trimming qual to the length of the sequence resulting # in qual scores for positions beyond the length of the sequence. with tempfile.NamedTemporaryFile('r+', suffix='.fq', delete=False) as f: f.write(fqdata_variable_length) self.to_remove.append(f.name) to_hdf5(f.name, self.hdf5_file) exp = [(b'a', [(b"@a_0 orig_bc=abc new_bc=abc bc_diffs=0\nxyz\n+\n" b"ABC\n")]), (b'b', [(b"@b_0 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n+\n" b"DFG\n"), (b"@b_1 orig_bc=abw new_bc=wbc bc_diffs=4\nqwexx\n+\n" b"DEF#G\n")])] obs = [(s[0], list(s[1])) for s in to_per_sample_ascii(self.hdf5_file)] self.assertEqual(obs, exp)
def test_submit_to_EBI(self): # setting up test fna_fp = join(self.temp_dir, 'seqs.fna') demux_fp = join(self.temp_dir, 'demux.seqs') with open(fna_fp, 'w') as f: f.write(FASTA_EXAMPLE) with File(demux_fp, "w") as f: to_hdf5(fna_fp, f) pt = PrepTemplate(1) params = Parameters.from_default_params( DefaultParameters(1), {'input_data': pt.artifact.id}) artifact = Artifact.create( [(demux_fp, 6)], "Demultiplexed", parents=[pt.artifact], processing_parameters=params) # submit job job = self._create_job('submit_to_EBI', { 'artifact': artifact.id, 'submission_type': 'VALIDATE'}) job._set_status('in_construction') job.submit() # wait for the job to fail, and check that the status is submitting checked_submitting = True while job.status != 'error': if checked_submitting: self.assertEqual('submitting', artifact.study.ebi_submission_status) checked_submitting = False # once it fails wait for a few to check status again sleep(5) exp = 'Some artifact submissions failed: %d' % artifact.id obs = artifact.study.ebi_submission_status self.assertEqual(obs, exp) # make sure that the error is correct, we have 2 options if environ.get('ASPERA_SCP_PASS', '') != '': self.assertIn('1.SKM2.640199', job.log.msg) else: self.assertIn('ASCP Error:', job.log.msg) # wait for everything to finish to avoid DB deadlocks sleep(5)
def write_demux_files(self, prep_template, generate_hdf5=True): """Writes a demux test file to avoid duplication of code""" fna_fp = join(self.temp_dir, "seqs.fna") demux_fp = join(self.temp_dir, "demux.seqs") if generate_hdf5: with open(fna_fp, "w") as f: f.write(FASTA_EXAMPLE) with File(demux_fp, "w") as f: to_hdf5(fna_fp, f) else: with open(demux_fp, "w") as f: f.write("") if prep_template.artifact is None: ppd = Artifact.create([(demux_fp, 6)], "Demultiplexed", prep_template=prep_template) else: params = Parameters.from_default_params(DefaultParameters(1), {"input_data": prep_template.artifact.id}) ppd = Artifact.create( [(demux_fp, 6)], "Demultiplexed", parents=[prep_template.artifact], processing_parameters=params ) return ppd
def _validate_demultiplexed(qclient, job_id, prep_info, files, out_dir): """Validate and fix a new 'Demultiplexed' artifact Parameters ---------- qclient : qiita_client.QiitaClient The Qiita server client job_id : str The job id prep_info : dict of {str: dict of {str: str}} The prep information keyed by sample id files : dict of {str: list of str} The files to add to the new artifact, keyed by filepath type out_dir : str The output directory Returns ------- dict The results of the job """ qclient.update_job_step(job_id, "Step 2: Validating 'Demultiplexed' files") supported_fp_types = { 'preprocessed_fasta', 'preprocessed_fastq', 'preprocessed_demux', 'log' } unsupported_fp_types = set(files) - supported_fp_types if unsupported_fp_types: error_msg = ("Filepath type(s) %s not supported by artifact type " "Demultiplexed. Supported filepath types: %s" % (', '.join(unsupported_fp_types), ', '.join( sorted(supported_fp_types)))) return False, None, error_msg # At most one file of each type can be provided offending = set(fp_t for fp_t, fps in files.items() if len(fps) > 1) if offending: errors = [ "%s (%d): %s" % (fp_t, len(files[fp_t]), ', '.join(files[fp_t])) for fp_t in sorted(offending) ] error_msg = ("Only one filepath of each file type is supported, " "offending types:\n%s" % "; ".join(errors)) return False, None, error_msg # Check which files we have available: fasta = (files['preprocessed_fasta'][0] if 'preprocessed_fasta' in files else None) fastq = (files['preprocessed_fastq'][0] if 'preprocessed_fastq' in files else None) demux = (files['preprocessed_demux'][0] if 'preprocessed_demux' in files else None) log = (files['log'][0] if 'log' in files else None) if demux: # If demux is available, use that one to perform the validation and # generate the fasta and fastq from it success, a_info, error_msg = _validate_demux_file(qclient, job_id, prep_info, out_dir, demux, log_fp=log) elif fastq: # Generate the demux file from the fastq demux = join(out_dir, "%s.demux" % splitext(basename(fastq))[0]) with File(demux, 'w') as f: # to_hdf5 expects a list to_hdf5([fastq], f) # Validate the demux, providing the original fastq success, a_info, error_msg = _validate_demux_file(qclient, job_id, prep_info, out_dir, demux, fastq_fp=fastq, log_fp=log) elif fasta: # Generate the demux file from the fasta demux = join(out_dir, "%s.demux" % splitext(basename(fasta))[0]) with File(demux, 'w') as f: # to_hdf5 expects a list to_hdf5([fasta], f) # Validate the demux, providing the original fasta success, a_info, error_msg = _validate_demux_file(qclient, job_id, prep_info, out_dir, demux, fasta_fp=fasta, log_fp=log) else: error_msg = ("Either a 'preprocessed_demux', 'preprocessed_fastq' or " "'preprocessed_fasta' file should be provided.") return False, None, error_msg return success, a_info, error_msg
def generate_new_study_with_preprocessed_data(self): """Creates a new study up to the processed data for testing""" info = { "timeseries_type_id": 1, "metadata_complete": True, "mixs_compliant": True, "number_samples_collected": 3, "number_samples_promised": 3, "study_alias": "Test EBI", "study_description": "Study for testing EBI", "study_abstract": "Study for testing EBI", "emp_person_id": StudyPerson(2), "principal_investigator_id": StudyPerson(3), "lab_person_id": StudyPerson(1) } study = Study.create(User('*****@*****.**'), "Test EBI study", info) metadata_dict = { 'Sample1': { 'collection_timestamp': datetime(2015, 6, 1, 7, 0, 0), 'physical_specimen_location': 'location1', 'taxon_id': 9606, 'scientific_name': 'h**o sapiens', 'Description': 'Test Sample 1' }, 'Sample2': { 'collection_timestamp': datetime(2015, 6, 2, 7, 0, 0), 'physical_specimen_location': 'location1', 'taxon_id': 9606, 'scientific_name': 'h**o sapiens', 'Description': 'Test Sample 2' }, 'Sample3': { 'collection_timestamp': datetime(2015, 6, 3, 7, 0, 0), 'physical_specimen_location': 'location1', 'taxon_id': 9606, 'scientific_name': 'h**o sapiens', 'Description': 'Test Sample 3' } } metadata = pd.DataFrame.from_dict(metadata_dict, orient='index', dtype=str) SampleTemplate.create(metadata, study) metadata_dict = { 'Sample1': { 'primer': 'GTGCCAGCMGCCGCGGTAA', 'barcode': 'CGTAGAGCTCTC', 'center_name': 'KnightLab', 'platform': 'ILLUMINA', 'instrument_model': 'Illumina MiSeq', 'library_construction_protocol': 'Protocol ABC', 'experiment_design_description': "Random value 1" }, 'Sample2': { 'primer': 'GTGCCAGCMGCCGCGGTAA', 'barcode': 'CGTAGAGCTCTA', 'center_name': 'KnightLab', 'platform': 'ILLUMINA', 'instrument_model': 'Illumina MiSeq', 'library_construction_protocol': 'Protocol ABC', 'experiment_design_description': "Random value 2" }, 'Sample3': { 'primer': 'GTGCCAGCMGCCGCGGTAA', 'barcode': 'CGTAGAGCTCTT', 'center_name': 'KnightLab', 'platform': 'ILLUMINA', 'instrument_model': 'Illumina MiSeq', 'library_construction_protocol': 'Protocol ABC', 'experiment_design_description': "Random value 3" }, } metadata = pd.DataFrame.from_dict(metadata_dict, orient='index', dtype=str) pt = PrepTemplate.create(metadata, study, "16S", 'Metagenomics') fna_fp = join(self.temp_dir, 'seqs.fna') demux_fp = join(self.temp_dir, 'demux.seqs') with open(fna_fp, 'w') as f: f.write(FASTA_EXAMPLE_2.format(study.id)) with File(demux_fp, 'w') as f: to_hdf5(fna_fp, f) ppd = Artifact.create([(demux_fp, 6)], "Demultiplexed", prep_template=pt) return ppd
def generate_new_study_with_preprocessed_data(self): """Creates a new study up to the processed data for testing""" info = { "timeseries_type_id": 1, "metadata_complete": True, "mixs_compliant": True, "number_samples_collected": 3, "number_samples_promised": 3, "study_alias": "Test EBI", "study_description": "Study for testing EBI", "study_abstract": "Study for testing EBI", "emp_person_id": StudyPerson(2), "principal_investigator_id": StudyPerson(3), "lab_person_id": StudyPerson(1) } study = Study.create(User('*****@*****.**'), "Test EBI study", info) metadata_dict = { 'Sample1': {'collection_timestamp': datetime(2015, 6, 1, 7, 0, 0), 'physical_specimen_location': 'location1', 'taxon_id': 9606, 'scientific_name': 'h**o sapiens', 'Description': 'Test Sample 1'}, 'Sample2': {'collection_timestamp': datetime(2015, 6, 2, 7, 0, 0), 'physical_specimen_location': 'location1', 'taxon_id': 9606, 'scientific_name': 'h**o sapiens', 'Description': 'Test Sample 2'}, 'Sample3': {'collection_timestamp': datetime(2015, 6, 3, 7, 0, 0), 'physical_specimen_location': 'location1', 'taxon_id': 9606, 'scientific_name': 'h**o sapiens', 'Description': 'Test Sample 3'} } metadata = pd.DataFrame.from_dict(metadata_dict, orient='index', dtype=str) SampleTemplate.create(metadata, study) metadata_dict = { 'Sample1': {'primer': 'GTGCCAGCMGCCGCGGTAA', 'barcode': 'CGTAGAGCTCTC', 'center_name': 'KnightLab', 'platform': 'ILLUMINA', 'instrument_model': 'Illumina MiSeq', 'library_construction_protocol': 'Protocol ABC', 'experiment_design_description': "Random value 1"}, 'Sample2': {'primer': 'GTGCCAGCMGCCGCGGTAA', 'barcode': 'CGTAGAGCTCTA', 'center_name': 'KnightLab', 'platform': 'ILLUMINA', 'instrument_model': 'Illumina MiSeq', 'library_construction_protocol': 'Protocol ABC', 'experiment_design_description': "Random value 2"}, 'Sample3': {'primer': 'GTGCCAGCMGCCGCGGTAA', 'barcode': 'CGTAGAGCTCTT', 'center_name': 'KnightLab', 'platform': 'ILLUMINA', 'instrument_model': 'Illumina MiSeq', 'library_construction_protocol': 'Protocol ABC', 'experiment_design_description': "Random value 3"}, } metadata = pd.DataFrame.from_dict(metadata_dict, orient='index', dtype=str) pt = PrepTemplate.create(metadata, study, "16S", 'Metagenomics') fna_fp = join(self.temp_dir, 'seqs.fna') demux_fp = join(self.temp_dir, 'demux.seqs') with open(fna_fp, 'w') as f: f.write(FASTA_EXAMPLE_2.format(study.id)) with File(demux_fp, 'w') as f: to_hdf5(fna_fp, f) ppd = Artifact.create( [(demux_fp, 6)], "Demultiplexed", prep_template=pt) return ppd
def _validate_demultiplexed(qclient, job_id, prep_info, files, out_dir): """Validate and fix a new 'Demultiplexed' artifact Parameters ---------- qclient : qiita_client.QiitaClient The Qiita server client job_id : str The job id prep_info : dict of {str: dict of {str: str}} The prep information keyed by sample id files : dict of {str: list of str} The files to add to the new artifact, keyed by filepath type out_dir : str The output directory Returns ------- dict The results of the job """ qclient.update_job_step(job_id, "Step 2: Validating 'Demultiplexed' files") supported_fp_types = {'preprocessed_fasta', 'preprocessed_fastq', 'preprocessed_demux', 'log'} unsupported_fp_types = set(files) - supported_fp_types if unsupported_fp_types: error_msg = ("Filepath type(s) %s not supported by artifact type " "Demultiplexed. Supported filepath types: %s" % (', '.join(unsupported_fp_types), ', '.join(sorted(supported_fp_types)))) return False, None, error_msg # At most one file of each type can be provided offending = set(fp_t for fp_t, fps in files.items() if len(fps) > 1) if offending: errors = ["%s (%d): %s" % (fp_t, len(files[fp_t]), ', '.join(files[fp_t])) for fp_t in sorted(offending)] error_msg = ("Only one filepath of each file type is supported, " "offending types:\n%s" % "; ".join(errors)) return False, None, error_msg # Check which files we have available: fasta = (files['preprocessed_fasta'][0] if 'preprocessed_fasta' in files else None) fastq = (files['preprocessed_fastq'][0] if 'preprocessed_fastq' in files else None) demux = (files['preprocessed_demux'][0] if 'preprocessed_demux' in files else None) log = (files['log'][0] if 'log' in files else None) if demux: # If demux is available, use that one to perform the validation and # generate the fasta and fastq from it success, a_info, error_msg = _validate_demux_file( qclient, job_id, prep_info, out_dir, demux, log_fp=log) elif fastq: # Generate the demux file from the fastq demux = join(out_dir, "%s.demux" % splitext(basename(fastq))[0]) with open_file(demux, "w") as f: to_hdf5(fastq, f) # Validate the demux, providing the original fastq success, a_info, error_msg = _validate_demux_file( qclient, job_id, prep_info, out_dir, demux, fastq_fp=fastq, log_fp=log) elif fasta: # Generate the demux file from the fasta demux = join(out_dir, "%s.demux" % splitext(basename(fasta))[0]) with open_file(demux, "w") as f: to_hdf5(fasta, f) # Validate the demux, providing the original fasta success, a_info, error_msg = _validate_demux_file( qclient, job_id, prep_info, out_dir, demux, fasta_fp=fasta, log_fp=log) else: error_msg = ("Either a 'preprocessed_demux', 'preprocessed_fastq' or " "'preprocessed_fasta' file should be provided.") return False, None, error_msg return success, a_info, error_msg
def generate_new_study_with_preprocessed_data(self): """Creates a new study up to the processed data for testing""" info = { "timeseries_type_id": 1, "metadata_complete": True, "mixs_compliant": True, "number_samples_collected": 3, "number_samples_promised": 3, "study_alias": "Test EBI", "study_description": "Study for testing EBI", "study_abstract": "Study for testing EBI", "emp_person_id": StudyPerson(2), "principal_investigator_id": StudyPerson(3), "lab_person_id": StudyPerson(1), } study = Study.create(User("*****@*****.**"), "Test EBI study", [1], info) metadata_dict = { "Sample1": { "collection_timestamp": datetime(2015, 6, 1, 7, 0, 0), "physical_specimen_location": "location1", "taxon_id": 9606, "scientific_name": "h**o sapiens", "Description": "Test Sample 1", }, "Sample2": { "collection_timestamp": datetime(2015, 6, 2, 7, 0, 0), "physical_specimen_location": "location1", "taxon_id": 9606, "scientific_name": "h**o sapiens", "Description": "Test Sample 2", }, "Sample3": { "collection_timestamp": datetime(2015, 6, 3, 7, 0, 0), "physical_specimen_location": "location1", "taxon_id": 9606, "scientific_name": "h**o sapiens", "Description": "Test Sample 3", }, } metadata = pd.DataFrame.from_dict(metadata_dict, orient="index", dtype=str) SampleTemplate.create(metadata, study) metadata_dict = { "Sample1": { "primer": "GTGCCAGCMGCCGCGGTAA", "barcode": "CGTAGAGCTCTC", "center_name": "KnightLab", "platform": "ILLUMINA", "instrument_model": "Illumina MiSeq", "library_construction_protocol": "Protocol ABC", "experiment_design_description": "Random value 1", }, "Sample2": { "primer": "GTGCCAGCMGCCGCGGTAA", "barcode": "CGTAGAGCTCTA", "center_name": "KnightLab", "platform": "ILLUMINA", "instrument_model": "Illumina MiSeq", "library_construction_protocol": "Protocol ABC", "experiment_design_description": "Random value 2", }, "Sample3": { "primer": "GTGCCAGCMGCCGCGGTAA", "barcode": "CGTAGAGCTCTT", "center_name": "KnightLab", "platform": "ILLUMINA", "instrument_model": "Illumina MiSeq", "library_construction_protocol": "Protocol ABC", "experiment_design_description": "Random value 3", }, } metadata = pd.DataFrame.from_dict(metadata_dict, orient="index", dtype=str) pt = PrepTemplate.create(metadata, study, "16S", "Metagenomics") fna_fp = join(self.temp_dir, "seqs.fna") demux_fp = join(self.temp_dir, "demux.seqs") with open(fna_fp, "w") as f: f.write(FASTA_EXAMPLE_2.format(study.id)) with File(demux_fp, "w") as f: to_hdf5(fna_fp, f) ppd = Artifact.create([(demux_fp, 6)], "Demultiplexed", prep_template=pt) return ppd