def test_add_raw_data(self): self._make_sandbox() new = Study.create( User('*****@*****.**'), 'NOT Identification of the ' 'Microbiomes for Cannabis Soils', [1], self.info) new.add_raw_data([RawData(1), RawData(2)]) obs = self.conn_handler.execute_fetchall( "SELECT * FROM qiita.study_raw_data WHERE study_id=%s", (new.id, )) self.assertEqual(obs, [[new.id, 1], [new.id, 2]])
def test_remove_filepath_errors(self): fp = join(self.db_test_raw_dir, '1_s_G1_L001_sequences.fastq.gz') with self.assertRaises(QiitaDBError): RawData(1).remove_filepath(fp) # filepath doesn't belong to that raw data with self.assertRaises(ValueError): RawData(2).remove_filepath(fp) # the raw data has been linked to more than 1 study so it can't be # unliked Study(2).add_raw_data([RawData(2)]) with self.assertRaises(QiitaDBError): RawData(2).remove_filepath(fp)
def test_add_raw_data_private(self): new = Study.create( User('*****@*****.**'), 'NOT Identification of the ' 'Microbiomes for Cannabis Soils', [1], self.info) new.status = 'private' with self.assertRaises(QiitaDBStatusError): new.add_raw_data([RawData(2)])
def test_get_preprocess_fastq_cmd(self): raw_data = RawData(1) params = PreprocessedIlluminaParams(1) prep_template = PrepTemplate(1) obs_cmd, obs_output_dir = _get_preprocess_fastq_cmd( raw_data, prep_template, params) get_raw_path = partial(join, self.db_dir, 'raw_data') seqs_fp = get_raw_path('1_s_G1_L001_sequences.fastq.gz') bc_fp = get_raw_path('1_s_G1_L001_sequences_barcodes.fastq.gz') exp_cmd_1 = ("split_libraries_fastq.py --store_demultiplexed_fastq -i " "{} -b {} " "-m ".format(seqs_fp, bc_fp)) exp_cmd_2 = ("-o {0} --barcode_type golay_12 --max_bad_run_length 3 " "--max_barcode_errors 1.5 " "--min_per_read_length_fraction 0.75 " "--phred_quality_threshold 3 " "--sequence_max_n 0".format(obs_output_dir)) # We are splitting the command into two parts because there is no way # that we can know the filepath of the mapping file. We thus split the # command on the mapping file path and we check that the two parts # of the commands is correct obs_cmd_1 = obs_cmd[:len(exp_cmd_1)] obs_cmd_2 = obs_cmd[len(exp_cmd_1):].split(" ", 1)[1] self.assertEqual(obs_cmd_1, exp_cmd_1) self.assertEqual(obs_cmd_2, exp_cmd_2)
def add_files_to_raw_data(raw_data_id, filepaths): """Add files to raw data Needs to be dispachable because it moves large files """ rd = RawData(raw_data_id) rd.add_filepaths(filepaths)
def test_delete(self): # the raw data doesn't exist with self.assertRaises(QiitaDBUnknownIDError): RawData.delete(1000, 1) # the raw data and the study id are not linked or # the study doesn't exits with self.assertRaises(QiitaDBError): RawData.delete(1, 1000) # the raw data has prep templates with self.assertRaises(QiitaDBError): RawData.delete(1, 1) # the raw data has linked files with self.assertRaises(QiitaDBError): RawData.delete(3, 1) # the raw data is linked to a study that has not prep templates Study(2).add_raw_data([RawData(1)]) RawData.delete(1, 2) # delete raw data self.assertTrue(RawData.exists(2)) RawData.delete(2, 1) self.assertFalse(RawData.exists(2))
def unlink_all_files(raw_data_id): """Removes all files from raw data Needs to be dispachable because it does I/O and a lot of DB calls """ rd = RawData(raw_data_id) rd.clear_filepaths()
def test_link_filepaths_status_setter(self): rd = RawData(1) self.assertEqual(rd.link_filepaths_status, 'idle') rd._set_link_filepaths_status('linking') self.assertEqual(rd.link_filepaths_status, 'linking') rd._set_link_filepaths_status('unlinking') self.assertEqual(rd.link_filepaths_status, 'unlinking') rd._set_link_filepaths_status('failed: error') self.assertEqual(rd.link_filepaths_status, 'failed: error')
def remove_add_prep_template(self, fp_rpt, raw_data_id, study, data_type_id, investigation_type): """add prep templates""" pt_id = PrepTemplate.create(load_template_to_dataframe(fp_rpt), RawData(raw_data_id), study, _to_int(data_type_id), investigation_type=investigation_type).id remove(fp_rpt) return pt_id
def test_get_preprocess_fasta_cmd_sff(self): raw_data = RawData(3) params = Preprocessed454Params(1) prep_template = PrepTemplate(1) obs_cmd, obs_output_dir = _get_preprocess_fasta_cmd( raw_data, prep_template, params) get_raw_path = partial(join, self.db_dir, 'raw_data') seqs_fp = [ get_raw_path('preprocess_test1.sff'), get_raw_path('preprocess_test2.sff') ] exp_cmd_1 = ' '.join( ["process_sff.py", "-i %s" % seqs_fp[0], "-o %s" % obs_output_dir]) exp_cmd_2 = ' '.join( ["process_sff.py", "-i %s" % seqs_fp[1], "-o %s" % obs_output_dir]) fasta_files = ','.join([ join(obs_output_dir, "preprocess_test1.fna"), join(obs_output_dir, "preprocess_test2.fna") ]) qual_files = ','.join([ join(obs_output_dir, "preprocess_test1.qual"), join(obs_output_dir, "preprocess_test2.qual") ]) exp_cmd_3a = ' '.join(["split_libraries.py", "-f %s" % fasta_files]) exp_cmd_3b = ' '.join([ "-q %s" % qual_files, "-d", "-o %s" % obs_output_dir, params.to_str() ]) exp_cmd_4 = ' '.join([ "convert_fastaqual_fastq.py", "-f %s/seqs.fna" % obs_output_dir, "-q %s/seqs_filtered.qual" % obs_output_dir, "-o %s" % obs_output_dir, "-F" ]) obs_cmds = obs_cmd.split('; ') # We are splitting the command into two parts because there is no way # that we can know the filepath of the mapping file. We thus split the # command on the mapping file path and we check that the two parts # of the commands is correct obs_cmd_3a, obs_cmd_3b_temp = obs_cmds[2].split(' -m ', 1) obs_cmd_3b = obs_cmd_3b_temp.split(' ', 1)[1] self.assertEqual(obs_cmds[0], exp_cmd_1) self.assertEqual(obs_cmds[1], exp_cmd_2) self.assertEqual(obs_cmd_3a, exp_cmd_3a) self.assertEqual(obs_cmd_3b, exp_cmd_3b) self.assertEqual(obs_cmds[3], exp_cmd_4)
def test_get_filepaths(self): """Correctly returns the filepaths to the raw files""" rd = RawData(1) obs = rd.get_filepaths() exp = [(join(self.db_test_raw_dir, '1_s_G1_L001_sequences.fastq.gz'), "raw_sequences"), (join(self.db_test_raw_dir, '1_s_G1_L001_sequences_barcodes.fastq.gz'), "raw_barcodes")] self.assertEqual(obs, exp)
def setUp(self): metadata_dict = { 'SKB8.640193': { 'center_name': 'ANL', 'center_project_name': 'Test Project', 'ebi_submission_accession': None, 'EMP_status_id': 1, 'data_type_id': 2, 'str_column': 'Value for sample 1' }, 'SKD8.640184': { 'center_name': 'ANL', 'center_project_name': 'Test Project', 'ebi_submission_accession': None, 'EMP_status_id': 1, 'data_type_id': 2, 'str_column': 'Value for sample 2' }, 'SKB7.640196': { 'center_name': 'ANL', 'center_project_name': 'Test Project', 'ebi_submission_accession': None, 'EMP_status_id': 1, 'data_type_id': 2, 'str_column': 'Value for sample 3' } } self.metadata = pd.DataFrame.from_dict(metadata_dict, orient='index') self.test_raw_data = RawData(1) fd, seqs_fp = mkstemp(suffix='_seqs.fastq') close(fd) fd, barcodes_fp = mkstemp(suffix='_barcodes.fastq') close(fd) filepaths = [(seqs_fp, 1), (barcodes_fp, 2)] with open(seqs_fp, "w") as f: f.write("\n") with open(barcodes_fp, "w") as f: f.write("\n") self.new_raw_data = RawData.create(2, filepaths, [Study(1)]) db_test_raw_dir = join(get_db_files_base_dir(), 'raw_data') db_seqs_fp = join(db_test_raw_dir, "3_%s" % basename(seqs_fp)) db_barcodes_fp = join(db_test_raw_dir, "3_%s" % basename(barcodes_fp)) self._clean_up_files = [db_seqs_fp, db_barcodes_fp] self.tester = PrepTemplate(1) self.exp_sample_ids = { 'SKB1.640202', 'SKB2.640194', 'SKB3.640195', 'SKB4.640189', 'SKB5.640181', 'SKB6.640176', 'SKB7.640196', 'SKB8.640193', 'SKB9.640200', 'SKD1.640179', 'SKD2.640178', 'SKD3.640198', 'SKD4.640185', 'SKD5.640186', 'SKD6.640190', 'SKD7.640191', 'SKD8.640184', 'SKD9.640182', 'SKM1.640183', 'SKM2.640199', 'SKM3.640197', 'SKM4.640180', 'SKM5.640177', 'SKM6.640187', 'SKM7.640188', 'SKM8.640201', 'SKM9.640192' }
def get_raw_data_from_other_studies(user, study): """Retrieves a tuple of raw_data_id and the last study title for that raw_data """ d = {} for sid in user.user_studies: if sid == study.id: continue for rdid in Study(sid).raw_data(): d[int(rdid)] = Study(RawData(rdid).studies[-1]).title return d
def create_raw_data(self, study, user, callback): """Adds a (new) raw data to the study Parameters ---------- study : Study The current study object user : User The current user object callback : function The callback function to call with the results once the processing is done """ msg = "Raw data successfully added" msg_level = "success" # Get the arguments needed to create a raw data object filetype = self.get_argument('filetype', None) previous_raw_data = self.get_argument('previous_raw_data', None) if filetype and previous_raw_data: # The user selected a filetype and an existing raw data msg = ("You can not specify both a new raw data and a previously " "used one") msg_level = "danger" elif filetype: # We are creating a new raw data object try: rd_id = RawData.create(filetype, [study]).id except (TypeError, QiitaDBColumnError, QiitaDBExecutionError, QiitaDBDuplicateError, IOError, ValueError, KeyError, CParserError) as e: msg = html_error_message % ( "creating a new raw data object for study:", str(study.id), str(e)) msg_level = "danger" elif previous_raw_data: previous_raw_data = previous_raw_data.split(',') raw_data = [RawData(rd) for rd in previous_raw_data] study.add_raw_data(raw_data) rd_id = raw_data[0].id else: # The user did not provide a filetype neither an existing raw data # If using the interface, we should never reach this if, but # better be safe than sorry msg = ("You should choose a filetype for a new raw data or " "choose a raw data previously used") msg_level = "danger" rd_id = None callback((msg, msg_level, 'raw_data_tab', rd_id, None))
def render(self, prep, study_id, is_editable, ena_terms, study_status, user_defined_terms): # Check if the request came from a local source is_local_request = self._is_local() prep_id = prep.id data_type = prep.data_type() raw_data = RawData(prep.raw_data) filepaths = prep.get_filepaths() investigation_type = prep.investigation_type preprocessed_data = prep.preprocessed_data preprocessing_status = prep.preprocessing_status if raw_data.filetype in ('SFF', 'FASTA'): param_iter = Preprocessed454Params.iter() elif raw_data.filetype == 'FASTQ': param_iter = PreprocessedIlluminaParams.iter() else: raise ValueError("Don't know what to do but this exception will " "never actually get shown anywhere because why " "would you want to see tracebacks?") preprocess_options = [] for param in param_iter: text = ("<b>%s:</b> %s" % (k, v) for k, v in viewitems(param.values)) preprocess_options.append( (param.id, param.name, '<br>'.join(text))) # Unfortunately, both the prep template and the qiime mapping files # have the sample type. The way to differentiate them is if we have # the substring 'qiime' in the basename _fp_type = (lambda fp: "Qiime mapping" if 'qiime' in basename(fp) else "Prep template") filepaths = [(id_, fp, _fp_type(fp)) for id_, fp in filepaths] return self.render_string( "study_description_templates/prep_template_panel.html", prep_id=prep_id, data_type=data_type, filepaths=filepaths, investigation_type=investigation_type, preprocessed_data=preprocessed_data, preprocessing_status=preprocessing_status, study_id=study_id, is_local_request=is_local_request, is_editable=is_editable, ena_terms=ena_terms, study_status=study_status, user_defined_terms=user_defined_terms, preprocess_options=preprocess_options)
def remove_add_study_template(self, raw_data, study_id, fp_rsp): """Replace prep templates, raw data, and sample template with a new one """ for rd in raw_data(): rd = RawData(rd) for pt in rd.prep_templates: if PrepTemplate.exists(pt): PrepTemplate.delete(pt) if SampleTemplate.exists(study_id): SampleTemplate.delete(study_id) SampleTemplate.create(load_template_to_dataframe(fp_rsp), Study(study_id)) remove(fp_rsp)
def setUp(self): self.raw_data = RawData(1) self.study = Study(1) self.params_table = "preprocessed_sequence_illumina_params" self.params_id = 1 fd, self.fna_fp = mkstemp(suffix='_seqs.fna') close(fd) fd, self.qual_fp = mkstemp(suffix='_seqs.qual') close(fd) self.filepaths = [(self.fna_fp, 4), (self.qual_fp, 5)] self.db_test_ppd_dir = join(get_db_files_base_dir(), 'preprocessed_data') self.ebi_submission_accession = "EBI123456-A" self.ebi_study_accession = "EBI123456-B" with open(self.fna_fp, "w") as f: f.write("\n") with open(self.qual_fp, "w") as f: f.write("\n") self._clean_up_files = []
def post(self): study_id = int(self.get_argument('study_id')) prep_template_id = int(self.get_argument('prep_template_id')) raw_data = RawData(PrepTemplate(prep_template_id).raw_data) param_id = int(self.get_argument('preprocessing_parameters_id')) # Get the preprocessing parameters if raw_data.filetype == 'FASTQ': param_constructor = PreprocessedIlluminaParams elif raw_data.filetype in ('FASTA', 'SFF'): param_constructor = Preprocessed454Params else: raise ValueError('Unknown filetype') job_id = submit(self.current_user.id, preprocessor, study_id, prep_template_id, param_id, param_constructor) self.render('compute_wait.html', job_id=job_id, title='Preprocessing', completion_redirect='/study/description/%d?top_tab=' 'raw_data_tab&sub_tab=%s&prep_tab=%s' % (study_id, raw_data.id, prep_template_id))
def get_raw_data(rdis): """Get all raw data objects from a list of raw_data_ids""" return [RawData(rdi) for rdi in rdis]
def test_studies(self): """Correctly returns the study ids""" rd = RawData(1) self.assertEqual(rd.studies, [1])
def test_get_qiime_minimal_mapping_multiple(self): # We need to create a prep template in which we have different run # prefix values, so we can test this case metadata_dict = { 'SKB8.640193': { 'center_name': 'ANL', 'center_project_name': 'Test Project', 'ebi_submission_accession': None, 'EMP_status': 'EMP', 'str_column': 'Value for sample 1', 'linkerprimersequence': 'GTGCCAGCMGCCGCGGTAA', 'barcodesequence': 'GTCCGCAAGTTA', 'run_prefix': "s_G1_L001_sequences", 'platform': 'ILLUMINA', 'library_construction_protocol': 'AAA', 'experiment_design_description': 'BBB' }, 'SKD8.640184': { 'center_name': 'ANL', 'center_project_name': 'Test Project', 'ebi_submission_accession': None, 'EMP_status': 'EMP', 'str_column': 'Value for sample 2', 'linkerprimersequence': 'GTGCCAGCMGCCGCGGTAA', 'barcodesequence': 'CGTAGAGCTCTC', 'run_prefix': "s_G1_L001_sequences", 'platform': 'ILLUMINA', 'library_construction_protocol': 'AAA', 'experiment_design_description': 'BBB' }, 'SKB7.640196': { 'center_name': 'ANL', 'center_project_name': 'Test Project', 'ebi_submission_accession': None, 'EMP_status': 'EMP', 'str_column': 'Value for sample 3', 'linkerprimersequence': 'GTGCCAGCMGCCGCGGTAA', 'barcodesequence': 'CCTCTGAGAGCT', 'run_prefix': "s_G1_L002_sequences", 'platform': 'ILLUMINA', 'library_construction_protocol': 'AAA', 'experiment_design_description': 'BBB' } } md_template = pd.DataFrame.from_dict(metadata_dict, orient='index') prep_template = PrepTemplate.create(md_template, RawData(2), Study(1), '16S') out_dir = mkdtemp() obs_fps = sorted(_get_qiime_minimal_mapping(prep_template, out_dir)) exp_fps = sorted([ join(out_dir, 's_G1_L001_sequences_MMF.txt'), join(out_dir, 's_G1_L002_sequences_MMF.txt') ]) # Check that the returned list is as expected self.assertEqual(obs_fps, exp_fps) # Check that the file exists for fp in exp_fps: self.assertTrue(exists(fp)) # Check the contents of the file for fp, contents in zip(exp_fps, [EXP_PREP_1, EXP_PREP_2]): with open(fp, "U") as f: self.assertEqual(f.read(), contents)
def get_raw_data(self, rdis, callback): """Get all raw data objects from a list of raw_data_ids""" callback([RawData(rdi) for rdi in rdis])
def test_data_type(self): """Correctly returns the data_type of raw_data""" rd = RawData(1) self.assertEqual(rd.data_type(), "18S")
def test_not_equal(self): """Not equals works with object of the same type""" new = RawData(2) self.assertNotEqual(self.tester, new)
def test_equal(self): """Equality works with two objects pointing to the same instance""" new = RawData(1) self.assertEqual(self.tester, new)
def test_init_error_inexistent(self): """Raises an error when instantiating an object that does not exists""" with self.assertRaises(QiitaDBUnknownIDError): RawData(10)
def setUp(self): # We need an actual subclass in order to test the equality functions self.tester = RawData(1)
def test_data_type_id(self): """Correctly returns the data_type of raw_data""" rd = RawData(1) self.assertEqual(rd.data_type(ret_id=True), 2)
def test_clear_filepaths_error(self): with self.assertRaises(QiitaDBError): RawData(1).clear_filepaths()
def _construct_job_graph(self, study, prep_template, params): """Constructs the workflow graph to preprocess a study The steps performed to preprocess a study are: 1) Execute split libraries 2) Add the new preprocessed data to the DB Parameters ---------- study : Study The study to preprocess prep_template : PrepTemplate The prep template to use for the preprocessing params : BaseParameters The parameters to use for preprocessing """ self.prep_template = prep_template self._logger = stderr raw_data = RawData(prep_template.raw_data) # Change the prep_template preprocessing_status t self.prep_template.preprocessing_status = 'preprocessing' # STEP 1: Preprocess the study preprocess_node = "PREPROCESS" # Check the raw data filetype to know which command generator we # should use filetype = raw_data.filetype if filetype == "FASTQ": cmd_generator = _get_preprocess_fastq_cmd insert_preprocessed_data = _insert_preprocessed_data elif filetype in ('FASTA', 'SFF'): cmd_generator = _get_preprocess_fasta_cmd insert_preprocessed_data = _insert_preprocessed_data else: raise NotImplementedError( "Raw data %s cannot be preprocessed, filetype %s not supported" % (raw_data.id, filetype)) # Generate the command cmd, output_dir = cmd_generator(raw_data, self.prep_template, params) self._job_graph.add_node(preprocess_node, func=system_call, args=(cmd, ), job_name="Construct preprocess command", requires_deps=False) # This step is currently only for data types in which we need to store, # demultiplexed sequences. Since it is the only supported data type at # this point, it is ok the leave it here. However, as new data types # become available, we will need to think a better way of doing this. demux_node = "GEN_DEMUX_FILE" self._job_graph.add_node(demux_node, func=generate_demux_file, args=(output_dir, ), job_name="Generated demux file", requires_deps=False) self._job_graph.add_edge(preprocess_node, demux_node) # STEP 2: Add preprocessed data to DB insert_preprocessed_node = "INSERT_PREPROCESSED" self._job_graph.add_node(insert_preprocessed_node, func=insert_preprocessed_data, args=(study, params, self.prep_template, output_dir), job_name="Store preprocessed data", requires_deps=False) self._job_graph.add_edge(demux_node, insert_preprocessed_node) self._dirpaths_to_remove.append(output_dir)