def delete_prep_template(self, study, user, callback): """Delete the selected prep template Parameters ---------- study : Study The current study object user : User The current user object callback : function The callback function to call with the results once the processing is done """ prep_template_id = int(self.get_argument('prep_template_id')) prep_id = PrepTemplate(prep_template_id).raw_data try: PrepTemplate.delete(prep_template_id) msg = ("Prep template %d has been deleted" % prep_template_id) msg_level = "success" prep_id = None except Exception as e: msg = ("Couldn't remove prep template: %s" % str(e)) msg_level = "danger" callback((msg, msg_level, 'raw_data_tab', prep_id, None))
def setUp(self): fd, self.seqs_fp = mkstemp(suffix='_seqs.fastq') close(fd) fd, self.barcodes_fp = mkstemp(suffix='_barcodes.fastq') close(fd) self.filetype = 2 self.filepaths = [(self.seqs_fp, 1), (self.barcodes_fp, 2)] _, self.db_test_raw_dir = get_mountpoint('raw_data')[0] with open(self.seqs_fp, "w") as f: f.write("\n") with open(self.barcodes_fp, "w") as f: f.write("\n") self._clean_up_files = [] # Create some new PrepTemplates metadata_dict = { 'SKB8.640193': {'center_name': 'ANL', 'primer': 'GTGCCAGCMGCCGCGGTAA', 'barcode': 'GTCCGCAAGTTA', 'run_prefix': "s_G1_L001_sequences", 'platform': 'ILLUMINA', 'library_construction_protocol': 'AAAA', 'experiment_design_description': 'BBBB'}} metadata = pd.DataFrame.from_dict(metadata_dict, orient='index') self.pt1 = PrepTemplate.create(metadata, Study(1), "16S") self.pt2 = PrepTemplate.create(metadata, Study(1), "18S") self.prep_templates = [self.pt1, self.pt2]
def delete_prep_template(self, study, user, callback): """Delete the selected prep template Parameters ---------- study : Study The current study object user : User The current user object callback : function The callback function to call with the results once the processing is done """ prep_template_id = int(self.get_argument('prep_template_id')) prep_id = PrepTemplate(prep_template_id).raw_data try: PrepTemplate.delete(prep_template_id) msg = ("Prep template %d has been deleted" % prep_template_id) msg_level = "success" prep_id = None except Exception as e: msg = ("Couldn't remove prep template: %s" % str(e)) msg_level = "danger" callback((msg, msg_level, 'prep_template_tab', prep_id, None))
def test_get_qiime_minimal_mapping_single_reverse_primer(self): conn_handler = SQLConnectionHandler() conn_handler sql = """INSERT INTO qiita.prep_columns (prep_template_id, column_name, column_type) VALUES (1, 'reverselinkerprimer', 'varchar'); ALTER TABLE qiita.prep_1 ADD COLUMN reverselinkerprimer varchar; DELETE FROM qiita.prep_columns WHERE prep_template_id = 1 AND column_name = 'run_prefix'; ALTER TABLE qiita.prep_1 DROP COLUMN run_prefix; UPDATE qiita.prep_1 SET reverselinkerprimer = %s """ conn_handler.execute(sql, ('GTGCCAGCM',)) prep_template = PrepTemplate(1) prep_template.generate_files() out_dir = mkdtemp() obs_fps = _get_qiime_minimal_mapping(prep_template, out_dir) exp_fps = [join(out_dir, 'prep_1_MMF.txt')] # Check that the returned list is as expected self.assertEqual(obs_fps, exp_fps) # Check that the file exists self.assertTrue(exists(exp_fps[0])) # Check the contents of the file with open(exp_fps[0], "U") as f: self.assertEqual(f.read(), EXP_PREP_RLP)
def test_dataframe_from_template(self): template = PrepTemplate(1) obs = template.to_dataframe() # 27 samples self.assertEqual(len(obs), 27) self.assertTrue(set(obs.index), { u'SKB1.640202', u'SKB2.640194', u'SKB3.640195', u'SKB4.640189', u'SKB5.640181', u'SKB6.640176', u'SKB7.640196', u'SKB8.640193', u'SKB9.640200', u'SKD1.640179', u'SKD2.640178', u'SKD3.640198', u'SKD4.640185', u'SKD5.640186', u'SKD6.640190', u'SKD7.640191', u'SKD8.640184', u'SKD9.640182', u'SKM1.640183', u'SKM2.640199', u'SKM3.640197', u'SKM4.640180', u'SKM5.640177', u'SKM6.640187', u'SKM7.640188', u'SKM8.640201', u'SKM9.640192'}) self.assertTrue(set(obs.columns), { u'tot_org_carb', u'common_name', u'has_extracted_data', u'required_sample_info_status', u'water_content_soil', u'env_feature', u'assigned_from_geo', u'altitude', u'env_biome', u'texture', u'has_physical_specimen', u'description_duplicate', u'physical_location', u'latitude', u'ph', u'host_taxid', u'elevation', u'description', u'collection_timestamp', u'taxon_id', u'samp_salinity', u'host_subject_id', u'sample_type', u'season_environment', u'temp', u'country', u'longitude', u'tot_nitro', u'depth', u'anonymized_name', u'target_subfragment', u'sample_center', u'samp_size', u'run_date', u'experiment_center', u'pcr_primers', u'center_name', u'barcodesequence', u'run_center', u'run_prefix', u'library_construction_protocol', u'emp_status', u'linkerprimersequence', u'experiment_design_description', u'target_gene', u'center_project_name', u'illumina_technology', u'sequencing_meth', u'platform', u'experiment_title', u'study_center'})
def add_raw_data(self, study, user, callback): """Adds an existing raw data to the study Parameters ---------- study : Study The current study object user : User The current user object callback : function The callback function to call with the results once the processing is done """ msg = "Raw data successfully added" msg_level = "success" # Get the arguments to add the raw data pt_id = self.get_argument('prep_template_id') raw_data_id = self.get_argument('raw_data_id') prep_template = PrepTemplate(pt_id) raw_data = RawData(raw_data_id) try: prep_template.raw_data = raw_data except QiitaDBError as e: msg = html_error_message % ("adding the raw data", str(raw_data_id), str(e)) msg = convert_text_html(msg) callback((msg, msg_level, 'prep_template_tab', pt_id, None))
def update_prep_template(self, study, user, callback): """Update a prep template from the POST method Parameters ---------- study : Study The current study object user : User The current user object callback : function The callback function to call with the results once the processing is done Raises ------ HTTPError If the prep template file does not exists """ # If we are on this function, the arguments "prep_template_id", # "update_prep_template_file" must defined. If not, let tornado # raise its error pt_id = int(self.get_argument('prep_template_id')) prep_template = self.get_argument('update_prep_template_file') # Define here the message and message level in case of success msg = "The prep template '%s' has been updated" % prep_template msg_level = "success" # Get the uploads folder _, base_fp = get_mountpoint("uploads")[0] # Get the path of the prep template in the uploads folder fp = join(base_fp, str(study.id), prep_template) if not exists(fp): # The file does not exist, fail nicely # Using 400 because we want the user to get the error in the GUI raise HTTPError(400, "This file doesn't exist: %s" % fp) try: with warnings.catch_warnings(record=True) as warns: pt = PrepTemplate(pt_id) pt.update(load_template_to_dataframe(fp)) remove(fp) # join all the warning messages into one. Note that this info # will be ignored if an exception is raised if warns: msg = '; '.join([str(w.message) for w in warns]) msg_level = 'warning' except (TypeError, QiitaDBColumnError, QiitaDBExecutionError, QiitaDBDuplicateError, IOError, ValueError, KeyError, CParserError, QiitaDBDuplicateHeaderError, QiitaDBError) as e: # Some error occurred while processing the sample template # Show the error to the user so they can fix the template msg = html_error_message % ('updating the prep template:', basename(fp), str(e)) msg = convert_text_html(msg) msg_level = "danger" callback((msg, msg_level, 'prep_template_tab', pt_id, None))
def submit_VAMPS(preprocessed_data_id): """Submit preprocessed data to VAMPS Parameters ---------- preprocessed_data_id : int The preprocesssed data id """ preprocessed_data = PreprocessedData(preprocessed_data_id) study = Study(preprocessed_data.study) sample_template = SampleTemplate(study.sample_template) prep_template = PrepTemplate(preprocessed_data.prep_template) status = preprocessed_data.submitted_to_vamps_status() if status in ('submitting', 'success'): raise ValueError("Cannot resubmit! Current status is: %s" % status) preprocessed_data.update_vamps_status('submitting') # Generating a tgz targz_folder = mkdtemp(prefix=qiita_config.working_dir) targz_fp = join(targz_folder, '%d_%d_%d.tgz' % (study.id, prep_template.id, preprocessed_data.id)) targz = taropen(targz_fp, mode='w:gz') # adding sample/prep samp_fp = join(targz_folder, 'sample_metadata.txt') sample_template.to_file(samp_fp) targz.add(samp_fp, arcname='sample_metadata.txt') prep_fp = join(targz_folder, 'prep_metadata.txt') prep_template.to_file(prep_fp) targz.add(prep_fp, arcname='prep_metadata.txt') # adding preprocessed data for _, fp, fp_type in preprocessed_data.get_filepaths(): if fp_type == 'preprocessed_fasta': targz.add(fp, arcname='preprocessed_fasta.fna') targz.close() # submitting cmd = ("curl -F user=%s -F pass='******' -F uploadFile=@%s -F " "press=UploadFile %s" % (qiita_config.vamps_user, qiita_config.vamps_pass, targz_fp, qiita_config.vamps_url)) obs, _, _ = system_call(cmd) exp = ("<html>\n<head>\n<title>Process Uploaded File</title>\n</head>\n" "<body>\n</body>\n</html>") if obs != exp: preprocessed_data.update_vamps_status('failure') return False else: preprocessed_data.update_vamps_status('success') return True
def test_get_sample_names_by_run_prefix(self): metadata_dict = { 'SKB8.640193': {'run_prefix': "s1", 'primer': 'A', 'barcode': 'A', 'center_name': 'ANL', 'platform': 'ILLUMINA', 'instrument_model': 'Illumina MiSeq', 'library_construction_protocol': 'A', 'experiment_design_description': 'A'}, 'SKD8.640184': {'run_prefix': "s2", 'primer': 'A', 'barcode': 'A', 'center_name': 'ANL', 'platform': 'ILLUMINA', 'instrument_model': 'Illumina MiSeq', 'library_construction_protocol': 'A', 'experiment_design_description': 'A'}, 'SKB7.640196': {'run_prefix': "s3", 'primer': 'A', 'barcode': 'A', 'center_name': 'ANL', 'platform': 'ILLUMINA', 'instrument_model': 'Illumina MiSeq', 'library_construction_protocol': 'A', 'experiment_design_description': 'A'}} md_template = pd.DataFrame.from_dict(metadata_dict, orient='index') prep_template = PrepTemplate.create(md_template, Study(1), '16S') for _, fp in prep_template.get_filepaths(): self.files_to_remove.append(fp) obs = _get_sample_names_by_run_prefix(prep_template) exp = {'s3': '1.SKB7.640196', 's2': '1.SKD8.640184', 's1': '1.SKB8.640193'} self.assertEqual(obs, exp) # This should raise an error metadata_dict = { 'SKB8.640193': {'run_prefix': "s1", 'primer': 'A', 'barcode': 'A', 'center_name': 'ANL', 'platform': 'ILLUMINA', 'instrument_model': 'Illumina MiSeq', 'library_construction_protocol': 'A', 'experiment_design_description': 'A'}, 'SKD8.640184': {'run_prefix': "s1", 'primer': 'A', 'barcode': 'A', 'center_name': 'ANL', 'platform': 'ILLUMINA', 'instrument_model': 'Illumina MiSeq', 'library_construction_protocol': 'A', 'experiment_design_description': 'A'}, 'SKB7.640196': {'run_prefix': "s3", 'primer': 'A', 'barcode': 'A', 'center_name': 'ANL', 'platform': 'ILLUMINA', 'instrument_model': 'Illumina MiSeq', 'library_construction_protocol': 'A', 'experiment_design_description': 'A'}} md_template = pd.DataFrame.from_dict(metadata_dict, orient='index') prep_template = PrepTemplate.create(md_template, Study(1), '16S') for _, fp in prep_template.get_filepaths(): self.files_to_remove.append(fp) with self.assertRaises(ValueError): _get_sample_names_by_run_prefix(prep_template)
def submit_VAMPS(preprocessed_data_id): """Submit preprocessed data to VAMPS Parameters ---------- preprocessed_data_id : int The preprocesssed data id """ preprocessed_data = PreprocessedData(preprocessed_data_id) study = Study(preprocessed_data.study) sample_template = SampleTemplate(study.sample_template) prep_template = PrepTemplate(preprocessed_data.prep_template) status = preprocessed_data.submitted_to_vamps_status() if status in ('submitting', 'success'): raise ValueError("Cannot resubmit! Current status is: %s" % status) preprocessed_data.update_vamps_status('submitting') # Generating a tgz targz_folder = mkdtemp(prefix=qiita_config.working_dir) targz_fp = join( targz_folder, '%d_%d_%d.tgz' % (study.id, prep_template.id, preprocessed_data.id)) targz = taropen(targz_fp, mode='w:gz') # adding sample/prep samp_fp = join(targz_folder, 'sample_metadata.txt') sample_template.to_file(samp_fp) targz.add(samp_fp, arcname='sample_metadata.txt') prep_fp = join(targz_folder, 'prep_metadata.txt') prep_template.to_file(prep_fp) targz.add(prep_fp, arcname='prep_metadata.txt') # adding preprocessed data for _, fp, fp_type in preprocessed_data.get_filepaths(): if fp_type == 'preprocessed_fasta': targz.add(fp, arcname='preprocessed_fasta.fna') targz.close() # submitting cmd = ("curl -F user=%s -F pass='******' -F uploadFile=@%s -F " "press=UploadFile %s" % (qiita_config.vamps_user, qiita_config.vamps_pass, targz_fp, qiita_config.vamps_url)) obs, _, _ = system_call(cmd) exp = ("<html>\n<head>\n<title>Process Uploaded File</title>\n</head>\n" "<body>\n</body>\n</html>") if obs != exp: preprocessed_data.update_vamps_status('failure') return False else: preprocessed_data.update_vamps_status('success') return True
def get_prep_templates(self, raw_data, callback): """Get all prep templates for a list of raw data objects""" d = {} for rd in raw_data: # We neeed this so PrepTemplate(p) doesn't fail if that raw # doesn't exist but raw data has the row: #554 prep_templates = sorted(rd.prep_templates) d[rd.id] = [PrepTemplate(p) for p in prep_templates if PrepTemplate.exists(p)] callback(d)
def remove_add_prep_template(self, fp_rpt, raw_data_id, study, data_type_id, investigation_type, callback): """add prep templates """ PrepTemplate.create(load_template_to_dataframe(fp_rpt), RawData(raw_data_id), study, int(data_type_id), investigation_type=investigation_type) remove(fp_rpt) callback()
def setUp(self): metadata_dict = { 'SKB8.640193': { 'center_name': 'ANL', 'center_project_name': 'Test Project', 'ebi_submission_accession': None, 'EMP_status_id': 1, 'data_type_id': 2, 'str_column': 'Value for sample 1' }, 'SKD8.640184': { 'center_name': 'ANL', 'center_project_name': 'Test Project', 'ebi_submission_accession': None, 'EMP_status_id': 1, 'data_type_id': 2, 'str_column': 'Value for sample 2' }, 'SKB7.640196': { 'center_name': 'ANL', 'center_project_name': 'Test Project', 'ebi_submission_accession': None, 'EMP_status_id': 1, 'data_type_id': 2, 'str_column': 'Value for sample 3' } } self.metadata = pd.DataFrame.from_dict(metadata_dict, orient='index') self.test_raw_data = RawData(1) fd, seqs_fp = mkstemp(suffix='_seqs.fastq') close(fd) fd, barcodes_fp = mkstemp(suffix='_barcodes.fastq') close(fd) filepaths = [(seqs_fp, 1), (barcodes_fp, 2)] with open(seqs_fp, "w") as f: f.write("\n") with open(barcodes_fp, "w") as f: f.write("\n") self.new_raw_data = RawData.create(2, filepaths, [Study(1)]) db_test_raw_dir = join(get_db_files_base_dir(), 'raw_data') db_seqs_fp = join(db_test_raw_dir, "3_%s" % basename(seqs_fp)) db_barcodes_fp = join(db_test_raw_dir, "3_%s" % basename(barcodes_fp)) self._clean_up_files = [db_seqs_fp, db_barcodes_fp] self.tester = PrepTemplate(1) self.exp_sample_ids = { 'SKB1.640202', 'SKB2.640194', 'SKB3.640195', 'SKB4.640189', 'SKB5.640181', 'SKB6.640176', 'SKB7.640196', 'SKB8.640193', 'SKB9.640200', 'SKD1.640179', 'SKD2.640178', 'SKD3.640198', 'SKD4.640185', 'SKD5.640186', 'SKD6.640190', 'SKD7.640191', 'SKD8.640184', 'SKD9.640182', 'SKM1.640183', 'SKM2.640199', 'SKM3.640197', 'SKM4.640180', 'SKM5.640177', 'SKM6.640187', 'SKM7.640188', 'SKM8.640201', 'SKM9.640192' }
def remove_add_study_template(self, raw_data, study_id, fp_rsp): """Replace prep templates, raw data, and sample template with a new one """ for rd in raw_data(): rd = RawData(rd) for pt in rd.prep_templates: if PrepTemplate.exists(pt): PrepTemplate.delete(pt) if SampleTemplate.exists(study_id): SampleTemplate.delete(study_id) SampleTemplate.create(load_template_to_dataframe(fp_rsp), Study(study_id)) remove(fp_rsp)
def update_investigation_type(self, study, user, callback): """Updates the investigation type of a prep template Parameters ---------- study : Study The current study object user : User The current user object callback : function The callback function to call with the results once the processing is done """ msg = "investigation type successfully updated" msg_level = "success" ppd_id = int(self.get_argument('ppd_id')) prep_id = self.get_argument('prep_id') edit_investigation_type = self.get_argument('edit-investigation-type', None) edit_user_defined_investigation_type = self.get_argument( 'edit-user-defined-investigation-type', None) edit_new_investigation_type = self.get_argument( 'edit-new-investigation-type', None) pt = PrepTemplate(prep_id) investigation_type = self._process_investigation_type( edit_investigation_type, edit_user_defined_investigation_type, edit_new_investigation_type) try: pt.investigation_type = investigation_type except QiitaDBColumnError as e: msg = html_error_message % (", invalid investigation type: ", investigation_type, str(e)) msg = convert_text_html(msg) msg_level = "danger" if ppd_id == 0: top_tab = "prep_template_tab" sub_tab = prep_id prep_tab = None else: top_tab = "preprocessed_data_tab" sub_tab = ppd_id prep_tab = None callback((msg, msg_level, top_tab, sub_tab, prep_tab))
def update_investigation_type(self, study, user, callback): """Updates the investigation type of a prep template Parameters ---------- study : Study The current study object user : User The current user object callback : function The callback function to call with the results once the processing is done """ msg = "investigation type successfully updated" msg_level = "success" ppd_id = int(self.get_argument('ppd_id')) prep_id = self.get_argument('prep_id') edit_investigation_type = self.get_argument('edit-investigation-type', None) edit_user_defined_investigation_type = self.get_argument( 'edit-user-defined-investigation-type', None) edit_new_investigation_type = self.get_argument( 'edit-new-investigation-type', None) pt = PrepTemplate(prep_id) rd_id = pt.raw_data investigation_type = self._process_investigation_type( edit_investigation_type, edit_user_defined_investigation_type, edit_new_investigation_type) try: pt.investigation_type = investigation_type except QiitaDBColumnError as e: msg = html_error_message % (", invalid investigation type: ", investigation_type, str(e)) msg_level = "danger" if ppd_id == 0: top_tab = "raw_data_tab" sub_tab = rd_id prep_tab = prep_id else: top_tab = "preprocessed_data_tab" sub_tab = ppd_id prep_tab = None callback((msg, msg_level, top_tab, sub_tab, prep_tab))
def preprocessor(study_id, prep_template_id, param_id, param_constructor): """Dispatch for preprocessor work""" study = Study(study_id) prep_template = PrepTemplate(prep_template_id) params = param_constructor(param_id) sp = StudyPreprocessor() try: preprocess_out = sp(study, prep_template, params) except Exception as e: error_msg = ''.join(format_exception_only(e, exc_info())) prep_template.preprocessing_status = "failed: %s" % error_msg preprocess_out = None return preprocess_out
def remove_add_prep_template(self, fp_rpt, study, data_type_id, investigation_type): """add prep templates""" pt_id = PrepTemplate.create( load_template_to_dataframe(fp_rpt), study, _to_int(data_type_id), investigation_type=investigation_type ).id remove(fp_rpt) return pt_id
def test_create(self): """Creates a new PrepTemplate""" pt = PrepTemplate.create(self.metadata, self.new_raw_data) # The returned object has the correct id self.assertEqual(pt.id, 3) # The relevant rows to common_prep_info have been added. obs = self.conn_handler.execute_fetchall( "SELECT * FROM qiita.common_prep_info WHERE raw_data_id=3") # raw_data_id, sample_id, center_name, center_project_name, # ebi_submission_accession, ebi_study_accession, emp_status_id, # data_type_id exp = [[3, 'SKB8.640193', 'ANL', 'Test Project', None, None, 1, 2], [3, 'SKD8.640184', 'ANL', 'Test Project', None, None, 1, 2], [3, 'SKB7.640196', 'ANL', 'Test Project', None, None, 1, 2]] self.assertEqual(sorted(obs), sorted(exp)) # The relevant rows have been added to the raw_data_prep_columns obs = self.conn_handler.execute_fetchall( "SELECT * FROM qiita.raw_data_prep_columns WHERE raw_data_id=3") # raw_data_id, column_name, column_type exp = [[3, "str_column", "varchar"]] self.assertEqual(obs, exp) # The new table exists self.assertTrue(exists_table("prep_3", self.conn_handler)) # The new table hosts the correct values obs = self.conn_handler.execute_fetchall( "SELECT * FROM qiita.prep_3") # sample_id, str_column exp = [['SKB8.640193', "Value for sample 1"], ['SKD8.640184', "Value for sample 2"], ['SKB7.640196', "Value for sample 3"]] self.assertEqual(sorted(obs), sorted(exp))
def test_insert_preprocessed_data(self): study = Study(1) params = PreprocessedIlluminaParams(1) prep_template = PrepTemplate(1) prep_out_dir = mkdtemp() self.dirs_to_remove.append(prep_out_dir) path_builder = partial(join, prep_out_dir) db_path_builder = partial(join, join(self.db_dir, "preprocessed_data")) file_suffixes = [ 'seqs.fna', 'seqs.fastq', 'seqs.demux', 'split_library_log.txt' ] db_files = [] for f_suff in file_suffixes: fp = path_builder(f_suff) with open(fp, 'w') as f: f.write("\n") self.files_to_remove.append(fp) db_files.append(db_path_builder("3_%s" % f_suff)) self.files_to_remove.extend(db_files) _insert_preprocessed_data(study, params, prep_template, prep_out_dir) # Check that the files have been copied for fp in db_files: self.assertTrue(exists(fp)) # Check that a new preprocessed data has been created self.assertTrue( self.conn_handler.execute_fetchone( "SELECT EXISTS(SELECT * FROM qiita.preprocessed_data WHERE " "preprocessed_data_id=%s)", (3, ))[0])
def test_get_preprocess_fastq_cmd(self): raw_data = RawData(1) params = PreprocessedIlluminaParams(1) prep_template = PrepTemplate(1) obs_cmd, obs_output_dir = _get_preprocess_fastq_cmd( raw_data, prep_template, params) get_raw_path = partial(join, self.db_dir, 'raw_data') seqs_fp = get_raw_path('1_s_G1_L001_sequences.fastq.gz') bc_fp = get_raw_path('1_s_G1_L001_sequences_barcodes.fastq.gz') exp_cmd_1 = ("split_libraries_fastq.py --store_demultiplexed_fastq -i " "{} -b {} " "-m ".format(seqs_fp, bc_fp)) exp_cmd_2 = ("-o {0} --barcode_type golay_12 --max_bad_run_length 3 " "--max_barcode_errors 1.5 " "--min_per_read_length_fraction 0.75 " "--phred_quality_threshold 3 " "--sequence_max_n 0".format(obs_output_dir)) # We are splitting the command into two parts because there is no way # that we can know the filepath of the mapping file. We thus split the # command on the mapping file path and we check that the two parts # of the commands is correct obs_cmd_1 = obs_cmd[:len(exp_cmd_1)] obs_cmd_2 = obs_cmd[len(exp_cmd_1):].split(" ", 1)[1] self.assertEqual(obs_cmd_1, exp_cmd_1) self.assertEqual(obs_cmd_2, exp_cmd_2)
def test_create(self): """Creates a new PrepTemplate""" pt = PrepTemplate.create(self.metadata, self.new_raw_data) # The returned object has the correct id self.assertEqual(pt.id, 3) # The relevant rows to common_prep_info have been added. obs = self.conn_handler.execute_fetchall( "SELECT * FROM qiita.common_prep_info WHERE raw_data_id=3") # raw_data_id, sample_id, center_name, center_project_name, # ebi_submission_accession, ebi_study_accession, emp_status_id, # data_type_id exp = [[3, 'SKB8.640193', 'ANL', 'Test Project', 1, 2], [3, 'SKD8.640184', 'ANL', 'Test Project', 1, 2], [3, 'SKB7.640196', 'ANL', 'Test Project', 1, 2]] self.assertEqual(sorted(obs), sorted(exp)) # The relevant rows have been added to the raw_data_prep_columns obs = self.conn_handler.execute_fetchall( "SELECT * FROM qiita.raw_data_prep_columns WHERE raw_data_id=3") # raw_data_id, column_name, column_type exp = [[3, 'str_column', 'varchar'], [3, 'ebi_submission_accession', 'varchar']] self.assertEqual(obs, exp) # The new table exists self.assertTrue(exists_table("prep_3", self.conn_handler)) # The new table hosts the correct values obs = self.conn_handler.execute_fetchall("SELECT * FROM qiita.prep_3") # sample_id, str_column exp = [['SKB8.640193', "Value for sample 1", None], ['SKD8.640184', "Value for sample 2", None], ['SKB7.640196', "Value for sample 3", None]] self.assertEqual(sorted(obs), sorted(exp))
def test_get_preprocess_fastq_cmd_per_sample_FASTQ_failure(self): metadata_dict = { 'SKB8.640193': {'run_prefix': "sample1_failure", 'primer': 'A', 'barcode': 'A', 'center_name': 'ANL', 'platform': 'ILLUMINA', 'library_construction_protocol': 'A', 'experiment_design_description': 'A'}} md_template = pd.DataFrame.from_dict(metadata_dict, orient='index') prep_template = PrepTemplate.create(md_template, Study(1), '16S') # This part should fail fp1 = self.path_builder('sample1_failure.fastq') with open(fp1, 'w') as f: f.write('\n') self.files_to_remove.append(fp1) fp2 = self.path_builder('sample1_failure.barcodes.fastq.gz') with open(fp2, 'w') as f: f.write('\n') self.files_to_remove.append(fp2) forward_filepath_id = convert_to_id('raw_forward_seqs', 'filepath_type') barcode_filepath_id = convert_to_id('raw_barcodes', 'filepath_type') fps = [(fp1, forward_filepath_id), (fp2, barcode_filepath_id)] filetype_id = get_filetypes()['per_sample_FASTQ'] raw_data = RawData.create(filetype_id, [prep_template], fps) params = [p for p in list(PreprocessedIlluminaParams.iter()) if p.name == 'per sample FASTQ defaults'][0] with self.assertRaises(ValueError): _get_preprocess_fastq_cmd(raw_data, prep_template, params)
def test_get_qiime_minimal_mapping_multiple(self): # We need to create a prep template in which we have different run # prefix values, so we can test this case metadata_dict = { 'SKB8.640193': {'center_name': 'ANL', 'center_project_name': 'Test Project', 'ebi_submission_accession': None, 'EMP_status': 'EMP', 'str_column': 'Value for sample 1', 'primer': 'GTGCCAGCMGCCGCGGTAA', 'barcode': 'GTCCGCAAGTTA', 'run_prefix': "s_G1_L001_sequences", 'platform': 'ILLUMINA', 'library_construction_protocol': 'AAA', 'experiment_design_description': 'BBB'}, 'SKD8.640184': {'center_name': 'ANL', 'center_project_name': 'Test Project', 'ebi_submission_accession': None, 'EMP_status': 'EMP', 'str_column': 'Value for sample 2', 'primer': 'GTGCCAGCMGCCGCGGTAA', 'barcode': 'CGTAGAGCTCTC', 'run_prefix': "s_G1_L001_sequences", 'platform': 'ILLUMINA', 'library_construction_protocol': 'AAA', 'experiment_design_description': 'BBB'}, 'SKB7.640196': {'center_name': 'ANL', 'center_project_name': 'Test Project', 'ebi_submission_accession': None, 'EMP_status': 'EMP', 'str_column': 'Value for sample 3', 'primer': 'GTGCCAGCMGCCGCGGTAA', 'barcode': 'CCTCTGAGAGCT', 'run_prefix': "s_G1_L002_sequences", 'platform': 'ILLUMINA', 'library_construction_protocol': 'AAA', 'experiment_design_description': 'BBB'} } md_template = pd.DataFrame.from_dict(metadata_dict, orient='index') prep_template = PrepTemplate.create(md_template, Study(1), '16S') for _, fp in prep_template.get_filepaths(): self.files_to_remove.append(fp) out_dir = mkdtemp() obs_fps = sorted(_get_qiime_minimal_mapping(prep_template, out_dir)) exp_fps = sorted([join(out_dir, 's_G1_L001_sequences_MMF.txt'), join(out_dir, 's_G1_L002_sequences_MMF.txt')]) # Check that the returned list is as expected self.assertEqual(obs_fps, exp_fps) # Check that the file exists for fp in exp_fps: self.assertTrue(exists(fp)) # Check the contents of the file for fp, contents in zip(exp_fps, [EXP_PREP_1, EXP_PREP_2]): with open(fp, "U") as f: self.assertEqual(f.read(), contents)
def test_get_preprocess_fasta_cmd_sff(self): raw_data = RawData(3) params = Preprocessed454Params(1) prep_template = PrepTemplate(1) obs_cmd, obs_output_dir = _get_preprocess_fasta_cmd( raw_data, prep_template, params) get_raw_path = partial(join, self.db_dir, 'raw_data') seqs_fp = [ get_raw_path('preprocess_test1.sff'), get_raw_path('preprocess_test2.sff') ] exp_cmd_1 = ' '.join( ["process_sff.py", "-i %s" % seqs_fp[0], "-o %s" % obs_output_dir]) exp_cmd_2 = ' '.join( ["process_sff.py", "-i %s" % seqs_fp[1], "-o %s" % obs_output_dir]) fasta_files = ','.join([ join(obs_output_dir, "preprocess_test1.fna"), join(obs_output_dir, "preprocess_test2.fna") ]) qual_files = ','.join([ join(obs_output_dir, "preprocess_test1.qual"), join(obs_output_dir, "preprocess_test2.qual") ]) exp_cmd_3a = ' '.join(["split_libraries.py", "-f %s" % fasta_files]) exp_cmd_3b = ' '.join([ "-q %s" % qual_files, "-d", "-o %s" % obs_output_dir, params.to_str() ]) exp_cmd_4 = ' '.join([ "convert_fastaqual_fastq.py", "-f %s/seqs.fna" % obs_output_dir, "-q %s/seqs_filtered.qual" % obs_output_dir, "-o %s" % obs_output_dir, "-F" ]) obs_cmds = obs_cmd.split('; ') # We are splitting the command into two parts because there is no way # that we can know the filepath of the mapping file. We thus split the # command on the mapping file path and we check that the two parts # of the commands is correct obs_cmd_3a, obs_cmd_3b_temp = obs_cmds[2].split(' -m ', 1) obs_cmd_3b = obs_cmd_3b_temp.split(' ', 1)[1] self.assertEqual(obs_cmds[0], exp_cmd_1) self.assertEqual(obs_cmds[1], exp_cmd_2) self.assertEqual(obs_cmd_3a, exp_cmd_3a) self.assertEqual(obs_cmd_3b, exp_cmd_3b) self.assertEqual(obs_cmds[3], exp_cmd_4)
def remove_add_prep_template(self, fp_rpt, raw_data_id, study, data_type_id, investigation_type): """add prep templates""" pt_id = PrepTemplate.create(load_template_to_dataframe(fp_rpt), RawData(raw_data_id), study, _to_int(data_type_id), investigation_type=investigation_type).id remove(fp_rpt) return pt_id
def test_to_file(self): """to file writes a tab delimited file with all the metadata""" fd, fp = mkstemp() close(fd) pt = PrepTemplate.create(self.metadata, self.new_raw_data) pt.to_file(fp) self._clean_up_files.append(fp) with open(fp, 'U') as f: obs = f.read() self.assertEqual(obs, EXP_PREP_TEMPLATE)
def remove_add_study_template(self, raw_data, study_id, fp_rsp, data_type, is_mapping_file): """Replace prep templates, raw data, and sample template with a new one """ if is_mapping_file and data_type == "": raise ValueError("Please, choose a data type if uploading a QIIME " "mapping file") for rd in raw_data(): rd = RawData(rd) for pt in rd.prep_templates: if PrepTemplate.exists(pt): PrepTemplate.delete(pt) if SampleTemplate.exists(study_id): SampleTemplate.delete(study_id) if is_mapping_file: create_templates_from_qiime_mapping_file(fp_rsp, Study(study_id), int(data_type)) else: SampleTemplate.create(load_template_to_dataframe(fp_rsp), Study(study_id)) remove(fp_rsp)
def test_get_qiime_minimal_mapping_single_no_run_prefix(self): conn_handler = SQLConnectionHandler() sql = """DELETE FROM qiita.prep_columns WHERE prep_template_id = 1 AND column_name = 'run_prefix'; ALTER TABLE qiita.prep_1 DROP COLUMN run_prefix""" conn_handler.execute(sql) prep_template = PrepTemplate(1) prep_template.generate_files() out_dir = mkdtemp() obs_fps = _get_qiime_minimal_mapping(prep_template, out_dir) exp_fps = [join(out_dir, 'prep_1_MMF.txt')] # Check that the returned list is as expected self.assertEqual(obs_fps, exp_fps) # Check that the file exists self.assertTrue(exists(exp_fps[0])) # Check the contents of the file with open(exp_fps[0], "U") as f: self.assertEqual(f.read(), EXP_PREP)
def _template_generator(study, full_access): """Generates tuples of prep template information Parameters ---------- study : Study The study to get all the prep templates full_access : boolean A boolean that indicates if the user has full access to the study Returns ------- Generator of tuples of (int, str, PrepTemplate, (str, str, str)) Each tuple contains the prep template id, the prep template data_type the PrepTemplate object and a tuple with 3 strings for the style of the prep template status icons """ for pt_id in sorted(study.prep_templates()): pt = PrepTemplate(pt_id) if full_access or pt.status == 'public': yield (pt.id, pt.data_type(), pt, STATUS_STYLER[pt.status])
def setUp(self): self.prep_template = PrepTemplate(1) self.sample_id = 'SKB8.640193' self.tester = PrepSample(self.sample_id, self.prep_template) self.exp_categories = { 'center_name', 'center_project_name', 'emp_status_id', 'data_type_id', 'barcodesequence', 'library_construction_protocol', 'linkerprimersequence', 'target_subfragment', 'target_gene', 'run_center', 'run_prefix', 'run_date', 'experiment_center', 'experiment_design_description', 'experiment_title', 'platform', 'samp_size', 'sequencing_meth', 'illumina_technology', 'sample_center', 'pcr_primers', 'study_center' }
def test_get_qiime_minimal_mapping_single(self): prep_template = PrepTemplate(1) out_dir = mkdtemp() obs_fps = _get_qiime_minimal_mapping(prep_template, out_dir) exp_fps = [join(out_dir, 's_G1_L001_sequences_MMF.txt')] # Check that the returned list is as expected self.assertEqual(obs_fps, exp_fps) # Check that the file exists self.assertTrue(exists(exp_fps[0])) # Check the contents of the file with open(exp_fps[0], "U") as f: self.assertEqual(f.read(), EXP_PREP)
def setUp(self): metadata_dict = { 'SKB8.640193': {'center_name': 'ANL', 'center_project_name': 'Test Project', 'ebi_submission_accession': None, 'EMP_status_id': 1, 'data_type_id': 2, 'str_column': 'Value for sample 1'}, 'SKD8.640184': {'center_name': 'ANL', 'center_project_name': 'Test Project', 'ebi_submission_accession': None, 'EMP_status_id': 1, 'data_type_id': 2, 'str_column': 'Value for sample 2'}, 'SKB7.640196': {'center_name': 'ANL', 'center_project_name': 'Test Project', 'ebi_submission_accession': None, 'EMP_status_id': 1, 'data_type_id': 2, 'str_column': 'Value for sample 3'} } self.metadata = pd.DataFrame.from_dict(metadata_dict, orient='index') self.test_raw_data = RawData(1) fd, seqs_fp = mkstemp(suffix='_seqs.fastq') close(fd) fd, barcodes_fp = mkstemp(suffix='_barcodes.fastq') close(fd) filepaths = [(seqs_fp, 1), (barcodes_fp, 2)] with open(seqs_fp, "w") as f: f.write("\n") with open(barcodes_fp, "w") as f: f.write("\n") self.new_raw_data = RawData.create(2, filepaths, [Study(1)]) db_test_raw_dir = join(get_db_files_base_dir(), 'raw_data') db_seqs_fp = join(db_test_raw_dir, "3_%s" % basename(seqs_fp)) db_barcodes_fp = join(db_test_raw_dir, "3_%s" % basename(barcodes_fp)) self._clean_up_files = [db_seqs_fp, db_barcodes_fp] self.tester = PrepTemplate(1) self.exp_sample_ids = {'SKB1.640202', 'SKB2.640194', 'SKB3.640195', 'SKB4.640189', 'SKB5.640181', 'SKB6.640176', 'SKB7.640196', 'SKB8.640193', 'SKB9.640200', 'SKD1.640179', 'SKD2.640178', 'SKD3.640198', 'SKD4.640185', 'SKD5.640186', 'SKD6.640190', 'SKD7.640191', 'SKD8.640184', 'SKD9.640182', 'SKM1.640183', 'SKM2.640199', 'SKM3.640197', 'SKM4.640180', 'SKM5.640177', 'SKM6.640187', 'SKM7.640188', 'SKM8.640201', 'SKM9.640192'}
def render(self, study_id, preprocessed_data): user = User(self.current_user) ppd_id = preprocessed_data.id ebi_status = preprocessed_data.submitted_to_insdc_status() ebi_study_accession = preprocessed_data.ebi_study_accession ebi_submission_accession = preprocessed_data.ebi_submission_accession vamps_status = preprocessed_data.submitted_to_vamps_status() filepaths = preprocessed_data.get_filepaths() is_local_request = self._is_local() show_ebi_btn = user.level == "admin" # Get all the ENA terms for the investigation type ontology = Ontology(convert_to_id('ENA', 'ontology')) # make "Other" show at the bottom of the drop down menu ena_terms = [] for v in sorted(ontology.terms): if v != 'Other': ena_terms.append('<option value="%s">%s</option>' % (v, v)) ena_terms.append('<option value="Other">Other</option>') # New Type is for users to add a new user-defined investigation type user_defined_terms = ontology.user_defined_terms + ['New Type'] if PrepTemplate.exists(preprocessed_data.prep_template): prep_template_id = preprocessed_data.prep_template prep_template = PrepTemplate(prep_template_id) raw_data_id = prep_template.raw_data inv_type = prep_template.investigation_type or "None Selected" else: prep_template_id = None raw_data_id = None inv_type = "None Selected" return self.render_string( "study_description_templates/preprocessed_data_info_tab.html", ppd_id=ppd_id, show_ebi_btn=show_ebi_btn, ebi_status=ebi_status, ebi_study_accession=ebi_study_accession, ebi_submission_accession=ebi_submission_accession, filepaths=filepaths, is_local_request=is_local_request, prep_template_id=prep_template_id, raw_data_id=raw_data_id, inv_type=inv_type, ena_terms=ena_terms, vamps_status=vamps_status, user_defined_terms=user_defined_terms)
def test_get_preprocess_fastq_cmd_per_sample_FASTQ(self): metadata_dict = { 'SKB8.640193': {'run_prefix': "sample1", 'primer': 'A', 'barcode': 'A', 'center_name': 'ANL', 'platform': 'ILLUMINA', 'instrument_model': 'Illumina MiSeq', 'library_construction_protocol': 'A', 'experiment_design_description': 'A'}, 'SKD8.640184': {'run_prefix': "sample2", 'primer': 'A', 'barcode': 'A', 'center_name': 'ANL', 'platform': 'ILLUMINA', 'instrument_model': 'Illumina MiSeq', 'library_construction_protocol': 'A', 'experiment_design_description': 'A'}} md_template = pd.DataFrame.from_dict(metadata_dict, orient='index') prep_template = PrepTemplate.create(md_template, Study(1), '16S') fp1 = self.path_builder('sample1.fastq') with open(fp1, 'w') as f: f.write('\n') self.files_to_remove.append(fp1) fp2 = self.path_builder('sample2.fastq.gz') with open(fp2, 'w') as f: f.write('\n') self.files_to_remove.append(fp2) filepath_id = convert_to_id('raw_forward_seqs', 'filepath_type') fps = [(fp1, filepath_id), (fp2, filepath_id)] filetype_id = get_filetypes()['per_sample_FASTQ'] raw_data = RawData.create(filetype_id, [prep_template], fps) params = [p for p in list(PreprocessedIlluminaParams.iter()) if p.name == 'per sample FASTQ defaults'][0] obs_cmd, obs_output_dir = _get_preprocess_fastq_cmd(raw_data, prep_template, params) raw_fps = ','.join([fp for _, fp, _ in sorted(raw_data.get_filepaths())]) exp_cmd = ( "split_libraries_fastq.py --store_demultiplexed_fastq -i " "{} --sample_ids 1.SKB8.640193,1.SKD8.640184 -o {} --barcode_type " "not-barcoded --max_bad_run_length 3 --max_barcode_errors 1.5 " "--min_per_read_length_fraction 0.75 --phred_quality_threshold 3 " "--sequence_max_n 0").format(raw_fps, obs_output_dir) self.assertEqual(obs_cmd, exp_cmd)
def test_load_data_from_cmd(self): filepaths = [self.forward_fp, self.reverse_fp, self.barcodes_fp] filepath_types = ['raw_forward_seqs', 'raw_reverse_seqs', 'raw_barcodes'] filetype = 'FASTQ' metadata_dict = { 'SKB8.640193': {'center_name': 'ANL', 'primer': 'GTGCCAGCMGCCGCGGTAA', 'barcode': 'GTCCGCAAGTTA', 'run_prefix': "s_G1_L001_sequences", 'platform': 'ILLUMINA', 'instrument_model': 'Illumina MiSeq', 'library_construction_protocol': 'AAAA', 'experiment_design_description': 'BBBB'}} metadata = pd.DataFrame.from_dict(metadata_dict, orient='index') pt1 = PrepTemplate.create(metadata, Study(1), "16S") prep_templates = [pt1.id] initial_raw_count = get_count('qiita.raw_data') initial_fp_count = get_count('qiita.filepath') initial_raw_fp_count = get_count('qiita.raw_filepath') new = load_raw_data_cmd(filepaths, filepath_types, filetype, prep_templates) raw_data_id = new.id self.files_to_remove.append( join(self.db_test_raw_dir, '%d_%s' % (raw_data_id, basename(self.forward_fp)))) self.files_to_remove.append( join(self.db_test_raw_dir, '%d_%s' % (raw_data_id, basename(self.reverse_fp)))) self.files_to_remove.append( join(self.db_test_raw_dir, '%d_%s' % (raw_data_id, basename(self.barcodes_fp)))) self.assertTrue(check_count('qiita.raw_data', initial_raw_count + 1)) self.assertTrue(check_count('qiita.filepath', initial_fp_count + 3)) self.assertTrue(check_count('qiita.raw_filepath', initial_raw_fp_count + 3)) # Ensure that the ValueError is raised when a filepath_type is not # provided for each and every filepath with self.assertRaises(ValueError): load_raw_data_cmd(filepaths, filepath_types[:-1], filetype, prep_templates)
def test_metadata_map_from_sample_and_prep_templates(self): obs = metadata_map_from_sample_and_prep_templates( SampleTemplate(1), PrepTemplate(1)) # We don't test the specific values as this would blow up the size # of this file as the amount of lines would go to ~1000 # 27 samples self.assertEqual(len(obs), 27) self.assertTrue( all(obs.index == pd.Index([ u'SKB1.640202', u'SKB2.640194', u'SKB3.640195', u'SKB4.640189', u'SKB5.640181', u'SKB6.640176', u'SKB7.640196', u'SKB8.640193', u'SKB9.640200', u'SKD1.640179', u'SKD2.640178', u'SKD3.640198', u'SKD4.640185', u'SKD5.640186', u'SKD6.640190', u'SKD7.640191', u'SKD8.640184', u'SKD9.640182', u'SKM1.640183', u'SKM2.640199', u'SKM3.640197', u'SKM4.640180', u'SKM5.640177', u'SKM6.640187', u'SKM7.640188', u'SKM8.640201', u'SKM9.640192' ], dtype='object'))) self.assertTrue( all(obs.columns == pd.Index([ u'tot_org_carb', u'common_name', u'has_extracted_data', u'water_content_soil', u'env_feature', u'assigned_from_geo', u'altitude', u'env_biome', u'texture', u'has_physical_specimen', u'description_duplicate', u'physical_location', u'latitude', u'ph', u'host_taxid', u'elevation', u'description', u'collection_timestamp', u'taxon_id', u'samp_salinity', u'host_subject_id', u'sample_type', u'season_environment', u'required_sample_info_status_id', u'temp', u'country', u'longitude', u'tot_nitro', u'depth', u'anonymized_name', u'experiment_center', u'center_name', u'run_center', u'run_prefix', u'data_type_id', u'target_gene', u'sequencing_meth', u'run_date', u'pcr_primers', u'linkerprimersequence', u'platform', u'library_construction_protocol', u'experiment_design_description', u'study_center', u'center_project_name', u'sample_center', u'samp_size', u'illumina_technology', u'experiment_title', u'emp_status_id', u'target_subfragment', u'barcodesequence' ], dtype='object')))
def display_template(self, preprocessed_data_id, msg, msg_level): """Simple function to avoid duplication of code""" preprocessed_data_id = int(preprocessed_data_id) try: preprocessed_data = PreprocessedData(preprocessed_data_id) except QiitaDBUnknownIDError: raise HTTPError(404, "PreprocessedData %d does not exist!" % preprocessed_data_id) else: user = self.current_user if user.level != 'admin': raise HTTPError(403, "No permissions of admin, " "get/EBISubmitHandler: %s!" % user.id) prep_template = PrepTemplate(preprocessed_data.prep_template) sample_template = SampleTemplate(preprocessed_data.study) study = Study(preprocessed_data.study) stats = [('Number of samples', len(prep_template)), ('Number of metadata headers', len(sample_template.metadata_headers()))] demux = [path for _, path, ftype in preprocessed_data.get_filepaths() if ftype == 'preprocessed_demux'] demux_length = len(demux) if not demux_length: msg = ("Study does not appear to have demultiplexed " "sequences associated") msg_level = 'danger' elif demux_length > 1: msg = ("Study appears to have multiple demultiplexed files!") msg_level = 'danger' elif demux_length == 1: demux_file = demux[0] demux_file_stats = demux_stats(demux_file) stats.append(('Number of sequences', demux_file_stats.n)) msg_level = 'success' self.render('ebi_submission.html', study_title=study.title, stats=stats, message=msg, study_id=study.id, level=msg_level, preprocessed_data_id=preprocessed_data_id, investigation_type=prep_template.investigation_type)
def setUp(self): self.prep_template = PrepTemplate(1) self.study = Study(1) self.params_table = "preprocessed_sequence_illumina_params" self.params_id = 1 fd, self.fna_fp = mkstemp(suffix='_seqs.fna') close(fd) fd, self.qual_fp = mkstemp(suffix='_seqs.qual') close(fd) self.filepaths = [(self.fna_fp, 4), (self.qual_fp, 5)] _, self.db_test_ppd_dir = get_mountpoint( 'preprocessed_data')[0] self.ebi_submission_accession = "EBI123456-A" self.ebi_study_accession = "EBI123456-B" with open(self.fna_fp, "w") as f: f.write("\n") with open(self.qual_fp, "w") as f: f.write("\n") self._clean_up_files = []
def test_dataframe_from_template(self): template = PrepTemplate(1) obs = dataframe_from_template(template) # 27 samples self.assertEqual(len(obs), 27) self.assertTrue( set(obs.index), { u'SKB1.640202', u'SKB2.640194', u'SKB3.640195', u'SKB4.640189', u'SKB5.640181', u'SKB6.640176', u'SKB7.640196', u'SKB8.640193', u'SKB9.640200', u'SKD1.640179', u'SKD2.640178', u'SKD3.640198', u'SKD4.640185', u'SKD5.640186', u'SKD6.640190', u'SKD7.640191', u'SKD8.640184', u'SKD9.640182', u'SKM1.640183', u'SKM2.640199', u'SKM3.640197', u'SKM4.640180', u'SKM5.640177', u'SKM6.640187', u'SKM7.640188', u'SKM8.640201', u'SKM9.640192' }) self.assertTrue( set(obs.columns), { u'tot_org_carb', u'common_name', u'has_extracted_data', u'required_sample_info_status', u'water_content_soil', u'env_feature', u'assigned_from_geo', u'altitude', u'env_biome', u'texture', u'has_physical_specimen', u'description_duplicate', u'physical_location', u'latitude', u'ph', u'host_taxid', u'elevation', u'description', u'collection_timestamp', u'taxon_id', u'samp_salinity', u'host_subject_id', u'sample_type', u'season_environment', u'temp', u'country', u'longitude', u'tot_nitro', u'depth', u'anonymized_name', u'target_subfragment', u'sample_center', u'samp_size', u'run_date', u'experiment_center', u'pcr_primers', u'center_name', u'barcodesequence', u'run_center', u'run_prefix', u'library_construction_protocol', u'emp_status', u'linkerprimersequence', u'experiment_design_description', u'target_gene', u'center_project_name', u'illumina_technology', u'sequencing_meth', u'platform', u'experiment_title', u'study_center' })
def test_move_filepaths_to_upload_folder(self): # setting up test, done here as this is the only test that uses these # files fd, seqs_fp = mkstemp(suffix="_seqs.fastq") close(fd) st = Study(1) metadata_dict = { "SKB8.640193": { "center_name": "ANL", "primer": "GTGCCAGCMGCCGCGGTAA", "barcode": "GTCCGCAAGTTA", "run_prefix": "s_G1_L001_sequences", "platform": "ILLUMINA", "library_construction_protocol": "AAAA", "experiment_design_description": "BBBB", } } metadata = pd.DataFrame.from_dict(metadata_dict, orient="index") pt = PrepTemplate.create(metadata, Study(1), "16S") rd = RawData.create(2, [pt], [(seqs_fp, 1)]) filepaths = rd.get_filepaths() # deleting reference so we can directly call # move_filepaths_to_upload_folder for fid, _, _ in filepaths: self.conn_handler.execute("DELETE FROM qiita.raw_filepath WHERE filepath_id=%s", (fid,)) # moving filepaths move_filepaths_to_upload_folder(st.id, filepaths) # check that they do not exist in the old path but do in the new one path_for_removal = join(get_mountpoint("uploads")[0][1], str(st.id)) for _, fp, _ in filepaths: self.assertFalse(exists(fp)) new_fp = join(path_for_removal, basename(fp).split("_", 1)[1]) self.assertTrue(exists(new_fp)) self.files_to_remove.append(new_fp)
def post(self): study_id = int(self.get_argument('study_id')) prep_template_id = int(self.get_argument('prep_template_id')) raw_data = RawData(PrepTemplate(prep_template_id).raw_data) param_id = int(self.get_argument('preprocessing_parameters_id')) # Get the preprocessing parameters if raw_data.filetype == 'FASTQ': param_constructor = PreprocessedIlluminaParams elif raw_data.filetype in ('FASTA', 'SFF'): param_constructor = Preprocessed454Params else: raise ValueError('Unknown filetype') job_id = submit(self.current_user.id, preprocessor, study_id, prep_template_id, param_id, param_constructor) self.render('compute_wait.html', job_id=job_id, title='Preprocessing', completion_redirect='/study/description/%d?top_tab=' 'raw_data_tab&sub_tab=%s&prep_tab=%s' % (study_id, raw_data.id, prep_template_id))
def template_to_dict(self): template = PrepTemplate(1) obs = template_to_dict(template) # We don't test the specific values as this would blow up the size # of this file as the amount of lines would go to ~1000 # twenty seven samples self.assertEqual(len(obs.keys()), 27) # the mapping file has 24 columns for key, value in obs.items(): # check there are exatly these column names in the dictionary self.assertItemsEqual(value.keys(), [ 'experiment_center', 'center_name', 'run_center', 'run_prefix', 'data_type_id', 'target_gene', 'sequencing_meth', 'run_date', 'pcr_primers', 'ebi_submission_accession', 'linkerprimersequence', 'platform', 'library_construction_protocol', 'experiment_design_description', 'study_center', 'center_project_name', 'sample_center', 'samp_size', 'illumina_technology', 'experiment_title', 'emp_status_id', 'target_subfragment', 'barcodesequence', 'ebi_study_accession' ])
def render(self, study_id, preprocessed_data): user = self.current_user ppd_id = preprocessed_data.id ebi_status = preprocessed_data.submitted_to_insdc_status() ebi_study_accession = preprocessed_data.ebi_study_accession ebi_submission_accession = preprocessed_data.ebi_submission_accession vamps_status = preprocessed_data.submitted_to_vamps_status() filepaths = preprocessed_data.get_filepaths() is_local_request = self._is_local() show_ebi_btn = user.level == "admin" processing_status = preprocessed_data.processing_status processed_data = preprocessed_data.processed_data # Get all the ENA terms for the investigation type ontology = Ontology(convert_to_id('ENA', 'ontology')) # make "Other" show at the bottom of the drop down menu ena_terms = [] for v in sorted(ontology.terms): if v != 'Other': ena_terms.append('<option value="%s">%s</option>' % (v, v)) ena_terms.append('<option value="Other">Other</option>') # New Type is for users to add a new user-defined investigation type user_defined_terms = ontology.user_defined_terms + ['New Type'] if PrepTemplate.exists(preprocessed_data.prep_template): prep_template_id = preprocessed_data.prep_template prep_template = PrepTemplate(prep_template_id) raw_data_id = prep_template.raw_data inv_type = prep_template.investigation_type or "None Selected" else: prep_template_id = None raw_data_id = None inv_type = "None Selected" process_params = {param.id: (generate_param_str(param), param.name) for param in ProcessedSortmernaParams.iter()} # We just need to provide an ID for the default parameters, # so we can initialize the interface default_params = 1 return self.render_string( "study_description_templates/preprocessed_data_info_tab.html", ppd_id=ppd_id, show_ebi_btn=show_ebi_btn, ebi_status=ebi_status, ebi_study_accession=ebi_study_accession, ebi_submission_accession=ebi_submission_accession, filepaths=filepaths, is_local_request=is_local_request, prep_template_id=prep_template_id, raw_data_id=raw_data_id, inv_type=inv_type, ena_terms=ena_terms, vamps_status=vamps_status, user_defined_terms=user_defined_terms, process_params=process_params, default_params=default_params, study_id=preprocessed_data.study, processing_status=processing_status, processed_data=processed_data)
from os.path import join from time import strftime from qiita_db.util import get_mountpoint from qiita_db.sql_connection import SQLConnectionHandler from qiita_db.metadata_template import SampleTemplate, PrepTemplate conn_handler = SQLConnectionHandler() _id, fp_base = get_mountpoint('templates')[0] for study_id in conn_handler.execute_fetchall( "SELECT study_id FROM qiita.study"): study_id = study_id[0] if SampleTemplate.exists(study_id): st = SampleTemplate(study_id) fp = join(fp_base, '%d_%s.txt' % (study_id, strftime("%Y%m%d-%H%M%S"))) st.to_file(fp) st.add_filepath(fp) for prep_template_id in conn_handler.execute_fetchall( "SELECT prep_template_id FROM qiita.prep_template"): prep_template_id = prep_template_id[0] pt = PrepTemplate(prep_template_id) study_id = pt.study_id fp = join(fp_base, '%d_prep_%d_%s.txt' % (pt.study_id, prep_template_id, strftime("%Y%m%d-%H%M%S"))) pt.to_file(fp) pt.add_filepath(fp)
# 23 Nov, 2014 # This patch creates all the qiime mapping files for the existing # prep templates from qiita_db.util import get_mountpoint from qiita_db.sql_connection import SQLConnectionHandler from qiita_db.metadata_template import PrepTemplate conn_handler = SQLConnectionHandler() _id, fp_base = get_mountpoint('templates')[0] for prep_template_id in conn_handler.execute_fetchall( "SELECT prep_template_id FROM qiita.prep_template"): prep_template_id = prep_template_id[0] pt = PrepTemplate(prep_template_id) study_id = pt.study_id for _, fpt in pt.get_filepaths(): pt.create_qiime_mapping_file(fpt)
def test_create_duplicate(self): """Create raises an error when creating a duplicated PrepTemplate""" with self.assertRaises(QiitaDBDuplicateError): PrepTemplate.create(self.metadata, self.test_raw_data)
def test_create_duplicate_header(self): """Create raises an error when duplicate headers are present""" self.metadata['STR_COLUMN'] = pd.Series(['', '', ''], index=self.metadata.index) with self.assertRaises(QiitaDBDuplicateHeaderError): PrepTemplate.create(self.metadata, self.new_raw_data)
def test_exists_true(self): """Exists returns true when the PrepTemplate already exists""" self.assertTrue(PrepTemplate.exists(self.test_raw_data))
def test_get_qiime_minimal_mapping_multiple(self): # We need to create a prep template in which we have different run # prefix values, so we can test this case metadata_dict = { 'SKB8.640193': { 'center_name': 'ANL', 'center_project_name': 'Test Project', 'ebi_submission_accession': None, 'EMP_status': 'EMP', 'str_column': 'Value for sample 1', 'linkerprimersequence': 'GTGCCAGCMGCCGCGGTAA', 'barcodesequence': 'GTCCGCAAGTTA', 'run_prefix': "s_G1_L001_sequences", 'platform': 'ILLUMINA', 'library_construction_protocol': 'AAA', 'experiment_design_description': 'BBB' }, 'SKD8.640184': { 'center_name': 'ANL', 'center_project_name': 'Test Project', 'ebi_submission_accession': None, 'EMP_status': 'EMP', 'str_column': 'Value for sample 2', 'linkerprimersequence': 'GTGCCAGCMGCCGCGGTAA', 'barcodesequence': 'CGTAGAGCTCTC', 'run_prefix': "s_G1_L001_sequences", 'platform': 'ILLUMINA', 'library_construction_protocol': 'AAA', 'experiment_design_description': 'BBB' }, 'SKB7.640196': { 'center_name': 'ANL', 'center_project_name': 'Test Project', 'ebi_submission_accession': None, 'EMP_status': 'EMP', 'str_column': 'Value for sample 3', 'linkerprimersequence': 'GTGCCAGCMGCCGCGGTAA', 'barcodesequence': 'CCTCTGAGAGCT', 'run_prefix': "s_G1_L002_sequences", 'platform': 'ILLUMINA', 'library_construction_protocol': 'AAA', 'experiment_design_description': 'BBB' } } md_template = pd.DataFrame.from_dict(metadata_dict, orient='index') prep_template = PrepTemplate.create(md_template, RawData(2), Study(1), '16S') out_dir = mkdtemp() obs_fps = sorted(_get_qiime_minimal_mapping(prep_template, out_dir)) exp_fps = sorted([ join(out_dir, 's_G1_L001_sequences_MMF.txt'), join(out_dir, 's_G1_L002_sequences_MMF.txt') ]) # Check that the returned list is as expected self.assertEqual(obs_fps, exp_fps) # Check that the file exists for fp in exp_fps: self.assertTrue(exists(fp)) # Check the contents of the file for fp, contents in zip(exp_fps, [EXP_PREP_1, EXP_PREP_2]): with open(fp, "U") as f: self.assertEqual(f.read(), contents)
def test_exists_false(self): """Exists returns false when the PrepTemplate does not exists""" self.assertFalse(PrepTemplate.exists(self.new_raw_data))
def test_metadata_stats_from_sample_and_prep_templates(self): obs = metadata_stats_from_sample_and_prep_templates( SampleTemplate(1), PrepTemplate(1)) for k in obs: self.assertEqual(obs[k], SUMMARY_STATS[k])
def submit_EBI(preprocessed_data_id, action, send, fastq_dir_fp=None): """Submit a preprocessed data to EBI Parameters ---------- preprocessed_data_id : int The preprocesssed data id action : %s The action to perform with this data send : bool True to actually send the files fastq_dir_fp : str, optional The fastq filepath """ preprocessed_data = PreprocessedData(preprocessed_data_id) preprocessed_data_id_str = str(preprocessed_data_id) study = Study(preprocessed_data.study) sample_template = SampleTemplate(study.sample_template) prep_template = PrepTemplate(preprocessed_data.prep_template) investigation_type = None new_investigation_type = None status = preprocessed_data.submitted_to_insdc_status() if status in ('submitting', 'success'): raise ValueError("Cannot resubmit! Current status is: %s" % status) if send: # If we intend actually to send the files, then change the status in # the database preprocessed_data.update_insdc_status('submitting') # we need to figure out whether the investigation type is a known one # or if we have to submit a "new_investigation_type" to EBI current_type = prep_template.investigation_type ena_ontology = Ontology(convert_to_id('ENA', 'ontology')) if current_type in ena_ontology.terms: investigation_type = current_type elif current_type in ena_ontology.user_defined_terms: investigation_type = 'Other' new_investigation_type = current_type else: # This should never happen raise ValueError("Unrecognized investigation type: '%s'. This term " "is neither one of the official terms nor one of the " "user-defined terms in the ENA ontology") if fastq_dir_fp is not None: # If the user specifies a FASTQ directory, use it # Set demux_samples to None so that MetadataTemplate.to_file will put # all samples in the template files demux_samples = None else: # If the user does not specify a FASTQ directory, create one and # re-serialize the per-sample FASTQs from the demux file fastq_dir_fp = mkdtemp(prefix=qiita_config.working_dir) demux = [ path for _, path, ftype in preprocessed_data.get_filepaths() if ftype == 'preprocessed_demux' ][0] # Keep track of which files were actually in the demux file so that we # can write those rows to the prep and samples templates demux_samples = set() with open_file(demux) as demux_fh: for samp, iterator in to_per_sample_ascii(demux_fh, list(sample_template)): demux_samples.add(samp) sample_fp = join(fastq_dir_fp, "%s.fastq.gz" % samp) with gzopen(sample_fp, 'w') as fh: for record in iterator: fh.write(record) output_dir = fastq_dir_fp + '_submission' samp_fp = join(fastq_dir_fp, 'sample_metadata.txt') prep_fp = join(fastq_dir_fp, 'prep_metadata.txt') sample_template.to_file(samp_fp, demux_samples) prep_template.to_file(prep_fp, demux_samples) # Get specific output directory and set filepaths get_output_fp = partial(join, output_dir) study_fp = get_output_fp('study.xml') sample_fp = get_output_fp('sample.xml') experiment_fp = get_output_fp('experiment.xml') run_fp = get_output_fp('run.xml') submission_fp = get_output_fp('submission.xml') if not isdir(output_dir): makedirs(output_dir) else: raise IOError('The output folder already exists: %s' % output_dir) with open(samp_fp, 'U') as st, open(prep_fp, 'U') as pt: submission = EBISubmission.from_templates_and_per_sample_fastqs( preprocessed_data_id_str, study.title, study.info['study_abstract'], investigation_type, st, pt, fastq_dir_fp, new_investigation_type=new_investigation_type, pmids=study.pmids) submission.write_all_xml_files(study_fp, sample_fp, experiment_fp, run_fp, submission_fp, action) if send: submission.send_sequences() study_accession, submission_accession = submission.send_xml() if study_accession is None or submission_accession is None: preprocessed_data.update_insdc_status('failed') raise ComputeError("EBI Submission failed!") else: preprocessed_data.update_insdc_status('success', study_accession, submission_accession) else: study_accession, submission_accession = None, None return study_accession, submission_accession