def test_create_templates_from_qiime_mapping_file_reverse_linker(self): with TRN: TRN.add("SELECT last_value FROM " "qiita.prep_template_prep_template_id_seq") curr_id = TRN.execute_fetchflatten()[0] obs_st, obs_pt = create_templates_from_qiime_mapping_file( StringIO(QIIME_MAP_WITH_REVERSE_LINKER_PRIMER), self.new_study, "16S") # Be green: clean the environment for template in [obs_st, obs_pt]: for _, fp in template.get_filepaths(): self._clean_up_files.append(fp) self.assertEqual(obs_st.id, self.new_study.id) self.assertEqual(obs_pt.id, curr_id + 1) # Check that each template has the correct columns exp = { "physical_specimen_location", "physical_specimen_remaining", "dna_extracted", "sample_type", "host_subject_id", "latitude", "longitude", "taxon_id", "scientific_name", "collection_timestamp", "description" } self.assertEqual(set(obs_st.categories()), exp) exp = { "barcode", "primer", "center_name", "run_prefix", "platform", "library_construction_protocol", "instrument_model", "experiment_design_description", "reverselinkerprimer" } self.assertEqual(set(obs_pt.categories()), exp)
def status(self): """The status of the prep template Returns ------- str The status of the prep template Notes ----- The status of a prep template is inferred by the status of the processed data generated from this prep template. If no processed data has been generated with this prep template; then the status is 'sandbox'. """ with TRN: sql = """SELECT processed_data_status FROM qiita.processed_data_status pds JOIN qiita.processed_data pd USING (processed_data_status_id) JOIN qiita.preprocessed_processed_data ppd_pd USING (processed_data_id) JOIN qiita.prep_template_preprocessed_data pt_ppd USING (preprocessed_data_id) WHERE pt_ppd.prep_template_id=%s""" TRN.add(sql, [self._id]) return infer_status(TRN.execute_fetchindex())
def test_context_manager_multiple_2(self): self.assertEqual(TRN._contexts_entered, 0) def tester(): self.assertEqual(TRN._contexts_entered, 1) with TRN: self.assertEqual(TRN._contexts_entered, 2) sql = """SELECT EXISTS( SELECT * FROM qiita.test_table WHERE int_column=%s)""" TRN.add(sql, [2]) self.assertTrue(TRN.execute_fetchlast()) self.assertEqual(TRN._contexts_entered, 1) with TRN: self.assertEqual(TRN._contexts_entered, 1) sql = """INSERT INTO qiita.test_table (str_column, int_column) VALUES (%s, %s) RETURNING str_column, int_column""" args = [["insert1", 1], ["insert2", 2], ["insert3", 3]] TRN.add(sql, args, many=True) tester() self.assertEqual(TRN._contexts_entered, 1) self._assert_sql_equal([]) self.assertEqual(TRN._contexts_entered, 0) self._assert_sql_equal([("insert1", True, 1), ("insert2", True, 2), ("insert3", True, 3)]) self.assertEqual(TRN._connection.get_transaction_status(), TRANSACTION_STATUS_IDLE)
def _generate_study_list_for_api(visibility, only_biom=True): """Get general study information Parameters ---------- visibility : string The visibility to get studies Returns ------- list of dict The list of studies and their information """ artifact_type = '' if only_biom: artifact_type = "AND artifact_type = 'BIOM'" sql = f""" SELECT study_id, array_agg(DISTINCT artifact_id) FROM qiita.study INNER JOIN qiita.study_artifact USING (study_id) INNER JOIN qiita.artifact USING (artifact_id) INNER JOIN qiita.artifact_type USING (artifact_type_id) INNER JOIN qiita.visibility USING (visibility_id) WHERE visibility = %s {artifact_type} GROUP BY study_id """ with TRN: TRN.add(sql, [visibility]) return dict(TRN.execute_fetchindex())
def _check_id(self, id_): r"""Checks that the MetadataTemplate id_ exists on the database""" with TRN: sql = "SELECT EXISTS(SELECT * FROM qiita.{0} WHERE {1}=%s)".format( self._table, self._id_column) TRN.add(sql, [id_]) return TRN.execute_fetchlast()
def test_create_templates_from_qiime_mapping_file_reverse_linker(self): with TRN: TRN.add("SELECT last_value FROM " "qiita.prep_template_prep_template_id_seq") curr_id = TRN.execute_fetchflatten()[0] obs_st, obs_pt = create_templates_from_qiime_mapping_file( StringIO(QIIME_MAP_WITH_REVERSE_LINKER_PRIMER), self.new_study, "16S") # Be green: clean the environment for template in [obs_st, obs_pt]: for _, fp in template.get_filepaths(): self._clean_up_files.append(fp) self.assertEqual(obs_st.id, self.new_study.id) self.assertEqual(obs_pt.id, curr_id + 1) # Check that each template has the correct columns exp = {"physical_specimen_location", "physical_specimen_remaining", "dna_extracted", "sample_type", "host_subject_id", "latitude", "longitude", "taxon_id", "scientific_name", "collection_timestamp", "description"} self.assertEqual(set(obs_st.categories()), exp) exp = {"barcode", "primer", "center_name", "run_prefix", "platform", "library_construction_protocol", "instrument_model", "experiment_design_description", "reverselinkerprimer"} self.assertEqual(set(obs_pt.categories()), exp)
def exists(cls, sample_id, md_template): r"""Checks if already exists a MetadataTemplate for the provided object Parameters ---------- sample_id : str The sample id md_template : MetadataTemplate The metadata template to which the sample belongs to Returns ------- bool True if already exists. False otherwise. """ with TRN: cls._check_subclass() sql = """SELECT EXISTS( SELECT * FROM qiita.{0} WHERE sample_id=%s AND {1}=%s )""".format( cls._table, cls._id_column ) TRN.add(sql, [sample_id, md_template.id]) return TRN.execute_fetchlast()
def update(self, md_template): r"""Update values in the template Parameters ---------- md_template : DataFrame The metadata template file contents indexed by samples Ids Raises ------ QiitaDBError If md_template and db do not have the same sample ids If md_template and db do not have the same column headers If self.can_be_updated is not True """ with TRN: # Clean and validate the metadata template given new_map = self._clean_validate_template(md_template, self.study_id, self.columns_restrictions) # Retrieving current metadata sql = "SELECT * FROM qiita.{0}".format(self._table_name(self.id)) TRN.add(sql) current_map = self._transform_to_dict(TRN.execute_fetchindex()) current_map = pd.DataFrame.from_dict(current_map, orient="index") # simple validations of sample ids and column names samples_diff = set(new_map.index).difference(current_map.index) if samples_diff: raise QiitaDBError( "The new template differs from what is stored " "in database by these samples names: %s" % ", ".join(samples_diff) ) columns_diff = set(new_map.columns).difference(current_map.columns) if columns_diff: raise QiitaDBError( "The new template differs from what is stored " "in database by these columns names: %s" % ", ".join(columns_diff) ) # here we are comparing two dataframes following: # http://stackoverflow.com/a/17095620/4228285 current_map.sort(axis=0, inplace=True) current_map.sort(axis=1, inplace=True) new_map.sort(axis=0, inplace=True) new_map.sort(axis=1, inplace=True) map_diff = (current_map != new_map).stack() map_diff = map_diff[map_diff] map_diff.index.names = ["id", "column"] changed_cols = map_diff.index.get_level_values("column").unique() if not self.can_be_updated(columns=set(changed_cols)): raise QiitaDBError( "The new template is modifying fields that cannot be " "modified. Try removing the target gene fields or " "deleting the processed data. You are trying to modify: %s" % ", ".join(changed_cols) ) for col in changed_cols: self.update_category(col, new_map[col].to_dict()) self.generate_files()
def preprocessed_data(self): with TRN: sql = """SELECT preprocessed_data_id FROM qiita.prep_template_preprocessed_data WHERE prep_template_id=%s""" TRN.add(sql, [self.id]) return TRN.execute_fetchflatten()
def __getitem__(self, key): r"""Returns the value of the metadata category `key` Parameters ---------- key : str The metadata category Returns ------- obj The value of the metadata category `key` Raises ------ KeyError If the metadata category `key` does not exists See Also -------- get """ with TRN: key = key.lower() if key not in self._get_categories(): # The key is not available for the sample, so raise a KeyError raise KeyError( "Metadata category %s does not exists for sample %s" " in template %d" % (key, self._id, self._md_template.id)) sql = """SELECT {0} FROM qiita.{1} WHERE sample_id=%s""".format(key, self._dynamic_table) TRN.add(sql, [self._id]) return TRN.execute_fetchlast()
def setitem(self, column, value): """Sets `value` as value for the given `column` Parameters ---------- column : str The column to update value : str The value to set. This is expected to be a str on the assumption that psycopg2 will cast as necessary when updating. Raises ------ QiitaDBColumnError If the column does not exist in the table """ with TRN: # Check if the column exist in the table if column not in self._get_categories(): raise QiitaDBColumnError("Column %s does not exist in %s" % (column, self._dynamic_table)) sql = """UPDATE qiita.{0} SET {1}=%s WHERE sample_id=%s""".format(self._dynamic_table, column) TRN.add(sql, [value, self._id])
def _common_creation_steps(cls, md_template, obj_id): r"""Executes the common creation steps Parameters ---------- md_template : DataFrame The metadata template file contents indexed by sample ids obj_id : int The id of the object being created """ with TRN: cls._check_subclass() # Get some useful information from the metadata template sample_ids = md_template.index.tolist() headers = sorted(md_template.keys().tolist()) # Insert values on template_sample table values = [[obj_id, s_id] for s_id in sample_ids] sql = """INSERT INTO qiita.{0} ({1}, sample_id) VALUES (%s, %s)""".format(cls._table, cls._id_column) TRN.add(sql, values, many=True) # Insert rows on *_columns table datatypes = get_datatypes(md_template.ix[:, headers]) # psycopg2 requires a list of tuples, in which each tuple is a set # of values to use in the string formatting of the query. We have # all the values in different lists (but in the same order) so use # zip to create the list of tuples that psycopg2 requires. values = [[obj_id, h, d] for h, d in zip(headers, datatypes)] sql = """INSERT INTO qiita.{0} ({1}, column_name, column_type) VALUES (%s, %s, %s)""".format(cls._column_table, cls._id_column) TRN.add(sql, values, many=True) # Create table with custom columns table_name = cls._table_name(obj_id) column_datatype = ["%s %s" % (col, dtype) for col, dtype in zip(headers, datatypes)] sql = """CREATE TABLE qiita.{0} ( sample_id varchar NOT NULL, {1}, CONSTRAINT fk_{0} FOREIGN KEY (sample_id) REFERENCES qiita.study_sample (sample_id) ON UPDATE CASCADE )""".format(table_name, ', '.join(column_datatype)) TRN.add(sql) # Insert values on custom table values = as_python_types(md_template, headers) values.insert(0, sample_ids) values = [list(v) for v in zip(*values)] sql = """INSERT INTO qiita.{0} (sample_id, {1}) VALUES (%s, {2})""".format( table_name, ", ".join(headers), ', '.join(["%s"] * len(headers))) TRN.add(sql, values, many=True) # Execute all the steps TRN.execute()
def update_category(self, category, samples_and_values): """Update an existing column Parameters ---------- category : str The category to update samples_and_values : dict A mapping of {sample_id: value} Raises ------ QiitaDBUnknownIDError If a sample_id is included in values that is not in the template QiitaDBColumnError If the column does not exist in the table. This is implicit, and can be thrown by the contained Samples. ValueError If one of the new values cannot be inserted in the DB due to different types """ with TRN: if not set(self.keys()).issuperset(samples_and_values): missing = set(self.keys()) - set(samples_and_values) table_name = self._table_name(self._id) raise QiitaDBUnknownIDError(missing, table_name) for k, v in viewitems(samples_and_values): sample = self[k] sample.setitem(category, v) try: TRN.execute() except ValueError as e: # catching error so we can check if the error is due to # different column type or something else value_types = set(type_lookup(type(value)) for value in viewvalues(samples_and_values)) sql = """SELECT udt_name FROM information_schema.columns WHERE column_name = %s AND table_schema = 'qiita' AND (table_name = %s OR table_name = %s)""" TRN.add(sql, [category, self._table, self._table_name(self._id)]) column_type = TRN.execute_fetchlast() if any([column_type != vt for vt in value_types]): value_str = ", ".join([str(value) for value in viewvalues(samples_and_values)]) value_types_str = ", ".join(value_types) raise ValueError( 'The new values being added to column: "%s" are "%s" ' '(types: "%s"). However, this column in the DB is of ' 'type "%s". Please change the values in your updated ' "template or reprocess your template." % (category, value_str, value_types_str, column_type) ) raise e
def test_post_commit_funcs_error(self): def func(): raise ValueError() with self.assertRaises(RuntimeError): with TRN: TRN.add("SELECT 42") TRN.add_post_commit_func(func)
def _set_allocation(memory): with TRN: sql = """UPDATE qiita.processing_job_resource_allocation SET allocation = '{0}' WHERE name = 'build_analysis_files'""".format( '-q qiita -l mem=%s' % memory) TRN.add(sql) TRN.execute()
def tester(): self.assertEqual(TRN._contexts_entered, 1) with TRN: self.assertEqual(TRN._contexts_entered, 2) sql = """SELECT EXISTS( SELECT * FROM qiita.test_table WHERE int_column=%s)""" TRN.add(sql, [2]) self.assertTrue(TRN.execute_fetchlast()) self.assertEqual(TRN._contexts_entered, 1)
def test_add_many(self): with TRN: self.assertEqual(TRN._queries, []) sql = "INSERT INTO qiita.test_table (int_column) VALUES (%s)" args = [[1], [2], [3]] TRN.add(sql, args, many=True) exp = [(sql, [1]), (sql, [2]), (sql, [3])] self.assertEqual(TRN._queries, exp)
def test_execute_return(self): with TRN: sql = """INSERT INTO qiita.test_table (str_column, int_column) VALUES (%s, %s) RETURNING str_column, int_column""" TRN.add(sql, ["test_insert", 2]) sql = """UPDATE qiita.test_table SET bool_column = %s WHERE str_column = %s RETURNING int_column""" TRN.add(sql, [False, "test_insert"]) obs = TRN.execute() self.assertEqual(obs, [[["test_insert", 2]], [[2]]])
def test_context_manager_execute(self): with TRN: sql = """INSERT INTO qiita.test_table (str_column, int_column) VALUES (%s, %s) RETURNING str_column, int_column""" args = [["insert1", 1], ["insert2", 2], ["insert3", 3]] TRN.add(sql, args, many=True) self._assert_sql_equal([]) self._assert_sql_equal([("insert1", True, 1), ("insert2", True, 2), ("insert3", True, 3)]) self.assertEqual(TRN._connection.get_transaction_status(), TRANSACTION_STATUS_IDLE)
def raw_data(self): with TRN: sql = """SELECT raw_data_id FROM qiita.prep_template WHERE prep_template_id=%s""" TRN.add(sql, [self.id]) result = TRN.execute_fetchindex() if result: # If there is any result, it will be in the first row # and in the first element of that row, thus [0][0] return result[0][0] return None
def test_execute_fetchlast(self): with TRN: sql = """INSERT INTO qiita.test_table (str_column, int_column) VALUES (%s, %s) RETURNING str_column, int_column""" args = [["insert1", 1], ["insert2", 2], ["insert3", 3]] TRN.add(sql, args, many=True) sql = """SELECT EXISTS( SELECT * FROM qiita.test_table WHERE int_column=%s)""" TRN.add(sql, [2]) self.assertTrue(TRN.execute_fetchlast())
def _get_sample_ids(self): r"""Returns all the available samples for the metadata template Returns ------- set of str The set of all available sample ids """ with TRN: sql = "SELECT sample_id FROM qiita.{0} WHERE {1}=%s".format(self._table, self._id_column) TRN.add(sql, [self._id]) return set(TRN.execute_fetchflatten())
def study_id(self): """Gets the study id with which this prep template is associated Returns ------- int The ID of the study with which this prep template is associated """ with TRN: sql = """SELECT study_id FROM qiita.study_prep_template WHERE prep_template_id=%s""" TRN.add(sql, [self.id]) return TRN.execute_fetchlast()
def preprocessing_status(self): r"""Tells if the data has been preprocessed or not Returns ------- str One of {'not_preprocessed', 'preprocessing', 'success', 'failed'} """ with TRN: sql = """SELECT preprocessing_status FROM qiita.prep_template WHERE {0}=%s""".format(self._id_column) TRN.add(sql, [self.id]) return TRN.execute_fetchlast()
def metadata_headers(): """Returns metadata headers available Returns ------- list Alphabetical list of all metadata headers available """ with TRN: sql = """SELECT DISTINCT column_name FROM qiita.study_sample_columns ORDER BY column_name""" TRN.add(sql) return TRN.execute_fetchflatten()
def test_execute(self): with TRN: sql = """INSERT INTO qiita.test_table (str_column, int_column) VALUES (%s, %s)""" TRN.add(sql, ["test_insert", 2]) sql = """UPDATE qiita.test_table SET int_column = %s, bool_column = %s WHERE str_column = %s""" TRN.add(sql, [20, False, "test_insert"]) obs = TRN.execute() self.assertEqual(obs, [None, None]) self._assert_sql_equal([]) self._assert_sql_equal([("test_insert", False, 20)])
def test_context_manager_rollback(self): try: with TRN: sql = """INSERT INTO qiita.test_table (str_column, int_column) VALUES (%s, %s) RETURNING str_column, int_column""" args = [["insert1", 1], ["insert2", 2], ["insert3", 3]] TRN.add(sql, args, many=True) TRN.execute() raise ValueError("Force exiting the context manager") except ValueError: pass self._assert_sql_equal([]) self.assertEqual(TRN._connection.get_transaction_status(), TRANSACTION_STATUS_IDLE)
def test_post_commit_funcs(self): fd, fp = mkstemp() close(fd) self._files_to_remove.append(fp) def func(fp): with open(fp, "w") as f: f.write("\n") with TRN: TRN.add("SELECT 42") TRN.add_post_commit_func(func, fp) self.assertTrue(exists(fp))
def test_execute_many(self): with TRN: sql = """INSERT INTO qiita.test_table (str_column, int_column) VALUES (%s, %s)""" args = [["insert1", 1], ["insert2", 2], ["insert3", 3]] TRN.add(sql, args, many=True) sql = """UPDATE qiita.test_table SET int_column = %s, bool_column = %s WHERE str_column = %s""" TRN.add(sql, [20, False, "insert2"]) obs = TRN.execute() self.assertEqual(obs, [None, None, None, None]) self._assert_sql_equal([]) self._assert_sql_equal([("insert1", True, 1), ("insert3", True, 3), ("insert2", False, 20)])
def test_execute_commit_false_rollback(self): with TRN: sql = """INSERT INTO qiita.test_table (str_column, int_column) VALUES (%s, %s) RETURNING str_column, int_column""" args = [["insert1", 1], ["insert2", 2], ["insert3", 3]] TRN.add(sql, args, many=True) obs = TRN.execute() exp = [[["insert1", 1]], [["insert2", 2]], [["insert3", 3]]] self.assertEqual(obs, exp) self._assert_sql_equal([]) TRN.rollback() self._assert_sql_equal([])
def add_filepath(self, filepath, fp_id=None): r"""Populates the DB tables for storing the filepath and connects the `self` objects with this filepath""" with TRN: fp_id = self._fp_id if fp_id is None else fp_id try: fpp_id = insert_filepaths([(filepath, fp_id)], None, "templates", "filepath", move_files=False)[0] sql = """INSERT INTO qiita.{0} ({1}, filepath_id) VALUES (%s, %s)""".format( self._filepath_table, self._id_column ) TRN.add(sql, [self._id, fpp_id]) TRN.execute() except Exception as e: LogEntry.create("Runtime", str(e), info={self.__class__.__name__: self.id}) raise e
def _to_dict(self): r"""Returns the categories and their values in a dictionary Returns ------- dict of {str: str} A dictionary of the form {category: value} """ with TRN: sql = "SELECT * FROM qiita.{0} WHERE sample_id=%s".format(self._dynamic_table) TRN.add(sql, [self._id]) d = dict(TRN.execute_fetchindex()[0]) # Remove the sample_id, is not part of the metadata del d["sample_id"] return d
def test_full_query_and_insertion(self): # let's archive different values from different jobs with TRN: # 3 - close reference picking # 3 - success sql = """SELECT processing_job_id FROM qiita.processing_job WHERE command_id = 3 AND processing_job_status_id = 3""" TRN.add(sql) jobs = TRN.execute_fetchflatten() # this is so we can also tests the parent merging scheme # 1 - split libraries sql = """UPDATE qiita.command_parameter SET check_biom_merge = True WHERE command_id = 1 and parameter_name = 'barcode_type'""" TRN.add(sql) TRN.execute() exp_all_features = {} for j in jobs: featureA = 'AA - %s' % j featureB = 'BB - %s' % j # testing that nothing is there data = {'job_id': j, 'features': [featureA, featureB]} obs = self.post( '/qiita_db/archive/observations/', headers=self.header, data=data) exp = {} self.assertEqual(obs.code, 200) self.assertEqual(loads(obs.body), exp) # inserting and testing insertion data = {'path': j, 'value': dumps({featureA: 'CA', featureB: 'CB'})} obs = self.patch( '/qiita_db/archive/observations/', headers=self.header, data=data) exp = {featureA: 'CA', featureB: 'CB'} self.assertEqual(obs.code, 200) self.assertEqual(loads(obs.body), exp) exp_all_features[featureA] = 'CA' exp_all_features[featureB] = 'CB' # testing retrieve all featues obs = Archive.retrieve_feature_values() self.assertEqual(obs, exp_all_features) # this doesn't exist so should be empty obs = Archive.retrieve_feature_values(archive_merging_scheme='') self.assertEqual(obs, {}) obs = Archive.retrieve_feature_values( archive_merging_scheme='Pick closed-reference OTUs | Split ' 'libraries FASTQ (barcode_type: golay_12)') self.assertEqual(obs, exp_all_features)
def create_rarefied_biom_artifact(analysis, srare_cmd_id, biom_data, params, parent_biom_artifact_id, rarefaction_job_id, srare_cmd_out_id): """Creates the rarefied biom artifact Parameters ---------- analysis : dict The analysis information srare_cmd_id : int The command id of "Single Rarefaction" biom_data : dict The biom information params : str The processing parameters parent_biom_artifact_id : int The parent biom artifact id rarefaction_job_id : str The job id of the rarefaction job srare_cmd_out_id : int The id of the single rarefaction output Returns ------- int The artifact id """ with TRN: # Transfer the file to an artifact # Magic number 7: artifact type -> biom artifact_id = transfer_file_to_artifact( analysis['analysis_id'], analysis['timestamp'], srare_cmd_id, biom_data['data_type_id'], params, 7, biom_data['filepath_id']) # Link the artifact with its parent sql = """INSERT INTO qiita.parent_artifact (artifact_id, parent_id) VALUES (%s, %s)""" TRN.add(sql, [artifact_id, parent_biom_artifact_id]) # Link the artifact as the job output sql = """INSERT INTO qiita.artifact_output_processing_job (artifact_id, processing_job_id, command_output_id) VALUES (%s, %s, %s)""" TRN.add(sql, [artifact_id, rarefaction_job_id, srare_cmd_out_id]) return artifact_id
def is_test_environment(): """Checks if Qiita is running in a test environment Returns ------- bool Whether Qiita is running in a test environment or not Notes ----- Qiita is running in a test environment if: - It is connected to a test database, AND - The config file indicates that this is a test environment """ # Check that we are not in a production environment with TRN: TRN.add("SELECT test FROM settings") test_db = TRN.execute_fetchflatten()[0] return qiita_config.test_environment and test_db
def test_delete_study(self): # as samples have been submitted to EBI, this will fail job = self._create_job('delete_study', {'study': 1}) private_task(job.id) self.assertEqual(job.status, 'error') self.assertIn( "Cannot delete artifact 2: Artifact 2 has been " "submitted to EBI", job.log.msg) # making sure the analysis, first thing to delete, still exists self.assertTrue(Analysis.exists(1)) # delete everything from the EBI submissions and the processing job so # we can try again: test success (with tags) with TRN: sql = """DELETE FROM qiita.ebi_run_accession""" TRN.add(sql) sql = """DELETE FROM qiita.artifact_processing_job""" TRN.add(sql) TRN.execute() # adding tags Study(1).update_tags(self.user, ['my new tag!']) job = self._create_job('delete_study', {'study': 1}) private_task(job.id) self.assertEqual(job.status, 'success') with self.assertRaises(QiitaDBUnknownIDError): Study(1)
def transfer_file_to_artifact(analysis_id, a_timestamp, command_id, data_type_id, params, artifact_type_id, filepath_id): """Creates a new artifact with the given filepath id Parameters ---------- analysis_id : int The analysis id to attach the artifact a_timestamp : datetime.datetime The generated timestamp of the artifact command_id : int The command id of the artifact data_type_id : int The data type id of the artifact params : str The parameters of the artifact artifact_type_id : int The artifact type filepath_id : int The filepath id Returns ------- int The artifact id """ with TRN: # Add the row in the artifact table # Magic number 4: Visibility -> sandbox sql = """INSERT INTO qiita.artifact (generated_timestamp, command_id, data_type_id, command_parameters, visibility_id, artifact_type_id, submitted_to_vamps) VALUES (%s, %s, %s, %s, %s, %s, %s) RETURNING artifact_id""" TRN.add(sql, [ a_timestamp, command_id, data_type_id, params, 4, artifact_type_id, False ]) artifact_id = TRN.execute_fetchlast() # Link the artifact with its file sql = """INSERT INTO qiita.artifact_filepath (artifact_id, filepath_id) VALUES (%s, %s)""" TRN.add(sql, [artifact_id, filepath_id]) # Link the artifact with the analysis sql = """INSERT INTO qiita.analysis_artifact (analysis_id, artifact_id) VALUES (%s, %s)""" TRN.add(sql, [analysis_id, artifact_id]) return artifact_id
def create_rarefaction_job(depth, biom_artifact_id, analysis, srare_cmd_id): """Create a new rarefaction job Parameters ---------- depth : int The rarefaction depth biom_artifact_id : int The artifact id of the input rarefaction biom table analysis : dict Dictionary with the analysis information srare_cmd_id : int The command id of the single rarefaction command Returns ------- job_id : str The job id params : str The job parameters """ # Add the row in the procesisng job table params = ('{"depth":%d,"subsample_multinomial":false,"biom_table":%s}' % (depth, biom_artifact_id)) with TRN: # magic number 3: status -> success sql = """INSERT INTO qiita.processing_job (email, command_id, command_parameters, processing_job_status_id) VALUES (%s, %s, %s, %s) RETURNING processing_job_id""" TRN.add(sql, [analysis['email'], srare_cmd_id, params, 3]) job_id = TRN.execute_fetchlast() # Step 1.2.b: Link the job with the input artifact sql = """INSERT INTO qiita.artifact_processing_job (artifact_id, processing_job_id) VALUES (%s, %s)""" TRN.add(sql, [biom_artifact_id, job_id]) TRN.execute() return job_id, params
# 'prep_template_sample' sql = """SELECT table_name, array_agg(column_name::text) FROM information_schema.columns WHERE column_name IN %s AND column_name != 'sample_id' AND table_name LIKE 'prep_%%' AND table_name NOT IN ( 'prep_template', 'prep_template_sample') GROUP BY table_name""" # note that we are looking for those columns with duplicated names in # the headers headers = set(PrepTemplate.metadata_headers()) & \ set(SampleTemplate.metadata_headers()) if headers: TRN.add(sql, [tuple(headers)]) overlapping = dict(TRN.execute_fetchindex()) else: overlapping = None if overlapping is not None: # finding actual duplicates for table_name, cols in overlapping.items(): # leaving print so when we patch in the main system we know that # nothing was renamed or deal with that print(table_name) with TRN: for c in cols: sql = 'ALTER TABLE qiita.%s RENAME COLUMN %s TO %s_renamed' % ( table_name, c, c) TRN.add(sql)
if cols: to_fix.append((st, cols)) for pt in s.prep_templates(): if pt is None: continue cols = searcher(pt.to_dataframe()) if cols: to_fix.append((pt, cols)) # now let's fix the database and regenerate the files for infofile, cols in to_fix: with TRN: for col in cols: # removing tabs sql = """UPDATE qiita.{0}{1} SET {2} = replace({2}, chr(9), ' ')""".format( infofile._table_prefix, infofile.id, col) TRN.add(sql) # removing enters sql = """UPDATE qiita.{0}{1} SET {2} = regexp_replace( {2}, E'[\\n\\r\\u2028]+', ' ', 'g' )""".format( infofile._table_prefix, infofile.id, col) TRN.add(sql) TRN.execute() infofile.generate_files()
def test_retrive_workflows(self): # we should see all 3 workflows DefaultWorkflow(2).active = False exp = deepcopy(WORKFLOWS) self.assertCountEqual(_retrive_workflows(False), exp) # validating that the params_name is not being used self.assertNotIn( 'Split libraries | Defaults with Golay 12 barcodes', [x[2] for x in _retrive_workflows(False)[1]['nodes']]) # now it should be there with TRN: # Hard-coded values; 19 -> barcode_type sql = """UPDATE qiita.command_parameter SET name_order = 0 WHERE command_parameter_id = 19""" TRN.add(sql) TRN.execute() self.assertIn( 'Split libraries | Defaults with Golay 12 barcodes', [x[2] for x in _retrive_workflows(False)[1]['nodes']]) # and gone again with TRN: sql = """UPDATE qiita.command_parameter SET name_order = NULL WHERE command_parameter_id = 19""" TRN.add(sql) TRN.execute() self.assertNotIn( 'Split libraries | Defaults with Golay 12 barcodes', [x[2] for x in _retrive_workflows(False)[1]['nodes']]) # we should not see the middle one del exp[1] self.assertCountEqual(_retrive_workflows(True), exp) # let's create a couple of more complex scenarios so we touch all code # by adding multiple paths, that should connect and get separate # -- adds a new path that should be kept separate all the way; this is # to emulate what happens with different trimming (different # default parameter) and deblur (same for each of the previous # steps) sql = """ INSERT INTO qiita.default_workflow_node ( default_workflow_id, default_parameter_set_id) VALUES (1, 2), (1, 10); INSERT INTO qiita.default_workflow_edge ( parent_id, child_id) VALUES (7, 8); INSERT INTO qiita.default_workflow_edge_connections ( default_workflow_edge_id, parent_output_id, child_input_id) VALUES (4, 1, 3)""" perform_as_transaction(sql) # -- adds a new path that should be kept together and then separate; # this is to simulate what happens with MTX/WGS processing, one # single QC step (together) and 2 separete profilers sql = """ INSERT INTO qiita.default_parameter_set ( command_id, parameter_set_name, parameter_set) VALUES (3, '100%', ('{"reference":1,"sortmerna_e_value":1,' || '"sortmerna_max_pos":' || '10000,"similarity":1.0,"sortmerna_coverage":1.00,' || '"threads":1}')::json); INSERT INTO qiita.default_workflow_node ( default_workflow_id, default_parameter_set_id) VALUES (2, 17); INSERT INTO qiita.default_workflow_edge ( parent_id, child_id) VALUES (3, 9); INSERT INTO qiita.default_workflow_edge_connections ( default_workflow_edge_id, parent_output_id, child_input_id) VALUES (5, 1, 3)""" perform_as_transaction(sql) # adding new expected values exp = deepcopy(WORKFLOWS) obs = _retrive_workflows(False) exp[0]['nodes'].extend([ ['params_7', 1, 'Split libraries FASTQ', 'Defaults with reverse ' 'complement mapping file barcodes', { 'max_bad_run_length': '3', 'min_per_read_length_fraction': '0.75', 'sequence_max_n': '0', 'rev_comp_barcode': 'False', 'rev_comp_mapping_barcodes': 'True', 'rev_comp': 'False', 'phred_quality_threshold': '3', 'barcode_type': 'golay_12', 'max_barcode_errors': '1.5', 'phred_offset': 'auto'}], ['output_params_7_demultiplexed | Demultiplexed', 1, 'demultiplexed | Demultiplexed'], ['params_8', 3, 'Pick closed-reference OTUs', 'Defaults', { 'reference': '1', 'sortmerna_e_value': '1', 'sortmerna_max_pos': '10000', 'similarity': '0.97', 'sortmerna_coverage': '0.97', 'threads': '1'}], ['output_params_8_OTU table | BIOM', 3, 'OTU table | BIOM']]) exp[0]['edges'].extend([ ['input_params_1_FASTQ | per_sample_FASTQ', 'params_7'], ['params_7', 'output_params_7_demultiplexed | Demultiplexed'], ['output_params_7_demultiplexed | Demultiplexed', 'params_8'], ['params_8', 'output_params_8_OTU table | BIOM']]) exp[1]['nodes'].extend([ ['params_9', 3, 'Pick closed-reference OTUs', '100%', { 'reference': '1', 'sortmerna_e_value': '1', 'sortmerna_max_pos': '10000', 'similarity': '1.0', 'sortmerna_coverage': '1.0', 'threads': '1'}], ['output_params_9_OTU table | BIOM', 3, 'OTU table | BIOM']]) exp[1]['edges'].extend([ ['output_params_3_demultiplexed | Demultiplexed', 'params_9'], ['params_9', 'output_params_9_OTU table | BIOM'] ]) self.assertCountEqual(obs, exp)
# October 30th, 2017 # A change introduced in July made all the parameters to be stored as strings # The DB needs to be patched so all the artifacts follow this structure from json import dumps from qiita_db.sql_connection import TRN with TRN: sql = """SELECT * FROM qiita.artifact JOIN qiita.artifact_output_processing_job USING (artifact_id) WHERE command_id IS NOT NULL""" TRN.add(sql) sql_update_artifact = """UPDATE qiita.artifact SET command_parameters = %s WHERE artifact_id = %s""" sql_update_job = """UPDATE qiita.processing_job SET command_parameters = %s WHERE processing_job_id = %s""" for ainfo in TRN.execute_fetchindex(): ainfo = dict(ainfo) params = dumps( {k: str(v) for k, v in ainfo['command_parameters'].items()}) TRN.add(sql_update_artifact, [params, ainfo['artifact_id']]) TRN.add(sql_update_job, [params, ainfo['processing_job_id']])
# helper function to calculate checksum and file size def calculate(finfo): try: size = getsize(finfo['fullpath']) except (FileNotFoundError, PermissionError): return finfo, None, None checksum = compute_checksum(finfo['fullpath']) return finfo['filepath_id'], checksum, size # get all filepaths and their filepath information; takes ~10 min with TRN: TRN.add("SELECT filepath_id FROM qiita.filepath") files = [] for fid in TRN.execute_fetchflatten(): files.append(get_filepath_information(fid)) # just get the filepath ids that haven't been processed, the file format # of this file is filepath_id[tab]checksum[tab]filesize fpath = join(dirname(abspath(__file__)), '74.py.cache.tsv') processed = [] if exists(fpath): with open(fpath, 'r') as f: processed = [ int(line.split('\t')[0]) for line in f.read().split('\n') if line != '' ] files_curr = [f for f in files if f['filepath_id'] not in processed]
def create_non_rarefied_biom_artifact(analysis, biom_data, rarefied_table): """Creates the initial non-rarefied BIOM artifact of the analysis Parameters ---------- analysis : dict Dictionary with the analysis information biom_data : dict Dictionary with the biom file information rarefied_table : biom.Table The rarefied BIOM table Returns ------- int The id of the new artifact """ # The non rarefied biom artifact is the initial biom table of the analysis. # This table does not currently exist anywhere, so we need to actually # create the BIOM file. To create this BIOM file we need: (1) the samples # and artifacts they come from and (2) whether the samples where # renamed or not. (1) is on the database, but we need to inferr (2) from # the existing rarefied BIOM table. Fun, fun... with TRN: # Get the samples included in the BIOM table grouped by artifact id # Note that the analysis contains a BIOM table per data type included # in it, and the table analysis_sample does not differentiate between # datatypes, so we need to check the data type in the artifact table sql = """SELECT artifact_id, array_agg(sample_id) FROM qiita.analysis_sample JOIN qiita.artifact USING (artifact_id) WHERE analysis_id = %s AND data_type_id = %s GROUP BY artifact_id""" TRN.add(sql, [analysis['analysis_id'], biom_data['data_type_id']]) samples_by_artifact = TRN.execute_fetchindex() # Create an empty BIOM table to be the new master table new_table = Table([], [], []) ids_map = {} for a_id, samples in samples_by_artifact: # Get the filepath of the BIOM table from the artifact artifact = Artifact(a_id) biom_fp = None for _, fp, fp_type in artifact.filepaths: if fp_type == 'biom': biom_fp = fp # Note that we are sure that the biom table exists for sure, so # no need to check if biom_fp is undefined biom_table = load_table(biom_fp) samples = set(samples).intersection(biom_table.ids()) biom_table.filter(samples, axis='sample', inplace=True) # we need to check if the table has samples left before merging if biom_table.shape[0] != 0 and biom_table.shape[1] != 0: new_table = new_table.merge(biom_table) ids_map.update( {sid: "%d.%s" % (a_id, sid) for sid in biom_table.ids()}) # Check if we need to rename the sample ids in the biom table new_table_ids = set(new_table.ids()) if not new_table_ids.issuperset(rarefied_table.ids()): # We need to rename the sample ids new_table.update_ids(ids_map, 'sample', True, True) sql = """INSERT INTO qiita.artifact (generated_timestamp, data_type_id, visibility_id, artifact_type_id, submitted_to_vamps) VALUES (%s, %s, %s, %s, %s) RETURNING artifact_id""" # Magic number 4 -> visibility sandbox # Magix number 7 -> biom artifact type TRN.add( sql, [analysis['timestamp'], biom_data['data_type_id'], 4, 7, False]) artifact_id = TRN.execute_fetchlast() # Associate the artifact with the analysis sql = """INSERT INTO qiita.analysis_artifact (analysis_id, artifact_id) VALUES (%s, %s)""" TRN.add(sql, [analysis['analysis_id'], artifact_id]) # Link the artifact with its file dd_id, mp = get_mountpoint('BIOM')[0] dir_fp = join(get_db_files_base_dir(), mp, str(artifact_id)) if not exists(dir_fp): makedirs(dir_fp) new_table_fp = join(dir_fp, "biom_table.biom") with biom_open(new_table_fp, 'w') as f: new_table.to_hdf5(f, "Generated by Qiita") sql = """INSERT INTO qiita.filepath (filepath, filepath_type_id, checksum, checksum_algorithm_id, data_directory_id) VALUES (%s, %s, %s, %s, %s) RETURNING filepath_id""" # Magic number 7 -> filepath_type_id = 'biom' # Magic number 1 -> the checksum algorithm id TRN.add(sql, [ basename(new_table_fp), 7, compute_checksum(new_table_fp), 1, dd_id ]) fp_id = TRN.execute_fetchlast() sql = """INSERT INTO qiita.artifact_filepath (artifact_id, filepath_id) VALUES (%s, %s)""" TRN.add(sql, [artifact_id, fp_id]) TRN.execute() return artifact_id
import pandas as pd from os.path import join, dirname, abspath, exists from qiita_db.sql_connection import TRN with TRN: sql = """SELECT filepath_id FROM qiita.filepath""" TRN.add(sql) fids = TRN.execute_fetchflatten() fpath = join(dirname(abspath(__file__)), 'support_files', 'patches', 'python_patches', '74.py.cache.tsv') if not exists(fpath): raise ValueError("%s doesn't exits, have you run step 1?" % fpath) df = pd.read_csv(fpath, sep='\t', index_col=0, dtype=str, names=['filepath_id', 'checksum', 'fp_size']) cache = df.to_dict('index') args = [] for fid in fids: if fid not in cache: print('missing: %d', fid) else: args.append([cache[fid]['fp_size'], cache[fid]['checksum'], fid]) with TRN: sql = """UPDATE qiita.filepath SET fp_size = %s, checksum = %s
# ----------------------------------------------------------------------------- # Copyright (c) 2014--, The Qiita Development Team. # # Distributed under the terms of the BSD 3-clause License. # # The full license is in the file LICENSE, distributed with this software. # ----------------------------------------------------------------------------- from random import SystemRandom from string import ascii_letters, digits from qiita_db.sql_connection import TRN pool = ascii_letters + digits client_id = ''.join([SystemRandom().choice(pool) for _ in range(50)]) client_secret = ''.join([SystemRandom().choice(pool) for _ in range(255)]) with TRN: sql = """INSERT INTO qiita.oauth_identifiers (client_id, client_secret) VALUES (%s, %s)""" TRN.add(sql, [client_id, client_secret]) sql = """INSERT INTO qiita.oauth_software (software_id, client_id) VALUES (%s, %s)""" TRN.add(sql, [1, client_id]) TRN.execute()
# change in the near future, we feel that the easiest way to transfer # the current analyses results is by creating 3 different types of # artifacts: (1) distance matrix -> which will include the distance matrix, # the principal coordinates and the emperor plots; (2) rarefaction # curves -> which will include all the files generated by alpha rarefaction # and (3) taxonomy summary, which will include all the files generated # by summarize_taxa_through_plots.py with TRN: # Add the new artifact types sql = """INSERT INTO qiita.artifact_type ( artifact_type, description, can_be_submitted_to_ebi, can_be_submitted_to_vamps) VALUES (%s, %s, %s, %s) RETURNING artifact_type_id""" TRN.add(sql, ['beta_div_plots', 'Qiime 1 beta diversity results', False, False]) dm_atype_id = TRN.execute_fetchlast() TRN.add(sql, ['rarefaction_curves', 'Rarefaction curves', False, False]) rc_atype_id = TRN.execute_fetchlast() TRN.add(sql, ['taxa_summary', 'Taxa summary plots', False, False]) ts_atype_id = TRN.execute_fetchlast() # Associate each artifact with the filetypes that it accepts # At this time we are going to add them as directories, just as it is done # right now. We can make it fancier with the new type system. # Magic number 8: the filepath_type_id for the directory sql = """INSERT INTO qiita.artifact_type_filepath_type (artifact_type_id, filepath_type_id, required) VALUES (%s, %s, %s)""" sql_args = [[dm_atype_id, 8, True], [rc_atype_id, 8, True], [ts_atype_id, 8, True]]
get_mountpoint_path_by_id) pool = ascii_letters + digits tgz_id = convert_to_id("tgz", "filepath_type") _id, analysis_mp = get_mountpoint('analysis')[0] with TRN: # 2 and 3 are the ids of the 2 new software rows, the BIOM and # target gene type plugins for i in [2, 3]: client_id = ''.join([SystemRandom().choice(pool) for _ in range(50)]) client_secret = ''.join( [SystemRandom().choice(pool) for _ in range(255)]) sql = """INSERT INTO qiita.oauth_identifiers (client_id, client_secret) VALUES (%s, %s)""" TRN.add(sql, [client_id, client_secret]) sql = """INSERT INTO qiita.oauth_software (software_id, client_id) VALUES (%s, %s)""" TRN.add(sql, [i, client_id]) TRN.execute() # # Generating compressed files for picking failures -- artifact_type = BIOM # sql = """SELECT artifact_id FROM qiita.artifact JOIN qiita.artifact_type USING (artifact_type_id) WHERE artifact_type = 'BIOM'""" TRN.add(sql) for r in TRN.execute_fetchindex():
def create_command(software, name, description, parameters, outputs=None, analysis_only=False): r"""Replicates the Command.create code at the time the patch was written""" # Perform some sanity checks in the parameters dictionary if not parameters: raise QiitaDBError( "Error creating command %s. At least one parameter should " "be provided." % name) sql_param_values = [] sql_artifact_params = [] for pname, vals in parameters.items(): if len(vals) != 2: raise QiitaDBError( "Malformed parameters dictionary, the format should be " "{param_name: [parameter_type, default]}. Found: " "%s for parameter name %s" % (vals, pname)) ptype, dflt = vals # Check that the type is one of the supported types supported_types = ['string', 'integer', 'float', 'reference', 'boolean', 'prep_template', 'analysis'] if ptype not in supported_types and not ptype.startswith( ('choice', 'mchoice', 'artifact')): supported_types.extend(['choice', 'mchoice', 'artifact']) raise QiitaDBError( "Unsupported parameters type '%s' for parameter %s. " "Supported types are: %s" % (ptype, pname, ', '.join(supported_types))) if ptype.startswith(('choice', 'mchoice')) and dflt is not None: choices = set(loads(ptype.split(':')[1])) dflt_val = dflt if ptype.startswith('choice'): # In the choice case, the dflt value is a single string, # create a list with it the string on it to use the # issuperset call below dflt_val = [dflt_val] else: # jsonize the list to store it in the DB dflt = dumps(dflt) if not choices.issuperset(dflt_val): raise QiitaDBError( "The default value '%s' for the parameter %s is not " "listed in the available choices: %s" % (dflt, pname, ', '.join(choices))) if ptype.startswith('artifact'): atypes = loads(ptype.split(':')[1]) sql_artifact_params.append( [pname, 'artifact', atypes]) else: if dflt is not None: sql_param_values.append([pname, ptype, False, dflt]) else: sql_param_values.append([pname, ptype, True, None]) with TRN: sql = """SELECT EXISTS(SELECT * FROM qiita.software_command WHERE software_id = %s AND name = %s)""" TRN.add(sql, [software.id, name]) if TRN.execute_fetchlast(): raise QiitaDBDuplicateError( "command", "software: %d, name: %s" % (software.id, name)) # Add the command to the DB sql = """INSERT INTO qiita.software_command (name, software_id, description, is_analysis) VALUES (%s, %s, %s, %s) RETURNING command_id""" sql_params = [name, software.id, description, analysis_only] TRN.add(sql, sql_params) c_id = TRN.execute_fetchlast() # Add the parameters to the DB sql = """INSERT INTO qiita.command_parameter (command_id, parameter_name, parameter_type, required, default_value) VALUES (%s, %s, %s, %s, %s) RETURNING command_parameter_id""" sql_params = [[c_id, pname, p_type, reqd, default] for pname, p_type, reqd, default in sql_param_values] TRN.add(sql, sql_params, many=True) TRN.execute() # Add the artifact parameters sql_type = """INSERT INTO qiita.parameter_artifact_type (command_parameter_id, artifact_type_id) VALUES (%s, %s)""" supported_types = [] for pname, p_type, atypes in sql_artifact_params: sql_params = [c_id, pname, p_type, True, None] TRN.add(sql, sql_params) pid = TRN.execute_fetchlast() sql_params = [[pid, convert_to_id(at, 'artifact_type')] for at in atypes] TRN.add(sql_type, sql_params, many=True) supported_types.extend([atid for _, atid in sql_params]) # If the software type is 'artifact definition', there are a couple # of extra steps if software.type == 'artifact definition': # If supported types is not empty, link the software with these # types if supported_types: sql = """INSERT INTO qiita.software_artifact_type (software_id, artifact_type_id) VALUES (%s, %s)""" sql_params = [[software.id, atid] for atid in supported_types] TRN.add(sql, sql_params, many=True) # If this is the validate command, we need to add the # provenance and name parameters. These are used internally, # that's why we are adding them here if name == 'Validate': sql = """INSERT INTO qiita.command_parameter (command_id, parameter_name, parameter_type, required, default_value) VALUES (%s, 'name', 'string', 'False', 'dflt_name'), (%s, 'provenance', 'string', 'False', NULL) """ TRN.add(sql, [c_id, c_id]) # Add the outputs to the command if outputs: sql = """INSERT INTO qiita.command_output (name, command_id, artifact_type_id) VALUES (%s, %s, %s)""" sql_args = [[pname, c_id, convert_to_id(at, 'artifact_type')] for pname, at in outputs.items()] TRN.add(sql, sql_args, many=True) TRN.execute() return Command(c_id)
from os.path import basename from qiita_db.sql_connection import TRN from qiita_db.study import Study for study in Study.iter(): for pt in study.prep_templates(): filepaths = pt.get_filepaths() if filepaths: # filepaths are returned in order so we can take the # oldest and newest; then we get the filename and parse the # creation time. Note that the filename comes in one of these # formats: 1_prep_1_qiime_19700101-000000.txt or # 1_prep_1_19700101-000000.txt oldest = basename(filepaths[-1][1])[-19:-4].replace('-', ' ') newest = basename(filepaths[0][1])[-19:-4].replace('-', ' ') with TRN: sql = """UPDATE qiita.prep_template SET creation_timestamp = %s, modification_timestamp = %s WHERE prep_template_id = %s""" TRN.add(sql, [oldest, newest, pt.id]) TRN.execute()
for vv in v: to_merge[vv] = k merge_fn = (lambda id_, x: to_merge[id_] if id_ in to_merge else id_) t = t.collapse(merge_fn, norm=False, min_group_size=1, axis='observation', collapse_f=collapse_f) else: ids_to_replace = {c: c.upper() for c in current if c != c.upper()} t.update_ids(ids_to_replace, axis='observation', strict=False, inplace=True) with biom_open(biom, 'w') as f: t.to_hdf5(f, t.generated_by) checksum = compute_checksum(biom) TRN.add(sql, [checksum, ftps['biom'][0]]) fna = ftps['preprocessed_fasta'][1] tmp = fna + '.tmp' with open(tmp, 'w') as out: for seq in t.ids('observation'): out.write('>%s\n%s\n' % (seq, seq)) rename(tmp, fna) checksum = compute_checksum(fna) TRN.add(sql, [checksum, ftps['preprocessed_fasta'][0]]) TRN.execute()
updated = map(lambda x: x.upper(), current) if len(set(updated)) != len(updated): print('************>', a.id, fp, '<**************') if set(current) ^ set(updated): print('Changing biom: ', a.id, fp) t.update_ids({i: i.upper() for i in t.ids('observation')}, axis='observation', inplace=True) with biom_open(fp, 'w') as f: t.to_hdf5(f, t.generated_by) checksum = compute_checksum(fp) elif fpt == 'preprocessed_fasta': changed = False tmp = fp + '.tmp' with open(tmp, 'w') as out: for seq in read(fp, format='fasta'): seq = str(seq) sequ = seq.upper() out.write('>%s\n%s\n' % (sequ, sequ)) if seq != sequ: changed = True if changed: print('Changing biom: ', a.id, fp) rename(tmp, fp) checksum = compute_checksum(fp) else: remove(tmp) if checksum is not None: TRN.add(sql, [checksum, _id]) TRN.execute()
import pandas as pd from os.path import getsize, join, dirname, abspath, exists from qiita_db.util import get_filepath_information, compute_checksum from qiita_db.sql_connection import TRN with TRN: sql = """SELECT filepath_id FROM qiita.filepath""" TRN.add(sql) fids = TRN.execute_fetchflatten() fpath = join(dirname(abspath(__file__)), 'support_files', 'patches', 'python_patches', '74.py.cache.tsv') cache = dict() if exists(fpath): df = pd.read_csv(fpath, sep='\t', index_col=0, dtype=str, names=['filepath_id', 'checksum', 'fp_size']) cache = df.to_dict('index') for fid in fids: if fid not in cache: finfo = get_filepath_information(fid) try: size = getsize(finfo['fullpath']) except FileNotFoundError: size = 0 try:
if cols_sample: with TRN: # a few notes: just getting the preps with duplicated values; ignoring # column 'sample_id' and tables 'study_sample', 'prep_template', # 'prep_template_sample' sql = """SELECT table_name, array_agg(column_name::text) FROM information_schema.columns WHERE column_name IN %s AND table_name LIKE 'sample_%%' AND table_name NOT IN ( 'prep_template', 'prep_template_sample') GROUP BY table_name""" # note that we are looking for those columns with duplicated names in # the headers TRN.add(sql, [tuple(set(cols_sample))]) for table, columns in dict(TRN.execute_fetchindex()).items(): # [1] the format is table_# so taking the # st = SampleTemplate(int(table.split('_')[1])) # getting just the columns of interest st_df = st.to_dataframe()[columns] # converting to datetime for col in columns: st_df[col] = st_df[col].apply(transform_date) st.update(st_df) if cols_prep: with TRN: # a few notes: just getting the preps with duplicated values; ignoring # column 'sample_id' and tables 'study_sample', 'prep_template', # 'prep_template_sample'
# a few notes: just getting the preps with duplicated values; ignoring # column 'sample_id' and tables 'study_sample', 'prep_template', # 'prep_template_sample' sql = """SELECT table_name, array_agg(column_name::text) FROM information_schema.columns WHERE column_name IN %s AND column_name != 'sample_id' AND table_name LIKE 'prep_%%' AND table_name NOT IN ( 'prep_template', 'prep_template_sample') GROUP BY table_name""" # note that we are looking for those columns with duplicated names in # the headers TRN.add(sql, [ tuple( set(PrepTemplate.metadata_headers()) & set(SampleTemplate.metadata_headers())) ]) overlapping = dict(TRN.execute_fetchindex()) # finding actual duplicates for table_name, cols in viewitems(overlapping): # leaving print so when we patch in the main system we know that # nothing was renamed or deal with that print table_name with TRN: for c in cols: sql = 'ALTER TABLE qiita.%s RENAME COLUMN %s TO %s_renamed' % ( table_name, c, c) TRN.add(sql) TRN.execute()
# Nov 28, 2017 (only in py file) # Adding a new command into Qiita/Alpha: delete_analysis from qiita_db.software import Software, Command from qiita_db.sql_connection import TRN # Create the delete study command Command.create(Software.from_name_and_version('Qiita', 'alpha'), 'delete_analysis', 'Deletes a full analysis', {'analysis_id': ['integer', None]}) # Make sure that all validate commands have the "analysis" parameter with TRN: # Get all validate commands that are missing the analysis parameter sql = """SELECT command_id FROM qiita.software_command sc WHERE name = 'Validate' AND NOT ( SELECT EXISTS(SELECT * FROM qiita.command_parameter WHERE parameter_name = 'analysis' AND command_id = sc.command_id));""" TRN.add(sql) sql = """INSERT INTO qiita.command_parameter (command_id, parameter_name, parameter_type, required, default_value, name_order, check_biom_merge) VALUES (6, 'analysis', 'analysis', false, NULL, NULL, false)""" sql_params = [[cmd_id, 'analysis', 'analysis', False, None, None, False] for cmd_id in TRN.execute_fetchflatten()] TRN.add(sql, sql_params, many=True) TRN.execute()
def transfer_job(analysis, command_id, params, input_artifact_id, job_data, cmd_out_id, biom_data, output_artifact_type_id): """Transfers the job from the old structure to the plugin structure Parameters ---------- analysis : dict The analysis information command_id : int The id of the command executed params : str The parameters used in the job input_artifact_id : int The id of the input artifact job_data : dict The job information cmd_out_id : int The id of the command's output biom_data : dict The biom information output_artifact_type_id : int The type of the output artifact """ with TRN: # Create the job # Add the row in the processing job table # Magic number 3: status -> success sql = """INSERT INTO qiita.processing_job (email, command_id, command_parameters, processing_job_status_id) VALUES (%s, %s, %s, %s) RETURNING processing_job_id""" TRN.add(sql, [analysis['email'], command_id, params, 3]) job_id = TRN.execute_fetchlast() # Link the job with the input artifact sql = """INSERT INTO qiita.artifact_processing_job (artifact_id, processing_job_id) VALUES (rarefied_biom_id, proc_job_id)""" TRN.add(sql, [input_artifact_id, job_id]) # Check if the executed job has results and add them sql = """SELECT EXISTS(SELECT * FROM qiita.job_results_filepath WHERE job_id = %s)""" TRN.add(sql, [job_data['job_id']]) if TRN.execute_fetchlast(): # There are results for the current job. # Transfer the job files to a new artifact sql = """SELECT filepath_id FROM qiita.job_results_filepath WHERE job_id = %s""" TRN.add(sql, job_data['job_id']) filepath_id = TRN.execute_fetchlast() artifact_id = transfer_file_to_artifact( analysis['analysis_id'], analysis['timestamp'], command_id, biom_data['data_type_id'], params, output_artifact_type_id, filepath_id) # Link the artifact with its parent sql = """INSERT INTO qiita.parent_artifact (artifact_id, parent_id) VALUES (%s, %s)""" TRN.add(sql, [artifact_id, input_artifact_id]) # Link the artifact as the job output sql = """INSERT INTO qiita.artifact_output_processing_job (artifact_id, processing_job_id, command_output_id) VALUES (%s, %s, %s)""" TRN.add(sql, [artifact_id, job_id, cmd_out_id]) TRN.exeucte() else: # There are no results on the current job, so mark it as # error if job_data.log_id is None: # Magic number 2 - we are not using any other severity # level, so keep using number 2 sql = """INSERT INTO qiita.logging (time, severity_id, msg) VALUES (%s, %s, %s) RETURNING logging_id""" TRN.add(sql, [analysis['timestamp'], 2, "Unknown error - patch 47"]) else: log_id = job_data['log_id'] # Magic number 4 -> status -> error sql = """UPDATE qiita.processing_job SET processing_job_status_id = 4, logging_id = %s WHERE processing_job_id = %s""" TRN.add(sql, [log_id, job_id])
from qiita_db.sql_connection import TRN # Due to the size of these changes we will with TRN: # select all table and column names from all sample template sql = """SELECT DISTINCT table_name FROM information_schema.columns WHERE (table_name LIKE 'sample_%' OR table_name LIKE 'prep_%') AND table_name NOT LIKE '%template%'""" TRN.add(sql) all_tables = TRN.execute_fetchflatten() for table in all_tables: with TRN: sql = """SELECT column_name FROM information_schema.columns WHERE table_name = %s ORDER BY column_name""" TRN.add(sql, [table]) for column in TRN.execute_fetchflatten(): sql = "ALTER TABLE qiita.%s ALTER COLUMN %s TYPE VARCHAR" % ( table, column) TRN.add(sql) TRN.execute()
def postgres_test(**kwargs): """Open a connection and query postgres""" from qiita_db.sql_connection import TRN with TRN: TRN.add("SELECT 42") return TRN.execute_fetchflatten()[0]