def test_execute_fetchindex(self): with TRN: sql = """INSERT INTO qiita.test_table (str_column, int_column) VALUES (%s, %s) RETURNING str_column, int_column""" args = [["insert1", 1], ["insert2", 2], ["insert3", 3]] TRN.add(sql, args, many=True) self.assertEqual(TRN.execute_fetchindex(), [["insert3", 3]]) sql = """INSERT INTO qiita.test_table (str_column, int_column) VALUES (%s, %s) RETURNING str_column, int_column""" args = [["insert4", 4], ["insert5", 5], ["insert6", 6]] TRN.add(sql, args, many=True) self.assertEqual(TRN.execute_fetchindex(3), [["insert4", 4]])
def update(self, md_template): r"""Update values in the template Parameters ---------- md_template : DataFrame The metadata template file contents indexed by samples Ids Raises ------ QiitaDBError If md_template and db do not have the same sample ids If md_template and db do not have the same column headers If self.can_be_updated is not True """ with TRN: # Clean and validate the metadata template given new_map = self._clean_validate_template(md_template, self.study_id, self.columns_restrictions) # Retrieving current metadata sql = "SELECT * FROM qiita.{0}".format(self._table_name(self.id)) TRN.add(sql) current_map = self._transform_to_dict(TRN.execute_fetchindex()) current_map = pd.DataFrame.from_dict(current_map, orient="index") # simple validations of sample ids and column names samples_diff = set(new_map.index).difference(current_map.index) if samples_diff: raise QiitaDBError( "The new template differs from what is stored " "in database by these samples names: %s" % ", ".join(samples_diff) ) columns_diff = set(new_map.columns).difference(current_map.columns) if columns_diff: raise QiitaDBError( "The new template differs from what is stored " "in database by these columns names: %s" % ", ".join(columns_diff) ) # here we are comparing two dataframes following: # http://stackoverflow.com/a/17095620/4228285 current_map.sort(axis=0, inplace=True) current_map.sort(axis=1, inplace=True) new_map.sort(axis=0, inplace=True) new_map.sort(axis=1, inplace=True) map_diff = (current_map != new_map).stack() map_diff = map_diff[map_diff] map_diff.index.names = ["id", "column"] changed_cols = map_diff.index.get_level_values("column").unique() if not self.can_be_updated(columns=set(changed_cols)): raise QiitaDBError( "The new template is modifying fields that cannot be " "modified. Try removing the target gene fields or " "deleting the processed data. You are trying to modify: %s" % ", ".join(changed_cols) ) for col in changed_cols: self.update_category(col, new_map[col].to_dict()) self.generate_files()
def status(self): """The status of the prep template Returns ------- str The status of the prep template Notes ----- The status of a prep template is inferred by the status of the processed data generated from this prep template. If no processed data has been generated with this prep template; then the status is 'sandbox'. """ with TRN: sql = """SELECT processed_data_status FROM qiita.processed_data_status pds JOIN qiita.processed_data pd USING (processed_data_status_id) JOIN qiita.preprocessed_processed_data ppd_pd USING (processed_data_id) JOIN qiita.prep_template_preprocessed_data pt_ppd USING (preprocessed_data_id) WHERE pt_ppd.prep_template_id=%s""" TRN.add(sql, [self._id]) return infer_status(TRN.execute_fetchindex())
def _generate_study_list_for_api(visibility, only_biom=True): """Get general study information Parameters ---------- visibility : string The visibility to get studies Returns ------- list of dict The list of studies and their information """ artifact_type = '' if only_biom: artifact_type = "AND artifact_type = 'BIOM'" sql = f""" SELECT study_id, array_agg(DISTINCT artifact_id) FROM qiita.study INNER JOIN qiita.study_artifact USING (study_id) INNER JOIN qiita.artifact USING (artifact_id) INNER JOIN qiita.artifact_type USING (artifact_type_id) INNER JOIN qiita.visibility USING (visibility_id) WHERE visibility = %s {artifact_type} GROUP BY study_id """ with TRN: TRN.add(sql, [visibility]) return dict(TRN.execute_fetchindex())
def _update_accession_numbers(self, column, values): """Update accession numbers stored in `column` with the ones in `values` Parameters ---------- column : str The column name where the accession number are stored values : dict of {str: str} The accession numbers keyed by sample id Raises ------ QiitaDBError If a sample in `values` already has an accession number QiitaDBWarning If `values` is not updating any accesion number """ with TRN: sql = """SELECT sample_id, {0} FROM qiita.{1} WHERE {2}=%s AND {0} IS NOT NULL""".format(column, self._table, self._id_column) TRN.add(sql, [self.id]) db_vals = {sample_id: accession for sample_id, accession in TRN.execute_fetchindex()} common_samples = set(db_vals) & set(values) diff = [sample for sample in common_samples if db_vals[sample] != values[sample]] if diff: raise QiitaDBError( "The following samples already have an accession number: " "%s" % ', '.join(diff)) # Remove the common samples form the values dictionary values = deepcopy(values) for sample in common_samples: del values[sample] if values: sql_vals = ', '.join(["(%s, %s)"] * len(values)) sql = """UPDATE qiita.{0} AS t SET {1}=c.{1} FROM (VALUES {2}) AS c(sample_id, {1}) WHERE c.sample_id = t.sample_id AND t.{3} = %s """.format(self._table, column, sql_vals, self._id_column) sql_vals = list(chain.from_iterable(values.items())) sql_vals.append(self.id) TRN.add(sql, sql_vals) TRN.execute() else: warnings.warn("No new accession numbers to update", QiitaDBWarning)
def raw_data(self): with TRN: sql = """SELECT raw_data_id FROM qiita.prep_template WHERE prep_template_id=%s""" TRN.add(sql, [self.id]) result = TRN.execute_fetchindex() if result: # If there is any result, it will be in the first row # and in the first element of that row, thus [0][0] return result[0][0] return None
def __call__(self, searchstr, user): """Runs a Study query and returns matching studies and samples Parameters ---------- searchstr : str Search string to use user : User object User making the search. Needed for permissions checks. Returns ------- dict Found samples in format {study_id: [[samp_id1, meta1, meta2, ...], [samp_id2, meta1, meta2, ...], ...} list metadata column names searched for Notes ----- Metadata information for each sample is in the same order as the metadata columns list returned Metadata column names and string searches are case-sensitive """ with TRN: study_sql, sample_sql, meta_headers = \ self._parse_study_search_string(searchstr, True) # get all studies containing the metadata headers requested TRN.add(study_sql) study_ids = set(TRN.execute_fetchflatten()) # strip to only studies user has access to if user.level not in {'admin', 'dev', 'superuser'}: study_ids = study_ids.intersection( Study.get_by_status('public') | user.user_studies | user.shared_studies) results = {} # run search on each study to get out the matching samples for sid in study_ids: TRN.add(sample_sql.format(sid)) study_res = TRN.execute_fetchindex() if study_res: # only add study to results if actually has samples # in results results[sid] = study_res self.results = results self.meta_headers = meta_headers return results, meta_headers
def _to_dict(self): r"""Returns the categories and their values in a dictionary Returns ------- dict of {str: str} A dictionary of the form {category: value} """ with TRN: sql = "SELECT * FROM qiita.{0} WHERE sample_id=%s".format(self._dynamic_table) TRN.add(sql, [self._id]) d = dict(TRN.execute_fetchindex()[0]) # Remove the sample_id, is not part of the metadata del d["sample_id"] return d
def to_dataframe(self): """Returns the metadata template as a dataframe Returns ------- pandas DataFrame The metadata in the template,indexed on sample id """ with TRN: cols = sorted(get_table_cols(self._table_name(self._id))) # Get all metadata for the template sql = "SELECT {0} FROM qiita.{1}".format(", ".join(cols), self._table_name(self.id)) TRN.add(sql, [self._id]) meta = TRN.execute_fetchindex() # Create the dataframe and clean it up a bit df = pd.DataFrame((list(x) for x in meta), columns=cols) df.set_index("sample_id", inplace=True, drop=True) return df
def _get_accession_numbers(self, column): """Return the accession numbers stored in `column` Parameters ---------- column : str The column name where the accession number is stored Returns ------- dict of {str: str} The accession numbers keyed by sample id """ with TRN: sql = """SELECT sample_id, {0} FROM qiita.{1} WHERE {2}=%s""".format(column, self._table, self._id_column) TRN.add(sql, [self.id]) result = {sample_id: accession for sample_id, accession in TRN.execute_fetchindex()} return result
def qiime_map_fp(self): """The QIIME mapping filepath attached to the prep template Returns ------- str The filepath of the QIIME mapping file """ with TRN: sql = """SELECT filepath_id, filepath FROM qiita.filepath JOIN qiita.{0} USING (filepath_id) JOIN qiita.filepath_type USING (filepath_type_id) WHERE {1} = %s AND filepath_type = 'qiime_map' ORDER BY filepath_id DESC""".format(self._filepath_table, self._id_column) TRN.add(sql, [self._id]) # We know that the good filepath is the one in the first row # because we sorted them in the SQL query fn = TRN.execute_fetchindex()[0][1] base_dir = get_mountpoint('templates')[0][1] return join(base_dir, fn)
def get_filepaths(self): r"""Retrieves the list of (filepath_id, filepath)""" with TRN: try: sql = """SELECT filepath_id, filepath FROM qiita.filepath WHERE filepath_id IN ( SELECT filepath_id FROM qiita.{0} WHERE {1}=%s) ORDER BY filepath_id DESC""".format( self._filepath_table, self._id_column) TRN.add(sql, [self.id]) filepath_ids = TRN.execute_fetchindex() except Exception as e: LogEntry.create('Runtime', str(e), info={self.__class__.__name__: self.id}) raise e _, fb = get_mountpoint('templates')[0] base_fp = partial(join, fb) return [(fpid, base_fp(fp)) for fpid, fp in filepath_ids]
def create_non_rarefied_biom_artifact(analysis, biom_data, rarefied_table): """Creates the initial non-rarefied BIOM artifact of the analysis Parameters ---------- analysis : dict Dictionary with the analysis information biom_data : dict Dictionary with the biom file information rarefied_table : biom.Table The rarefied BIOM table Returns ------- int The id of the new artifact """ # The non rarefied biom artifact is the initial biom table of the analysis. # This table does not currently exist anywhere, so we need to actually # create the BIOM file. To create this BIOM file we need: (1) the samples # and artifacts they come from and (2) whether the samples where # renamed or not. (1) is on the database, but we need to inferr (2) from # the existing rarefied BIOM table. Fun, fun... with TRN: # Get the samples included in the BIOM table grouped by artifact id # Note that the analysis contains a BIOM table per data type included # in it, and the table analysis_sample does not differentiate between # datatypes, so we need to check the data type in the artifact table sql = """SELECT artifact_id, array_agg(sample_id) FROM qiita.analysis_sample JOIN qiita.artifact USING (artifact_id) WHERE analysis_id = %s AND data_type_id = %s GROUP BY artifact_id""" TRN.add(sql, [analysis['analysis_id'], biom_data['data_type_id']]) samples_by_artifact = TRN.execute_fetchindex() # Create an empty BIOM table to be the new master table new_table = Table([], [], []) ids_map = {} for a_id, samples in samples_by_artifact: # Get the filepath of the BIOM table from the artifact artifact = Artifact(a_id) biom_fp = None for _, fp, fp_type in artifact.filepaths: if fp_type == 'biom': biom_fp = fp # Note that we are sure that the biom table exists for sure, so # no need to check if biom_fp is undefined biom_table = load_table(biom_fp) samples = set(samples).intersection(biom_table.ids()) biom_table.filter(samples, axis='sample', inplace=True) # we need to check if the table has samples left before merging if biom_table.shape[0] != 0 and biom_table.shape[1] != 0: new_table = new_table.merge(biom_table) ids_map.update( {sid: "%d.%s" % (a_id, sid) for sid in biom_table.ids()}) # Check if we need to rename the sample ids in the biom table new_table_ids = set(new_table.ids()) if not new_table_ids.issuperset(rarefied_table.ids()): # We need to rename the sample ids new_table.update_ids(ids_map, 'sample', True, True) sql = """INSERT INTO qiita.artifact (generated_timestamp, data_type_id, visibility_id, artifact_type_id, submitted_to_vamps) VALUES (%s, %s, %s, %s, %s) RETURNING artifact_id""" # Magic number 4 -> visibility sandbox # Magix number 7 -> biom artifact type TRN.add( sql, [analysis['timestamp'], biom_data['data_type_id'], 4, 7, False]) artifact_id = TRN.execute_fetchlast() # Associate the artifact with the analysis sql = """INSERT INTO qiita.analysis_artifact (analysis_id, artifact_id) VALUES (%s, %s)""" TRN.add(sql, [analysis['analysis_id'], artifact_id]) # Link the artifact with its file dd_id, mp = get_mountpoint('BIOM')[0] dir_fp = join(get_db_files_base_dir(), mp, str(artifact_id)) if not exists(dir_fp): makedirs(dir_fp) new_table_fp = join(dir_fp, "biom_table.biom") with biom_open(new_table_fp, 'w') as f: new_table.to_hdf5(f, "Generated by Qiita") sql = """INSERT INTO qiita.filepath (filepath, filepath_type_id, checksum, checksum_algorithm_id, data_directory_id) VALUES (%s, %s, %s, %s, %s) RETURNING filepath_id""" # Magic number 7 -> filepath_type_id = 'biom' # Magic number 1 -> the checksum algorithm id TRN.add(sql, [ basename(new_table_fp), 7, compute_checksum(new_table_fp), 1, dd_id ]) fp_id = TRN.execute_fetchlast() sql = """INSERT INTO qiita.artifact_filepath (artifact_id, filepath_id) VALUES (%s, %s)""" TRN.add(sql, [artifact_id, fp_id]) TRN.execute() return artifact_id
'{"max_rare_depth": "Default", "tree": "", "num_steps": 10, ' '"min_rare_depth": 10, "metrics": ["chao1", "observed_otus"]}' ], [srare_cmd_id, 'Defaults', '{"subsample_multinomial": "False"}'] ] TRN.add(sql, sql_args, many=True) # At this point we are ready to start transferring the data from the old # structures to the new structures. Overview of the procedure: # Step 1: Add initial set of artifacts up to rarefied table # Step 2: Transfer the "analisys jobs" to processing jobs and create # the analysis artifacts db_dir = get_db_files_base_dir() with TRN: sql = "SELECT * FROM qiita.analysis" TRN.add(sql) analysis_info = TRN.execute_fetchindex() # Loop through all the analysis for analysis in analysis_info: # Step 1: Add the inital set of artifacts. An analysis starts with # a set of BIOM artifacts. sql = """SELECT * FROM qiita.analysis_filepath JOIN qiita.filepath USING (filepath_id) JOIN qiita.filepath_type USING (filepath_type_id) WHERE analysis_id = %s AND filepath_type = 'biom'""" TRN.add(sql, [analysis['analysis_id']]) analysis_bioms = TRN.execute_fetchindex() # Loop through all the biom tables associated with the current analysis # so we can create the initial set of artifacts
if cols_sample: with TRN: # a few notes: just getting the preps with duplicated values; ignoring # column 'sample_id' and tables 'study_sample', 'prep_template', # 'prep_template_sample' sql = """SELECT table_name, array_agg(column_name::text) FROM information_schema.columns WHERE column_name IN %s AND table_name LIKE 'sample_%%' AND table_name NOT IN ( 'prep_template', 'prep_template_sample') GROUP BY table_name""" # note that we are looking for those columns with duplicated names in # the headers TRN.add(sql, [tuple(set(cols_sample))]) for table, columns in viewitems(dict(TRN.execute_fetchindex())): # [1] the format is table_# so taking the # st = SampleTemplate(int(table.split('_')[1])) # getting just the columns of interest st_df = st.to_dataframe()[columns] # converting to datetime for col in columns: st_df[col] = st_df[col].apply(transform_date) st.update(st_df) if cols_prep: with TRN: # a few notes: just getting the preps with duplicated values; ignoring # column 'sample_id' and tables 'study_sample', 'prep_template', # 'prep_template_sample' sql = """SELECT table_name, array_agg(column_name::text)
pc_update_sql = """UPDATE qiita.prep_columns SET column_type = 'bool' WHERE prep_template_id = %s AND column_name = %s""" for table in tables: table_id = table.split("_")[1] # Change NaN values to NULL in database TRN.add(cols_sql, [table]) cols = TRN.execute_fetchflatten() for col in cols: TRN.add(null_sql.format(table, col), [nans]) TRN.execute() # Update now boolean columns to bool in database TRN.add("SELECT {0} FROM qiita.{1}".format(",".join(cols), table)) col_vals = zip(*TRN.execute_fetchindex()) for col, vals in zip(cols, col_vals): if set(vals) == {None}: # Ignore columns that are all NULL continue if all([v in bool_vals for v in vals]): # Every value in the column should be bool, so do it TRN.add(alter_sql.format(table, col), [false_vals, true_vals]) if "sample" in table: st_update.add(table_id) TRN.add(ssc_update_sql, [table_id, col]) else: pr_update.add(table_id) TRN.add(pc_update_sql, [table_id, col]) TRN.execute()
TRN.add(sql, [client_id, client_secret]) sql = """INSERT INTO qiita.oauth_software (software_id, client_id) VALUES (%s, %s)""" TRN.add(sql, [i, client_id]) TRN.execute() # # Generating compressed files for picking failures -- artifact_type = BIOM # sql = """SELECT artifact_id FROM qiita.artifact JOIN qiita.artifact_type USING (artifact_type_id) WHERE artifact_type = 'BIOM'""" TRN.add(sql) for r in TRN.execute_fetchindex(): to_tgz = None a = Artifact(r[0]) for x in a.filepaths: if x['fp_type'] == 'directory': # removing / from the path if it exists to_tgz = x['fp'][:-1] if x['fp'][-1] == '/' else x['fp'] break if to_tgz is None: continue tgz = to_tgz + '.tgz' if not exists(tgz): with taropen(tgz, "w:gz") as tar: tar.add(to_tgz, arcname=basename(to_tgz))
# October 30th, 2017 # A change introduced in July made all the parameters to be stored as strings # The DB needs to be patched so all the artifacts follow this structure from json import dumps from qiita_db.sql_connection import TRN with TRN: sql = """SELECT * FROM qiita.artifact JOIN qiita.artifact_output_processing_job USING (artifact_id) WHERE command_id IS NOT NULL""" TRN.add(sql) sql_update_artifact = """UPDATE qiita.artifact SET command_parameters = %s WHERE artifact_id = %s""" sql_update_job = """UPDATE qiita.processing_job SET command_parameters = %s WHERE processing_job_id = %s""" for ainfo in TRN.execute_fetchindex(): ainfo = dict(ainfo) params = dumps( {k: str(v) for k, v in ainfo['command_parameters'].items()}) TRN.add(sql_update_artifact, [params, ainfo['artifact_id']]) TRN.add(sql_update_job, [params, ainfo['processing_job_id']])
def create_qiime_mapping_file(self): """This creates the QIIME mapping file and links it in the db. Returns ------- filepath : str The filepath of the created QIIME mapping file Raises ------ ValueError If the prep template is not a subset of the sample template QiitaDBWarning If the QIIME-required columns are not present in the template Notes ----- We cannot ensure that the QIIME-required columns are present in the metadata map. However, we have to generate a QIIME-compliant mapping file. Since the user may need a QIIME mapping file, but not these QIIME-required columns, we are going to create them and populate them with the value XXQIITAXX. """ with TRN: rename_cols = { 'barcode': 'BarcodeSequence', 'primer': 'LinkerPrimerSequence', 'description': 'Description', } if 'reverselinkerprimer' in self.categories(): rename_cols['reverselinkerprimer'] = 'ReverseLinkerPrimer' new_cols = ['BarcodeSequence', 'LinkerPrimerSequence', 'ReverseLinkerPrimer'] else: new_cols = ['BarcodeSequence', 'LinkerPrimerSequence'] # getting the latest sample template sql = """SELECT filepath_id, filepath FROM qiita.filepath JOIN qiita.sample_template_filepath USING (filepath_id) WHERE study_id=%s ORDER BY filepath_id DESC""" TRN.add(sql, [self.study_id]) # We know that the good filepath is the one in the first row # because we sorted them in the SQL query sample_template_fname = TRN.execute_fetchindex()[0][1] _, fp = get_mountpoint('templates')[0] sample_template_fp = join(fp, sample_template_fname) # reading files via pandas st = load_template_to_dataframe(sample_template_fp) pt = self.to_dataframe() st_sample_names = set(st.index) pt_sample_names = set(pt.index) if not pt_sample_names.issubset(st_sample_names): raise ValueError( "Prep template is not a sub set of the sample template, " "file: %s - samples: %s" % (sample_template_fp, ', '.join(pt_sample_names-st_sample_names))) mapping = pt.join(st, lsuffix="_prep") mapping.rename(columns=rename_cols, inplace=True) # Pre-populate the QIIME-required columns with the value XXQIITAXX index = mapping.index placeholder = ['XXQIITAXX'] * len(index) missing = [] for val in viewvalues(rename_cols): if val not in mapping: missing.append(val) mapping[val] = pd.Series(placeholder, index=index) if missing: warnings.warn( "Some columns required to generate a QIIME-compliant " "mapping file are not present in the template. A " "placeholder value (XXQIITAXX) has been used to populate " "these columns. Missing columns: %s" % ', '.join(missing), QiitaDBWarning) # Gets the orginal mapping columns and readjust the order to comply # with QIIME requirements cols = mapping.columns.values.tolist() cols.remove('BarcodeSequence') cols.remove('LinkerPrimerSequence') cols.remove('Description') new_cols.extend(cols) new_cols.append('Description') mapping = mapping[new_cols] # figuring out the filepath for the QIIME map file _id, fp = get_mountpoint('templates')[0] filepath = join(fp, '%d_prep_%d_qiime_%s.txt' % (self.study_id, self.id, strftime("%Y%m%d-%H%M%S"))) # Save the mapping file mapping.to_csv(filepath, index_label='#SampleID', na_rep='', sep='\t', encoding='utf-8') # adding the fp to the object self.add_filepath( filepath, fp_id=convert_to_id("qiime_map", "filepath_type")) return filepath
TRN.add(sql, [client_id, client_secret]) sql = """INSERT INTO qiita.oauth_software (software_id, client_id) VALUES (%s, %s)""" TRN.add(sql, [i, client_id]) TRN.execute() # # Generating compressed files for picking failures -- artifact_type = BIOM # sql = """SELECT artifact_id FROM qiita.artifact JOIN qiita.artifact_type USING (artifact_type_id) WHERE artifact_type = 'BIOM'""" TRN.add(sql) for r in TRN.execute_fetchindex(): to_tgz = None a = Artifact(r[0]) for _, fp, fp_type in a.filepaths: if fp_type == 'directory': # removing / from the path if it exists to_tgz = fp[:-1] if fp[-1] == '/' else fp break if to_tgz is None: continue tgz = to_tgz + '.tgz' if not exists(tgz): with taropen(tgz, "w:gz") as tar: tar.add(to_tgz, arcname=basename(to_tgz))
# Feb 11, 2015 # This changes all analysis files to be relative path instead of absolute from os.path import basename, dirname from qiita_db.util import get_mountpoint from qiita_db.sql_connection import TRN with TRN: sql = """SELECT f.* FROM qiita.filepath f JOIN qiita.analysis_filepath afp ON f.filepath_id = afp.filepath_id""" TRN.add(sql) filepaths = TRN.execute_fetchindex() # retrieve relative filepaths as dictionary for matching mountpoints = {m[1].rstrip('/\\'): m[0] for m in get_mountpoint( 'analysis', retrieve_all=True)} sql = """UPDATE qiita.filepath SET filepath = %s, data_directory_id = %s WHERE filepath_id = %s""" for filepath in filepaths: filename = basename(filepath['filepath']) # find the ID of the analysis filepath used mp_id = mountpoints[dirname(filepath['filepath']).rstrip('/\\')] TRN.add(sql, [filename, mp_id, filepath['filepath_id']]) TRN.execute()
# ----------------------------------------------------------------------------- # Copyright (c) 2014--, The Qiita Development Team. # # Distributed under the terms of the BSD 3-clause License. # # The full license is in the file LICENSE, distributed with this software. # ----------------------------------------------------------------------------- from qiita_db.sql_connection import TRN sql = """ SELECT constraint_name AS cname, 'qiita.' || table_name AS tname FROM information_schema.table_constraints WHERE constraint_type ='FOREIGN KEY' AND ( (constraint_name LIKE 'fk_sample_%' AND table_name LIKE 'sample_%') OR (constraint_name LIKE 'fk_prep_%' AND table_name LIKE 'prep_%')) AND table_name NOT IN ( 'prep_template', 'prep_template_sample', 'prep_template_filepath', 'prep_template_processing_job')""" with TRN: TRN.add(sql) to_delete = TRN.execute_fetchindex() for cname, tname in to_delete: with TRN: sql = "ALTER TABLE %s DROP CONSTRAINT %s" % (tname, cname) TRN.add(sql) TRN.execute()
sql = """SELECT table_name, array_agg(column_name::text) FROM information_schema.columns WHERE column_name IN %s AND column_name != 'sample_id' AND table_name LIKE 'prep_%%' AND table_name NOT IN ( 'prep_template', 'prep_template_sample') GROUP BY table_name""" # note that we are looking for those columns with duplicated names in # the headers headers = set(PrepTemplate.metadata_headers()) & \ set(SampleTemplate.metadata_headers()) if headers: TRN.add(sql, [tuple(headers)]) overlapping = dict(TRN.execute_fetchindex()) else: overlapping = None if overlapping is not None: # finding actual duplicates for table_name, cols in overlapping.items(): # leaving print so when we patch in the main system we know that # nothing was renamed or deal with that print(table_name) with TRN: for c in cols: sql = 'ALTER TABLE qiita.%s RENAME COLUMN %s TO %s_renamed' % ( table_name, c, c) TRN.add(sql) TRN.execute()
'{"max_rare_depth": "Default", "tree": "", "num_steps": 10, ' '"min_rare_depth": 10, "metrics": ["chao1", "observed_otus"]}'], [srare_cmd_id, 'Defaults', '{"subsample_multinomial": "False"}']] TRN.add(sql, sql_args, many=True) # At this point we are ready to start transferring the data from the old # structures to the new structures. Overview of the procedure: # Step 1: Add initial set of artifacts up to rarefied table # Step 2: Transfer the "analysis jobs" to processing jobs and create # the analysis artifacts db_dir = get_db_files_base_dir() with TRN: sql = "SELECT * FROM qiita.analysis" TRN.add(sql) analysis_info = TRN.execute_fetchindex() # Loop through all the analysis for analysis in analysis_info: # Step 1: Add the inital set of artifacts. An analysis starts with # a set of BIOM artifacts. sql = """SELECT * FROM qiita.analysis_filepath JOIN qiita.filepath USING (filepath_id) JOIN qiita.filepath_type USING (filepath_type_id) WHERE analysis_id = %s AND filepath_type = 'biom'""" TRN.add(sql, [analysis['analysis_id']]) analysis_bioms = TRN.execute_fetchindex() # Loop through all the biom tables associated with the current analysis # so we can create the initial set of artifacts
# October 30th, 2017 # A change introduced in July made all the parameters to be stored as strings # The DB needs to be patched so all the artifacts follow this structure from json import dumps from qiita_db.sql_connection import TRN with TRN: sql = """SELECT * FROM qiita.artifact JOIN qiita.artifact_output_processing_job USING (artifact_id) WHERE command_id IS NOT NULL""" TRN.add(sql) sql_update_artifact = """UPDATE qiita.artifact SET command_parameters = %s WHERE artifact_id = %s""" sql_update_job = """UPDATE qiita.processing_job SET command_parameters = %s WHERE processing_job_id = %s""" for ainfo in TRN.execute_fetchindex(): ainfo = dict(ainfo) params = dumps( {k: str(v) for k, v in ainfo['command_parameters'].items()}) TRN.add(sql_update_artifact, [params, ainfo['artifact_id']]) TRN.add(sql_update_job, [params, ainfo['processing_job_id']])
def create_non_rarefied_biom_artifact(analysis, biom_data, rarefied_table): """Creates the initial non-rarefied BIOM artifact of the analysis Parameters ---------- analysis : dict Dictionary with the analysis information biom_data : dict Dictionary with the biom file information rarefied_table : biom.Table The rarefied BIOM table Returns ------- int The id of the new artifact """ # The non rarefied biom artifact is the initial biom table of the analysis. # This table does not currently exist anywhere, so we need to actually # create the BIOM file. To create this BIOM file we need: (1) the samples # and artifacts they come from and (2) whether the samples where # renamed or not. (1) is on the database, but we need to inferr (2) from # the existing rarefied BIOM table. Fun, fun... with TRN: # Get the samples included in the BIOM table grouped by artifact id # Note that the analysis contains a BIOM table per data type included # in it, and the table analysis_sample does not differentiate between # datatypes, so we need to check the data type in the artifact table sql = """SELECT artifact_id, array_agg(sample_id) FROM qiita.analysis_sample JOIN qiita.artifact USING (artifact_id) WHERE analysis_id = %s AND data_type_id = %s GROUP BY artifact_id""" TRN.add(sql, [analysis['analysis_id'], biom_data['data_type_id']]) samples_by_artifact = TRN.execute_fetchindex() # Create an empty BIOM table to be the new master table new_table = Table([], [], []) ids_map = {} for a_id, samples in samples_by_artifact: # Get the filepath of the BIOM table from the artifact artifact = Artifact(a_id) biom_fp = None for _, fp, fp_type in artifact.filepaths: if fp_type == 'biom': biom_fp = fp # Note that we are sure that the biom table exists for sure, so # no need to check if biom_fp is undefined biom_table = load_table(biom_fp) samples = set(samples).intersection(biom_table.ids()) biom_table.filter(samples, axis='sample', inplace=True) # we need to check if the table has samples left before merging if biom_table.shape[0] != 0 and biom_table.shape[1] != 0: new_table = new_table.merge(biom_table) ids_map.update({sid: "%d.%s" % (a_id, sid) for sid in biom_table.ids()}) # Check if we need to rename the sample ids in the biom table new_table_ids = set(new_table.ids()) if not new_table_ids.issuperset(rarefied_table.ids()): # We need to rename the sample ids new_table.update_ids(ids_map, 'sample', True, True) sql = """INSERT INTO qiita.artifact (generated_timestamp, data_type_id, visibility_id, artifact_type_id, submitted_to_vamps) VALUES (%s, %s, %s, %s, %s) RETURNING artifact_id""" # Magic number 4 -> visibility sandbox # Magix number 7 -> biom artifact type TRN.add(sql, [analysis['timestamp'], biom_data['data_type_id'], 4, 7, False]) artifact_id = TRN.execute_fetchlast() # Associate the artifact with the analysis sql = """INSERT INTO qiita.analysis_artifact (analysis_id, artifact_id) VALUES (%s, %s)""" TRN.add(sql, [analysis['analysis_id'], artifact_id]) # Link the artifact with its file dd_id, mp = get_mountpoint('BIOM')[0] dir_fp = join(get_db_files_base_dir(), mp, str(artifact_id)) if not exists(dir_fp): makedirs(dir_fp) new_table_fp = join(dir_fp, "biom_table.biom") with biom_open(new_table_fp, 'w') as f: new_table.to_hdf5(f, "Generated by Qiita") sql = """INSERT INTO qiita.filepath (filepath, filepath_type_id, checksum, checksum_algorithm_id, data_directory_id) VALUES (%s, %s, %s, %s, %s) RETURNING filepath_id""" # Magic number 7 -> filepath_type_id = 'biom' # Magic number 1 -> the checksum algorithm id TRN.add(sql, [basename(new_table_fp), 7, compute_checksum(new_table_fp), 1, dd_id]) fp_id = TRN.execute_fetchlast() sql = """INSERT INTO qiita.artifact_filepath (artifact_id, filepath_id) VALUES (%s, %s)""" TRN.add(sql, [artifact_id, fp_id]) TRN.execute() return artifact_id
sql = """SELECT table_name, array_agg(column_name::text) FROM information_schema.columns WHERE column_name IN %s AND column_name != 'sample_id' AND table_name LIKE 'prep_%%' AND table_name NOT IN ( 'prep_template', 'prep_template_sample') GROUP BY table_name""" # note that we are looking for those columns with duplicated names in # the headers headers = set(PrepTemplate.metadata_headers()) & \ set(SampleTemplate.metadata_headers()) if headers: TRN.add(sql, [tuple(headers)]) overlapping = dict(TRN.execute_fetchindex()) else: overlapping = None if overlapping is not None: # finding actual duplicates for table_name, cols in viewitems(overlapping): # leaving print so when we patch in the main system we know that # nothing was renamed or deal with that print table_name with TRN: for c in cols: sql = 'ALTER TABLE qiita.%s RENAME COLUMN %s TO %s_renamed' % ( table_name, c, c) TRN.add(sql) TRN.execute()
if cols_sample: with TRN: # a few notes: just getting the preps with duplicated values; ignoring # column 'sample_id' and tables 'study_sample', 'prep_template', # 'prep_template_sample' sql = """SELECT table_name, array_agg(column_name::text) FROM information_schema.columns WHERE column_name IN %s AND table_name LIKE 'sample_%%' AND table_name NOT IN ( 'prep_template', 'prep_template_sample') GROUP BY table_name""" # note that we are looking for those columns with duplicated names in # the headers TRN.add(sql, [tuple(set(cols_sample))]) for table, columns in dict(TRN.execute_fetchindex()).items(): # [1] the format is table_# so taking the # st = SampleTemplate(int(table.split('_')[1])) # getting just the columns of interest st_df = st.to_dataframe()[columns] # converting to datetime for col in columns: st_df[col] = st_df[col].apply(transform_date) st.update(st_df) if cols_prep: with TRN: # a few notes: just getting the preps with duplicated values; ignoring # column 'sample_id' and tables 'study_sample', 'prep_template', # 'prep_template_sample' sql = """SELECT table_name, array_agg(column_name::text)
# April 5, 2018 # Making sure that all parameter in the artifacts are strings from json import dumps from qiita_db.sql_connection import TRN # Make sure that all validate commands have the "analysis" parameter with TRN: # Get all validate commands that are missing the analysis parameter sql = """SELECT artifact_id, command_parameters FROM qiita.artifact""" TRN.add(sql) all_rows = TRN.execute_fetchindex() sql = """UPDATE qiita.artifact SET command_parameters = %s WHERE artifact_id = %s""" # taking the loop outside so we can have a TRN per change for row in all_rows: aid, params = row if params is None: continue if any([isinstance(v, int) for k, v in params.items()]): continue params = {k: str(v) if isinstance(v, int) else v for k, v in params.items()}