def to_dataframe(self): """Returns the metadata template as a dataframe Returns ------- pandas DataFrame The metadata in the template,indexed on sample id """ conn_handler = SQLConnectionHandler() cols = get_table_cols(self._table, conn_handler) if 'study_id' in cols: cols.remove('study_id') dyncols = get_table_cols(self._table_name(self._id), conn_handler) # remove sample_id from dyncols so not repeated dyncols.remove('sample_id') # Get all metadata for the template sql = """SELECT {0}, {1} FROM qiita.{2} req INNER JOIN qiita.{3} dyn on req.sample_id = dyn.sample_id WHERE req.{4} = %s""".format( ", ".join("req.%s" % c for c in cols), ", ".join("dyn.%s" % d for d in dyncols), self._table, self._table_name(self._id), self._id_column) meta = conn_handler.execute_fetchall(sql, [self._id]) cols = cols + dyncols # Create the dataframe and clean it up a bit df = pd.DataFrame((list(x) for x in meta), columns=cols) df.set_index('sample_id', inplace=True, drop=True) # Turn id cols to value cols for col, value in viewitems(self.str_cols_handlers): df[col].replace(value, inplace=True) df.rename(columns=self.translate_cols_dict, inplace=True) return df
class SearchTerm(object): # column names from required_sample_info table required_cols = set(get_table_cols("required_sample_info")) # column names from study table study_cols = set(get_table_cols("study")) def __init__(self, tokens): self.term = tokens[0] # clean all the inputs for pos, term in enumerate(self.term): self.term[pos] = scrub_data(term) def generate_sql(self): # we can assume that the metadata is either in required_sample_info # or the study-specific table column_name, operator, argument = self.term argument_type = type(typecast_string(argument)) allowable_types = { int: {'<', '<=', '=', '>=', '>'}, float: {'<', '<=', '=', '>=', '>'}, str: {'=', 'includes', 'startswith'} } if operator not in allowable_types[argument_type]: raise QiitaDBIncompatibleDatatypeError(operator, argument_type) if column_name in self.required_cols: column_name = "r.%s" % column_name.lower() elif column_name in self.study_cols: column_name = "st.%s" % column_name.lower() else: column_name = "sa.%s" % column_name.lower() if operator == "includes": # substring search, so create proper query for it return "LOWER(%s) LIKE '%%%s%%'" % (column_name, argument.lower()) else: # standard query so just return it, adding quotes if string if argument_type == str: argument = ''.join(("'", argument, "'")) return ' '.join([column_name, operator, argument]) def __repr__(self): column_name, operator, argument = self.term if operator == "includes": return "LOWER(%s) LIKE '%%%s%%')" % (column_name, argument.lower()) else: return ' '.join(self.term)
def test_get_table_cols(self): obs = get_table_cols("qiita_user", self.conn_handler) exp = { "email", "user_level_id", "password", "name", "affiliation", "address", "phone", "user_verify_code", "pass_reset_code", "pass_reset_timestamp" } self.assertEqual(set(obs), exp)
def categories(self): """Identifies the metadata columns present in a template Returns ------- cols : list The static and dynamic category fields """ cols = get_table_cols(self._table_name(self._id)) cols.extend(get_table_cols(self._table)[1:]) for idx, c in enumerate(cols): if c in self.translate_cols_dict: cols[idx] = self.translate_cols_dict[c] return cols
def __getitem__(self, key): r"""Returns the value of the metadata category `key` Parameters ---------- key : str The metadata category Returns ------- obj The value of the metadata category `key` Raises ------ KeyError If the metadata category `key` does not exists See Also -------- get """ conn_handler = SQLConnectionHandler() key = key.lower() if key in self._get_categories(conn_handler): # It's possible that the key is asking for one of the *_id columns # that we have to do the translation def handler(x): return x # prevent flake8 from complaining about the function not being # used and a redefinition happening in the next few lines handler(None) if key in self._md_template.translate_cols_dict.values(): handler = ( lambda x: self._md_template.str_cols_handlers[key][x]) key = "%s_id" % key # Check if we have either to query the table with required columns # or the dynamic table if key in get_table_cols(self._table, conn_handler): result = conn_handler.execute_fetchone( "SELECT {0} FROM qiita.{1} WHERE {2}=%s AND " "sample_id=%s".format(key, self._table, self._id_column), (self._md_template.id, self._id))[0] return handler(result) else: return conn_handler.execute_fetchone( "SELECT {0} FROM qiita.{1} WHERE " "sample_id=%s".format(key, self._dynamic_table), (self._id, ))[0] else: # The key is not available for the sample, so raise a KeyError raise KeyError("Metadata category %s does not exists for sample %s" " in template %d" % (key, self._id, self._md_template.id))
def get(self, message="", msg_level=None): all_emails_except_current = yield Task(self._get_all_emails) all_emails_except_current.remove(self.current_user.id) avail_meta = SampleTemplate.metadata_headers() +\ get_table_cols("study") self.render('list_studies.html', availmeta=avail_meta, all_emails_except_current=all_emails_except_current, message=message, msg_level=msg_level)
def categories(self): """Identifies the metadata columns present in a template Returns ------- cols : list The static and dynamic category fields """ cols = get_table_cols(self._table_name(self._id)) cols.remove("sample_id") return cols
def get(self): userobj = self.current_user analysis = Analysis(int(self.get_argument("aid"))) # make sure user has access to the analysis check_analysis_access(userobj, analysis) # get the dictionaries of selected samples and data types selproc_data, selsamples = self._selected_parser(analysis) self.render('search_studies.html', aid=analysis.id, selsamples=selsamples, selproc_data=selproc_data, counts={}, fullcounts={}, searchmsg="", query="", results={}, availmeta=SampleTemplate.metadata_headers() + get_table_cols("study"))
def _get_categories(self): r"""Returns all the available metadata categories for the sample Returns ------- set of str The set of all available metadata categories """ # Get all the columns cols = get_table_cols(self._dynamic_table) # Remove the sample_id column as this column is used internally for # data storage and it doesn't actually belong to the metadata cols.remove('sample_id') return set(cols)
def _get_categories(self, conn_handler): r"""Returns all the available metadata categories for the sample Parameters ---------- conn_handler : SQLConnectionHandler The connection handler object connected to the DB Returns ------- set of str The set of all available metadata categories """ # Get all the required columns required_cols = get_table_cols(self._table, conn_handler) # Get all the the columns in the dynamic table dynamic_cols = get_table_cols(self._dynamic_table, conn_handler) # Get the union of the two previous lists cols = set(required_cols).union(dynamic_cols) # Remove the sample_id column and the study_id/raw_data_id columns, # as this columns are used internally for data storage and they don't # actually belong to the metadata cols.remove('sample_id') cols.remove(self._id_column) try: # study_id could be potentially removed by _id_column, so wrap # in a try except cols.remove('study_id') except KeyError: pass # Change the *_id columns, as this is for convenience internally, # and add the original categories for key, value in viewitems(self._md_template.translate_cols_dict): cols.remove(key) cols.add(value) return cols
def test_get_table_cols(self): obs = get_table_cols("qiita_user") exp = { "email", "user_level_id", "password", "name", "affiliation", "address", "phone", "user_verify_code", "pass_reset_code", "pass_reset_timestamp", } self.assertEqual(set(obs), exp)
def to_dataframe(self): """Returns the metadata template as a dataframe Returns ------- pandas DataFrame The metadata in the template,indexed on sample id """ conn_handler = SQLConnectionHandler() cols = sorted(get_table_cols(self._table_name(self._id))) # Get all metadata for the template sql = "SELECT {0} FROM qiita.{1}".format(", ".join(cols), self._table_name(self.id)) meta = conn_handler.execute_fetchall(sql, (self._id,)) # Create the dataframe and clean it up a bit df = pd.DataFrame((list(x) for x in meta), columns=cols) df.set_index('sample_id', inplace=True, drop=True) return df
def to_dataframe(self): """Returns the metadata template as a dataframe Returns ------- pandas DataFrame The metadata in the template,indexed on sample id """ with TRN: cols = sorted(get_table_cols(self._table_name(self._id))) # Get all metadata for the template sql = "SELECT {0} FROM qiita.{1}".format(", ".join(cols), self._table_name(self.id)) TRN.add(sql, [self._id]) meta = TRN.execute_fetchindex() # Create the dataframe and clean it up a bit df = pd.DataFrame((list(x) for x in meta), columns=cols) df.set_index("sample_id", inplace=True, drop=True) return df
def _get_categories(self, conn_handler): r"""Returns all the available metadata categories for the sample Parameters ---------- conn_handler : SQLConnectionHandler The connection handler object connected to the DB Returns ------- set of str The set of all available metadata categories """ # Get all the columns cols = get_table_cols(self._dynamic_table) # Remove the sample_id column as this column is used internally for # data storage and it doesn't actually belong to the metadata cols.remove('sample_id') return set(cols)
def _add_common_extend_steps_to_queue(self, md_template, conn_handler, queue_name): r"""Adds the common extend steps to the queue in conn_handler Parameters ---------- md_template : DataFrame The metadata template file contents indexed by sample ids conn_handler : SQLConnectionHandler The connection handler object connected to the DB queue_name : str The queue where the SQL statements will be added Raises ------ QiitaDBError If no new samples or new columns are present in `md_template` """ # Check if we are adding new samples sample_ids = md_template.index.tolist() curr_samples = set(self.keys()) existing_samples = curr_samples.intersection(sample_ids) new_samples = set(sample_ids).difference(existing_samples) # Check if we are adding new columns, by getting all the columns from # the database table_name = self._table_name(self._id) db_cols = get_table_cols(self._table, conn_handler) db_cols.remove('sample_id') db_cols.remove(self._id_column) curr_cols = set( get_table_cols(table_name, conn_handler)).union(db_cols) headers = md_template.keys().tolist() existing_cols = curr_cols.intersection(headers) new_cols = set(headers).difference(existing_cols) if not new_cols and not new_samples: raise QiitaDBError( "No new samples or new columns found in the template. If you " "want to update existing values, you should use the 'update' " "functionality.") if new_cols: # If we are adding new columns, add them first (simplifies code) # Sorting the new columns to enforce an order new_cols = sorted(new_cols) datatypes = get_datatypes(md_template.ix[:, new_cols]) sql_cols = """INSERT INTO qiita.{0} ({1}, column_name, column_type) VALUES (%s, %s, %s)""".format(self._column_table, self._id_column) sql_alter = """ALTER TABLE qiita.{0} ADD COLUMN {1} {2}""" for category, dtype in zip(new_cols, datatypes): conn_handler.add_to_queue( queue_name, sql_cols, (self._id, category, dtype)) conn_handler.add_to_queue( queue_name, sql_alter.format(table_name, category, dtype)) if existing_samples: warnings.warn( "No values have been modified for samples '%s'. However, " "the following columns have been added to them: '%s'" % (", ".join(existing_samples), ", ".join(new_cols)), QiitaDBWarning) # The values for the new columns are the only ones that get # added to the database. None of the existing values will be # modified (see update for that functionality) min_md_template = md_template[new_cols].loc[existing_samples] values = as_python_types(min_md_template, new_cols) values.append(existing_samples) # psycopg2 requires a list of tuples, in which each tuple is a # set of values to use in the string formatting of the query. # We have all the values in different lists (but in the same # order) so use zip to create the list of tuples that psycopg2 # requires. values = [v for v in zip(*values)] set_str = ["{0} = %s".format(col) for col in new_cols] sql = """UPDATE qiita.{0} SET {1} WHERE sample_id=%s""".format(table_name, ",".join(set_str)) conn_handler.add_to_queue(queue_name, sql, values, many=True) elif existing_samples: warnings.warn( "The following samples already exist in the template and " "will be ignored: %s" % ", ".join(existing_samples), QiitaDBWarning) if new_samples: num_samples = len(new_samples) new_samples = sorted(new_samples) # At this point we only want the information from the new samples md_template = md_template.loc[new_samples] # Insert values on required columns values = as_python_types(md_template, db_cols) values.insert(0, new_samples) values.insert(0, [self._id] * num_samples) # psycopg2 requires a list of tuples, in which each tuple is a # tuple of values to use in the string formatting of the query. We # have all the values in different lists (but in the same order) so # use zip to create the list of tuples that psycopg2 requires. values = [v for v in zip(*values)] sql = """INSERT INTO qiita.{0} ({1}, sample_id, {2}) VALUES (%s, %s, {3})""".format( self._table, self._id_column, ', '.join(db_cols), ', '.join(['%s'] * len(db_cols))) conn_handler.add_to_queue(queue_name, sql, values, many=True) headers = sorted(set(headers).difference(db_cols)) # Insert values on custom table values = as_python_types(md_template, headers) values.insert(0, new_samples) values = [v for v in zip(*values)] sql = """INSERT INTO qiita.{0} (sample_id, {1}) VALUES (%s, {2})""".format( table_name, ", ".join(headers), ', '.join(["%s"] * len(headers))) conn_handler.add_to_queue(queue_name, sql, values, many=True)
def _clean_validate_template(cls, md_template, study_id, obj, conn_handler=None): """Takes care of all validation and cleaning of metadata templates Parameters ---------- md_template : DataFrame The metadata template file contents indexed by sample ids study_id : int The study to which the metadata template belongs to. obj : object Any extra object needed by the template to perform any extra check Returns ------- md_template : DataFrame Cleaned copy of the input md_template Raises ------ QiitaDBColumnError If the sample names in md_template contains invalid names QiitaDBDuplicateHeaderError If md_template contains duplicate headers QiitaDBColumnError If md_template is missing a required column """ cls._check_subclass() invalid_ids = get_invalid_sample_names(md_template.index) if invalid_ids: raise QiitaDBColumnError("The following sample names in the " "template contain invalid characters " "(only alphanumeric characters or periods" " are allowed): %s." % ", ".join(invalid_ids)) # We are going to modify the md_template. We create a copy so # we don't modify the user one md_template = deepcopy(md_template) # Prefix the sample names with the study_id prefix_sample_names_with_id(md_template, study_id) # In the database, all the column headers are lowercase md_template.columns = [c.lower() for c in md_template.columns] # Check that we don't have duplicate columns if len(set(md_template.columns)) != len(md_template.columns): raise QiitaDBDuplicateHeaderError( find_duplicates(md_template.columns)) # We need to check for some special columns, that are not present on # the database, but depending on the data type are required. missing = cls._check_special_columns(md_template, obj) conn_handler = conn_handler if conn_handler else SQLConnectionHandler() # Get the required columns from the DB db_cols = get_table_cols(cls._table, conn_handler) # Remove the sample_id and study_id columns db_cols.remove('sample_id') db_cols.remove(cls._id_column) # Retrieve the headers of the metadata template headers = list(md_template.keys()) # Check that md_template has the required columns remaining = set(db_cols).difference(headers) missing = missing.union(remaining) missing = missing.difference(cls.translate_cols_dict) if missing: raise QiitaDBColumnError("Missing columns: %s" % ', '.join(missing)) return md_template
def load_template_to_dataframe(fn, strip_whitespace=True): """Load a sample or a prep template into a data frame Parameters ---------- fn : str or file-like object filename of the template to load, or an already open template file strip_whitespace : bool, optional Defaults to True. Whether or not to strip whitespace from values in the input file Returns ------- DataFrame Pandas dataframe with the loaded information Raises ------ ValueError Empty file passed QiitaDBColumnError If the sample_name column is not present in the template. If there's a value in one of the reserved columns that cannot be cast to the needed type. QiitaDBWarning When columns are dropped because they have no content for any sample. Notes ----- The index attribute of the DataFrame will be forced to be 'sample_name' and will be cast to a string. Additionally rows that start with a '\t' character will be ignored and columns that are empty will be removed. Empty sample names will be removed from the DataFrame. The following table describes the data type per column that will be enforced in `fn`. Column names are case-insensitive but will be lowercased on addition to the database. +-----------------------+--------------+ | Column Name | Python Type | +=======================+==============+ | sample_name | str | +-----------------------+--------------+ | physical_location | str | +-----------------------+--------------+ | has_physical_specimen | bool | +-----------------------+--------------+ | has_extracted_data | bool | +-----------------------+--------------+ | sample_type | str | +-----------------------+--------------+ | host_subject_id | str | +-----------------------+--------------+ | description | str | +-----------------------+--------------+ | latitude | float | +-----------------------+--------------+ | longitude | float | +-----------------------+--------------+ """ # Load in file lines holdfile = None with open_file(fn) as f: holdfile = f.readlines() if not holdfile: raise ValueError('Empty file passed!') # Strip all values in the cells in the input file, if requested if strip_whitespace: for pos, line in enumerate(holdfile): holdfile[pos] = '\t'.join(d.strip(" \r\x0b\x0c") for d in line.split('\t')) # get and clean the required columns reqcols = set(get_table_cols("required_sample_info")) reqcols.add('sample_name') reqcols.add('required_sample_info_status') reqcols.discard('required_sample_info_status_id') # clean all the column names cols = holdfile[0].split('\t') holdfile[0] = '\t'.join(c.lower() if c.lower() in reqcols else c for c in cols) # index_col: # is set as False, otherwise it is cast as a float and we want a string # keep_default: # is set as False, to avoid inferring empty/NA values with the defaults # that Pandas has. # na_values: # the values that should be considered as empty, in this case only empty # strings. # converters: # ensure that sample names are not converted into any other types but # strings and remove any trailing spaces. Don't let pandas try to guess # the dtype of the other columns, force them to be a str. # comment: # using the tab character as "comment" we remove rows that are # constituted only by delimiters i. e. empty rows. template = pd.read_csv(StringIO(''.join(holdfile)), sep='\t', infer_datetime_format=True, keep_default_na=False, na_values=[''], parse_dates=True, index_col=False, comment='\t', mangle_dupe_cols=False, converters={ 'sample_name': lambda x: str(x).strip(), # required_sample_info 'physical_location': str, 'sample_type': str, # collection_timestamp is not added here 'host_subject_id': str, 'description': str, # common_prep_info 'center_name': str, 'center_projct_name': str}) # let pandas infer the dtypes of these columns, if the inference is # not correct, then we have to raise an error columns_to_dtype = [(['latitude', 'longitude'], (np.int, np.float), 'integer or decimal'), (['has_physical_specimen', 'has_extracted_data'], np.bool_, 'boolean')] for columns, c_dtype, english_desc in columns_to_dtype: for n in columns: if n in template.columns and not all([isinstance(val, c_dtype) for val in template[n]]): raise QiitaDBColumnError("The '%s' column includes values " "that cannot be cast into a %s " "value " % (n, english_desc)) initial_columns = set(template.columns) if 'sample_name' not in template.columns: raise QiitaDBColumnError("The 'sample_name' column is missing from " "your template, this file cannot be parsed.") # remove rows that have no sample identifier but that may have other data # in the rest of the columns template.dropna(subset=['sample_name'], how='all', inplace=True) # set the sample name as the index template.set_index('sample_name', inplace=True) # it is not uncommon to find templates that have empty columns template.dropna(how='all', axis=1, inplace=True) initial_columns.remove('sample_name') dropped_cols = initial_columns - set(template.columns) if dropped_cols: warnings.warn('The following column(s) were removed from the template ' 'because all their values are empty: ' '%s' % ', '.join(dropped_cols), QiitaDBWarning) return template
def post(self): user = self.current_user action = self.get_argument("action") # set required template variables results = {} meta_headers = [] counts = {} fullcounts = {} query = "" searchmsg = "" selsamples = {} selproc_data = {} # get analysis and selected samples if exists, or create if necessary if action == "create": name = self.get_argument('name') description = self.get_argument('description') analysis = Analysis.create(user, name, description) analysis_id = analysis.id # set to second step since this page is second step in workflow analysis.step = SELECT_SAMPLES # fill example studies by running query for specific studies search = QiitaStudySearch() def_query = 'study_id = 1 OR study_id = 2 OR study_id = 3' results, meta_headers = search(def_query, user) results, counts, fullcounts = self._parse_search_results( results, selsamples, meta_headers) else: analysis_id = int(self.get_argument("analysis-id")) analysis = Analysis(analysis_id) check_analysis_access(user, analysis) selproc_data, selsamples = self._selected_parser(analysis) # run through action requested if action == "search": search = QiitaStudySearch() query = str(self.get_argument("query")) try: results, meta_headers = search(query, user) except ParseException: searchmsg = "Malformed search query, please read search help." except QiitaDBIncompatibleDatatypeError as e: searchmsg = ''.join(e) if not results and not searchmsg: searchmsg = "No results found." else: results, counts, fullcounts = self._parse_search_results( results, selsamples, meta_headers) elif action == "select": analysis.add_samples(self._parse_form_select()) # rebuild the selected from database to reflect changes selproc_data, selsamples = self._selected_parser(analysis) elif action == "deselect": proc_data, samples = self._parse_form_deselect() if proc_data: analysis.remove_samples(proc_data=proc_data) if samples: analysis.remove_samples(samples=samples) if not proc_data and not samples: searchmsg = "Must select samples to remove from analysis!" # rebuild the selected from database to reflect changes selproc_data, selsamples = self._selected_parser(analysis) self.render('search_studies.html', user=user, aid=analysis_id, results=results, meta_headers=meta_headers, selsamples=selsamples, selproc_data=selproc_data, counts=counts, fullcounts=fullcounts, searchmsg=searchmsg, query=query, availmeta=SampleTemplate.metadata_headers() + get_table_cols("study"))
class QiitaStudySearch(object): """QiitaStudySearch object to parse and run searches on studies.""" # column names from required_sample_info table required_cols = set(get_table_cols("required_sample_info")) # column names from study table study_cols = set(get_table_cols("study")) def __call__(self, searchstr, user): """Runs a Study query and returns matching studies and samples Parameters ---------- searchstr : str Search string to use user : str User making the search. Needed for permissions checks. Returns ------- dict Found samples in format {study_id: [[samp_id1, meta1, meta2, ...], [samp_id2, meta1, meta2, ...], ...} list metadata column names searched for Notes ----- Metadata information for each sample is in the same order as the metadata columns list returned Metadata column names and string searches are case-sensitive """ study_sql, sample_sql, meta_headers = \ self._parse_study_search_string(searchstr) conn_handler = SQLConnectionHandler() # get all studies containing the metadata headers requested study_ids = {x[0] for x in conn_handler.execute_fetchall(study_sql)} # strip to only studies user has access to userobj = User(user) study_ids = study_ids.intersection(Study.get_public() + userobj.private_studies + userobj.shared_studies) results = {} # run search on each study to get out the matching samples for sid in study_ids: study_res = conn_handler.execute_fetchall(sample_sql.format(sid)) if study_res: # only add study to results if actually has samples in results results[sid] = study_res return results, meta_headers def _parse_study_search_string(self, searchstr): """parses string into SQL query for study search Parameters ---------- searchstr : str The string to parse Returns ------- study_sql : str SQL query for selecting studies with the required metadata columns sample_sql : str SQL query for each study to get the sample ids that mach the query meta_headers : list metadata categories in the query string in alphabetical order Notes ----- All searches are case-sensitive References ---------- .. [1] McGuire P (2007) Getting started with pyparsing. """ # build the parse grammar category = Word(alphas + nums + "_") seperator = oneOf("> < = >= <= !=") | CaselessLiteral("includes") | \ CaselessLiteral("startswith") value = Word(alphas + nums + "_" + ":" + ".") | \ dblQuotedString().setParseAction(removeQuotes) criterion = Group(category + seperator + value) criterion.setParseAction(SearchTerm) and_ = CaselessLiteral("and") or_ = CaselessLiteral("or") not_ = CaselessLiteral("not") optional_seps = Optional(and_ | or_ | not_) # create the grammar for parsing operators AND, OR, NOT search_expr = operatorPrecedence(criterion, [(not_, 1, opAssoc.RIGHT, SearchNot), (and_, 2, opAssoc.LEFT, SearchAnd), (or_, 2, opAssoc.LEFT, SearchOr)]) # parse the search string to get out the SQL WHERE formatted query eval_stack = (search_expr + stringEnd).parseString(searchstr)[0] sql_where = eval_stack.generate_sql() # this lookup will be used to select only studies with columns # of the correct type type_lookup = {int: 'integer', float: 'float8', str: 'varchar'} # parse out all metadata headers we need to have in a study, and # their corresponding types all_headers = [ c[0][0].term[0] for c in (criterion + optional_seps).scanString(searchstr) ] meta_headers = set(all_headers) all_types = [ c[0][0].term[2] for c in (criterion + optional_seps).scanString(searchstr) ] all_types = [type_lookup[type(typecast_string(s))] for s in all_types] # sort headers and types so they return in same order every time. # Should be a relatively short list so very quick # argsort implementation taken from # http://stackoverflow.com/questions/3382352/ # equivalent-of-numpy-argsort-in-basic-python sort_order = sorted(range(len(all_headers)), key=all_headers.__getitem__) all_types = [all_types[x] for x in sort_order] all_headers.sort() # At this point it is possible that a metadata header has been # reference more than once in the query. If the types agree, then we # do not need to do anything. If the types do not agree (specifically, # if it appears to be numerical in one case and string in another), # then we need to give varchar the precedence. meta_header_type_lookup = dict() for header, header_type in zip(all_headers, all_types): if header not in meta_header_type_lookup: meta_header_type_lookup[header] = header_type else: if header_type == 'varchar' or \ meta_header_type_lookup[header] == 'varchar': meta_header_type_lookup[header] = 'varchar' # create the study finding SQL # remove metadata headers that are in required_sample_info table meta_headers = meta_headers.difference(self.required_cols).difference( self.study_cols) # get all study ids that contain all metadata categories searched for sql = [] if meta_headers: # have study-specific metadata, so need to find specific studies for meta in meta_headers: if meta_header_type_lookup[meta] in ('integer', 'float8'): allowable_types = "('integer', 'float8')" else: allowable_types = "('varchar')" sql.append("SELECT study_id FROM qiita.study_sample_columns " "WHERE lower(column_name) = lower('%s') and " "column_type in %s" % (scrub_data(meta), allowable_types)) else: # no study-specific metadata, so need all studies sql.append("SELECT study_id FROM qiita.study_sample_columns") # combine the query study_sql = ' INTERSECT '.join(sql) # create the sample finding SQL, getting both sample id and values # build the sql formatted list of metadata headers header_info = [] for meta in meta_header_type_lookup: if meta in self.required_cols: header_info.append("r.%s" % meta) elif meta in self.study_cols: header_info.append("st.%s" % meta) else: header_info.append("sa.%s" % meta) # build the SQL query sample_sql = ("SELECT r.sample_id,%s FROM qiita.required_sample_info " "r JOIN qiita.sample_{0} sa ON sa.sample_id = " "r.sample_id JOIN qiita.study st ON st.study_id = " "r.study_id WHERE %s" % (','.join(header_info), sql_where)) return study_sql, sample_sql, meta_header_type_lookup.keys()
def _add_common_creation_steps_to_queue(cls, md_template, obj_id, conn_handler, queue_name): r"""Adds the common creation steps to the queue in conn_handler Parameters ---------- md_template : DataFrame The metadata template file contents indexed by sample ids obj_id : int The id of the object being created conn_handler : SQLConnectionHandler The connection handler object connected to the DB queue_name : str The queue where the SQL statements will be added """ cls._check_subclass() # Get some useful information from the metadata template sample_ids = md_template.index.tolist() num_samples = len(sample_ids) headers = list(md_template.keys()) # Get the required columns from the DB db_cols = sorted(get_table_cols(cls._table, conn_handler)) # Remove the sample_id and _id_column columns db_cols.remove('sample_id') db_cols.remove(cls._id_column) # Insert values on required columns values = as_python_types(md_template, db_cols) values.insert(0, sample_ids) values.insert(0, [obj_id] * num_samples) values = [v for v in zip(*values)] conn_handler.add_to_queue( queue_name, "INSERT INTO qiita.{0} ({1}, sample_id, {2}) " "VALUES (%s, %s, {3})".format(cls._table, cls._id_column, ', '.join(db_cols), ', '.join(['%s'] * len(db_cols))), values, many=True) # Insert rows on *_columns table headers = sorted(set(headers).difference(db_cols)) datatypes = get_datatypes(md_template.ix[:, headers]) # psycopg2 requires a list of tuples, in which each tuple is a set # of values to use in the string formatting of the query. We have all # the values in different lists (but in the same order) so use zip # to create the list of tuples that psycopg2 requires. values = [ v for v in zip([obj_id] * len(headers), headers, datatypes)] conn_handler.add_to_queue( queue_name, "INSERT INTO qiita.{0} ({1}, column_name, column_type) " "VALUES (%s, %s, %s)".format(cls._column_table, cls._id_column), values, many=True) # Create table with custom columns table_name = cls._table_name(obj_id) column_datatype = ["%s %s" % (col, dtype) for col, dtype in zip(headers, datatypes)] conn_handler.add_to_queue( queue_name, "CREATE TABLE qiita.{0} (sample_id varchar NOT NULL, {1})".format( table_name, ', '.join(column_datatype))) # Insert values on custom table values = as_python_types(md_template, headers) values.insert(0, sample_ids) values = [v for v in zip(*values)] conn_handler.add_to_queue( queue_name, "INSERT INTO qiita.{0} (sample_id, {1}) " "VALUES (%s, {2})".format(table_name, ", ".join(headers), ', '.join(["%s"] * len(headers))), values, many=True)
def post(self): user = self.current_user action = self.get_argument("action") # set required template variables results = {} meta_headers = [] counts = {} fullcounts = {} query = "" searchmsg = "" selsamples = {} selproc_data = {} # get analysis and selected samples if exists, or create if necessary if action == "create": name = self.get_argument('name') description = self.get_argument('description') analysis = Analysis.create(User(user), name, description) analysis_id = analysis.id # set to second step since this page is second step in workflow analysis.step = SELECT_SAMPLES # fill example studies by running query for specific studies search = QiitaStudySearch() def_query = 'study_id = 1 OR study_id = 2 OR study_id = 3' results, meta_headers = search(def_query, user) results, counts, fullcounts = self._parse_search_results( results, selsamples, meta_headers) else: analysis_id = int(self.get_argument("analysis-id")) check_analysis_access(User(user), analysis_id) analysis = Analysis(analysis_id) selproc_data, selsamples = self._selected_parser(analysis) # run through action requested if action == "search": search = QiitaStudySearch() query = str(self.get_argument("query")) try: results, meta_headers = search(query, user) except ParseException: searchmsg = "Malformed search query, please read search help." except QiitaDBIncompatibleDatatypeError as e: searchmsg = ''.join(e) if not results and not searchmsg: searchmsg = "No results found." else: results, counts, fullcounts = self._parse_search_results( results, selsamples, meta_headers) elif action == "select": analysis.add_samples(self._parse_form_select()) # rebuild the selected from database to reflect changes selproc_data, selsamples = self._selected_parser(analysis) elif action == "deselect": proc_data, samples = self._parse_form_deselect() if proc_data: analysis.remove_samples(proc_data=proc_data) if samples: analysis.remove_samples(samples=samples) if not proc_data and not samples: searchmsg = "Must select samples to remove from analysis!" # rebuild the selected from database to reflect changes selproc_data, selsamples = self._selected_parser(analysis) self.render('search_studies.html', user=user, aid=analysis_id, results=results, meta_headers=meta_headers, selsamples=selsamples, selproc_data=selproc_data, counts=counts, fullcounts=fullcounts, searchmsg=searchmsg, query=query, availmeta=SampleTemplate.metadata_headers() + get_table_cols("study"))