Example #1
0
    def to_dataframe(self):
        """Returns the metadata template as a dataframe

        Returns
        -------
        pandas DataFrame
            The metadata in the template,indexed on sample id
        """
        conn_handler = SQLConnectionHandler()
        cols = get_table_cols(self._table, conn_handler)
        if 'study_id' in cols:
            cols.remove('study_id')
        dyncols = get_table_cols(self._table_name(self._id), conn_handler)
        # remove sample_id from dyncols so not repeated
        dyncols.remove('sample_id')
        # Get all metadata for the template
        sql = """SELECT {0}, {1} FROM qiita.{2} req
            INNER JOIN qiita.{3} dyn on req.sample_id = dyn.sample_id
            WHERE req.{4} = %s""".format(
            ", ".join("req.%s" % c for c in cols),
            ", ".join("dyn.%s" % d for d in dyncols),
            self._table, self._table_name(self._id), self._id_column)
        meta = conn_handler.execute_fetchall(sql, [self._id])
        cols = cols + dyncols

        # Create the dataframe and clean it up a bit
        df = pd.DataFrame((list(x) for x in meta), columns=cols)
        df.set_index('sample_id', inplace=True, drop=True)
        # Turn id cols to value cols
        for col, value in viewitems(self.str_cols_handlers):
            df[col].replace(value, inplace=True)
        df.rename(columns=self.translate_cols_dict, inplace=True)

        return df
Example #2
0
class SearchTerm(object):
    # column names from required_sample_info table
    required_cols = set(get_table_cols("required_sample_info"))
    # column names from study table
    study_cols = set(get_table_cols("study"))

    def __init__(self, tokens):
        self.term = tokens[0]
        # clean all the inputs
        for pos, term in enumerate(self.term):
            self.term[pos] = scrub_data(term)

    def generate_sql(self):
        # we can assume that the metadata is either in required_sample_info
        # or the study-specific table
        column_name, operator, argument = self.term
        argument_type = type(typecast_string(argument))

        allowable_types = {
            int: {'<', '<=', '=', '>=', '>'},
            float: {'<', '<=', '=', '>=', '>'},
            str: {'=', 'includes', 'startswith'}
        }

        if operator not in allowable_types[argument_type]:
            raise QiitaDBIncompatibleDatatypeError(operator, argument_type)

        if column_name in self.required_cols:
            column_name = "r.%s" % column_name.lower()
        elif column_name in self.study_cols:
            column_name = "st.%s" % column_name.lower()
        else:
            column_name = "sa.%s" % column_name.lower()

        if operator == "includes":
            # substring search, so create proper query for it
            return "LOWER(%s) LIKE '%%%s%%'" % (column_name, argument.lower())
        else:
            # standard query so just return it, adding quotes if string
            if argument_type == str:
                argument = ''.join(("'", argument, "'"))
            return ' '.join([column_name, operator, argument])

    def __repr__(self):
        column_name, operator, argument = self.term
        if operator == "includes":
            return "LOWER(%s) LIKE '%%%s%%')" % (column_name, argument.lower())
        else:
            return ' '.join(self.term)
Example #3
0
 def test_get_table_cols(self):
     obs = get_table_cols("qiita_user", self.conn_handler)
     exp = {
         "email", "user_level_id", "password", "name", "affiliation",
         "address", "phone", "user_verify_code", "pass_reset_code",
         "pass_reset_timestamp"
     }
     self.assertEqual(set(obs), exp)
Example #4
0
    def categories(self):
        """Identifies the metadata columns present in a template

        Returns
        -------
        cols : list
            The static and dynamic category fields

        """
        cols = get_table_cols(self._table_name(self._id))
        cols.extend(get_table_cols(self._table)[1:])

        for idx, c in enumerate(cols):
            if c in self.translate_cols_dict:
                cols[idx] = self.translate_cols_dict[c]

        return cols
Example #5
0
    def __getitem__(self, key):
        r"""Returns the value of the metadata category `key`

        Parameters
        ----------
        key : str
            The metadata category

        Returns
        -------
        obj
            The value of the metadata category `key`

        Raises
        ------
        KeyError
            If the metadata category `key` does not exists

        See Also
        --------
        get
        """
        conn_handler = SQLConnectionHandler()
        key = key.lower()
        if key in self._get_categories(conn_handler):
            # It's possible that the key is asking for one of the *_id columns
            # that we have to do the translation
            def handler(x):
                return x

            # prevent flake8 from complaining about the function not being
            # used and a redefinition happening in the next few lines
            handler(None)

            if key in self._md_template.translate_cols_dict.values():
                handler = (
                    lambda x: self._md_template.str_cols_handlers[key][x])
                key = "%s_id" % key
            # Check if we have either to query the table with required columns
            # or the dynamic table
            if key in get_table_cols(self._table, conn_handler):
                result = conn_handler.execute_fetchone(
                    "SELECT {0} FROM qiita.{1} WHERE {2}=%s AND "
                    "sample_id=%s".format(key, self._table, self._id_column),
                    (self._md_template.id, self._id))[0]
                return handler(result)
            else:
                return conn_handler.execute_fetchone(
                    "SELECT {0} FROM qiita.{1} WHERE "
                    "sample_id=%s".format(key, self._dynamic_table),
                    (self._id, ))[0]
        else:
            # The key is not available for the sample, so raise a KeyError
            raise KeyError("Metadata category %s does not exists for sample %s"
                           " in template %d" %
                           (key, self._id, self._md_template.id))
Example #6
0
 def get(self, message="", msg_level=None):
     all_emails_except_current = yield Task(self._get_all_emails)
     all_emails_except_current.remove(self.current_user.id)
     avail_meta = SampleTemplate.metadata_headers() +\
         get_table_cols("study")
     self.render('list_studies.html',
                 availmeta=avail_meta,
                 all_emails_except_current=all_emails_except_current,
                 message=message,
                 msg_level=msg_level)
Example #7
0
 def get(self, message="", msg_level=None):
     all_emails_except_current = yield Task(self._get_all_emails)
     all_emails_except_current.remove(self.current_user.id)
     avail_meta = SampleTemplate.metadata_headers() +\
         get_table_cols("study")
     self.render('list_studies.html',
                 availmeta=avail_meta,
                 all_emails_except_current=all_emails_except_current,
                 message=message,
                 msg_level=msg_level)
Example #8
0
    def categories(self):
        """Identifies the metadata columns present in a template

        Returns
        -------
        cols : list
            The static and dynamic category fields

        """
        cols = get_table_cols(self._table_name(self._id))
        cols.remove("sample_id")

        return cols
Example #9
0
    def get(self):
        userobj = self.current_user
        analysis = Analysis(int(self.get_argument("aid")))
        # make sure user has access to the analysis
        check_analysis_access(userobj, analysis)

        # get the dictionaries of selected samples and data types
        selproc_data, selsamples = self._selected_parser(analysis)

        self.render('search_studies.html', aid=analysis.id,
                    selsamples=selsamples, selproc_data=selproc_data,
                    counts={}, fullcounts={}, searchmsg="", query="",
                    results={}, availmeta=SampleTemplate.metadata_headers() +
                    get_table_cols("study"))
Example #10
0
    def _get_categories(self):
        r"""Returns all the available metadata categories for the sample

        Returns
        -------
        set of str
            The set of all available metadata categories
        """
        # Get all the columns
        cols = get_table_cols(self._dynamic_table)
        # Remove the sample_id column as this column is used internally for
        # data storage and it doesn't actually belong to the metadata
        cols.remove('sample_id')
        return set(cols)
Example #11
0
    def _get_categories(self, conn_handler):
        r"""Returns all the available metadata categories for the sample

        Parameters
        ----------
        conn_handler : SQLConnectionHandler
            The connection handler object connected to the DB

        Returns
        -------
        set of str
            The set of all available metadata categories
        """
        # Get all the required columns
        required_cols = get_table_cols(self._table, conn_handler)
        # Get all the the columns in the dynamic table
        dynamic_cols = get_table_cols(self._dynamic_table, conn_handler)
        # Get the union of the two previous lists
        cols = set(required_cols).union(dynamic_cols)
        # Remove the sample_id column and the study_id/raw_data_id columns,
        # as this columns are used internally for data storage and they don't
        # actually belong to the metadata
        cols.remove('sample_id')
        cols.remove(self._id_column)
        try:
            # study_id could be potentially removed by _id_column, so wrap
            # in a try except
            cols.remove('study_id')
        except KeyError:
            pass
        # Change the *_id columns, as this is for convenience internally,
        # and add the original categories
        for key, value in viewitems(self._md_template.translate_cols_dict):
            cols.remove(key)
            cols.add(value)

        return cols
Example #12
0
 def test_get_table_cols(self):
     obs = get_table_cols("qiita_user")
     exp = {
         "email",
         "user_level_id",
         "password",
         "name",
         "affiliation",
         "address",
         "phone",
         "user_verify_code",
         "pass_reset_code",
         "pass_reset_timestamp",
     }
     self.assertEqual(set(obs), exp)
Example #13
0
    def to_dataframe(self):
        """Returns the metadata template as a dataframe

        Returns
        -------
        pandas DataFrame
            The metadata in the template,indexed on sample id
        """
        conn_handler = SQLConnectionHandler()
        cols = sorted(get_table_cols(self._table_name(self._id)))
        # Get all metadata for the template
        sql = "SELECT {0} FROM qiita.{1}".format(", ".join(cols),
                                                 self._table_name(self.id))
        meta = conn_handler.execute_fetchall(sql, (self._id,))

        # Create the dataframe and clean it up a bit
        df = pd.DataFrame((list(x) for x in meta), columns=cols)
        df.set_index('sample_id', inplace=True, drop=True)

        return df
Example #14
0
    def to_dataframe(self):
        """Returns the metadata template as a dataframe

        Returns
        -------
        pandas DataFrame
            The metadata in the template,indexed on sample id
        """
        with TRN:
            cols = sorted(get_table_cols(self._table_name(self._id)))
            # Get all metadata for the template
            sql = "SELECT {0} FROM qiita.{1}".format(", ".join(cols), self._table_name(self.id))
            TRN.add(sql, [self._id])
            meta = TRN.execute_fetchindex()

            # Create the dataframe and clean it up a bit
            df = pd.DataFrame((list(x) for x in meta), columns=cols)
            df.set_index("sample_id", inplace=True, drop=True)

            return df
Example #15
0
    def _get_categories(self, conn_handler):
        r"""Returns all the available metadata categories for the sample

        Parameters
        ----------
        conn_handler : SQLConnectionHandler
            The connection handler object connected to the DB

        Returns
        -------
        set of str
            The set of all available metadata categories
        """
        # Get all the columns
        cols = get_table_cols(self._dynamic_table)
        # Remove the sample_id column as this column is used internally for
        # data storage and it doesn't actually belong to the metadata
        cols.remove('sample_id')

        return set(cols)
Example #16
0
    def get(self):
        userobj = self.current_user
        analysis = Analysis(int(self.get_argument("aid")))
        # make sure user has access to the analysis
        check_analysis_access(userobj, analysis)

        # get the dictionaries of selected samples and data types
        selproc_data, selsamples = self._selected_parser(analysis)

        self.render('search_studies.html',
                    aid=analysis.id,
                    selsamples=selsamples,
                    selproc_data=selproc_data,
                    counts={},
                    fullcounts={},
                    searchmsg="",
                    query="",
                    results={},
                    availmeta=SampleTemplate.metadata_headers() +
                    get_table_cols("study"))
Example #17
0
    def _add_common_extend_steps_to_queue(self, md_template, conn_handler,
                                          queue_name):
        r"""Adds the common extend steps to the queue in conn_handler

        Parameters
        ----------
        md_template : DataFrame
            The metadata template file contents indexed by sample ids
        conn_handler : SQLConnectionHandler
            The connection handler object connected to the DB
        queue_name : str
            The queue where the SQL statements will be added

        Raises
        ------
        QiitaDBError
            If no new samples or new columns are present in `md_template`
        """
        # Check if we are adding new samples
        sample_ids = md_template.index.tolist()
        curr_samples = set(self.keys())
        existing_samples = curr_samples.intersection(sample_ids)
        new_samples = set(sample_ids).difference(existing_samples)

        # Check if we are adding new columns, by getting all the columns from
        # the database
        table_name = self._table_name(self._id)
        db_cols = get_table_cols(self._table, conn_handler)
        db_cols.remove('sample_id')
        db_cols.remove(self._id_column)
        curr_cols = set(
            get_table_cols(table_name, conn_handler)).union(db_cols)
        headers = md_template.keys().tolist()
        existing_cols = curr_cols.intersection(headers)
        new_cols = set(headers).difference(existing_cols)

        if not new_cols and not new_samples:
            raise QiitaDBError(
                "No new samples or new columns found in the template. If you "
                "want to update existing values, you should use the 'update' "
                "functionality.")

        if new_cols:
            # If we are adding new columns, add them first (simplifies code)
            # Sorting the new columns to enforce an order
            new_cols = sorted(new_cols)
            datatypes = get_datatypes(md_template.ix[:, new_cols])
            sql_cols = """INSERT INTO qiita.{0} ({1}, column_name, column_type)
                          VALUES (%s, %s, %s)""".format(self._column_table,
                                                        self._id_column)
            sql_alter = """ALTER TABLE qiita.{0} ADD COLUMN {1} {2}"""
            for category, dtype in zip(new_cols, datatypes):
                conn_handler.add_to_queue(
                    queue_name, sql_cols, (self._id, category, dtype))
                conn_handler.add_to_queue(
                    queue_name, sql_alter.format(table_name, category, dtype))

            if existing_samples:
                warnings.warn(
                    "No values have been modified for samples '%s'. However, "
                    "the following columns have been added to them: '%s'"
                    % (", ".join(existing_samples), ", ".join(new_cols)),
                    QiitaDBWarning)
                # The values for the new columns are the only ones that get
                # added to the database. None of the existing values will be
                # modified (see update for that functionality)
                min_md_template = md_template[new_cols].loc[existing_samples]
                values = as_python_types(min_md_template, new_cols)
                values.append(existing_samples)
                # psycopg2 requires a list of tuples, in which each tuple is a
                # set of values to use in the string formatting of the query.
                # We have all the values in different lists (but in the same
                # order) so use zip to create the list of tuples that psycopg2
                # requires.
                values = [v for v in zip(*values)]
                set_str = ["{0} = %s".format(col) for col in new_cols]
                sql = """UPDATE qiita.{0}
                         SET {1}
                         WHERE sample_id=%s""".format(table_name,
                                                      ",".join(set_str))
                conn_handler.add_to_queue(queue_name, sql, values, many=True)
        elif existing_samples:
            warnings.warn(
                "The following samples already exist in the template and "
                "will be ignored: %s" % ", ".join(existing_samples),
                QiitaDBWarning)

        if new_samples:
            num_samples = len(new_samples)
            new_samples = sorted(new_samples)
            # At this point we only want the information from the new samples
            md_template = md_template.loc[new_samples]

            # Insert values on required columns
            values = as_python_types(md_template, db_cols)
            values.insert(0, new_samples)
            values.insert(0, [self._id] * num_samples)
            # psycopg2 requires a list of tuples, in which each tuple is a
            # tuple of values to use in the string formatting of the query. We
            # have all the values in different lists (but in the same order) so
            # use zip to create the list of tuples that psycopg2 requires.
            values = [v for v in zip(*values)]
            sql = """INSERT INTO qiita.{0} ({1}, sample_id, {2})
                     VALUES (%s, %s, {3})""".format(
                self._table, self._id_column, ', '.join(db_cols),
                ', '.join(['%s'] * len(db_cols)))
            conn_handler.add_to_queue(queue_name, sql, values, many=True)

            headers = sorted(set(headers).difference(db_cols))

            # Insert values on custom table
            values = as_python_types(md_template, headers)
            values.insert(0, new_samples)
            values = [v for v in zip(*values)]
            sql = """INSERT INTO qiita.{0} (sample_id, {1})
                     VALUES (%s, {2})""".format(
                table_name, ", ".join(headers),
                ', '.join(["%s"] * len(headers)))
            conn_handler.add_to_queue(queue_name, sql, values, many=True)
Example #18
0
    def _clean_validate_template(cls, md_template, study_id, obj,
                                 conn_handler=None):
        """Takes care of all validation and cleaning of metadata templates

        Parameters
        ----------
        md_template : DataFrame
            The metadata template file contents indexed by sample ids
        study_id : int
            The study to which the metadata template belongs to.
        obj : object
            Any extra object needed by the template to perform any extra check

        Returns
        -------
        md_template : DataFrame
            Cleaned copy of the input md_template

        Raises
        ------
        QiitaDBColumnError
            If the sample names in md_template contains invalid names
        QiitaDBDuplicateHeaderError
            If md_template contains duplicate headers
        QiitaDBColumnError
            If md_template is missing a required column
        """
        cls._check_subclass()
        invalid_ids = get_invalid_sample_names(md_template.index)
        if invalid_ids:
            raise QiitaDBColumnError("The following sample names in the "
                                     "template contain invalid characters "
                                     "(only alphanumeric characters or periods"
                                     " are allowed): %s." %
                                     ", ".join(invalid_ids))
        # We are going to modify the md_template. We create a copy so
        # we don't modify the user one
        md_template = deepcopy(md_template)

        # Prefix the sample names with the study_id
        prefix_sample_names_with_id(md_template, study_id)

        # In the database, all the column headers are lowercase
        md_template.columns = [c.lower() for c in md_template.columns]

        # Check that we don't have duplicate columns
        if len(set(md_template.columns)) != len(md_template.columns):
            raise QiitaDBDuplicateHeaderError(
                find_duplicates(md_template.columns))

        # We need to check for some special columns, that are not present on
        # the database, but depending on the data type are required.
        missing = cls._check_special_columns(md_template, obj)

        conn_handler = conn_handler if conn_handler else SQLConnectionHandler()

        # Get the required columns from the DB
        db_cols = get_table_cols(cls._table, conn_handler)

        # Remove the sample_id and study_id columns
        db_cols.remove('sample_id')
        db_cols.remove(cls._id_column)

        # Retrieve the headers of the metadata template
        headers = list(md_template.keys())

        # Check that md_template has the required columns
        remaining = set(db_cols).difference(headers)
        missing = missing.union(remaining)
        missing = missing.difference(cls.translate_cols_dict)
        if missing:
            raise QiitaDBColumnError("Missing columns: %s"
                                     % ', '.join(missing))
        return md_template
Example #19
0
File: util.py Project: RNAer/qiita
def load_template_to_dataframe(fn, strip_whitespace=True):
    """Load a sample or a prep template into a data frame

    Parameters
    ----------
    fn : str or file-like object
        filename of the template to load, or an already open template file
    strip_whitespace : bool, optional
        Defaults to True. Whether or not to strip whitespace from values in the
        input file

    Returns
    -------
    DataFrame
        Pandas dataframe with the loaded information

    Raises
    ------
    ValueError
        Empty file passed
    QiitaDBColumnError
        If the sample_name column is not present in the template.
        If there's a value in one of the reserved columns that cannot be cast
        to the needed type.
    QiitaDBWarning
        When columns are dropped because they have no content for any sample.

    Notes
    -----
    The index attribute of the DataFrame will be forced to be 'sample_name'
    and will be cast to a string. Additionally rows that start with a '\t'
    character will be ignored and columns that are empty will be removed. Empty
    sample names will be removed from the DataFrame.

    The following table describes the data type per column that will be
    enforced in `fn`. Column names are case-insensitive but will be lowercased
    on addition to the database.

    +-----------------------+--------------+
    |      Column Name      |  Python Type |
    +=======================+==============+
    |           sample_name |          str |
    +-----------------------+--------------+
    |     physical_location |          str |
    +-----------------------+--------------+
    | has_physical_specimen |         bool |
    +-----------------------+--------------+
    |    has_extracted_data |         bool |
    +-----------------------+--------------+
    |           sample_type |          str |
    +-----------------------+--------------+
    |       host_subject_id |          str |
    +-----------------------+--------------+
    |           description |          str |
    +-----------------------+--------------+
    |              latitude |        float |
    +-----------------------+--------------+
    |             longitude |        float |
    +-----------------------+--------------+
    """
    # Load in file lines
    holdfile = None
    with open_file(fn) as f:
        holdfile = f.readlines()
    if not holdfile:
        raise ValueError('Empty file passed!')

    # Strip all values in the cells in the input file, if requested
    if strip_whitespace:
        for pos, line in enumerate(holdfile):
            holdfile[pos] = '\t'.join(d.strip(" \r\x0b\x0c")
                                      for d in line.split('\t'))

    # get and clean the required columns
    reqcols = set(get_table_cols("required_sample_info"))
    reqcols.add('sample_name')
    reqcols.add('required_sample_info_status')
    reqcols.discard('required_sample_info_status_id')
    # clean all the column names
    cols = holdfile[0].split('\t')
    holdfile[0] = '\t'.join(c.lower() if c.lower() in reqcols else c
                            for c in cols)
    # index_col:
    #   is set as False, otherwise it is cast as a float and we want a string
    # keep_default:
    #   is set as False, to avoid inferring empty/NA values with the defaults
    #   that Pandas has.
    # na_values:
    #   the values that should be considered as empty, in this case only empty
    #   strings.
    # converters:
    #   ensure that sample names are not converted into any other types but
    #   strings and remove any trailing spaces. Don't let pandas try to guess
    #   the dtype of the other columns, force them to be a str.
    # comment:
    #   using the tab character as "comment" we remove rows that are
    #   constituted only by delimiters i. e. empty rows.
    template = pd.read_csv(StringIO(''.join(holdfile)), sep='\t',
                           infer_datetime_format=True,
                           keep_default_na=False, na_values=[''],
                           parse_dates=True, index_col=False, comment='\t',
                           mangle_dupe_cols=False, converters={
                               'sample_name': lambda x: str(x).strip(),
                               # required_sample_info
                               'physical_location': str,
                               'sample_type': str,
                               # collection_timestamp is not added here
                               'host_subject_id': str,
                               'description': str,
                               # common_prep_info
                               'center_name': str,
                               'center_projct_name': str})

    # let pandas infer the dtypes of these columns, if the inference is
    # not correct, then we have to raise an error
    columns_to_dtype = [(['latitude', 'longitude'], (np.int, np.float),
                         'integer or decimal'),
                        (['has_physical_specimen', 'has_extracted_data'],
                         np.bool_, 'boolean')]
    for columns, c_dtype, english_desc in columns_to_dtype:
        for n in columns:
            if n in template.columns and not all([isinstance(val, c_dtype)
                                                  for val in template[n]]):
                raise QiitaDBColumnError("The '%s' column includes values "
                                         "that cannot be cast into a %s "
                                         "value " % (n, english_desc))

    initial_columns = set(template.columns)

    if 'sample_name' not in template.columns:
        raise QiitaDBColumnError("The 'sample_name' column is missing from "
                                 "your template, this file cannot be parsed.")

    # remove rows that have no sample identifier but that may have other data
    # in the rest of the columns
    template.dropna(subset=['sample_name'], how='all', inplace=True)

    # set the sample name as the index
    template.set_index('sample_name', inplace=True)

    # it is not uncommon to find templates that have empty columns
    template.dropna(how='all', axis=1, inplace=True)

    initial_columns.remove('sample_name')
    dropped_cols = initial_columns - set(template.columns)
    if dropped_cols:
        warnings.warn('The following column(s) were removed from the template '
                      'because all their values are empty: '
                      '%s' % ', '.join(dropped_cols), QiitaDBWarning)

    return template
Example #20
0
    def post(self):
        user = self.current_user
        action = self.get_argument("action")
        # set required template variables
        results = {}
        meta_headers = []
        counts = {}
        fullcounts = {}
        query = ""
        searchmsg = ""
        selsamples = {}
        selproc_data = {}
        # get analysis and selected samples if exists, or create if necessary
        if action == "create":
            name = self.get_argument('name')
            description = self.get_argument('description')
            analysis = Analysis.create(user, name, description)
            analysis_id = analysis.id
            # set to second step since this page is second step in workflow
            analysis.step = SELECT_SAMPLES
            # fill example studies by running query for specific studies
            search = QiitaStudySearch()
            def_query = 'study_id = 1 OR study_id = 2 OR study_id = 3'
            results, meta_headers = search(def_query, user)
            results, counts, fullcounts = self._parse_search_results(
                results, selsamples, meta_headers)
        else:
            analysis_id = int(self.get_argument("analysis-id"))
            analysis = Analysis(analysis_id)
            check_analysis_access(user, analysis)
            selproc_data, selsamples = self._selected_parser(analysis)

        # run through action requested
        if action == "search":
            search = QiitaStudySearch()
            query = str(self.get_argument("query"))
            try:
                results, meta_headers = search(query, user)
            except ParseException:
                searchmsg = "Malformed search query, please read search help."
            except QiitaDBIncompatibleDatatypeError as e:
                searchmsg = ''.join(e)

            if not results and not searchmsg:
                searchmsg = "No results found."
            else:
                results, counts, fullcounts = self._parse_search_results(
                    results, selsamples, meta_headers)

        elif action == "select":
            analysis.add_samples(self._parse_form_select())

            # rebuild the selected from database to reflect changes
            selproc_data, selsamples = self._selected_parser(analysis)

        elif action == "deselect":
            proc_data, samples = self._parse_form_deselect()
            if proc_data:
                analysis.remove_samples(proc_data=proc_data)
            if samples:
                analysis.remove_samples(samples=samples)
            if not proc_data and not samples:
                searchmsg = "Must select samples to remove from analysis!"

            # rebuild the selected from database to reflect changes
            selproc_data, selsamples = self._selected_parser(analysis)

        self.render('search_studies.html', user=user, aid=analysis_id,
                    results=results, meta_headers=meta_headers,
                    selsamples=selsamples, selproc_data=selproc_data,
                    counts=counts, fullcounts=fullcounts, searchmsg=searchmsg,
                    query=query, availmeta=SampleTemplate.metadata_headers() +
                    get_table_cols("study"))
Example #21
0
class QiitaStudySearch(object):
    """QiitaStudySearch object to parse and run searches on studies."""

    # column names from required_sample_info table
    required_cols = set(get_table_cols("required_sample_info"))
    # column names from study table
    study_cols = set(get_table_cols("study"))

    def __call__(self, searchstr, user):
        """Runs a Study query and returns matching studies and samples

        Parameters
        ----------
        searchstr : str
            Search string to use
        user : str
            User making the search. Needed for permissions checks.

        Returns
        -------
        dict
            Found samples in format
            {study_id: [[samp_id1, meta1, meta2, ...],
                        [samp_id2, meta1, meta2, ...], ...}
        list
            metadata column names searched for

        Notes
        -----
        Metadata information for each sample is in the same order as the
        metadata columns list returned

        Metadata column names and string searches are case-sensitive
        """
        study_sql, sample_sql, meta_headers = \
            self._parse_study_search_string(searchstr)
        conn_handler = SQLConnectionHandler()
        # get all studies containing the metadata headers requested
        study_ids = {x[0] for x in conn_handler.execute_fetchall(study_sql)}
        # strip to only studies user has access to
        userobj = User(user)
        study_ids = study_ids.intersection(Study.get_public() +
                                           userobj.private_studies +
                                           userobj.shared_studies)
        results = {}
        # run search on each study to get out the matching samples
        for sid in study_ids:
            study_res = conn_handler.execute_fetchall(sample_sql.format(sid))
            if study_res:
                # only add study to results if actually has samples in results
                results[sid] = study_res
        return results, meta_headers

    def _parse_study_search_string(self, searchstr):
        """parses string into SQL query for study search

        Parameters
        ----------
        searchstr : str
            The string to parse

        Returns
        -------
        study_sql : str
            SQL query for selecting studies with the required metadata columns
        sample_sql : str
            SQL query for each study to get the sample ids that mach the query
        meta_headers : list
            metadata categories in the query string in alphabetical order

        Notes
        -----
        All searches are case-sensitive

        References
        ----------
        .. [1] McGuire P (2007) Getting started with pyparsing.
        """
        # build the parse grammar
        category = Word(alphas + nums + "_")
        seperator = oneOf("> < = >= <= !=") | CaselessLiteral("includes") | \
            CaselessLiteral("startswith")
        value = Word(alphas + nums + "_" + ":" + ".") | \
            dblQuotedString().setParseAction(removeQuotes)
        criterion = Group(category + seperator + value)
        criterion.setParseAction(SearchTerm)
        and_ = CaselessLiteral("and")
        or_ = CaselessLiteral("or")
        not_ = CaselessLiteral("not")
        optional_seps = Optional(and_ | or_ | not_)

        # create the grammar for parsing operators AND, OR, NOT
        search_expr = operatorPrecedence(criterion,
                                         [(not_, 1, opAssoc.RIGHT, SearchNot),
                                          (and_, 2, opAssoc.LEFT, SearchAnd),
                                          (or_, 2, opAssoc.LEFT, SearchOr)])

        # parse the search string to get out the SQL WHERE formatted query
        eval_stack = (search_expr + stringEnd).parseString(searchstr)[0]
        sql_where = eval_stack.generate_sql()

        # this lookup will be used to select only studies with columns
        # of the correct type
        type_lookup = {int: 'integer', float: 'float8', str: 'varchar'}

        # parse out all metadata headers we need to have in a study, and
        # their corresponding types
        all_headers = [
            c[0][0].term[0]
            for c in (criterion + optional_seps).scanString(searchstr)
        ]
        meta_headers = set(all_headers)
        all_types = [
            c[0][0].term[2]
            for c in (criterion + optional_seps).scanString(searchstr)
        ]
        all_types = [type_lookup[type(typecast_string(s))] for s in all_types]

        # sort headers and types so they return in same order every time.
        # Should be a relatively short list so very quick
        # argsort implementation taken from
        # http://stackoverflow.com/questions/3382352/
        # equivalent-of-numpy-argsort-in-basic-python
        sort_order = sorted(range(len(all_headers)),
                            key=all_headers.__getitem__)
        all_types = [all_types[x] for x in sort_order]
        all_headers.sort()

        # At this point it is possible that a metadata header has been
        # reference more than once in the query. If the types agree, then we
        # do not need to do anything. If the types do not agree (specifically,
        # if it appears to be numerical in one case and string in another),
        # then we need to give varchar the precedence.
        meta_header_type_lookup = dict()
        for header, header_type in zip(all_headers, all_types):
            if header not in meta_header_type_lookup:
                meta_header_type_lookup[header] = header_type
            else:
                if header_type == 'varchar' or \
                        meta_header_type_lookup[header] == 'varchar':
                    meta_header_type_lookup[header] = 'varchar'

        # create the study finding SQL
        # remove metadata headers that are in required_sample_info table
        meta_headers = meta_headers.difference(self.required_cols).difference(
            self.study_cols)

        # get all study ids that contain all metadata categories searched for
        sql = []
        if meta_headers:
            # have study-specific metadata, so need to find specific studies
            for meta in meta_headers:
                if meta_header_type_lookup[meta] in ('integer', 'float8'):
                    allowable_types = "('integer', 'float8')"
                else:
                    allowable_types = "('varchar')"

                sql.append("SELECT study_id FROM qiita.study_sample_columns "
                           "WHERE lower(column_name) = lower('%s') and "
                           "column_type in %s" %
                           (scrub_data(meta), allowable_types))
        else:
            # no study-specific metadata, so need all studies
            sql.append("SELECT study_id FROM qiita.study_sample_columns")
        # combine the query
        study_sql = ' INTERSECT '.join(sql)

        # create  the sample finding SQL, getting both sample id and values
        # build the sql formatted list of metadata headers
        header_info = []
        for meta in meta_header_type_lookup:
            if meta in self.required_cols:
                header_info.append("r.%s" % meta)
            elif meta in self.study_cols:
                header_info.append("st.%s" % meta)
            else:
                header_info.append("sa.%s" % meta)
        # build the SQL query
        sample_sql = ("SELECT r.sample_id,%s FROM qiita.required_sample_info "
                      "r JOIN qiita.sample_{0} sa ON sa.sample_id = "
                      "r.sample_id JOIN qiita.study st ON st.study_id = "
                      "r.study_id WHERE %s" %
                      (','.join(header_info), sql_where))
        return study_sql, sample_sql, meta_header_type_lookup.keys()
Example #22
0
    def _add_common_creation_steps_to_queue(cls, md_template, obj_id,
                                            conn_handler, queue_name):
        r"""Adds the common creation steps to the queue in conn_handler

        Parameters
        ----------
        md_template : DataFrame
            The metadata template file contents indexed by sample ids
        obj_id : int
            The id of the object being created
        conn_handler : SQLConnectionHandler
            The connection handler object connected to the DB
        queue_name : str
            The queue where the SQL statements will be added
        """
        cls._check_subclass()
        # Get some useful information from the metadata template
        sample_ids = md_template.index.tolist()
        num_samples = len(sample_ids)
        headers = list(md_template.keys())

        # Get the required columns from the DB
        db_cols = sorted(get_table_cols(cls._table, conn_handler))
        # Remove the sample_id and _id_column columns
        db_cols.remove('sample_id')
        db_cols.remove(cls._id_column)

        # Insert values on required columns
        values = as_python_types(md_template, db_cols)
        values.insert(0, sample_ids)
        values.insert(0, [obj_id] * num_samples)
        values = [v for v in zip(*values)]
        conn_handler.add_to_queue(
            queue_name,
            "INSERT INTO qiita.{0} ({1}, sample_id, {2}) "
            "VALUES (%s, %s, {3})".format(cls._table, cls._id_column,
                                          ', '.join(db_cols),
                                          ', '.join(['%s'] * len(db_cols))),
            values, many=True)

        # Insert rows on *_columns table
        headers = sorted(set(headers).difference(db_cols))
        datatypes = get_datatypes(md_template.ix[:, headers])
        # psycopg2 requires a list of tuples, in which each tuple is a set
        # of values to use in the string formatting of the query. We have all
        # the values in different lists (but in the same order) so use zip
        # to create the list of tuples that psycopg2 requires.
        values = [
            v for v in zip([obj_id] * len(headers), headers, datatypes)]
        conn_handler.add_to_queue(
            queue_name,
            "INSERT INTO qiita.{0} ({1}, column_name, column_type) "
            "VALUES (%s, %s, %s)".format(cls._column_table, cls._id_column),
            values, many=True)

        # Create table with custom columns
        table_name = cls._table_name(obj_id)
        column_datatype = ["%s %s" % (col, dtype)
                           for col, dtype in zip(headers, datatypes)]
        conn_handler.add_to_queue(
            queue_name,
            "CREATE TABLE qiita.{0} (sample_id varchar NOT NULL, {1})".format(
                table_name, ', '.join(column_datatype)))

        # Insert values on custom table
        values = as_python_types(md_template, headers)
        values.insert(0, sample_ids)
        values = [v for v in zip(*values)]
        conn_handler.add_to_queue(
            queue_name,
            "INSERT INTO qiita.{0} (sample_id, {1}) "
            "VALUES (%s, {2})".format(table_name, ", ".join(headers),
                                      ', '.join(["%s"] * len(headers))),
            values, many=True)
Example #23
0
    def post(self):
        user = self.current_user
        action = self.get_argument("action")
        # set required template variables
        results = {}
        meta_headers = []
        counts = {}
        fullcounts = {}
        query = ""
        searchmsg = ""
        selsamples = {}
        selproc_data = {}
        # get analysis and selected samples if exists, or create if necessary
        if action == "create":
            name = self.get_argument('name')
            description = self.get_argument('description')
            analysis = Analysis.create(User(user), name, description)
            analysis_id = analysis.id
            # set to second step since this page is second step in workflow
            analysis.step = SELECT_SAMPLES
            # fill example studies by running query for specific studies
            search = QiitaStudySearch()
            def_query = 'study_id = 1 OR study_id = 2 OR study_id = 3'
            results, meta_headers = search(def_query, user)
            results, counts, fullcounts = self._parse_search_results(
                results, selsamples, meta_headers)
        else:
            analysis_id = int(self.get_argument("analysis-id"))
            check_analysis_access(User(user), analysis_id)
            analysis = Analysis(analysis_id)
            selproc_data, selsamples = self._selected_parser(analysis)

        # run through action requested
        if action == "search":
            search = QiitaStudySearch()
            query = str(self.get_argument("query"))
            try:
                results, meta_headers = search(query, user)
            except ParseException:
                searchmsg = "Malformed search query, please read search help."
            except QiitaDBIncompatibleDatatypeError as e:
                searchmsg = ''.join(e)

            if not results and not searchmsg:
                searchmsg = "No results found."
            else:
                results, counts, fullcounts = self._parse_search_results(
                    results, selsamples, meta_headers)

        elif action == "select":
            analysis.add_samples(self._parse_form_select())

            # rebuild the selected from database to reflect changes
            selproc_data, selsamples = self._selected_parser(analysis)

        elif action == "deselect":
            proc_data, samples = self._parse_form_deselect()
            if proc_data:
                analysis.remove_samples(proc_data=proc_data)
            if samples:
                analysis.remove_samples(samples=samples)
            if not proc_data and not samples:
                searchmsg = "Must select samples to remove from analysis!"

            # rebuild the selected from database to reflect changes
            selproc_data, selsamples = self._selected_parser(analysis)

        self.render('search_studies.html',
                    user=user,
                    aid=analysis_id,
                    results=results,
                    meta_headers=meta_headers,
                    selsamples=selsamples,
                    selproc_data=selproc_data,
                    counts=counts,
                    fullcounts=fullcounts,
                    searchmsg=searchmsg,
                    query=query,
                    availmeta=SampleTemplate.metadata_headers() +
                    get_table_cols("study"))