Exemple #1
0
    def test_create_templates_from_qiime_mapping_file_reverse_linker(self):
        with TRN:
            TRN.add("SELECT last_value FROM "
                    "qiita.prep_template_prep_template_id_seq")
            curr_id = TRN.execute_fetchflatten()[0]
        obs_st, obs_pt = create_templates_from_qiime_mapping_file(
            StringIO(QIIME_MAP_WITH_REVERSE_LINKER_PRIMER), self.new_study,
            "16S")

        # Be green: clean the environment
        for template in [obs_st, obs_pt]:
            for _, fp in template.get_filepaths():
                self._clean_up_files.append(fp)

        self.assertEqual(obs_st.id, self.new_study.id)
        self.assertEqual(obs_pt.id, curr_id + 1)

        # Check that each template has the correct columns
        exp = {
            "physical_specimen_location", "physical_specimen_remaining",
            "dna_extracted", "sample_type", "host_subject_id", "latitude",
            "longitude", "taxon_id", "scientific_name", "collection_timestamp",
            "description"
        }
        self.assertEqual(set(obs_st.categories()), exp)

        exp = {
            "barcode", "primer", "center_name", "run_prefix", "platform",
            "library_construction_protocol", "instrument_model",
            "experiment_design_description", "reverselinkerprimer"
        }
        self.assertEqual(set(obs_pt.categories()), exp)
Exemple #2
0
    def status(self):
        """The status of the prep template

        Returns
        -------
        str
            The status of the prep template

        Notes
        -----
        The status of a prep template is inferred by the status of the
        processed data generated from this prep template. If no processed
        data has been generated with this prep template; then the status
        is 'sandbox'.
        """
        with TRN:
            sql = """SELECT processed_data_status
                    FROM qiita.processed_data_status pds
                      JOIN qiita.processed_data pd
                        USING (processed_data_status_id)
                      JOIN qiita.preprocessed_processed_data ppd_pd
                        USING (processed_data_id)
                      JOIN qiita.prep_template_preprocessed_data pt_ppd
                        USING (preprocessed_data_id)
                    WHERE pt_ppd.prep_template_id=%s"""
            TRN.add(sql, [self._id])

            return infer_status(TRN.execute_fetchindex())
Exemple #3
0
    def test_context_manager_multiple_2(self):
        self.assertEqual(TRN._contexts_entered, 0)

        def tester():
            self.assertEqual(TRN._contexts_entered, 1)
            with TRN:
                self.assertEqual(TRN._contexts_entered, 2)
                sql = """SELECT EXISTS(
                        SELECT * FROM qiita.test_table WHERE int_column=%s)"""
                TRN.add(sql, [2])
                self.assertTrue(TRN.execute_fetchlast())
            self.assertEqual(TRN._contexts_entered, 1)

        with TRN:
            self.assertEqual(TRN._contexts_entered, 1)
            sql = """INSERT INTO qiita.test_table (str_column, int_column)
                         VALUES (%s, %s) RETURNING str_column, int_column"""
            args = [["insert1", 1], ["insert2", 2], ["insert3", 3]]
            TRN.add(sql, args, many=True)
            tester()
            self.assertEqual(TRN._contexts_entered, 1)
            self._assert_sql_equal([])

        self.assertEqual(TRN._contexts_entered, 0)
        self._assert_sql_equal([("insert1", True, 1), ("insert2", True, 2), ("insert3", True, 3)])
        self.assertEqual(TRN._connection.get_transaction_status(), TRANSACTION_STATUS_IDLE)
Exemple #4
0
def _generate_study_list_for_api(visibility, only_biom=True):
    """Get general study information

    Parameters
    ----------
    visibility : string
        The visibility to get studies

    Returns
    -------
    list of dict
        The list of studies and their information
    """
    artifact_type = ''
    if only_biom:
        artifact_type = "AND artifact_type = 'BIOM'"

    sql = f"""
        SELECT study_id, array_agg(DISTINCT artifact_id) FROM qiita.study
            INNER JOIN qiita.study_artifact USING (study_id)
            INNER JOIN qiita.artifact USING (artifact_id)
            INNER JOIN qiita.artifact_type USING (artifact_type_id)
            INNER JOIN qiita.visibility USING (visibility_id)
        WHERE visibility = %s
        {artifact_type}
        GROUP BY study_id
    """
    with TRN:
        TRN.add(sql, [visibility])
        return dict(TRN.execute_fetchindex())
 def _check_id(self, id_):
     r"""Checks that the MetadataTemplate id_ exists on the database"""
     with TRN:
         sql = "SELECT EXISTS(SELECT * FROM qiita.{0} WHERE {1}=%s)".format(
             self._table, self._id_column)
         TRN.add(sql, [id_])
         return TRN.execute_fetchlast()
    def test_create_templates_from_qiime_mapping_file_reverse_linker(self):
        with TRN:
            TRN.add("SELECT last_value FROM "
                    "qiita.prep_template_prep_template_id_seq")
            curr_id = TRN.execute_fetchflatten()[0]
        obs_st, obs_pt = create_templates_from_qiime_mapping_file(
            StringIO(QIIME_MAP_WITH_REVERSE_LINKER_PRIMER),
            self.new_study, "16S")

        # Be green: clean the environment
        for template in [obs_st, obs_pt]:
            for _, fp in template.get_filepaths():
                self._clean_up_files.append(fp)

        self.assertEqual(obs_st.id, self.new_study.id)
        self.assertEqual(obs_pt.id, curr_id + 1)

        # Check that each template has the correct columns
        exp = {"physical_specimen_location", "physical_specimen_remaining",
               "dna_extracted", "sample_type", "host_subject_id", "latitude",
               "longitude", "taxon_id", "scientific_name",
               "collection_timestamp", "description"}
        self.assertEqual(set(obs_st.categories()), exp)

        exp = {"barcode", "primer", "center_name", "run_prefix", "platform",
               "library_construction_protocol", "instrument_model",
               "experiment_design_description", "reverselinkerprimer"}
        self.assertEqual(set(obs_pt.categories()), exp)
    def exists(cls, sample_id, md_template):
        r"""Checks if already exists a MetadataTemplate for the provided object

        Parameters
        ----------
        sample_id : str
            The sample id
        md_template : MetadataTemplate
            The metadata template to which the sample belongs to

        Returns
        -------
        bool
            True if already exists. False otherwise.
        """
        with TRN:
            cls._check_subclass()
            sql = """SELECT EXISTS(
                        SELECT * FROM qiita.{0}
                        WHERE sample_id=%s AND {1}=%s
                    )""".format(
                cls._table, cls._id_column
            )
            TRN.add(sql, [sample_id, md_template.id])
            return TRN.execute_fetchlast()
    def update(self, md_template):
        r"""Update values in the template

        Parameters
        ----------
        md_template : DataFrame
            The metadata template file contents indexed by samples Ids

        Raises
        ------
        QiitaDBError
            If md_template and db do not have the same sample ids
            If md_template and db do not have the same column headers
            If self.can_be_updated is not True
        """
        with TRN:
            # Clean and validate the metadata template given
            new_map = self._clean_validate_template(md_template, self.study_id, self.columns_restrictions)
            # Retrieving current metadata
            sql = "SELECT * FROM qiita.{0}".format(self._table_name(self.id))
            TRN.add(sql)
            current_map = self._transform_to_dict(TRN.execute_fetchindex())
            current_map = pd.DataFrame.from_dict(current_map, orient="index")

            # simple validations of sample ids and column names
            samples_diff = set(new_map.index).difference(current_map.index)
            if samples_diff:
                raise QiitaDBError(
                    "The new template differs from what is stored "
                    "in database by these samples names: %s" % ", ".join(samples_diff)
                )
            columns_diff = set(new_map.columns).difference(current_map.columns)
            if columns_diff:
                raise QiitaDBError(
                    "The new template differs from what is stored "
                    "in database by these columns names: %s" % ", ".join(columns_diff)
                )

            # here we are comparing two dataframes following:
            # http://stackoverflow.com/a/17095620/4228285
            current_map.sort(axis=0, inplace=True)
            current_map.sort(axis=1, inplace=True)
            new_map.sort(axis=0, inplace=True)
            new_map.sort(axis=1, inplace=True)
            map_diff = (current_map != new_map).stack()
            map_diff = map_diff[map_diff]
            map_diff.index.names = ["id", "column"]
            changed_cols = map_diff.index.get_level_values("column").unique()

            if not self.can_be_updated(columns=set(changed_cols)):
                raise QiitaDBError(
                    "The new template is modifying fields that cannot be "
                    "modified. Try removing the target gene fields or "
                    "deleting the processed data. You are trying to modify: %s" % ", ".join(changed_cols)
                )

            for col in changed_cols:
                self.update_category(col, new_map[col].to_dict())

            self.generate_files()
Exemple #9
0
 def preprocessed_data(self):
     with TRN:
         sql = """SELECT preprocessed_data_id
                  FROM qiita.prep_template_preprocessed_data
                  WHERE prep_template_id=%s"""
         TRN.add(sql, [self.id])
         return TRN.execute_fetchflatten()
    def __getitem__(self, key):
        r"""Returns the value of the metadata category `key`

        Parameters
        ----------
        key : str
            The metadata category

        Returns
        -------
        obj
            The value of the metadata category `key`

        Raises
        ------
        KeyError
            If the metadata category `key` does not exists

        See Also
        --------
        get
        """
        with TRN:
            key = key.lower()
            if key not in self._get_categories():
                # The key is not available for the sample, so raise a KeyError
                raise KeyError(
                    "Metadata category %s does not exists for sample %s"
                    " in template %d" % (key, self._id, self._md_template.id))

            sql = """SELECT {0} FROM qiita.{1}
                     WHERE sample_id=%s""".format(key, self._dynamic_table)
            TRN.add(sql, [self._id])
            return TRN.execute_fetchlast()
    def setitem(self, column, value):
        """Sets `value` as value for the given `column`

        Parameters
        ----------
        column : str
            The column to update
        value : str
            The value to set. This is expected to be a str on the assumption
            that psycopg2 will cast as necessary when updating.

        Raises
        ------
        QiitaDBColumnError
            If the column does not exist in the table
        """
        with TRN:
            # Check if the column exist in the table
            if column not in self._get_categories():
                raise QiitaDBColumnError("Column %s does not exist in %s" %
                                         (column, self._dynamic_table))

            sql = """UPDATE qiita.{0}
                     SET {1}=%s
                     WHERE sample_id=%s""".format(self._dynamic_table, column)
            TRN.add(sql, [value, self._id])
    def _common_creation_steps(cls, md_template, obj_id):
        r"""Executes the common creation steps

        Parameters
        ----------
        md_template : DataFrame
            The metadata template file contents indexed by sample ids
        obj_id : int
            The id of the object being created
        """
        with TRN:
            cls._check_subclass()

            # Get some useful information from the metadata template
            sample_ids = md_template.index.tolist()
            headers = sorted(md_template.keys().tolist())

            # Insert values on template_sample table
            values = [[obj_id, s_id] for s_id in sample_ids]
            sql = """INSERT INTO qiita.{0} ({1}, sample_id)
                     VALUES (%s, %s)""".format(cls._table, cls._id_column)
            TRN.add(sql, values, many=True)

            # Insert rows on *_columns table
            datatypes = get_datatypes(md_template.ix[:, headers])
            # psycopg2 requires a list of tuples, in which each tuple is a set
            # of values to use in the string formatting of the query. We have
            # all the values in different lists (but in the same order) so use
            # zip to create the list of tuples that psycopg2 requires.
            values = [[obj_id, h, d] for h, d in zip(headers, datatypes)]
            sql = """INSERT INTO qiita.{0} ({1}, column_name, column_type)
                     VALUES (%s, %s, %s)""".format(cls._column_table,
                                                   cls._id_column)
            TRN.add(sql, values, many=True)

            # Create table with custom columns
            table_name = cls._table_name(obj_id)
            column_datatype = ["%s %s" % (col, dtype)
                               for col, dtype in zip(headers, datatypes)]
            sql = """CREATE TABLE qiita.{0} (
                        sample_id varchar NOT NULL, {1},
                        CONSTRAINT fk_{0} FOREIGN KEY (sample_id)
                            REFERENCES qiita.study_sample (sample_id)
                            ON UPDATE CASCADE
                     )""".format(table_name, ', '.join(column_datatype))
            TRN.add(sql)

            # Insert values on custom table
            values = as_python_types(md_template, headers)
            values.insert(0, sample_ids)
            values = [list(v) for v in zip(*values)]
            sql = """INSERT INTO qiita.{0} (sample_id, {1})
                     VALUES (%s, {2})""".format(
                table_name, ", ".join(headers),
                ', '.join(["%s"] * len(headers)))
            TRN.add(sql, values, many=True)

            # Execute all the steps
            TRN.execute()
    def update_category(self, category, samples_and_values):
        """Update an existing column

        Parameters
        ----------
        category : str
            The category to update
        samples_and_values : dict
            A mapping of {sample_id: value}

        Raises
        ------
        QiitaDBUnknownIDError
            If a sample_id is included in values that is not in the template
        QiitaDBColumnError
            If the column does not exist in the table. This is implicit, and
            can be thrown by the contained Samples.
        ValueError
            If one of the new values cannot be inserted in the DB due to
            different types
        """
        with TRN:
            if not set(self.keys()).issuperset(samples_and_values):
                missing = set(self.keys()) - set(samples_and_values)
                table_name = self._table_name(self._id)
                raise QiitaDBUnknownIDError(missing, table_name)

            for k, v in viewitems(samples_and_values):
                sample = self[k]
                sample.setitem(category, v)

            try:
                TRN.execute()
            except ValueError as e:
                # catching error so we can check if the error is due to
                # different column type or something else

                value_types = set(type_lookup(type(value)) for value in viewvalues(samples_and_values))

                sql = """SELECT udt_name
                         FROM information_schema.columns
                         WHERE column_name = %s
                            AND table_schema = 'qiita'
                            AND (table_name = %s OR table_name = %s)"""
                TRN.add(sql, [category, self._table, self._table_name(self._id)])
                column_type = TRN.execute_fetchlast()

                if any([column_type != vt for vt in value_types]):
                    value_str = ", ".join([str(value) for value in viewvalues(samples_and_values)])
                    value_types_str = ", ".join(value_types)

                    raise ValueError(
                        'The new values being added to column: "%s" are "%s" '
                        '(types: "%s"). However, this column in the DB is of '
                        'type "%s". Please change the values in your updated '
                        "template or reprocess your template." % (category, value_str, value_types_str, column_type)
                    )

                raise e
Exemple #14
0
    def test_post_commit_funcs_error(self):
        def func():
            raise ValueError()

        with self.assertRaises(RuntimeError):
            with TRN:
                TRN.add("SELECT 42")
                TRN.add_post_commit_func(func)
Exemple #15
0
 def _set_allocation(memory):
     with TRN:
         sql = """UPDATE qiita.processing_job_resource_allocation
                  SET allocation = '{0}'
                  WHERE name = 'build_analysis_files'""".format(
             '-q qiita -l mem=%s' % memory)
         TRN.add(sql)
         TRN.execute()
Exemple #16
0
 def tester():
     self.assertEqual(TRN._contexts_entered, 1)
     with TRN:
         self.assertEqual(TRN._contexts_entered, 2)
         sql = """SELECT EXISTS(
                 SELECT * FROM qiita.test_table WHERE int_column=%s)"""
         TRN.add(sql, [2])
         self.assertTrue(TRN.execute_fetchlast())
     self.assertEqual(TRN._contexts_entered, 1)
Exemple #17
0
    def test_add_many(self):
        with TRN:
            self.assertEqual(TRN._queries, [])

            sql = "INSERT INTO qiita.test_table (int_column) VALUES (%s)"
            args = [[1], [2], [3]]
            TRN.add(sql, args, many=True)

            exp = [(sql, [1]), (sql, [2]), (sql, [3])]
            self.assertEqual(TRN._queries, exp)
Exemple #18
0
 def test_execute_return(self):
     with TRN:
         sql = """INSERT INTO qiita.test_table (str_column, int_column)
                  VALUES (%s, %s) RETURNING str_column, int_column"""
         TRN.add(sql, ["test_insert", 2])
         sql = """UPDATE qiita.test_table SET bool_column = %s
                  WHERE str_column = %s RETURNING int_column"""
         TRN.add(sql, [False, "test_insert"])
         obs = TRN.execute()
         self.assertEqual(obs, [[["test_insert", 2]], [[2]]])
Exemple #19
0
    def test_context_manager_execute(self):
        with TRN:
            sql = """INSERT INTO qiita.test_table (str_column, int_column)
                 VALUES (%s, %s) RETURNING str_column, int_column"""
            args = [["insert1", 1], ["insert2", 2], ["insert3", 3]]
            TRN.add(sql, args, many=True)
            self._assert_sql_equal([])

        self._assert_sql_equal([("insert1", True, 1), ("insert2", True, 2), ("insert3", True, 3)])
        self.assertEqual(TRN._connection.get_transaction_status(), TRANSACTION_STATUS_IDLE)
Exemple #20
0
 def raw_data(self):
     with TRN:
         sql = """SELECT raw_data_id FROM qiita.prep_template
                  WHERE prep_template_id=%s"""
         TRN.add(sql, [self.id])
         result = TRN.execute_fetchindex()
         if result:
             # If there is any result, it will be in the first row
             # and in the first element of that row, thus [0][0]
             return result[0][0]
         return None
Exemple #21
0
    def test_execute_fetchlast(self):
        with TRN:
            sql = """INSERT INTO qiita.test_table (str_column, int_column)
                     VALUES (%s, %s) RETURNING str_column, int_column"""
            args = [["insert1", 1], ["insert2", 2], ["insert3", 3]]
            TRN.add(sql, args, many=True)

            sql = """SELECT EXISTS(
                        SELECT * FROM qiita.test_table WHERE int_column=%s)"""
            TRN.add(sql, [2])
            self.assertTrue(TRN.execute_fetchlast())
    def _get_sample_ids(self):
        r"""Returns all the available samples for the metadata template

        Returns
        -------
        set of str
            The set of all available sample ids
        """
        with TRN:
            sql = "SELECT sample_id FROM qiita.{0} WHERE {1}=%s".format(self._table, self._id_column)
            TRN.add(sql, [self._id])
            return set(TRN.execute_fetchflatten())
Exemple #23
0
    def study_id(self):
        """Gets the study id with which this prep template is associated

        Returns
        -------
        int
            The ID of the study with which this prep template is associated
        """
        with TRN:
            sql = """SELECT study_id FROM qiita.study_prep_template
                     WHERE prep_template_id=%s"""
            TRN.add(sql, [self.id])
            return TRN.execute_fetchlast()
Exemple #24
0
    def preprocessing_status(self):
        r"""Tells if the data has been preprocessed or not

        Returns
        -------
        str
            One of {'not_preprocessed', 'preprocessing', 'success', 'failed'}
        """
        with TRN:
            sql = """SELECT preprocessing_status FROM qiita.prep_template
                     WHERE {0}=%s""".format(self._id_column)
            TRN.add(sql, [self.id])
            return TRN.execute_fetchlast()
Exemple #25
0
    def metadata_headers():
        """Returns metadata headers available

        Returns
        -------
        list
            Alphabetical list of all metadata headers available
        """
        with TRN:
            sql = """SELECT DISTINCT column_name
                     FROM qiita.study_sample_columns ORDER BY column_name"""
            TRN.add(sql)
            return TRN.execute_fetchflatten()
Exemple #26
0
    def test_execute(self):
        with TRN:
            sql = """INSERT INTO qiita.test_table (str_column, int_column)
                     VALUES (%s, %s)"""
            TRN.add(sql, ["test_insert", 2])
            sql = """UPDATE qiita.test_table
                     SET int_column = %s, bool_column = %s
                     WHERE str_column = %s"""
            TRN.add(sql, [20, False, "test_insert"])
            obs = TRN.execute()
            self.assertEqual(obs, [None, None])
            self._assert_sql_equal([])

        self._assert_sql_equal([("test_insert", False, 20)])
Exemple #27
0
    def test_context_manager_rollback(self):
        try:
            with TRN:
                sql = """INSERT INTO qiita.test_table (str_column, int_column)
                     VALUES (%s, %s) RETURNING str_column, int_column"""
                args = [["insert1", 1], ["insert2", 2], ["insert3", 3]]
                TRN.add(sql, args, many=True)

                TRN.execute()
                raise ValueError("Force exiting the context manager")
        except ValueError:
            pass
        self._assert_sql_equal([])
        self.assertEqual(TRN._connection.get_transaction_status(), TRANSACTION_STATUS_IDLE)
Exemple #28
0
    def test_post_commit_funcs(self):
        fd, fp = mkstemp()
        close(fd)
        self._files_to_remove.append(fp)

        def func(fp):
            with open(fp, "w") as f:
                f.write("\n")

        with TRN:
            TRN.add("SELECT 42")
            TRN.add_post_commit_func(func, fp)

        self.assertTrue(exists(fp))
Exemple #29
0
    def test_execute_many(self):
        with TRN:
            sql = """INSERT INTO qiita.test_table (str_column, int_column)
                     VALUES (%s, %s)"""
            args = [["insert1", 1], ["insert2", 2], ["insert3", 3]]
            TRN.add(sql, args, many=True)
            sql = """UPDATE qiita.test_table
                     SET int_column = %s, bool_column = %s
                     WHERE str_column = %s"""
            TRN.add(sql, [20, False, "insert2"])
            obs = TRN.execute()
            self.assertEqual(obs, [None, None, None, None])

            self._assert_sql_equal([])

        self._assert_sql_equal([("insert1", True, 1), ("insert3", True, 3), ("insert2", False, 20)])
Exemple #30
0
    def test_execute_commit_false_rollback(self):
        with TRN:
            sql = """INSERT INTO qiita.test_table (str_column, int_column)
                     VALUES (%s, %s) RETURNING str_column, int_column"""
            args = [["insert1", 1], ["insert2", 2], ["insert3", 3]]
            TRN.add(sql, args, many=True)

            obs = TRN.execute()
            exp = [[["insert1", 1]], [["insert2", 2]], [["insert3", 3]]]
            self.assertEqual(obs, exp)

            self._assert_sql_equal([])

            TRN.rollback()

            self._assert_sql_equal([])
    def add_filepath(self, filepath, fp_id=None):
        r"""Populates the DB tables for storing the filepath and connects the
        `self` objects with this filepath"""
        with TRN:
            fp_id = self._fp_id if fp_id is None else fp_id

            try:
                fpp_id = insert_filepaths([(filepath, fp_id)], None, "templates", "filepath", move_files=False)[0]
                sql = """INSERT INTO qiita.{0} ({1}, filepath_id)
                         VALUES (%s, %s)""".format(
                    self._filepath_table, self._id_column
                )
                TRN.add(sql, [self._id, fpp_id])
                TRN.execute()
            except Exception as e:
                LogEntry.create("Runtime", str(e), info={self.__class__.__name__: self.id})
                raise e
    def _to_dict(self):
        r"""Returns the categories and their values in a dictionary

        Returns
        -------
        dict of {str: str}
            A dictionary of the form {category: value}
        """
        with TRN:
            sql = "SELECT * FROM qiita.{0} WHERE sample_id=%s".format(self._dynamic_table)
            TRN.add(sql, [self._id])
            d = dict(TRN.execute_fetchindex()[0])

            # Remove the sample_id, is not part of the metadata
            del d["sample_id"]

            return d
Exemple #33
0
    def test_full_query_and_insertion(self):
        # let's archive different values from different jobs
        with TRN:
            # 3 - close reference picking
            # 3 - success
            sql = """SELECT processing_job_id
                     FROM qiita.processing_job
                     WHERE command_id = 3 AND processing_job_status_id = 3"""
            TRN.add(sql)
            jobs = TRN.execute_fetchflatten()

            # this is so we can also tests the parent merging scheme
            # 1 - split libraries
            sql = """UPDATE qiita.command_parameter
                     SET check_biom_merge = True
                     WHERE command_id = 1
                        and parameter_name = 'barcode_type'"""
            TRN.add(sql)
            TRN.execute()

            exp_all_features = {}
            for j in jobs:
                featureA = 'AA - %s' % j
                featureB = 'BB - %s' % j

                # testing that nothing is there
                data = {'job_id': j, 'features': [featureA, featureB]}
                obs = self.post(
                    '/qiita_db/archive/observations/', headers=self.header,
                    data=data)
                exp = {}
                self.assertEqual(obs.code, 200)
                self.assertEqual(loads(obs.body), exp)

                # inserting and testing insertion
                data = {'path': j,
                        'value': dumps({featureA: 'CA', featureB: 'CB'})}
                obs = self.patch(
                    '/qiita_db/archive/observations/', headers=self.header,
                    data=data)
                exp = {featureA: 'CA', featureB: 'CB'}
                self.assertEqual(obs.code, 200)
                self.assertEqual(loads(obs.body), exp)

                exp_all_features[featureA] = 'CA'
                exp_all_features[featureB] = 'CB'

            # testing retrieve all featues
            obs = Archive.retrieve_feature_values()
            self.assertEqual(obs, exp_all_features)

            # this doesn't exist so should be empty
            obs = Archive.retrieve_feature_values(archive_merging_scheme='')
            self.assertEqual(obs, {})

            obs = Archive.retrieve_feature_values(
                archive_merging_scheme='Pick closed-reference OTUs | Split '
                'libraries FASTQ (barcode_type: golay_12)')
            self.assertEqual(obs, exp_all_features)
Exemple #34
0
def create_rarefied_biom_artifact(analysis, srare_cmd_id, biom_data, params,
                                  parent_biom_artifact_id, rarefaction_job_id,
                                  srare_cmd_out_id):
    """Creates the rarefied biom artifact

    Parameters
    ----------
    analysis : dict
        The analysis information
    srare_cmd_id : int
        The command id of "Single Rarefaction"
    biom_data : dict
        The biom information
    params : str
        The processing parameters
    parent_biom_artifact_id : int
        The parent biom artifact id
    rarefaction_job_id : str
        The job id of the rarefaction job
    srare_cmd_out_id : int
        The id of the single rarefaction output

    Returns
    -------
    int
        The artifact id
    """
    with TRN:
        # Transfer the file to an artifact
        # Magic number 7: artifact type -> biom
        artifact_id = transfer_file_to_artifact(
            analysis['analysis_id'], analysis['timestamp'], srare_cmd_id,
            biom_data['data_type_id'], params, 7, biom_data['filepath_id'])
        # Link the artifact with its parent
        sql = """INSERT INTO qiita.parent_artifact (artifact_id, parent_id)
                 VALUES (%s, %s)"""
        TRN.add(sql, [artifact_id, parent_biom_artifact_id])
        # Link the artifact as the job output
        sql = """INSERT INTO qiita.artifact_output_processing_job
                    (artifact_id, processing_job_id, command_output_id)
                 VALUES (%s, %s, %s)"""
        TRN.add(sql, [artifact_id, rarefaction_job_id, srare_cmd_out_id])
    return artifact_id
Exemple #35
0
def is_test_environment():
    """Checks if Qiita is running in a test environment

    Returns
    -------
    bool
        Whether Qiita is running in a test environment or not

    Notes
    -----
    Qiita is running in a test environment if:
        - It is connected to a test database, AND
        - The config file indicates that this is a test environment
    """
    # Check that we are not in a production environment
    with TRN:
        TRN.add("SELECT test FROM settings")
        test_db = TRN.execute_fetchflatten()[0]
    return qiita_config.test_environment and test_db
Exemple #36
0
def create_rarefied_biom_artifact(analysis, srare_cmd_id, biom_data, params,
                                  parent_biom_artifact_id, rarefaction_job_id,
                                  srare_cmd_out_id):
    """Creates the rarefied biom artifact

    Parameters
    ----------
    analysis : dict
        The analysis information
    srare_cmd_id : int
        The command id of "Single Rarefaction"
    biom_data : dict
        The biom information
    params : str
        The processing parameters
    parent_biom_artifact_id : int
        The parent biom artifact id
    rarefaction_job_id : str
        The job id of the rarefaction job
    srare_cmd_out_id : int
        The id of the single rarefaction output

    Returns
    -------
    int
        The artifact id
    """
    with TRN:
        # Transfer the file to an artifact
        # Magic number 7: artifact type -> biom
        artifact_id = transfer_file_to_artifact(
            analysis['analysis_id'], analysis['timestamp'], srare_cmd_id,
            biom_data['data_type_id'], params, 7, biom_data['filepath_id'])
        # Link the artifact with its parent
        sql = """INSERT INTO qiita.parent_artifact (artifact_id, parent_id)
                 VALUES (%s, %s)"""
        TRN.add(sql, [artifact_id, parent_biom_artifact_id])
        # Link the artifact as the job output
        sql = """INSERT INTO qiita.artifact_output_processing_job
                    (artifact_id, processing_job_id, command_output_id)
                 VALUES (%s, %s, %s)"""
        TRN.add(sql, [artifact_id, rarefaction_job_id, srare_cmd_out_id])
    return artifact_id
    def test_delete_study(self):
        # as samples have been submitted to EBI, this will fail
        job = self._create_job('delete_study', {'study': 1})
        private_task(job.id)
        self.assertEqual(job.status, 'error')
        self.assertIn(
            "Cannot delete artifact 2: Artifact 2 has been "
            "submitted to EBI", job.log.msg)
        # making sure the analysis, first thing to delete, still exists
        self.assertTrue(Analysis.exists(1))

        # delete everything from the EBI submissions and the processing job so
        # we can try again: test success (with tags)
        with TRN:
            sql = """DELETE FROM qiita.ebi_run_accession"""
            TRN.add(sql)
            sql = """DELETE FROM qiita.artifact_processing_job"""
            TRN.add(sql)
            TRN.execute()

            # adding tags
            Study(1).update_tags(self.user, ['my new tag!'])

            job = self._create_job('delete_study', {'study': 1})
            private_task(job.id)

            self.assertEqual(job.status, 'success')
            with self.assertRaises(QiitaDBUnknownIDError):
                Study(1)
Exemple #38
0
def transfer_file_to_artifact(analysis_id, a_timestamp, command_id,
                              data_type_id, params, artifact_type_id,
                              filepath_id):
    """Creates a new artifact with the given filepath id

    Parameters
    ----------
    analysis_id : int
        The analysis id to attach the artifact
    a_timestamp : datetime.datetime
        The generated timestamp of the artifact
    command_id : int
        The command id of the artifact
    data_type_id : int
        The data type id of the artifact
    params : str
        The parameters of the artifact
    artifact_type_id : int
        The artifact type
    filepath_id : int
        The filepath id

    Returns
    -------
    int
        The artifact id
    """
    with TRN:
        # Add the row in the artifact table
        # Magic number 4: Visibility -> sandbox
        sql = """INSERT INTO qiita.artifact
                    (generated_timestamp, command_id, data_type_id,
                     command_parameters, visibility_id, artifact_type_id,
                     submitted_to_vamps)
                 VALUES (%s, %s, %s, %s, %s, %s, %s)
                 RETURNING artifact_id"""
        TRN.add(sql, [
            a_timestamp, command_id, data_type_id, params, 4, artifact_type_id,
            False
        ])
        artifact_id = TRN.execute_fetchlast()
        # Link the artifact with its file
        sql = """INSERT INTO qiita.artifact_filepath (artifact_id, filepath_id)
                 VALUES (%s, %s)"""
        TRN.add(sql, [artifact_id, filepath_id])
        # Link the artifact with the analysis
        sql = """INSERT INTO qiita.analysis_artifact
                    (analysis_id, artifact_id)
                 VALUES (%s, %s)"""
        TRN.add(sql, [analysis_id, artifact_id])

    return artifact_id
Exemple #39
0
def create_rarefaction_job(depth, biom_artifact_id, analysis, srare_cmd_id):
    """Create a new rarefaction job

    Parameters
    ----------
    depth : int
        The rarefaction depth
    biom_artifact_id : int
        The artifact id of the input rarefaction biom table
    analysis : dict
        Dictionary with the analysis information
    srare_cmd_id : int
        The command id of the single rarefaction command

    Returns
    -------
    job_id : str
        The job id
    params : str
        The job parameters
    """
    # Add the row in the procesisng job table
    params = ('{"depth":%d,"subsample_multinomial":false,"biom_table":%s}' %
              (depth, biom_artifact_id))
    with TRN:
        # magic number 3: status -> success
        sql = """INSERT INTO qiita.processing_job
                    (email, command_id, command_parameters,
                     processing_job_status_id)
                 VALUES (%s, %s, %s, %s)
                 RETURNING processing_job_id"""
        TRN.add(sql, [analysis['email'], srare_cmd_id, params, 3])
        job_id = TRN.execute_fetchlast()
        # Step 1.2.b: Link the job with the input artifact
        sql = """INSERT INTO qiita.artifact_processing_job
                    (artifact_id, processing_job_id)
                 VALUES (%s, %s)"""
        TRN.add(sql, [biom_artifact_id, job_id])
        TRN.execute()
    return job_id, params
Exemple #40
0
    # 'prep_template_sample'
    sql = """SELECT table_name, array_agg(column_name::text)
                FROM information_schema.columns
                WHERE column_name IN %s
                    AND column_name != 'sample_id'
                    AND table_name LIKE 'prep_%%'
                    AND table_name NOT IN (
                        'prep_template', 'prep_template_sample')
                GROUP BY table_name"""
    # note that we are looking for those columns with duplicated names in
    # the headers
    headers = set(PrepTemplate.metadata_headers()) & \
        set(SampleTemplate.metadata_headers())

    if headers:
        TRN.add(sql, [tuple(headers)])
        overlapping = dict(TRN.execute_fetchindex())
    else:
        overlapping = None

if overlapping is not None:
    # finding actual duplicates
    for table_name, cols in overlapping.items():
        # leaving print so when we patch in the main system we know that
        # nothing was renamed or deal with that
        print(table_name)
        with TRN:
            for c in cols:
                sql = 'ALTER TABLE qiita.%s RENAME COLUMN %s TO %s_renamed' % (
                    table_name, c, c)
                TRN.add(sql)
Exemple #41
0
    if cols:
        to_fix.append((st, cols))

    for pt in s.prep_templates():
        if pt is None:
            continue
        cols = searcher(pt.to_dataframe())
        if cols:
            to_fix.append((pt, cols))

# now let's fix the database and regenerate the files
for infofile, cols in to_fix:
    with TRN:
        for col in cols:
            # removing tabs
            sql = """UPDATE qiita.{0}{1}
                        SET {2} = replace({2}, chr(9), ' ')""".format(
                infofile._table_prefix, infofile.id, col)
            TRN.add(sql)

            # removing enters
            sql = """UPDATE qiita.{0}{1}
                        SET {2} = regexp_replace(
                            {2}, E'[\\n\\r\\u2028]+', ' ', 'g' )""".format(
                infofile._table_prefix, infofile.id, col)
            TRN.add(sql)

        TRN.execute()

    infofile.generate_files()
Exemple #42
0
    def test_retrive_workflows(self):
        # we should see all 3 workflows
        DefaultWorkflow(2).active = False
        exp = deepcopy(WORKFLOWS)
        self.assertCountEqual(_retrive_workflows(False), exp)

        # validating that the params_name is not being used
        self.assertNotIn(
            'Split libraries | Defaults with Golay 12 barcodes',
            [x[2] for x in _retrive_workflows(False)[1]['nodes']])
        # now it should be there
        with TRN:
            # Hard-coded values; 19 -> barcode_type
            sql = """UPDATE qiita.command_parameter
                     SET name_order = 0
                     WHERE command_parameter_id = 19"""
            TRN.add(sql)
            TRN.execute()
        self.assertIn(
            'Split libraries | Defaults with Golay 12 barcodes',
            [x[2] for x in _retrive_workflows(False)[1]['nodes']])
        # and gone again
        with TRN:
            sql = """UPDATE qiita.command_parameter
                     SET name_order = NULL
                     WHERE command_parameter_id = 19"""
            TRN.add(sql)
            TRN.execute()
        self.assertNotIn(
            'Split libraries | Defaults with Golay 12 barcodes',
            [x[2] for x in _retrive_workflows(False)[1]['nodes']])

        # we should not see the middle one
        del exp[1]
        self.assertCountEqual(_retrive_workflows(True), exp)

        # let's create a couple of more complex scenarios so we touch all code
        # by adding multiple paths, that should connect and get separate
        # -- adds a new path that should be kept separate all the way; this is
        #    to emulate what happens with different trimming (different
        #    default parameter) and deblur (same for each of the previous
        #    steps)
        sql = """
            INSERT INTO qiita.default_workflow_node (
                default_workflow_id, default_parameter_set_id)
            VALUES (1, 2), (1, 10);
            INSERT INTO qiita.default_workflow_edge (
                parent_id, child_id)
            VALUES (7, 8);
            INSERT INTO qiita.default_workflow_edge_connections (
                default_workflow_edge_id, parent_output_id, child_input_id)
            VALUES (4, 1, 3)"""
        perform_as_transaction(sql)
        # -- adds a new path that should be kept together and then separate;
        #    this is to simulate what happens with MTX/WGS processing, one
        #    single QC step (together) and 2 separete profilers
        sql = """
            INSERT INTO qiita.default_parameter_set (
                command_id, parameter_set_name, parameter_set)
            VALUES (3, '100%',
                    ('{"reference":1,"sortmerna_e_value":1,'
                     || '"sortmerna_max_pos":'
                     || '10000,"similarity":1.0,"sortmerna_coverage":1.00,'
                     || '"threads":1}')::json);
            INSERT INTO qiita.default_workflow_node (
                default_workflow_id, default_parameter_set_id)
            VALUES (2, 17);
            INSERT INTO qiita.default_workflow_edge (
                parent_id, child_id)
            VALUES (3, 9);
            INSERT INTO qiita.default_workflow_edge_connections (
                default_workflow_edge_id, parent_output_id, child_input_id)
            VALUES (5, 1, 3)"""
        perform_as_transaction(sql)

        # adding new expected values
        exp = deepcopy(WORKFLOWS)
        obs = _retrive_workflows(False)
        exp[0]['nodes'].extend([
            ['params_7', 1, 'Split libraries FASTQ', 'Defaults with reverse '
             'complement mapping file barcodes', {
                'max_bad_run_length': '3',
                'min_per_read_length_fraction': '0.75',
                'sequence_max_n': '0', 'rev_comp_barcode': 'False',
                'rev_comp_mapping_barcodes': 'True', 'rev_comp': 'False',
                'phred_quality_threshold': '3', 'barcode_type': 'golay_12',
                'max_barcode_errors': '1.5', 'phred_offset': 'auto'}],
            ['output_params_7_demultiplexed | Demultiplexed', 1,
             'demultiplexed | Demultiplexed'],
            ['params_8', 3, 'Pick closed-reference OTUs', 'Defaults', {
                'reference': '1', 'sortmerna_e_value': '1',
                'sortmerna_max_pos': '10000', 'similarity': '0.97',
                'sortmerna_coverage': '0.97', 'threads': '1'}],
            ['output_params_8_OTU table | BIOM', 3, 'OTU table | BIOM']])
        exp[0]['edges'].extend([
            ['input_params_1_FASTQ | per_sample_FASTQ', 'params_7'],
            ['params_7', 'output_params_7_demultiplexed | Demultiplexed'],
            ['output_params_7_demultiplexed | Demultiplexed', 'params_8'],
            ['params_8', 'output_params_8_OTU table | BIOM']])
        exp[1]['nodes'].extend([
            ['params_9', 3, 'Pick closed-reference OTUs', '100%', {
                'reference': '1', 'sortmerna_e_value': '1',
                'sortmerna_max_pos': '10000', 'similarity': '1.0',
                'sortmerna_coverage': '1.0', 'threads': '1'}],
            ['output_params_9_OTU table | BIOM', 3, 'OTU table | BIOM']])
        exp[1]['edges'].extend([
            ['output_params_3_demultiplexed | Demultiplexed', 'params_9'],
            ['params_9', 'output_params_9_OTU table | BIOM']
        ])
        self.assertCountEqual(obs, exp)
Exemple #43
0
# October 30th, 2017
# A change introduced in July made all the parameters to be stored as strings
# The DB needs to be patched so all the artifacts follow this structure

from json import dumps

from qiita_db.sql_connection import TRN

with TRN:
    sql = """SELECT *
                FROM qiita.artifact
                    JOIN qiita.artifact_output_processing_job
                        USING (artifact_id)
                WHERE command_id IS NOT NULL"""
    TRN.add(sql)

    sql_update_artifact = """UPDATE qiita.artifact
                             SET command_parameters = %s
                             WHERE artifact_id = %s"""
    sql_update_job = """UPDATE qiita.processing_job
                        SET command_parameters = %s
                        WHERE processing_job_id = %s"""
    for ainfo in TRN.execute_fetchindex():
        ainfo = dict(ainfo)
        params = dumps(
            {k: str(v)
             for k, v in ainfo['command_parameters'].items()})
        TRN.add(sql_update_artifact, [params, ainfo['artifact_id']])
        TRN.add(sql_update_job, [params, ainfo['processing_job_id']])
Exemple #44
0
# helper function to calculate checksum and file size
def calculate(finfo):
    try:
        size = getsize(finfo['fullpath'])
    except (FileNotFoundError, PermissionError):
        return finfo, None, None

    checksum = compute_checksum(finfo['fullpath'])

    return finfo['filepath_id'], checksum, size


# get all filepaths and their filepath information; takes ~10 min
with TRN:
    TRN.add("SELECT filepath_id FROM qiita.filepath")
    files = []
    for fid in TRN.execute_fetchflatten():
        files.append(get_filepath_information(fid))

# just get the filepath ids that haven't been processed, the file format
# of this file is filepath_id[tab]checksum[tab]filesize
fpath = join(dirname(abspath(__file__)), '74.py.cache.tsv')
processed = []
if exists(fpath):
    with open(fpath, 'r') as f:
        processed = [
            int(line.split('\t')[0]) for line in f.read().split('\n')
            if line != ''
        ]
files_curr = [f for f in files if f['filepath_id'] not in processed]
Exemple #45
0
def create_non_rarefied_biom_artifact(analysis, biom_data, rarefied_table):
    """Creates the initial non-rarefied BIOM artifact of the analysis

    Parameters
    ----------
    analysis : dict
        Dictionary with the analysis information
    biom_data : dict
        Dictionary with the biom file information
    rarefied_table : biom.Table
        The rarefied BIOM table

    Returns
    -------
    int
        The id of the new artifact
    """
    # The non rarefied biom artifact is the initial biom table of the analysis.
    # This table does not currently exist anywhere, so we need to actually
    # create the BIOM file. To create this BIOM file we need: (1) the samples
    # and artifacts they come from and (2) whether the samples where
    # renamed or not. (1) is on the database, but we need to inferr (2) from
    # the existing rarefied BIOM table. Fun, fun...

    with TRN:
        # Get the samples included in the BIOM table grouped by artifact id
        # Note that the analysis contains a BIOM table per data type included
        # in it, and the table analysis_sample does not differentiate between
        # datatypes, so we need to check the data type in the artifact table
        sql = """SELECT artifact_id, array_agg(sample_id)
                 FROM qiita.analysis_sample
                    JOIN qiita.artifact USING (artifact_id)
                 WHERE analysis_id = %s AND data_type_id = %s
                 GROUP BY artifact_id"""
        TRN.add(sql, [analysis['analysis_id'], biom_data['data_type_id']])
        samples_by_artifact = TRN.execute_fetchindex()

        # Create an empty BIOM table to be the new master table
        new_table = Table([], [], [])
        ids_map = {}
        for a_id, samples in samples_by_artifact:
            # Get the filepath of the BIOM table from the artifact
            artifact = Artifact(a_id)
            biom_fp = None
            for _, fp, fp_type in artifact.filepaths:
                if fp_type == 'biom':
                    biom_fp = fp
            # Note that we are sure that the biom table exists for sure, so
            # no need to check if biom_fp is undefined
            biom_table = load_table(biom_fp)
            samples = set(samples).intersection(biom_table.ids())
            biom_table.filter(samples, axis='sample', inplace=True)
            # we need to check if the table has samples left before merging
            if biom_table.shape[0] != 0 and biom_table.shape[1] != 0:
                new_table = new_table.merge(biom_table)
                ids_map.update(
                    {sid: "%d.%s" % (a_id, sid)
                     for sid in biom_table.ids()})

        # Check if we need to rename the sample ids in the biom table
        new_table_ids = set(new_table.ids())
        if not new_table_ids.issuperset(rarefied_table.ids()):
            # We need to rename the sample ids
            new_table.update_ids(ids_map, 'sample', True, True)

        sql = """INSERT INTO qiita.artifact
                    (generated_timestamp, data_type_id, visibility_id,
                     artifact_type_id, submitted_to_vamps)
            VALUES (%s, %s, %s, %s, %s)
            RETURNING artifact_id"""
        # Magic number 4 -> visibility sandbox
        # Magix number 7 -> biom artifact type
        TRN.add(
            sql,
            [analysis['timestamp'], biom_data['data_type_id'], 4, 7, False])
        artifact_id = TRN.execute_fetchlast()

        # Associate the artifact with the analysis
        sql = """INSERT INTO qiita.analysis_artifact
                    (analysis_id, artifact_id)
                 VALUES (%s, %s)"""
        TRN.add(sql, [analysis['analysis_id'], artifact_id])
        # Link the artifact with its file
        dd_id, mp = get_mountpoint('BIOM')[0]
        dir_fp = join(get_db_files_base_dir(), mp, str(artifact_id))
        if not exists(dir_fp):
            makedirs(dir_fp)
        new_table_fp = join(dir_fp, "biom_table.biom")
        with biom_open(new_table_fp, 'w') as f:
            new_table.to_hdf5(f, "Generated by Qiita")

        sql = """INSERT INTO qiita.filepath
                    (filepath, filepath_type_id, checksum,
                     checksum_algorithm_id, data_directory_id)
                 VALUES (%s, %s, %s, %s, %s)
                 RETURNING filepath_id"""
        # Magic number 7 -> filepath_type_id = 'biom'
        # Magic number 1 -> the checksum algorithm id
        TRN.add(sql, [
            basename(new_table_fp), 7,
            compute_checksum(new_table_fp), 1, dd_id
        ])
        fp_id = TRN.execute_fetchlast()
        sql = """INSERT INTO qiita.artifact_filepath
                    (artifact_id, filepath_id)
                 VALUES (%s, %s)"""
        TRN.add(sql, [artifact_id, fp_id])
        TRN.execute()

    return artifact_id
Exemple #46
0
import pandas as pd
from os.path import join, dirname, abspath, exists
from qiita_db.sql_connection import TRN

with TRN:
    sql = """SELECT filepath_id
             FROM qiita.filepath"""
    TRN.add(sql)
    fids = TRN.execute_fetchflatten()

fpath = join(dirname(abspath(__file__)), 'support_files', 'patches',
             'python_patches', '74.py.cache.tsv')
if not exists(fpath):
    raise ValueError("%s doesn't exits, have you run step 1?" % fpath)
df = pd.read_csv(fpath,
                 sep='\t',
                 index_col=0,
                 dtype=str,
                 names=['filepath_id', 'checksum', 'fp_size'])
cache = df.to_dict('index')

args = []
for fid in fids:
    if fid not in cache:
        print('missing: %d', fid)
    else:
        args.append([cache[fid]['fp_size'], cache[fid]['checksum'], fid])

with TRN:
    sql = """UPDATE qiita.filepath
            SET fp_size = %s, checksum = %s
Exemple #47
0
# -----------------------------------------------------------------------------
# Copyright (c) 2014--, The Qiita Development Team.
#
# Distributed under the terms of the BSD 3-clause License.
#
# The full license is in the file LICENSE, distributed with this software.
# -----------------------------------------------------------------------------

from random import SystemRandom
from string import ascii_letters, digits

from qiita_db.sql_connection import TRN

pool = ascii_letters + digits
client_id = ''.join([SystemRandom().choice(pool) for _ in range(50)])
client_secret = ''.join([SystemRandom().choice(pool) for _ in range(255)])

with TRN:
    sql = """INSERT INTO qiita.oauth_identifiers (client_id, client_secret)
             VALUES (%s, %s)"""
    TRN.add(sql, [client_id, client_secret])

    sql = """INSERT INTO qiita.oauth_software (software_id, client_id)
             VALUES (%s, %s)"""
    TRN.add(sql, [1, client_id])
    TRN.execute()
Exemple #48
0
# change in the near future, we feel that the easiest way to transfer
# the current analyses results is by creating 3 different types of
# artifacts: (1) distance matrix -> which will include the distance matrix,
# the principal coordinates and the emperor plots; (2) rarefaction
# curves -> which will include all the files generated by alpha rarefaction
# and (3) taxonomy summary, which will include all the files generated
# by summarize_taxa_through_plots.py

with TRN:
    # Add the new artifact types
    sql = """INSERT INTO qiita.artifact_type (
                artifact_type, description, can_be_submitted_to_ebi,
                can_be_submitted_to_vamps)
             VALUES (%s, %s, %s, %s)
             RETURNING artifact_type_id"""
    TRN.add(sql,
            ['beta_div_plots', 'Qiime 1 beta diversity results', False, False])
    dm_atype_id = TRN.execute_fetchlast()
    TRN.add(sql, ['rarefaction_curves', 'Rarefaction curves', False, False])
    rc_atype_id = TRN.execute_fetchlast()
    TRN.add(sql, ['taxa_summary', 'Taxa summary plots', False, False])
    ts_atype_id = TRN.execute_fetchlast()

    # Associate each artifact with the filetypes that it accepts
    # At this time we are going to add them as directories, just as it is done
    # right now. We can make it fancier with the new type system.
    # Magic number 8: the filepath_type_id for the directory
    sql = """INSERT INTO qiita.artifact_type_filepath_type
                (artifact_type_id, filepath_type_id, required)
             VALUES (%s, %s, %s)"""
    sql_args = [[dm_atype_id, 8, True], [rc_atype_id, 8, True],
                [ts_atype_id, 8, True]]
Exemple #49
0
                           get_mountpoint_path_by_id)

pool = ascii_letters + digits
tgz_id = convert_to_id("tgz", "filepath_type")
_id, analysis_mp = get_mountpoint('analysis')[0]
with TRN:
    # 2 and 3 are the ids of the 2 new software rows, the BIOM and
    # target gene type plugins
    for i in [2, 3]:
        client_id = ''.join([SystemRandom().choice(pool) for _ in range(50)])
        client_secret = ''.join(
            [SystemRandom().choice(pool) for _ in range(255)])

        sql = """INSERT INTO qiita.oauth_identifiers (client_id, client_secret)
                VALUES (%s, %s)"""
        TRN.add(sql, [client_id, client_secret])

        sql = """INSERT INTO qiita.oauth_software (software_id, client_id)
                 VALUES (%s, %s)"""
        TRN.add(sql, [i, client_id])
        TRN.execute()

    #
    # Generating compressed files for picking failures -- artifact_type = BIOM
    #
    sql = """SELECT artifact_id FROM qiita.artifact
                JOIN qiita.artifact_type USING (artifact_type_id)
                WHERE artifact_type = 'BIOM'"""
    TRN.add(sql)

    for r in TRN.execute_fetchindex():
Exemple #50
0
def create_command(software, name, description, parameters, outputs=None,
                   analysis_only=False):
    r"""Replicates the Command.create code at the time the patch was written"""
    # Perform some sanity checks in the parameters dictionary
    if not parameters:
        raise QiitaDBError(
            "Error creating command %s. At least one parameter should "
            "be provided." % name)
    sql_param_values = []
    sql_artifact_params = []
    for pname, vals in parameters.items():
        if len(vals) != 2:
            raise QiitaDBError(
                "Malformed parameters dictionary, the format should be "
                "{param_name: [parameter_type, default]}. Found: "
                "%s for parameter name %s" % (vals, pname))

        ptype, dflt = vals
        # Check that the type is one of the supported types
        supported_types = ['string', 'integer', 'float', 'reference',
                           'boolean', 'prep_template', 'analysis']
        if ptype not in supported_types and not ptype.startswith(
                ('choice', 'mchoice', 'artifact')):
            supported_types.extend(['choice', 'mchoice', 'artifact'])
            raise QiitaDBError(
                "Unsupported parameters type '%s' for parameter %s. "
                "Supported types are: %s"
                % (ptype, pname, ', '.join(supported_types)))

        if ptype.startswith(('choice', 'mchoice')) and dflt is not None:
            choices = set(loads(ptype.split(':')[1]))
            dflt_val = dflt
            if ptype.startswith('choice'):
                # In the choice case, the dflt value is a single string,
                # create a list with it the string on it to use the
                # issuperset call below
                dflt_val = [dflt_val]
            else:
                # jsonize the list to store it in the DB
                dflt = dumps(dflt)
            if not choices.issuperset(dflt_val):
                raise QiitaDBError(
                    "The default value '%s' for the parameter %s is not "
                    "listed in the available choices: %s"
                    % (dflt, pname, ', '.join(choices)))

        if ptype.startswith('artifact'):
            atypes = loads(ptype.split(':')[1])
            sql_artifact_params.append(
                [pname, 'artifact', atypes])
        else:
            if dflt is not None:
                sql_param_values.append([pname, ptype, False, dflt])
            else:
                sql_param_values.append([pname, ptype, True, None])

    with TRN:
        sql = """SELECT EXISTS(SELECT *
                               FROM qiita.software_command
                               WHERE software_id = %s AND name = %s)"""
        TRN.add(sql, [software.id, name])
        if TRN.execute_fetchlast():
            raise QiitaDBDuplicateError(
                "command", "software: %d, name: %s"
                           % (software.id, name))
        # Add the command to the DB
        sql = """INSERT INTO qiita.software_command
                        (name, software_id, description, is_analysis)
                 VALUES (%s, %s, %s, %s)
                 RETURNING command_id"""
        sql_params = [name, software.id, description, analysis_only]
        TRN.add(sql, sql_params)
        c_id = TRN.execute_fetchlast()

        # Add the parameters to the DB
        sql = """INSERT INTO qiita.command_parameter
                    (command_id, parameter_name, parameter_type, required,
                     default_value)
                 VALUES (%s, %s, %s, %s, %s)
                 RETURNING command_parameter_id"""
        sql_params = [[c_id, pname, p_type, reqd, default]
                      for pname, p_type, reqd, default in sql_param_values]
        TRN.add(sql, sql_params, many=True)
        TRN.execute()

        # Add the artifact parameters
        sql_type = """INSERT INTO qiita.parameter_artifact_type
                        (command_parameter_id, artifact_type_id)
                      VALUES (%s, %s)"""
        supported_types = []
        for pname, p_type, atypes in sql_artifact_params:
            sql_params = [c_id, pname, p_type, True, None]
            TRN.add(sql, sql_params)
            pid = TRN.execute_fetchlast()
            sql_params = [[pid, convert_to_id(at, 'artifact_type')]
                          for at in atypes]
            TRN.add(sql_type, sql_params, many=True)
            supported_types.extend([atid for _, atid in sql_params])

        # If the software type is 'artifact definition', there are a couple
        # of extra steps
        if software.type == 'artifact definition':
            # If supported types is not empty, link the software with these
            # types
            if supported_types:
                sql = """INSERT INTO qiita.software_artifact_type
                                (software_id, artifact_type_id)
                            VALUES (%s, %s)"""
                sql_params = [[software.id, atid]
                              for atid in supported_types]
                TRN.add(sql, sql_params, many=True)
            # If this is the validate command, we need to add the
            # provenance and name parameters. These are used internally,
            # that's why we are adding them here
            if name == 'Validate':
                sql = """INSERT INTO qiita.command_parameter
                            (command_id, parameter_name, parameter_type,
                             required, default_value)
                         VALUES (%s, 'name', 'string', 'False',
                                 'dflt_name'),
                                (%s, 'provenance', 'string', 'False', NULL)
                         """
                TRN.add(sql, [c_id, c_id])

        # Add the outputs to the command
        if outputs:
            sql = """INSERT INTO qiita.command_output
                        (name, command_id, artifact_type_id)
                     VALUES (%s, %s, %s)"""
            sql_args = [[pname, c_id, convert_to_id(at, 'artifact_type')]
                        for pname, at in outputs.items()]
            TRN.add(sql, sql_args, many=True)
            TRN.execute()

    return Command(c_id)
Exemple #51
0
from os.path import basename

from qiita_db.sql_connection import TRN
from qiita_db.study import Study

for study in Study.iter():
    for pt in study.prep_templates():
        filepaths = pt.get_filepaths()
        if filepaths:
            # filepaths are returned in order so we can take the
            # oldest and newest; then we get the filename and parse the
            # creation time. Note that the filename comes in one of these
            # formats: 1_prep_1_qiime_19700101-000000.txt or
            # 1_prep_1_19700101-000000.txt
            oldest = basename(filepaths[-1][1])[-19:-4].replace('-', ' ')
            newest = basename(filepaths[0][1])[-19:-4].replace('-', ' ')

            with TRN:
                sql = """UPDATE qiita.prep_template
                         SET creation_timestamp = %s,
                             modification_timestamp = %s
                         WHERE prep_template_id = %s"""
                TRN.add(sql, [oldest, newest, pt.id])
                TRN.execute()
Exemple #52
0
                        for vv in v:
                            to_merge[vv] = k
                merge_fn = (lambda id_, x: to_merge[id_]
                            if id_ in to_merge else id_)
                t = t.collapse(merge_fn, norm=False, min_group_size=1,
                               axis='observation', collapse_f=collapse_f)
            else:
                ids_to_replace = {c: c.upper() for c in current
                                  if c != c.upper()}

            t.update_ids(ids_to_replace, axis='observation', strict=False,
                         inplace=True)

            with biom_open(biom, 'w') as f:
                t.to_hdf5(f, t.generated_by)
            checksum = compute_checksum(biom)

            TRN.add(sql, [checksum, ftps['biom'][0]])

            fna = ftps['preprocessed_fasta'][1]
            tmp = fna + '.tmp'
            with open(tmp, 'w') as out:
                for seq in t.ids('observation'):
                    out.write('>%s\n%s\n' % (seq, seq))
            rename(tmp, fna)
            checksum = compute_checksum(fna)

            TRN.add(sql, [checksum, ftps['preprocessed_fasta'][0]])

            TRN.execute()
Exemple #53
0
                updated = map(lambda x: x.upper(), current)
                if len(set(updated)) != len(updated):
                    print('************>', a.id, fp, '<**************')
                if set(current) ^ set(updated):
                    print('Changing biom: ', a.id, fp)
                    t.update_ids({i: i.upper() for i in t.ids('observation')},
                                 axis='observation', inplace=True)
                    with biom_open(fp, 'w') as f:
                        t.to_hdf5(f, t.generated_by)
                    checksum = compute_checksum(fp)
            elif fpt == 'preprocessed_fasta':
                changed = False
                tmp = fp + '.tmp'
                with open(tmp, 'w') as out:
                    for seq in read(fp, format='fasta'):
                        seq = str(seq)
                        sequ = seq.upper()
                        out.write('>%s\n%s\n' % (sequ, sequ))
                        if seq != sequ:
                            changed = True
                if changed:
                    print('Changing biom: ', a.id, fp)
                    rename(tmp, fp)
                    checksum = compute_checksum(fp)
                else:
                    remove(tmp)

            if checksum is not None:
                TRN.add(sql, [checksum, _id])
                TRN.execute()
Exemple #54
0
import pandas as pd
from os.path import getsize, join, dirname, abspath, exists
from qiita_db.util import get_filepath_information, compute_checksum
from qiita_db.sql_connection import TRN

with TRN:
    sql = """SELECT filepath_id
             FROM qiita.filepath"""
    TRN.add(sql)
    fids = TRN.execute_fetchflatten()

fpath = join(dirname(abspath(__file__)), 'support_files', 'patches',
             'python_patches', '74.py.cache.tsv')
cache = dict()
if exists(fpath):
    df = pd.read_csv(fpath,
                     sep='\t',
                     index_col=0,
                     dtype=str,
                     names=['filepath_id', 'checksum', 'fp_size'])
    cache = df.to_dict('index')

for fid in fids:
    if fid not in cache:
        finfo = get_filepath_information(fid)
        try:
            size = getsize(finfo['fullpath'])
        except FileNotFoundError:
            size = 0

        try:
Exemple #55
0
if cols_sample:
    with TRN:
        # a few notes: just getting the preps with duplicated values; ignoring
        # column 'sample_id' and tables 'study_sample', 'prep_template',
        # 'prep_template_sample'
        sql = """SELECT table_name, array_agg(column_name::text)
                    FROM information_schema.columns
                    WHERE column_name IN %s
                        AND table_name LIKE 'sample_%%'
                        AND table_name NOT IN (
                            'prep_template', 'prep_template_sample')
                    GROUP BY table_name"""
        # note that we are looking for those columns with duplicated names in
        # the headers
        TRN.add(sql, [tuple(set(cols_sample))])
        for table, columns in dict(TRN.execute_fetchindex()).items():
            # [1] the format is table_# so taking the #
            st = SampleTemplate(int(table.split('_')[1]))
            # getting just the columns of interest
            st_df = st.to_dataframe()[columns]
            # converting to datetime
            for col in columns:
                st_df[col] = st_df[col].apply(transform_date)
            st.update(st_df)

if cols_prep:
    with TRN:
        # a few notes: just getting the preps with duplicated values; ignoring
        # column 'sample_id' and tables 'study_sample', 'prep_template',
        # 'prep_template_sample'
Exemple #56
0
    # a few notes: just getting the preps with duplicated values; ignoring
    # column 'sample_id' and tables 'study_sample', 'prep_template',
    # 'prep_template_sample'
    sql = """SELECT table_name, array_agg(column_name::text)
                FROM information_schema.columns
                WHERE column_name IN %s
                    AND column_name != 'sample_id'
                    AND table_name LIKE 'prep_%%'
                    AND table_name NOT IN (
                        'prep_template', 'prep_template_sample')
                GROUP BY table_name"""
    # note that we are looking for those columns with duplicated names in
    # the headers
    TRN.add(sql, [
        tuple(
            set(PrepTemplate.metadata_headers())
            & set(SampleTemplate.metadata_headers()))
    ])
    overlapping = dict(TRN.execute_fetchindex())

# finding actual duplicates
for table_name, cols in viewitems(overlapping):
    # leaving print so when we patch in the main system we know that
    # nothing was renamed or deal with that
    print table_name
    with TRN:
        for c in cols:
            sql = 'ALTER TABLE qiita.%s RENAME COLUMN %s TO %s_renamed' % (
                table_name, c, c)
            TRN.add(sql)
        TRN.execute()
Exemple #57
0
# Nov 28, 2017 (only in py file)
# Adding a new command into Qiita/Alpha: delete_analysis

from qiita_db.software import Software, Command
from qiita_db.sql_connection import TRN

# Create the delete study command
Command.create(Software.from_name_and_version('Qiita',
                                              'alpha'), 'delete_analysis',
               'Deletes a full analysis', {'analysis_id': ['integer', None]})

# Make sure that all validate commands have the "analysis" parameter
with TRN:
    # Get all validate commands that are missing the analysis parameter
    sql = """SELECT command_id
             FROM qiita.software_command sc
             WHERE name = 'Validate' AND NOT (
                SELECT EXISTS(SELECT *
                              FROM qiita.command_parameter
                              WHERE parameter_name = 'analysis'
                                AND command_id = sc.command_id));"""
    TRN.add(sql)
    sql = """INSERT INTO qiita.command_parameter
                (command_id, parameter_name, parameter_type,
                 required, default_value, name_order, check_biom_merge)
             VALUES (6, 'analysis', 'analysis', false, NULL, NULL, false)"""
    sql_params = [[cmd_id, 'analysis', 'analysis', False, None, None, False]
                  for cmd_id in TRN.execute_fetchflatten()]
    TRN.add(sql, sql_params, many=True)
    TRN.execute()
Exemple #58
0
def transfer_job(analysis, command_id, params, input_artifact_id, job_data,
                 cmd_out_id, biom_data, output_artifact_type_id):
    """Transfers the job from the old structure to the plugin structure

    Parameters
    ----------
    analysis : dict
        The analysis information
    command_id : int
        The id of the command executed
    params : str
        The parameters used in the job
    input_artifact_id : int
        The id of the input artifact
    job_data : dict
        The job information
    cmd_out_id : int
        The id of the command's output
    biom_data : dict
        The biom information
    output_artifact_type_id : int
        The type of the output artifact
    """
    with TRN:
        # Create the job
        # Add the row in the processing job table
        # Magic number 3: status -> success
        sql = """INSERT INTO qiita.processing_job
                    (email, command_id, command_parameters,
                     processing_job_status_id)
                 VALUES (%s, %s, %s, %s)
                 RETURNING processing_job_id"""
        TRN.add(sql, [analysis['email'], command_id, params, 3])
        job_id = TRN.execute_fetchlast()

        # Link the job with the input artifact
        sql = """INSERT INTO qiita.artifact_processing_job
                    (artifact_id, processing_job_id)
                 VALUES (rarefied_biom_id, proc_job_id)"""
        TRN.add(sql, [input_artifact_id, job_id])

        # Check if the executed job has results and add them
        sql = """SELECT EXISTS(SELECT *
                               FROM qiita.job_results_filepath
                               WHERE job_id = %s)"""
        TRN.add(sql, [job_data['job_id']])
        if TRN.execute_fetchlast():
            # There are results for the current job.
            # Transfer the job files to a new artifact
            sql = """SELECT filepath_id
                     FROM qiita.job_results_filepath
                     WHERE job_id = %s"""
            TRN.add(sql, job_data['job_id'])
            filepath_id = TRN.execute_fetchlast()
            artifact_id = transfer_file_to_artifact(
                analysis['analysis_id'], analysis['timestamp'], command_id,
                biom_data['data_type_id'], params, output_artifact_type_id,
                filepath_id)

            # Link the artifact with its parent
            sql = """INSERT INTO qiita.parent_artifact (artifact_id, parent_id)
                     VALUES (%s, %s)"""
            TRN.add(sql, [artifact_id, input_artifact_id])
            # Link the artifact as the job output
            sql = """INSERT INTO qiita.artifact_output_processing_job
                        (artifact_id, processing_job_id, command_output_id)
                     VALUES (%s, %s, %s)"""
            TRN.add(sql, [artifact_id, job_id, cmd_out_id])
            TRN.exeucte()
        else:
            # There are no results on the current job, so mark it as
            # error
            if job_data.log_id is None:
                # Magic number 2 - we are not using any other severity
                # level, so keep using number 2
                sql = """INSERT INTO qiita.logging (time, severity_id, msg)
                    VALUES (%s, %s, %s)
                    RETURNING logging_id"""
                TRN.add(sql,
                        [analysis['timestamp'], 2, "Unknown error - patch 47"])
            else:
                log_id = job_data['log_id']

            # Magic number 4 -> status -> error
            sql = """UPDATE qiita.processing_job
                SET processing_job_status_id = 4, logging_id = %s
                WHERE processing_job_id = %s"""
            TRN.add(sql, [log_id, job_id])
Exemple #59
0
from qiita_db.sql_connection import TRN


# Due to the size of these changes we will
with TRN:
    # select all table and column names from all sample template
    sql = """SELECT DISTINCT table_name FROM information_schema.columns
                WHERE (table_name LIKE 'sample_%'
                       OR table_name LIKE 'prep_%')
                    AND table_name NOT LIKE '%template%'"""
    TRN.add(sql)

    all_tables = TRN.execute_fetchflatten()

for table in all_tables:
    with TRN:
        sql = """SELECT column_name FROM information_schema.columns
                    WHERE table_name = %s
                    ORDER BY column_name"""
        TRN.add(sql, [table])

        for column in TRN.execute_fetchflatten():
            sql = "ALTER TABLE qiita.%s ALTER COLUMN %s TYPE VARCHAR" % (
                table, column)
            TRN.add(sql)

        TRN.execute()
Exemple #60
0
 def postgres_test(**kwargs):
     """Open a connection and query postgres"""
     from qiita_db.sql_connection import TRN
     with TRN:
         TRN.add("SELECT 42")
         return TRN.execute_fetchflatten()[0]