Beispiel #1
0
    def test_execute_fetchindex(self):
        with TRN:
            sql = """INSERT INTO qiita.test_table (str_column, int_column)
                     VALUES (%s, %s) RETURNING str_column, int_column"""
            args = [["insert1", 1], ["insert2", 2], ["insert3", 3]]
            TRN.add(sql, args, many=True)
            self.assertEqual(TRN.execute_fetchindex(), [["insert3", 3]])

            sql = """INSERT INTO qiita.test_table (str_column, int_column)
                     VALUES (%s, %s) RETURNING str_column, int_column"""
            args = [["insert4", 4], ["insert5", 5], ["insert6", 6]]
            TRN.add(sql, args, many=True)
            self.assertEqual(TRN.execute_fetchindex(3), [["insert4", 4]])
    def update(self, md_template):
        r"""Update values in the template

        Parameters
        ----------
        md_template : DataFrame
            The metadata template file contents indexed by samples Ids

        Raises
        ------
        QiitaDBError
            If md_template and db do not have the same sample ids
            If md_template and db do not have the same column headers
            If self.can_be_updated is not True
        """
        with TRN:
            # Clean and validate the metadata template given
            new_map = self._clean_validate_template(md_template, self.study_id, self.columns_restrictions)
            # Retrieving current metadata
            sql = "SELECT * FROM qiita.{0}".format(self._table_name(self.id))
            TRN.add(sql)
            current_map = self._transform_to_dict(TRN.execute_fetchindex())
            current_map = pd.DataFrame.from_dict(current_map, orient="index")

            # simple validations of sample ids and column names
            samples_diff = set(new_map.index).difference(current_map.index)
            if samples_diff:
                raise QiitaDBError(
                    "The new template differs from what is stored "
                    "in database by these samples names: %s" % ", ".join(samples_diff)
                )
            columns_diff = set(new_map.columns).difference(current_map.columns)
            if columns_diff:
                raise QiitaDBError(
                    "The new template differs from what is stored "
                    "in database by these columns names: %s" % ", ".join(columns_diff)
                )

            # here we are comparing two dataframes following:
            # http://stackoverflow.com/a/17095620/4228285
            current_map.sort(axis=0, inplace=True)
            current_map.sort(axis=1, inplace=True)
            new_map.sort(axis=0, inplace=True)
            new_map.sort(axis=1, inplace=True)
            map_diff = (current_map != new_map).stack()
            map_diff = map_diff[map_diff]
            map_diff.index.names = ["id", "column"]
            changed_cols = map_diff.index.get_level_values("column").unique()

            if not self.can_be_updated(columns=set(changed_cols)):
                raise QiitaDBError(
                    "The new template is modifying fields that cannot be "
                    "modified. Try removing the target gene fields or "
                    "deleting the processed data. You are trying to modify: %s" % ", ".join(changed_cols)
                )

            for col in changed_cols:
                self.update_category(col, new_map[col].to_dict())

            self.generate_files()
Beispiel #3
0
    def status(self):
        """The status of the prep template

        Returns
        -------
        str
            The status of the prep template

        Notes
        -----
        The status of a prep template is inferred by the status of the
        processed data generated from this prep template. If no processed
        data has been generated with this prep template; then the status
        is 'sandbox'.
        """
        with TRN:
            sql = """SELECT processed_data_status
                    FROM qiita.processed_data_status pds
                      JOIN qiita.processed_data pd
                        USING (processed_data_status_id)
                      JOIN qiita.preprocessed_processed_data ppd_pd
                        USING (processed_data_id)
                      JOIN qiita.prep_template_preprocessed_data pt_ppd
                        USING (preprocessed_data_id)
                    WHERE pt_ppd.prep_template_id=%s"""
            TRN.add(sql, [self._id])

            return infer_status(TRN.execute_fetchindex())
Beispiel #4
0
def _generate_study_list_for_api(visibility, only_biom=True):
    """Get general study information

    Parameters
    ----------
    visibility : string
        The visibility to get studies

    Returns
    -------
    list of dict
        The list of studies and their information
    """
    artifact_type = ''
    if only_biom:
        artifact_type = "AND artifact_type = 'BIOM'"

    sql = f"""
        SELECT study_id, array_agg(DISTINCT artifact_id) FROM qiita.study
            INNER JOIN qiita.study_artifact USING (study_id)
            INNER JOIN qiita.artifact USING (artifact_id)
            INNER JOIN qiita.artifact_type USING (artifact_type_id)
            INNER JOIN qiita.visibility USING (visibility_id)
        WHERE visibility = %s
        {artifact_type}
        GROUP BY study_id
    """
    with TRN:
        TRN.add(sql, [visibility])
        return dict(TRN.execute_fetchindex())
Beispiel #5
0
    def _update_accession_numbers(self, column, values):
        """Update accession numbers stored in `column` with the ones in `values`

        Parameters
        ----------
        column : str
            The column name where the accession number are stored
        values : dict of {str: str}
            The accession numbers keyed by sample id

        Raises
        ------
        QiitaDBError
            If a sample in `values` already has an accession number
        QiitaDBWarning
            If `values` is not updating any accesion number
        """
        with TRN:
            sql = """SELECT sample_id, {0}
                     FROM qiita.{1}
                     WHERE {2}=%s
                        AND {0} IS NOT NULL""".format(column, self._table,
                                                      self._id_column)
            TRN.add(sql, [self.id])
            db_vals = {sample_id: accession
                       for sample_id, accession in TRN.execute_fetchindex()}
            common_samples = set(db_vals) & set(values)
            diff = [sample for sample in common_samples
                    if db_vals[sample] != values[sample]]
            if diff:
                raise QiitaDBError(
                    "The following samples already have an accession number: "
                    "%s" % ', '.join(diff))

            # Remove the common samples form the values dictionary
            values = deepcopy(values)
            for sample in common_samples:
                del values[sample]

            if values:
                sql_vals = ', '.join(["(%s, %s)"] * len(values))
                sql = """UPDATE qiita.{0} AS t
                         SET {1}=c.{1}
                         FROM (VALUES {2}) AS c(sample_id, {1})
                         WHERE c.sample_id = t.sample_id
                            AND t.{3} = %s
                         """.format(self._table, column, sql_vals,
                                    self._id_column)
                sql_vals = list(chain.from_iterable(values.items()))
                sql_vals.append(self.id)
                TRN.add(sql, sql_vals)
                TRN.execute()
            else:
                warnings.warn("No new accession numbers to update",
                              QiitaDBWarning)
Beispiel #6
0
 def raw_data(self):
     with TRN:
         sql = """SELECT raw_data_id FROM qiita.prep_template
                  WHERE prep_template_id=%s"""
         TRN.add(sql, [self.id])
         result = TRN.execute_fetchindex()
         if result:
             # If there is any result, it will be in the first row
             # and in the first element of that row, thus [0][0]
             return result[0][0]
         return None
Beispiel #7
0
    def __call__(self, searchstr, user):
        """Runs a Study query and returns matching studies and samples

        Parameters
        ----------
        searchstr : str
            Search string to use
        user : User object
            User making the search. Needed for permissions checks.

        Returns
        -------
        dict
            Found samples in format
            {study_id: [[samp_id1, meta1, meta2, ...],
                        [samp_id2, meta1, meta2, ...], ...}
        list
            metadata column names searched for

        Notes
        -----
        Metadata information for each sample is in the same order as the
        metadata columns list returned

        Metadata column names and string searches are case-sensitive
        """
        with TRN:
            study_sql, sample_sql, meta_headers = \
                self._parse_study_search_string(searchstr, True)

            # get all studies containing the metadata headers requested
            TRN.add(study_sql)
            study_ids = set(TRN.execute_fetchflatten())
            # strip to only studies user has access to
            if user.level not in {'admin', 'dev', 'superuser'}:
                study_ids = study_ids.intersection(
                    Study.get_by_status('public') | user.user_studies |
                    user.shared_studies)

            results = {}
            # run search on each study to get out the matching samples
            for sid in study_ids:
                TRN.add(sample_sql.format(sid))
                study_res = TRN.execute_fetchindex()
                if study_res:
                    # only add study to results if actually has samples
                    # in results
                    results[sid] = study_res
            self.results = results
            self.meta_headers = meta_headers
            return results, meta_headers
    def _to_dict(self):
        r"""Returns the categories and their values in a dictionary

        Returns
        -------
        dict of {str: str}
            A dictionary of the form {category: value}
        """
        with TRN:
            sql = "SELECT * FROM qiita.{0} WHERE sample_id=%s".format(self._dynamic_table)
            TRN.add(sql, [self._id])
            d = dict(TRN.execute_fetchindex()[0])

            # Remove the sample_id, is not part of the metadata
            del d["sample_id"]

            return d
    def to_dataframe(self):
        """Returns the metadata template as a dataframe

        Returns
        -------
        pandas DataFrame
            The metadata in the template,indexed on sample id
        """
        with TRN:
            cols = sorted(get_table_cols(self._table_name(self._id)))
            # Get all metadata for the template
            sql = "SELECT {0} FROM qiita.{1}".format(", ".join(cols), self._table_name(self.id))
            TRN.add(sql, [self._id])
            meta = TRN.execute_fetchindex()

            # Create the dataframe and clean it up a bit
            df = pd.DataFrame((list(x) for x in meta), columns=cols)
            df.set_index("sample_id", inplace=True, drop=True)

            return df
Beispiel #10
0
    def _get_accession_numbers(self, column):
        """Return the accession numbers stored in `column`

        Parameters
        ----------
        column : str
            The column name where the accession number is stored

        Returns
        -------
        dict of {str: str}
            The accession numbers keyed by sample id
        """
        with TRN:
            sql = """SELECT sample_id, {0}
                     FROM qiita.{1}
                     WHERE {2}=%s""".format(column, self._table,
                                            self._id_column)
            TRN.add(sql, [self.id])
            result = {sample_id: accession
                      for sample_id, accession in TRN.execute_fetchindex()}
        return result
Beispiel #11
0
    def qiime_map_fp(self):
        """The QIIME mapping filepath attached to the prep template

        Returns
        -------
        str
            The filepath of the QIIME mapping file
        """
        with TRN:
            sql = """SELECT filepath_id, filepath
                     FROM qiita.filepath
                        JOIN qiita.{0} USING (filepath_id)
                        JOIN qiita.filepath_type USING (filepath_type_id)
                     WHERE {1} = %s AND filepath_type = 'qiime_map'
                     ORDER BY filepath_id DESC""".format(self._filepath_table,
                                                         self._id_column)
            TRN.add(sql, [self._id])
            # We know that the good filepath is the one in the first row
            # because we sorted them in the SQL query
            fn = TRN.execute_fetchindex()[0][1]
            base_dir = get_mountpoint('templates')[0][1]
            return join(base_dir, fn)
Beispiel #12
0
    def get_filepaths(self):
        r"""Retrieves the list of (filepath_id, filepath)"""
        with TRN:
            try:
                sql = """SELECT filepath_id, filepath
                         FROM qiita.filepath
                         WHERE filepath_id IN (
                            SELECT filepath_id FROM qiita.{0}
                            WHERE {1}=%s)
                         ORDER BY filepath_id DESC""".format(
                    self._filepath_table, self._id_column)

                TRN.add(sql, [self.id])
                filepath_ids = TRN.execute_fetchindex()
            except Exception as e:
                LogEntry.create('Runtime', str(e),
                                info={self.__class__.__name__: self.id})
                raise e

            _, fb = get_mountpoint('templates')[0]
            base_fp = partial(join, fb)

            return [(fpid, base_fp(fp)) for fpid, fp in filepath_ids]
Beispiel #13
0
def create_non_rarefied_biom_artifact(analysis, biom_data, rarefied_table):
    """Creates the initial non-rarefied BIOM artifact of the analysis

    Parameters
    ----------
    analysis : dict
        Dictionary with the analysis information
    biom_data : dict
        Dictionary with the biom file information
    rarefied_table : biom.Table
        The rarefied BIOM table

    Returns
    -------
    int
        The id of the new artifact
    """
    # The non rarefied biom artifact is the initial biom table of the analysis.
    # This table does not currently exist anywhere, so we need to actually
    # create the BIOM file. To create this BIOM file we need: (1) the samples
    # and artifacts they come from and (2) whether the samples where
    # renamed or not. (1) is on the database, but we need to inferr (2) from
    # the existing rarefied BIOM table. Fun, fun...

    with TRN:
        # Get the samples included in the BIOM table grouped by artifact id
        # Note that the analysis contains a BIOM table per data type included
        # in it, and the table analysis_sample does not differentiate between
        # datatypes, so we need to check the data type in the artifact table
        sql = """SELECT artifact_id, array_agg(sample_id)
                 FROM qiita.analysis_sample
                    JOIN qiita.artifact USING (artifact_id)
                 WHERE analysis_id = %s AND data_type_id = %s
                 GROUP BY artifact_id"""
        TRN.add(sql, [analysis['analysis_id'], biom_data['data_type_id']])
        samples_by_artifact = TRN.execute_fetchindex()

        # Create an empty BIOM table to be the new master table
        new_table = Table([], [], [])
        ids_map = {}
        for a_id, samples in samples_by_artifact:
            # Get the filepath of the BIOM table from the artifact
            artifact = Artifact(a_id)
            biom_fp = None
            for _, fp, fp_type in artifact.filepaths:
                if fp_type == 'biom':
                    biom_fp = fp
            # Note that we are sure that the biom table exists for sure, so
            # no need to check if biom_fp is undefined
            biom_table = load_table(biom_fp)
            samples = set(samples).intersection(biom_table.ids())
            biom_table.filter(samples, axis='sample', inplace=True)
            # we need to check if the table has samples left before merging
            if biom_table.shape[0] != 0 and biom_table.shape[1] != 0:
                new_table = new_table.merge(biom_table)
                ids_map.update(
                    {sid: "%d.%s" % (a_id, sid)
                     for sid in biom_table.ids()})

        # Check if we need to rename the sample ids in the biom table
        new_table_ids = set(new_table.ids())
        if not new_table_ids.issuperset(rarefied_table.ids()):
            # We need to rename the sample ids
            new_table.update_ids(ids_map, 'sample', True, True)

        sql = """INSERT INTO qiita.artifact
                    (generated_timestamp, data_type_id, visibility_id,
                     artifact_type_id, submitted_to_vamps)
            VALUES (%s, %s, %s, %s, %s)
            RETURNING artifact_id"""
        # Magic number 4 -> visibility sandbox
        # Magix number 7 -> biom artifact type
        TRN.add(
            sql,
            [analysis['timestamp'], biom_data['data_type_id'], 4, 7, False])
        artifact_id = TRN.execute_fetchlast()

        # Associate the artifact with the analysis
        sql = """INSERT INTO qiita.analysis_artifact
                    (analysis_id, artifact_id)
                 VALUES (%s, %s)"""
        TRN.add(sql, [analysis['analysis_id'], artifact_id])
        # Link the artifact with its file
        dd_id, mp = get_mountpoint('BIOM')[0]
        dir_fp = join(get_db_files_base_dir(), mp, str(artifact_id))
        if not exists(dir_fp):
            makedirs(dir_fp)
        new_table_fp = join(dir_fp, "biom_table.biom")
        with biom_open(new_table_fp, 'w') as f:
            new_table.to_hdf5(f, "Generated by Qiita")

        sql = """INSERT INTO qiita.filepath
                    (filepath, filepath_type_id, checksum,
                     checksum_algorithm_id, data_directory_id)
                 VALUES (%s, %s, %s, %s, %s)
                 RETURNING filepath_id"""
        # Magic number 7 -> filepath_type_id = 'biom'
        # Magic number 1 -> the checksum algorithm id
        TRN.add(sql, [
            basename(new_table_fp), 7,
            compute_checksum(new_table_fp), 1, dd_id
        ])
        fp_id = TRN.execute_fetchlast()
        sql = """INSERT INTO qiita.artifact_filepath
                    (artifact_id, filepath_id)
                 VALUES (%s, %s)"""
        TRN.add(sql, [artifact_id, fp_id])
        TRN.execute()

    return artifact_id
Beispiel #14
0
            '{"max_rare_depth": "Default", "tree": "", "num_steps": 10, '
            '"min_rare_depth": 10, "metrics": ["chao1", "observed_otus"]}'
        ], [srare_cmd_id, 'Defaults', '{"subsample_multinomial": "False"}']
    ]
    TRN.add(sql, sql_args, many=True)

# At this point we are ready to start transferring the data from the old
# structures to the new structures. Overview of the procedure:
# Step 1: Add initial set of artifacts up to rarefied table
# Step 2: Transfer the "analisys jobs" to processing jobs and create
#         the analysis artifacts
db_dir = get_db_files_base_dir()
with TRN:
    sql = "SELECT * FROM qiita.analysis"
    TRN.add(sql)
    analysis_info = TRN.execute_fetchindex()

    # Loop through all the analysis
    for analysis in analysis_info:
        # Step 1: Add the inital set of artifacts. An analysis starts with
        # a set of BIOM artifacts.
        sql = """SELECT *
                 FROM qiita.analysis_filepath
                    JOIN qiita.filepath USING (filepath_id)
                    JOIN qiita.filepath_type USING (filepath_type_id)
                WHERE analysis_id = %s AND filepath_type = 'biom'"""
        TRN.add(sql, [analysis['analysis_id']])
        analysis_bioms = TRN.execute_fetchindex()

        # Loop through all the biom tables associated with the current analysis
        # so we can create the initial set of artifacts
Beispiel #15
0
if cols_sample:
    with TRN:
        # a few notes: just getting the preps with duplicated values; ignoring
        # column 'sample_id' and tables 'study_sample', 'prep_template',
        # 'prep_template_sample'
        sql = """SELECT table_name, array_agg(column_name::text)
                    FROM information_schema.columns
                    WHERE column_name IN %s
                        AND table_name LIKE 'sample_%%'
                        AND table_name NOT IN (
                            'prep_template', 'prep_template_sample')
                    GROUP BY table_name"""
        # note that we are looking for those columns with duplicated names in
        # the headers
        TRN.add(sql, [tuple(set(cols_sample))])
        for table, columns in viewitems(dict(TRN.execute_fetchindex())):
            # [1] the format is table_# so taking the #
            st = SampleTemplate(int(table.split('_')[1]))
            # getting just the columns of interest
            st_df = st.to_dataframe()[columns]
            # converting to datetime
            for col in columns:
                st_df[col] = st_df[col].apply(transform_date)
            st.update(st_df)

if cols_prep:
    with TRN:
        # a few notes: just getting the preps with duplicated values; ignoring
        # column 'sample_id' and tables 'study_sample', 'prep_template',
        # 'prep_template_sample'
        sql = """SELECT table_name, array_agg(column_name::text)
Beispiel #16
0
    pc_update_sql = """UPDATE qiita.prep_columns
                        SET column_type = 'bool'
                        WHERE prep_template_id = %s AND column_name = %s"""

    for table in tables:
        table_id = table.split("_")[1]
        # Change NaN values to NULL in database
        TRN.add(cols_sql, [table])
        cols = TRN.execute_fetchflatten()
        for col in cols:
            TRN.add(null_sql.format(table, col), [nans])
        TRN.execute()

        # Update now boolean columns to bool in database
        TRN.add("SELECT {0} FROM qiita.{1}".format(",".join(cols), table))
        col_vals = zip(*TRN.execute_fetchindex())
        for col, vals in zip(cols, col_vals):
            if set(vals) == {None}:
                # Ignore columns that are all NULL
                continue
            if all([v in bool_vals for v in vals]):
                # Every value in the column should be bool, so do it
                TRN.add(alter_sql.format(table, col), [false_vals, true_vals])
                if "sample" in table:
                    st_update.add(table_id)
                    TRN.add(ssc_update_sql, [table_id, col])
                else:
                    pr_update.add(table_id)
                    TRN.add(pc_update_sql, [table_id, col])

    TRN.execute()
Beispiel #17
0
        TRN.add(sql, [client_id, client_secret])

        sql = """INSERT INTO qiita.oauth_software (software_id, client_id)
                 VALUES (%s, %s)"""
        TRN.add(sql, [i, client_id])
        TRN.execute()

    #
    # Generating compressed files for picking failures -- artifact_type = BIOM
    #
    sql = """SELECT artifact_id FROM qiita.artifact
                JOIN qiita.artifact_type USING (artifact_type_id)
                WHERE artifact_type = 'BIOM'"""
    TRN.add(sql)

    for r in TRN.execute_fetchindex():
        to_tgz = None
        a = Artifact(r[0])
        for x in a.filepaths:
            if x['fp_type'] == 'directory':
                # removing / from the path if it exists
                to_tgz = x['fp'][:-1] if x['fp'][-1] == '/' else x['fp']
                break

        if to_tgz is None:
            continue

        tgz = to_tgz + '.tgz'
        if not exists(tgz):
            with taropen(tgz, "w:gz") as tar:
                tar.add(to_tgz, arcname=basename(to_tgz))
Beispiel #18
0
# October 30th, 2017
# A change introduced in July made all the parameters to be stored as strings
# The DB needs to be patched so all the artifacts follow this structure

from json import dumps

from qiita_db.sql_connection import TRN

with TRN:
    sql = """SELECT *
                FROM qiita.artifact
                    JOIN qiita.artifact_output_processing_job
                        USING (artifact_id)
                WHERE command_id IS NOT NULL"""
    TRN.add(sql)

    sql_update_artifact = """UPDATE qiita.artifact
                             SET command_parameters = %s
                             WHERE artifact_id = %s"""
    sql_update_job = """UPDATE qiita.processing_job
                        SET command_parameters = %s
                        WHERE processing_job_id = %s"""
    for ainfo in TRN.execute_fetchindex():
        ainfo = dict(ainfo)
        params = dumps(
            {k: str(v)
             for k, v in ainfo['command_parameters'].items()})
        TRN.add(sql_update_artifact, [params, ainfo['artifact_id']])
        TRN.add(sql_update_job, [params, ainfo['processing_job_id']])
Beispiel #19
0
    def create_qiime_mapping_file(self):
        """This creates the QIIME mapping file and links it in the db.

        Returns
        -------
        filepath : str
            The filepath of the created QIIME mapping file

        Raises
        ------
        ValueError
            If the prep template is not a subset of the sample template
        QiitaDBWarning
            If the QIIME-required columns are not present in the template

        Notes
        -----
        We cannot ensure that the QIIME-required columns are present in the
        metadata map. However, we have to generate a QIIME-compliant mapping
        file. Since the user may need a QIIME mapping file, but not these
        QIIME-required columns, we are going to create them and
        populate them with the value XXQIITAXX.
        """
        with TRN:
            rename_cols = {
                'barcode': 'BarcodeSequence',
                'primer': 'LinkerPrimerSequence',
                'description': 'Description',
            }

            if 'reverselinkerprimer' in self.categories():
                rename_cols['reverselinkerprimer'] = 'ReverseLinkerPrimer'
                new_cols = ['BarcodeSequence', 'LinkerPrimerSequence',
                            'ReverseLinkerPrimer']
            else:
                new_cols = ['BarcodeSequence', 'LinkerPrimerSequence']

            # getting the latest sample template
            sql = """SELECT filepath_id, filepath
                     FROM qiita.filepath
                        JOIN qiita.sample_template_filepath
                        USING (filepath_id)
                     WHERE study_id=%s
                     ORDER BY filepath_id DESC"""
            TRN.add(sql, [self.study_id])
            # We know that the good filepath is the one in the first row
            # because we sorted them in the SQL query
            sample_template_fname = TRN.execute_fetchindex()[0][1]
            _, fp = get_mountpoint('templates')[0]
            sample_template_fp = join(fp, sample_template_fname)

            # reading files via pandas
            st = load_template_to_dataframe(sample_template_fp)
            pt = self.to_dataframe()

            st_sample_names = set(st.index)
            pt_sample_names = set(pt.index)

            if not pt_sample_names.issubset(st_sample_names):
                raise ValueError(
                    "Prep template is not a sub set of the sample template, "
                    "file: %s - samples: %s"
                    % (sample_template_fp,
                       ', '.join(pt_sample_names-st_sample_names)))

            mapping = pt.join(st, lsuffix="_prep")
            mapping.rename(columns=rename_cols, inplace=True)

            # Pre-populate the QIIME-required columns with the value XXQIITAXX
            index = mapping.index
            placeholder = ['XXQIITAXX'] * len(index)
            missing = []
            for val in viewvalues(rename_cols):
                if val not in mapping:
                    missing.append(val)
                    mapping[val] = pd.Series(placeholder, index=index)

            if missing:
                warnings.warn(
                    "Some columns required to generate a QIIME-compliant "
                    "mapping file are not present in the template. A "
                    "placeholder value (XXQIITAXX) has been used to populate "
                    "these columns. Missing columns: %s" % ', '.join(missing),
                    QiitaDBWarning)

            # Gets the orginal mapping columns and readjust the order to comply
            # with QIIME requirements
            cols = mapping.columns.values.tolist()
            cols.remove('BarcodeSequence')
            cols.remove('LinkerPrimerSequence')
            cols.remove('Description')
            new_cols.extend(cols)
            new_cols.append('Description')
            mapping = mapping[new_cols]

            # figuring out the filepath for the QIIME map file
            _id, fp = get_mountpoint('templates')[0]
            filepath = join(fp, '%d_prep_%d_qiime_%s.txt' % (self.study_id,
                            self.id, strftime("%Y%m%d-%H%M%S")))

            # Save the mapping file
            mapping.to_csv(filepath, index_label='#SampleID', na_rep='',
                           sep='\t', encoding='utf-8')

            # adding the fp to the object
            self.add_filepath(
                filepath,
                fp_id=convert_to_id("qiime_map", "filepath_type"))

            return filepath
Beispiel #20
0
        TRN.add(sql, [client_id, client_secret])

        sql = """INSERT INTO qiita.oauth_software (software_id, client_id)
                 VALUES (%s, %s)"""
        TRN.add(sql, [i, client_id])
        TRN.execute()

    #
    # Generating compressed files for picking failures -- artifact_type = BIOM
    #
    sql = """SELECT artifact_id FROM qiita.artifact
                JOIN qiita.artifact_type USING (artifact_type_id)
                WHERE artifact_type = 'BIOM'"""
    TRN.add(sql)

    for r in TRN.execute_fetchindex():
        to_tgz = None
        a = Artifact(r[0])
        for _, fp, fp_type in a.filepaths:
            if fp_type == 'directory':
                # removing / from the path if it exists
                to_tgz = fp[:-1] if fp[-1] == '/' else fp
                break

        if to_tgz is None:
            continue

        tgz = to_tgz + '.tgz'
        if not exists(tgz):
            with taropen(tgz, "w:gz") as tar:
                tar.add(to_tgz, arcname=basename(to_tgz))
Beispiel #21
0
# Feb 11, 2015
# This changes all analysis files to be relative path instead of absolute

from os.path import basename, dirname

from qiita_db.util import get_mountpoint
from qiita_db.sql_connection import TRN

with TRN:
    sql = """SELECT f.*
             FROM qiita.filepath f
                JOIN qiita.analysis_filepath afp
                    ON f.filepath_id = afp.filepath_id"""
    TRN.add(sql)
    filepaths = TRN.execute_fetchindex()

    # retrieve relative filepaths as dictionary for matching
    mountpoints = {m[1].rstrip('/\\'): m[0] for m in get_mountpoint(
        'analysis', retrieve_all=True)}

    sql = """UPDATE qiita.filepath SET filepath = %s, data_directory_id = %s
             WHERE filepath_id = %s"""
    for filepath in filepaths:
        filename = basename(filepath['filepath'])
        # find the ID of the analysis filepath used
        mp_id = mountpoints[dirname(filepath['filepath']).rstrip('/\\')]
        TRN.add(sql, [filename, mp_id, filepath['filepath_id']])

    TRN.execute()
Beispiel #22
0
# -----------------------------------------------------------------------------
# Copyright (c) 2014--, The Qiita Development Team.
#
# Distributed under the terms of the BSD 3-clause License.
#
# The full license is in the file LICENSE, distributed with this software.
# -----------------------------------------------------------------------------

from qiita_db.sql_connection import TRN

sql = """
    SELECT constraint_name AS cname, 'qiita.' || table_name AS tname
    FROM information_schema.table_constraints
    WHERE constraint_type ='FOREIGN KEY' AND (
        (constraint_name LIKE 'fk_sample_%' AND table_name LIKE 'sample_%') OR
        (constraint_name LIKE 'fk_prep_%' AND table_name LIKE 'prep_%')) AND
        table_name NOT IN (
            'prep_template', 'prep_template_sample', 'prep_template_filepath',
            'prep_template_processing_job')"""

with TRN:
    TRN.add(sql)
    to_delete = TRN.execute_fetchindex()

for cname, tname in to_delete:
    with TRN:
        sql = "ALTER TABLE %s DROP CONSTRAINT %s" % (tname, cname)
        TRN.add(sql)
        TRN.execute()
Beispiel #23
0
    sql = """SELECT table_name, array_agg(column_name::text)
                FROM information_schema.columns
                WHERE column_name IN %s
                    AND column_name != 'sample_id'
                    AND table_name LIKE 'prep_%%'
                    AND table_name NOT IN (
                        'prep_template', 'prep_template_sample')
                GROUP BY table_name"""
    # note that we are looking for those columns with duplicated names in
    # the headers
    headers = set(PrepTemplate.metadata_headers()) & \
        set(SampleTemplate.metadata_headers())

    if headers:
        TRN.add(sql, [tuple(headers)])
        overlapping = dict(TRN.execute_fetchindex())
    else:
        overlapping = None

if overlapping is not None:
    # finding actual duplicates
    for table_name, cols in overlapping.items():
        # leaving print so when we patch in the main system we know that
        # nothing was renamed or deal with that
        print(table_name)
        with TRN:
            for c in cols:
                sql = 'ALTER TABLE qiita.%s RENAME COLUMN %s TO %s_renamed' % (
                    table_name, c, c)
                TRN.add(sql)
            TRN.execute()
Beispiel #24
0
         '{"max_rare_depth": "Default", "tree": "", "num_steps": 10, '
         '"min_rare_depth": 10, "metrics": ["chao1", "observed_otus"]}'],
        [srare_cmd_id, 'Defaults',
         '{"subsample_multinomial": "False"}']]
    TRN.add(sql, sql_args, many=True)

# At this point we are ready to start transferring the data from the old
# structures to the new structures. Overview of the procedure:
# Step 1: Add initial set of artifacts up to rarefied table
# Step 2: Transfer the "analysis jobs" to processing jobs and create
#         the analysis artifacts
db_dir = get_db_files_base_dir()
with TRN:
    sql = "SELECT * FROM qiita.analysis"
    TRN.add(sql)
    analysis_info = TRN.execute_fetchindex()

    # Loop through all the analysis
    for analysis in analysis_info:
        # Step 1: Add the inital set of artifacts. An analysis starts with
        # a set of BIOM artifacts.
        sql = """SELECT *
                 FROM qiita.analysis_filepath
                    JOIN qiita.filepath USING (filepath_id)
                    JOIN qiita.filepath_type USING (filepath_type_id)
                WHERE analysis_id = %s AND filepath_type = 'biom'"""
        TRN.add(sql, [analysis['analysis_id']])
        analysis_bioms = TRN.execute_fetchindex()

        # Loop through all the biom tables associated with the current analysis
        # so we can create the initial set of artifacts
Beispiel #25
0
# October 30th, 2017
# A change introduced in July made all the parameters to be stored as strings
# The DB needs to be patched so all the artifacts follow this structure

from json import dumps

from qiita_db.sql_connection import TRN

with TRN:
    sql = """SELECT *
                FROM qiita.artifact
                    JOIN qiita.artifact_output_processing_job
                        USING (artifact_id)
                WHERE command_id IS NOT NULL"""
    TRN.add(sql)

    sql_update_artifact = """UPDATE qiita.artifact
                             SET command_parameters = %s
                             WHERE artifact_id = %s"""
    sql_update_job = """UPDATE qiita.processing_job
                        SET command_parameters = %s
                        WHERE processing_job_id = %s"""
    for ainfo in TRN.execute_fetchindex():
        ainfo = dict(ainfo)
        params = dumps(
            {k: str(v) for k, v in ainfo['command_parameters'].items()})
        TRN.add(sql_update_artifact, [params, ainfo['artifact_id']])
        TRN.add(sql_update_job, [params, ainfo['processing_job_id']])
Beispiel #26
0
def create_non_rarefied_biom_artifact(analysis, biom_data, rarefied_table):
    """Creates the initial non-rarefied BIOM artifact of the analysis

    Parameters
    ----------
    analysis : dict
        Dictionary with the analysis information
    biom_data : dict
        Dictionary with the biom file information
    rarefied_table : biom.Table
        The rarefied BIOM table

    Returns
    -------
    int
        The id of the new artifact
    """
    # The non rarefied biom artifact is the initial biom table of the analysis.
    # This table does not currently exist anywhere, so we need to actually
    # create the BIOM file. To create this BIOM file we need: (1) the samples
    # and artifacts they come from and (2) whether the samples where
    # renamed or not. (1) is on the database, but we need to inferr (2) from
    # the existing rarefied BIOM table. Fun, fun...

    with TRN:
        # Get the samples included in the BIOM table grouped by artifact id
        # Note that the analysis contains a BIOM table per data type included
        # in it, and the table analysis_sample does not differentiate between
        # datatypes, so we need to check the data type in the artifact table
        sql = """SELECT artifact_id, array_agg(sample_id)
                 FROM qiita.analysis_sample
                    JOIN qiita.artifact USING (artifact_id)
                 WHERE analysis_id = %s AND data_type_id = %s
                 GROUP BY artifact_id"""
        TRN.add(sql, [analysis['analysis_id'], biom_data['data_type_id']])
        samples_by_artifact = TRN.execute_fetchindex()

        # Create an empty BIOM table to be the new master table
        new_table = Table([], [], [])
        ids_map = {}
        for a_id, samples in samples_by_artifact:
            # Get the filepath of the BIOM table from the artifact
            artifact = Artifact(a_id)
            biom_fp = None
            for _, fp, fp_type in artifact.filepaths:
                if fp_type == 'biom':
                    biom_fp = fp
            # Note that we are sure that the biom table exists for sure, so
            # no need to check if biom_fp is undefined
            biom_table = load_table(biom_fp)
            samples = set(samples).intersection(biom_table.ids())
            biom_table.filter(samples, axis='sample', inplace=True)
            # we need to check if the table has samples left before merging
            if biom_table.shape[0] != 0 and biom_table.shape[1] != 0:
                new_table = new_table.merge(biom_table)
                ids_map.update({sid: "%d.%s" % (a_id, sid)
                                for sid in biom_table.ids()})

        # Check if we need to rename the sample ids in the biom table
        new_table_ids = set(new_table.ids())
        if not new_table_ids.issuperset(rarefied_table.ids()):
            # We need to rename the sample ids
            new_table.update_ids(ids_map, 'sample', True, True)

        sql = """INSERT INTO qiita.artifact
                    (generated_timestamp, data_type_id, visibility_id,
                     artifact_type_id, submitted_to_vamps)
            VALUES (%s, %s, %s, %s, %s)
            RETURNING artifact_id"""
        # Magic number 4 -> visibility sandbox
        # Magix number 7 -> biom artifact type
        TRN.add(sql, [analysis['timestamp'], biom_data['data_type_id'],
                      4, 7, False])
        artifact_id = TRN.execute_fetchlast()

        # Associate the artifact with the analysis
        sql = """INSERT INTO qiita.analysis_artifact
                    (analysis_id, artifact_id)
                 VALUES (%s, %s)"""
        TRN.add(sql, [analysis['analysis_id'], artifact_id])
        # Link the artifact with its file
        dd_id, mp = get_mountpoint('BIOM')[0]
        dir_fp = join(get_db_files_base_dir(), mp, str(artifact_id))
        if not exists(dir_fp):
            makedirs(dir_fp)
        new_table_fp = join(dir_fp, "biom_table.biom")
        with biom_open(new_table_fp, 'w') as f:
            new_table.to_hdf5(f, "Generated by Qiita")

        sql = """INSERT INTO qiita.filepath
                    (filepath, filepath_type_id, checksum,
                     checksum_algorithm_id, data_directory_id)
                 VALUES (%s, %s, %s, %s, %s)
                 RETURNING filepath_id"""
        # Magic number 7 -> filepath_type_id = 'biom'
        # Magic number 1 -> the checksum algorithm id
        TRN.add(sql, [basename(new_table_fp), 7,
                      compute_checksum(new_table_fp), 1, dd_id])
        fp_id = TRN.execute_fetchlast()
        sql = """INSERT INTO qiita.artifact_filepath
                    (artifact_id, filepath_id)
                 VALUES (%s, %s)"""
        TRN.add(sql, [artifact_id, fp_id])
        TRN.execute()

    return artifact_id
Beispiel #27
0
    sql = """SELECT table_name, array_agg(column_name::text)
                FROM information_schema.columns
                WHERE column_name IN %s
                    AND column_name != 'sample_id'
                    AND table_name LIKE 'prep_%%'
                    AND table_name NOT IN (
                        'prep_template', 'prep_template_sample')
                GROUP BY table_name"""
    # note that we are looking for those columns with duplicated names in
    # the headers
    headers = set(PrepTemplate.metadata_headers()) & \
        set(SampleTemplate.metadata_headers())

    if headers:
        TRN.add(sql, [tuple(headers)])
        overlapping = dict(TRN.execute_fetchindex())
    else:
        overlapping = None

if overlapping is not None:
    # finding actual duplicates
    for table_name, cols in viewitems(overlapping):
        # leaving print so when we patch in the main system we know that
        # nothing was renamed or deal with that
        print table_name
        with TRN:
            for c in cols:
                sql = 'ALTER TABLE qiita.%s RENAME COLUMN %s TO %s_renamed' % (
                    table_name, c, c)
                TRN.add(sql)
            TRN.execute()
Beispiel #28
0
if cols_sample:
    with TRN:
        # a few notes: just getting the preps with duplicated values; ignoring
        # column 'sample_id' and tables 'study_sample', 'prep_template',
        # 'prep_template_sample'
        sql = """SELECT table_name, array_agg(column_name::text)
                    FROM information_schema.columns
                    WHERE column_name IN %s
                        AND table_name LIKE 'sample_%%'
                        AND table_name NOT IN (
                            'prep_template', 'prep_template_sample')
                    GROUP BY table_name"""
        # note that we are looking for those columns with duplicated names in
        # the headers
        TRN.add(sql, [tuple(set(cols_sample))])
        for table, columns in dict(TRN.execute_fetchindex()).items():
            # [1] the format is table_# so taking the #
            st = SampleTemplate(int(table.split('_')[1]))
            # getting just the columns of interest
            st_df = st.to_dataframe()[columns]
            # converting to datetime
            for col in columns:
                st_df[col] = st_df[col].apply(transform_date)
            st.update(st_df)

if cols_prep:
    with TRN:
        # a few notes: just getting the preps with duplicated values; ignoring
        # column 'sample_id' and tables 'study_sample', 'prep_template',
        # 'prep_template_sample'
        sql = """SELECT table_name, array_agg(column_name::text)
Beispiel #29
0
# April 5, 2018
# Making sure that all parameter in the artifacts are strings

from json import dumps
from qiita_db.sql_connection import TRN

# Make sure that all validate commands have the "analysis" parameter
with TRN:
    # Get all validate commands that are missing the analysis parameter
    sql = """SELECT artifact_id, command_parameters
             FROM qiita.artifact"""
    TRN.add(sql)

    all_rows = TRN.execute_fetchindex()

sql = """UPDATE qiita.artifact
         SET command_parameters = %s
         WHERE artifact_id = %s"""
# taking the loop outside so we can have a TRN per change
for row in all_rows:
    aid, params = row

    if params is None:
        continue

    if any([isinstance(v, int) for k, v in params.items()]):
        continue

    params = {k: str(v) if isinstance(v, int) else v
              for k, v in params.items()}