Ejemplo n.º 1
0
def merge_meta(meta, target=None, commit=True, local_repo=None):
    """Merge one EMPD meta data into another

    Parameters
    ----------
    meta: str
        The file to merge.
    target: str
        The file to merge `meta` into. If None, the meta file of the
        `local_repo` is used, and, if this is `meta` we use `meta.tsv`.
    commit: bool
        If True, commit the changes to the git repository
    local_repo: str
        The path to the EMPD-data local repository. If None, the directory of
        `meta` is used

    Returns
    -------
    str
        The `target`"""
    if local_repo is None:
        local_repo = osp.dirname(meta)

    if not target:
        target = osp.basename(get_meta_file(local_repo))
        if osp.samefile(meta, osp.join(local_repo, target)):
            target = 'meta.tsv'

    meta_df = read_empd_meta(meta)

    base_meta = osp.join(local_repo, target)
    base_meta_df = read_empd_meta(base_meta)

    # update the meta file and save
    base_meta_df = base_meta_df.join(meta_df[[]], how='outer')
    cols = [col for col in meta_df.columns if col in base_meta_df.columns]
    base_meta_df.loc[meta_df.index, cols] = meta_df

    dump_empd_meta(base_meta_df, base_meta)

    if commit:
        repo = Repo(local_repo)
        repo.index.add([target])
        repo.index.commit("Merged {} into {} [skip ci]".format(
            osp.basename(meta), target))

    return target
Ejemplo n.º 2
0
def query_meta(meta,
               query,
               columns='notnull',
               count=False,
               output=None,
               commit=False,
               local_repo=None,
               distinct=False):
    """Query the meta data of a data contribution

    This function uses the :func:`query_samples` function to return a subset
    of the EMPD metadata. The performed query is such as::

        SELECT columns FROM meta WHERE query

    Parameters
    ----------
    meta: str
        The path to the metadata that shall be queried (see
        :func:`~empd_admin.common.read_empd_meta`)
    query: str
        The WHERE clause of the SQL query
    columns: list of str
        The columns that shall be returned. It can either be a list of columns,
        ``'all'`` to return all columns, or ``'notnull'`` (default) to return
        the non-empty columns
    count: bool
        If True, do not return the values per column but the number of valid
        entries per column (i.e. ``SELECT COUNT(*) FROM meta WHERE query``)
    output: str
        The path where to save the tab-delimited result of the query. If None
        and `commit` is ``True``, it will be saved to ``queries/query.tsv``,
        relative to the `local_repo`
    commit: bool
        If True, commit the changes in the repository `local_repo`
    local_repo: str
        The path of the local EMPD-data repository. If None, it will be assumed
        to be the directory of the given `meta`.
    distinct: list of str
        If not null, return a distinct query based on the columns listed in
        this parameter. For example ``distinct=['Country', 'SampleContext']``
        will result in ``SELECT DISTINCT ON ('Country', 'SampleContext') ...``

    Returns
    -------
    str
        The path where the query has been saved (see `output` and `commit`) or
        None
    str
        The result of the query as a markdown table, at maximum 200 rows
    """
    if local_repo is None:
        local_repo = osp.dirname(meta)
    else:
        meta = osp.join(local_repo, meta)
    meta_df = read_empd_meta(meta).replace('', np.nan)
    samples = query_samples(meta_df, query)

    sub = meta_df.loc[samples].reset_index()
    if isinstance(columns, str):
        columns = [columns]

    if 'notnull' in columns:
        missing = []
        notnull = sub.notnull().any(axis=0)
        columns = notnull[notnull].index
    elif 'all' in columns:
        missing = []
        columns = sub.columns
    else:
        columns = np.array(columns)
        mask = np.isin(columns, sub.columns)
        missing = columns[~mask]
        columns = columns[mask]
    if count:
        sub = sub[columns].count().to_frame().reset_index().fillna('')
        sub.columns = ['Column', 'Count']
    else:
        sub = sub[columns].fillna('')
    if commit:
        output = output or 'query.tsv'
    if output:
        ofile = osp.join(local_repo, 'queries', output)
        os.makedirs(osp.dirname(ofile), exist_ok=True)
        dump_empd_meta(sub, ofile)

    if commit:
        repo = Repo(local_repo)
        repo.index.add([osp.join('queries', output)])
        repo.index.commit(f'Added {output} [skip ci]\n\n{query}')

    sub = pd.concat([
        pd.DataFrame([('---', ) * len(sub.columns)], columns=sub.columns), sub
    ],
                    ignore_index=True)

    if distinct:
        if 'all' in distinct:
            distinct = sub.columns
        sub.drop_duplicates(distinct, inplace=True)

    ret = f'<details><summary>{query}</summary>\n\n' + textwrap.indent(
        dump_empd_meta(sub.head(200), sep='|'), '| ')
    ret += '\n\nDisplaying %i of %i rows' % (min(len(sub) - 1,
                                                 200), len(sub) - 1)
    if len(missing):
        ret += '\n\nMissing columns ' + ', '.join(missing)
    return output, ret + '\n</details>'
Ejemplo n.º 3
0
def diff(meta, left=None, right=None, output=None, commit=False,
         maxdiff=200, *args, **kwargs):
    """Compute the diff between two EMPD metadata files

    This function computes the difference between two EMPD-data files using the
    :func:`compute_diff` function. It takes the meta data of an EMPD-data
    repository and compares it to another

    Parameters
    ----------
    meta: str
        The path to the tab-delimited meta data of a cloned EMPD-data
        repository
    left: str
        The path to the first meta data file, relative to the directory of
        `meta`. Alternatively it can also be a url. If `left` is None, the
        `meta` will be used
    right: str
        The path to the second meta data file, relative to the directory of
        `meta`. Alternatively it can also be a url. If `right` is None, the
        `meta` will be used, or (if `left` is the same as `meta` or None),
        the meta data of the EMPD2/EMPD-data repository at
        https://raw.githubusercontent.com/EMPD2/EMPD-data/master/meta.tsv
        is used.
    output: str
        The filename to use for saving the diff. If set, it will be saved in
        the ``'queries'`` directory, relative to `meta`. If not set but
        `commit` is True, it will be saved to ``'queries/diff.tsv'``.
    commit: bool
        If True, commit the added `output` to the git repository of `meta`
    maxdiff: int
        The maximum number of lines for the diff
    ``*args,**kwargs``
        Any other parameter for the :func:`compute_diff` function

    Returns
    -------
    str
        The path where the data has been saved (if `output` is set or `commit`
        is True)
    str
        The computed difference as markdown table

    Examples
    --------
    For a data contribution, e.g. the test-data branch, you can compute the
    difference to the EMPD meta.tsv via::

        import git
        git.Repo.clone_from('https://github.com/EMPD2/EMPD-data',
                            branch='test-data')
        diff('EMPD-data/test.tsv')

    which is essentially the same as::

        diff('EMPD-data/test.tsv', 'test.tsv', 'meta.tsv')

    You will reveive nothing, however, because `how` is set to ``'inner'`` and
    ``'test.tsv'`` contains new samples. Instead, you can set `how` to
    ``'left'`` to include the samples of ``'test.tsv'`` that are not in
    ``'meta.tsv'``::

        diff('EMPD-data/test.tsv', how='left')
    """
    local_repo = osp.dirname(meta)
    meta = osp.basename(meta)
    repo = Repo(local_repo)
    master_url = ('https://raw.githubusercontent.com/EMPD2/EMPD-data/'
                  'master/meta.tsv')
    if left is None:
        left = meta
    if right is None:
        if left == meta:
            base_meta = osp.join(local_repo, 'meta.tsv')
            if osp.samefile(meta, base_meta):
                right = master_url
            else:
                right = 'meta.tsv'
        elif left == 'meta.tsv':
            right = master_url
        else:
            right = meta
    if url_regex.match(left):
        with tempfile.TemporaryDirectory() as tmpdir:
            download_target = osp.join(tmpdir, 'meta.tsv')
            request.urlretrieve(left, download_target)
            left_df = read_empd_meta(download_target)
    else:
        left_df = read_empd_meta(osp.join(local_repo, left))

    if url_regex.match(right):
        with tempfile.TemporaryDirectory() as tmpdir:
            download_target = osp.join(tmpdir, 'meta.tsv')
            request.urlretrieve(right, download_target)
            right_df = read_empd_meta(download_target)
    else:
        right_df = read_empd_meta(osp.join(local_repo, right))

    diff = compute_diff(left_df, right_df, *args, **kwargs)

    if commit and not output:
        output = 'diff.tsv'
    if output:
        target = osp.join(local_repo, 'queries', output)
        if not osp.exists(osp.dirname(target)):
            os.makedirs(osp.dirname(target))
        dump_empd_meta(diff, target)
    if commit:
        repo.index.add([osp.join('queries', output)])
        repo.index.commit(f"Added diff between {left} and {right}")

    diff.reset_index(inplace=True)

    diff = pd.concat([
        pd.DataFrame([('---', ) * len(diff.columns)], columns=diff.columns),
        diff], ignore_index=True)

    ret = f'<details><summary>{left}..{right}</summary>\n\n' + textwrap.indent(
        dump_empd_meta(diff.head(maxdiff), sep='|'),
        '| ')
    ret += '\n\nDisplaying %i of %i rows' % (min(len(diff) - 1, maxdiff),
                                             len(diff) - 1)

    return output, ret
Ejemplo n.º 4
0
def handle_viewer_request(metadata,
                          submitter,
                          repo='EMPD2/EMPD-data',
                          branch='master',
                          meta='meta.tsv',
                          submitter_gh=None,
                          commit_msg=''):
    """Handle data contribution through the viewer

    Parameters
    ----------
    metadata: dict
        The meta data as JSON from the viewer
    submitter: str
        The name of the submitter
    repo: str
        The name of the repository ('EMPD2/EMPD-data')
    branch: str
        The branch of the repo
    meta: str
        The name of the meta file for the contribution
    submitter_gh: str
        The github username of the `submitter`
    commit_msg: str
        The message that shall be used for the commit

    Returns
    -------
    bool
        True, if everything went fine
    str
        a html-formatted report whether everything worked as expected
    """
    # read the meta data json
    metadata = pd.DataFrame.from_dict(
        {d.pop('SampleName'): d
         for d in metadata}, 'index')
    if 'Temperature' in metadata.columns:
        metadata['Temperature'] = metadata.Temperature.apply(transform_list)
    if 'Precipitation' in metadata.columns:
        metadata['Precipitation'] = metadata.Precipitation.apply(
            transform_list)
    metadata.index.name = 'SampleName'

    # write the data frame and load it again to have a consistent dump
    with tempfile.TemporaryDirectory() as d2:
        dump_empd_meta(metadata, osp.join(d2, 'tmp.tsv'))
        metadata = read_empd_meta(osp.join(d2, 'tmp.tsv'))

    if repo == 'EMPD2/EMPD-data' and branch == 'master':
        return create_new_pull_request(metadata, submitter, submitter_gh,
                                       commit_msg)
    # check if we can find an existing pull request for the given repository
    pulls = github.Github(
        os.environ['GH_TOKEN']).get_repo('EMPD2/EMPD-data').get_pulls()
    for pull in pulls:
        if (pull.state == 'open' and pull.head.repo.full_name == repo
                and pull.head.label.split(':')[1] == branch):
            return edit_pull_request(pull, meta, metadata, submitter,
                                     submitter_gh, commit_msg)

    return False, f"Could not find an open pull request for {repo}:{branch}"
Ejemplo n.º 5
0
def edit_pull_request(pull,
                      meta,
                      metadata,
                      submitter,
                      submitter_gh=None,
                      commit_msg='',
                      commit=True):
    """Edit the meta data of an existing pull request

    Parameters
    ----------
    pull: github.PullRequest
        The pull request on github
    meta: str
        The name of the meta file for the contribution
    metadata: dict
        The meta data as JSON from the viewer
    submitter: str
        The name of the submitter
    submitter_gh: str
        The github username of the `submitter`
    commit_msg: str
        The message that shall be used for the commit
    commit: bool
        If True, commit the changes"""
    full_repo = pull.head.repo.full_name
    remote_url = f'https://github.com/{full_repo}.git'
    branch = pull.head.label.split(':')[1]
    if not pull.labels or not any(l.name == 'viewer-editable'
                                  for l in pull.labels):
        return False, (
            f"Pull request {pull.number} for {full_repo}:{branch} is not "
            "marked as editable. To change this, post a new comment in the "
            f"<a href='{pull.html_url}' target='_blank'>PR</a> with "
            "<code>@EMPD-admin allow-edits</code></a>")

    with tempfile.TemporaryDirectory('_empd') as tmpdir:
        repo = Repo.clone_from(remote_url, tmpdir, branch=branch)
        old_meta = read_empd_meta(osp.join(tmpdir, meta))
        save_meta = old_meta.copy(True)
        cols = [col for col in metadata.columns if col in old_meta.columns]
        old_meta.loc[metadata.index, cols] = metadata
        n = len(metadata)
        nsamples = '%i sample%s' % (n, 's' if n > 1 else '')
        if old_meta.shape == save_meta.shape and old_meta.equals(save_meta):
            return False, "No data has been edited."
        else:
            dump_empd_meta(old_meta, osp.join(tmpdir, meta))
            repo.index.add([meta])
            commit_msg += '\n\n' if commit_msg else ''
            repo.index.commit(commit_msg +
                              f"Updated {nsamples} in {meta} as requested by "
                              f"{submitter}")
            remote_url = ('https://*****:*****@github.com/'
                          f'{full_repo}.git')
            remote = repo.create_remote('push_remote',
                                        remote_url % os.environ['GH_TOKEN'])
            if commit:
                remote.push(branch)
    pr_owner = '@' + pull.user.login
    uri = pull.html_url
    if submitter_gh and '@' + submitter_gh != pr_owner:
        pr_owner += ' and @' + submitter_gh
    pr_msg = (
        f"Dear {pr_owner}, I just updated {nsamples} in your {meta} file "
        f"as requested via [EMPD2.github.io](https://empd2.github.io/) by "
        f"{submitter}.\n"
        f"If you believe that this is a bug or has been a wrong edit: "
        f"Please ping `@Chilipp`.")
    if commit:
        comment = comment_on_pr('EMPD2',
                                'EMPD-data',
                                pull.number,
                                pr_msg,
                                force=True)
        uri = comment.html_url

    return True, (
        f'Successfully pushed {nsamples} into {full_repo}/{meta} '
        f'and PR <a href="{uri}" title="PR #{pull.number}: {pull.title}">'
        f'#{pull.number}</a>.')
Ejemplo n.º 6
0
def look_for_changed_fixed_tables(meta, pr_owner, pr_repo, pr_branch):
    """Check whether any of the fixed tables has been changed

    The import of the data contribution into the postgres database might add
    new entries into the postgres/scripts/tables files. This function checks
    for this and reports back to the PR

    Parameters
    ----------
    meta: str
        The path to the meta file of the data contribution
    pr_owner: str
        The owner (github username) of the data contribution
    pr_repo: str
        The name of the repository
    pr_branch: str
        The branch of the data contribution

    Returns
    -------
    str
        The status message to report what happened with the fixed tables"""
    fixed = [
        'Country', 'GroupID', 'SampleContext', 'SampleMethod', 'SampleType'
    ]
    msg = ''
    changed_tables = []
    local_tables = osp.join(osp.dirname(meta), 'postgres', 'scripts', 'tables')
    for table in fixed:
        fname = osp.join(get_psql_scripts(), 'tables', table + '.tsv')
        old = pd.read_csv(fname, sep='\t')
        new = pd.read_csv(osp.join(local_tables, table + '.tsv'), sep='\t')
        changed = set(map(tuple, new.values)) - set(map(tuple, old.values))
        if changed:
            shutil.copyfile(osp.join(local_tables, table + '.tsv'), fname)
            changed = pd.DataFrame([('---', ) * len(new.columns)] +
                                   list(changed),
                                   columns=new.columns)
            changed_tables.append(table)
            msg += textwrap.dedent(f"""
                - postgres/scripts/tables/{table}.tsv - [Edit the file](https://github.com/{pr_owner}/{pr_repo}/edit/{pr_branch}/postgres/scripts/tables/{table}.tsv)

                  <details><summary>%i changed rows:</summary>

                  %s
                  </details>
                """) % (len(changed) - 1,
                        textwrap.indent(dump_empd_meta(changed, sep='|'),
                                        '  | '))
    if changed_tables:
        if len(changed_tables) == 1:
            msg = ("**Note** that one of the fixed tables has been changed!"
                   "\n\n%s\n\nPlease review it. "
                   "") % msg
        else:
            msg = ("**Note** that some of the fixed tables have been changed!"
                   "\n\n%s\n\nPlease review them. ") % msg
        action_required = set(changed_tables) & {
            'GroupID', 'SampleType', 'Country'
        }
        if action_required:
            suffix = 's' if len(action_required) > 1 else ''
            msg += ("If you change the file%s, please tell me via\n"
                    "`@EMPD-admin rebuild %s`\n"
                    "to update the table%s in the database") % (
                        suffix, ' '.join(action_required), suffix)
    return msg
Ejemplo n.º 7
0
def fill_repo(meta, db_url, root_db=None, dry_run=False,
              meta_data=True, count_data=True, keep=None,
              how='left', on=None, exclude=[], columns='left', atol=1e-3):
    """Fill the EMPD-data repo with the database in the given URL

    Parameters
    ----------
    meta: str
        The path where to save the data
    db_url: str
        The url where the postgres database can be accessed. Note that we
        expect this database to have a ``'metaViewer'`` table
    root_db: str
        The url where the EMPD2 postgres database can be accessed. This
        parameter is only necessary where ``how != 'left-only'``
    dry_run: bool
        If True, do not create any file but only report what would have been
        saved
    meta_data: bool
        If True (default), dump the meta data into `meta`
    count_data: bool
        If True (default), dump the pollen counts in the corresponding file
        of the sample
    keep: list
        Columns to keep from the `root_df`
    how: str
        How to merge the `root` meta data into the new one. Possiblities are

        inner
            use intersection of samples from both frames, similar to a SQL
            inner join; preserve the order of the left keys.
        outer
            use union of samples from both frames, similar to a SQL full outer
            join; sort keys lexicographically.
        left (default)
            use only samples from the new frame, similar to a SQL left outer
            join; preserve key order.
        right
            use only samples from right frame, similar to a SQL right outer
            join; preserve key order.
    on: list of str
        The names of the columns to compute the diff on. If None, we use the
        intersection of columns between `left` and `right.`
    exclude: list of str
        Columns names that should be excluded in the diff.
    columns: str or list of str
        The columns of the returned dataframe. It can either be a list of
        column names to use or one of

        leftdiff (default)
            To use the columns from `left` that differ from `right`
        left
            To use all columns from `left`
        rightdiff
            To use the columns from `right` that differ from `left`
        right
            To use all columns from `right`
        inner
            To use the intersection of `left` and `right`
        bothdiff
            To use the differing columns from `right` and `left` (columns from
            `right` are suffixed with an ``'_r'``)
        both
            To use all columns from `left` and `right` (columns from `right`
            are suffixed with an ``'_r'``)

        In any of these cases (except if you specify the column names
        explicitly), the columns the data frame will include a ``diff`` column
        that contains for each sample the columns names of the differing cells.
    atol: float
        Absolute tolerance to use for numeric columns (see the
        :attr:`empd_admin.common.NUMERIC_COLS`).

    Returns
    -------
    str
        The markdown formatted report
    list
        The filenames that have changed (or would have been changed, if
        `dry_run` is True)"""
    engine = sqlalchemy.create_engine(
        db_url, poolclass=sqlalchemy.pool.NullPool)

    outdir = osp.dirname(meta)

    exclude = list(exclude) + ['var_', 'acc_var_']

    meta_df = pd.read_sql('metaViewer', engine)

    climate = pd.read_sql('climate', engine)
    climate['Temperature'] = list(map(
        ','.join, climate.iloc[:, 1:18].values.astype(str)))
    climate['Precipitation'] = list(map(
        ','.join, climate.iloc[:, 18:-1].values.astype(str)))

    meta_df = meta_df.merge(
        climate[['samplename', 'Temperature', 'Precipitation']].rename(
            columns={'samplename': 'SampleName'}), on='SampleName', how='left')

    meta_df.set_index('SampleName', inplace=True)

    # save meta data and load it again to make sure we have a consistent table
    with tempfile.NamedTemporaryFile(suffix='_empd.tsv') as f:
        dump_empd_meta(meta_df, f.name)
        meta_df = read_empd_meta(f.name)

    if 'okexcept' not in meta_df:
        meta_df['okexcept'] = ''

    files = []
    message = ""

    if how != 'left-only':
        diff_kws = dict(how=how, on=on, exclude=exclude, columns=columns,
                        atol=atol)
        root_df = read_empd_meta(osp.join(outdir, 'meta.tsv'))
        meta_df = compute_diff(meta_df, root_df, **diff_kws)
        if keep:
            meta_df.loc[:, keep] = meta_df[[]].join(root_df[keep], how='left')
    if meta_data and len(meta_df):
        files += [meta]
        if not dry_run:
            dump_empd_meta(meta_df, meta)
        message = f"Dumped {meta_df.shape[0]} lines to {osp.basename(meta)}."
    else:
        message = "No meta data has changed."

    if count_data:
        engine = sqlalchemy.create_engine(
            db_url, poolclass=sqlalchemy.pool.NullPool)
        counts = pd.read_sql_query(
            'SELECT * FROM p_counts LEFT JOIN p_vars USING (var_)', engine,
            index_col=['samplename', 'original_varname'])

        if how != 'left-only':
            engine = sqlalchemy.create_engine(
                root_db, poolclass=sqlalchemy.pool.NullPool)
            root_counts = pd.read_sql_query(
                'SELECT * FROM p_counts LEFT JOIN p_vars USING (var_)', engine,
                index_col=['samplename', 'original_varname'])
            diff = compute_diff(counts, root_counts, **diff_kws)
            changed = np.unique(diff.index.get_level_values(0))
            files.extend(map('samples/{}.tsv'.format, changed))

            if not dry_run:
                for key, group in counts.reset_index(-1).loc[changed].groupby(
                        level=0):
                    target = osp.join(outdir, 'samples', f'{key}.tsv')
                    dump_empd_meta(group, target)
        else:
            changed = np.unique(counts.index.get_level_values(0))
            files.extend(map('samples/{}.tsv'.format, changed))
            if not dry_run:
                for key, group in counts.groupby(level=0):
                    target = osp.join(outdir, 'samples', f'{key}.tsv')
                    dump_empd_meta(group, target)

    if count_data:
        message += f" Changed {len(changed)} count files."

    if dry_run:
        message += '\n\nNo action has been performed because it was a dry run.'

    return message, files
Ejemplo n.º 8
0
def unaccept_query(meta,
                   query,
                   columns,
                   commit=True,
                   skip_ci=False,
                   raise_error=False,
                   local_repo=None):
    """Reverse acceptance for failed meta data based on a SQL query

    This function reverses the acceptance made by the :func:`accept` or
    :func:`accept_query` function, based on a SQL query. The arguments are
    the same as for the :func:`accept_query` function.

    Parameters
    ----------
    meta: str
        The path to the metadata that shall be queried
    query: str
        The ``WHERE`` part of the query (see
        :func:`empd_admin.query.query_samples`).
    columns: list of str
        The columns that shall not be accepted any more
    commit: bool
        If True, commit the changes in the repository of `meta`
    skip_ci: bool
        If True and `commit`, then ``[skip ci]`` will be added to the commit
        message
    raise_error: bool
        If True, raise an error on Failure, otherwise return the error msg
    local_repo: str
        The path of the local EMPD-data repository. If None, it will be assumed
        to be the directory of the given `meta`.

    Returns
    -------
    str
        The status message. None if everything is allright.

    See Also
    --------
    unaccept

    Examples
    --------
    Do not accept any failure for samples where the Country equals "Germany"::

        unaccept_query(meta, "Country = 'Germany'", ['Country'])
    """
    if local_repo is None:
        local_repo = osp.dirname(meta)
        base_meta = osp.basename(meta)
    else:
        base_meta = meta
        meta = osp.join(local_repo, meta)
    repo = Repo(local_repo)
    meta_df = read_empd_meta(meta)
    samples = query_samples(meta_df, query)

    if not len(samples):
        msg = "No samples selected with %r" % (query, )
        if raise_error:
            raise ValueError(msg)
        else:
            return msg
    if 'okexcept' not in meta_df.columns:
        meta_df['okexcept'] = ''
    else:
        meta_df['okexcept'] = meta_df.okexcept.fillna('')
    nsamples = len(samples)
    for column in columns:
        if column == 'all':
            meta_df.loc[samples, 'okexcept'] = ''
            message = (f"Do not accept any failure for {nsamples} samples\n\n"
                       f"based on '{query}'")
        else:
            meta_df.loc[samples,
                        'okexcept'] = meta_df.loc[samples, 'okexcept'].replace(
                            column + ',', '')
            message = (
                f"Do not accept wrong {column} for {nsamples} samples\n\n"
                f"based on '{query}'")

    if commit:
        dump_empd_meta(meta_df, meta)
        repo.index.add([base_meta])
        repo.index.commit(message + ('\n\n[skip ci]' if skip_ci else ''))
    if not commit:
        dump_empd_meta(meta_df, meta)
        return ("Marked the fields as accepted but without having it "
                "commited. %i sample%s would have been affected.") % (
                    nsamples, 's' if nsamples > 1 else '')
Ejemplo n.º 9
0
def unaccept(meta,
             what,
             commit=True,
             skip_ci=False,
             raise_error=False,
             exact=False,
             local_repo=None):
    """Reverse acceptance for failed meta data

    This function reverses the acceptance made by the :func:`accept` or
    :func:`accept_query` function. Arguments are the same as for the
    :ref:`accept` function, despite the fact that the `column` part in `what`
    can also be `all`.

    Parameters
    ----------
    meta: str
        The path to the metadata
    what: list of str
        A list of strings like `sample:column` where `sample` is a regular
        expression (or the name of the sample if `exact`) and the `column` is
        the column for the corresponding sample that shall be accepted
    commit: bool
        If True, commit the changes in the repository of `meta`
    skip_ci: bool
        If True and `commit`, then ``[skip ci]`` will be added to the commit
        message
    raise_error: bool
        If True, raise an error on Failure, otherwise return the error msg
    except: bool
        If True, samples must be euqal to the `sample` part in `what`.
        Otherwise we use regular expressions
    local_repo: str
        The path of the local EMPD-data repository. If None, it will be assumed
        to be the directory of the given `meta`.

    Returs
    ------
    str
        The status message. None if everything is allright.

    Examples
    --------
    Do not accept any failure for any column::

        unaccept(meta, ['all:all'])

    Do not accept any failure for latitudes or longitudes with samples that
    start with ``'Barboni'``::

        unaccept(meta, ['Barboni:Latitude', 'Barboni:Longitude'])

    Do not accept wrong Temperature for the sample ``'Beaudouin_a1'``::

        unaccept(meta, ['Beaudouin_a1:Temperature'], exact=True)

    .. note::

        If you skip the `exact` parameter above, wrong temperatures would
        also be not accepted anymore for the sample ``Beaudouin_a10``!
    """
    if local_repo is None:
        local_repo = osp.dirname(meta)
        base_meta = osp.basename(meta)
    else:
        base_meta = meta
        meta = osp.join(local_repo, meta)
    repo = Repo(local_repo)
    meta_df = read_empd_meta(meta).reset_index()
    samples = np.unique([t[0] for t in what])

    valid = (samples == 'all')
    if exact:
        valid |= np.isin(samples, meta_df.SampleName.values)
    else:
        valid |= np.array(
            [meta_df.SampleName.str.contains(s).any() for s in samples])

    if not valid.all():
        msg = "Missing samples %s in %s" % (samples[~valid],
                                            osp.basename(meta))
        if raise_error:
            raise ValueError(msg)
        else:
            return msg
    if 'okexcept' not in meta_df.columns or not meta_df.okexcept.any():
        return  # no failures are already
    old_okexcept = meta_df.okexcept.copy(True)
    names = meta_df.SampleName
    messages = []
    for sample, column in what:
        if sample == 'all':
            if column == 'all':
                meta_df['okexcept'] = ''
                message = 'Do not accept any failure'
            else:
                meta_df['okexcept'] = meta_df['okexcept'].str.replace(
                    column + ',', '')
                message = f"Do not accept wrong {column} for all samples"
        else:
            if column == 'all':
                if exact:
                    meta_df.loc[names == sample, 'okexcept'] = ''
                else:
                    meta_df.loc[names.str.contains(sample), 'okexcept'] = ''
                message = f"Do not accept any failure for sample {sample}"
            else:
                if exact:
                    meta_df.loc[names == sample, 'okexcept'] = \
                        meta_df.loc[names == sample, 'okexcept'].replace(
                            column + ',', '')
                else:
                    meta_df.loc[names.str.contains(sample), 'okexcept'] = \
                        meta_df.loc[names.str.contains(sample),
                                    'okexcept'].replace(column + ',', '')
                message = f"Do not accept wrong {column} for sample {sample}"

            messages.append(message)

        if commit and (old_okexcept != meta_df['okexcept']).any():
            dump_empd_meta(meta_df, meta)
            repo.index.add([base_meta])
            repo.index.commit(message + ('\n\n[skip ci]' if skip_ci else ''))
        old_okexcept = meta_df['okexcept'].copy(True)
    if not commit:
        dump_empd_meta(meta_df, meta)
        return ("Reverted the acceptance of mentioned erroneous fields but "
                "did not commit.\n\n- " + "\n- ".join(messages))
Ejemplo n.º 10
0
def accept_query(meta,
                 query,
                 columns,
                 commit=True,
                 skip_ci=False,
                 raise_error=False,
                 local_repo=None):
    """Accept failed metadata based on a query for the pandas.DataFrame.query

    This function can accept failed `columns` for samples based on a `query`.

    The sql expression would be something like::

        UPDATE meta SET okexcept = ','.join(columns) WHERE query

    Parameters
    ----------
    meta: str
        The path to the metadata that shall be queried
    query: str
        The ``WHERE`` part of the query (see
        :func:`empd_admin.query.query_samples`).
    columns: list of str
        The columns that shall be marked as accepted (they will be appended to
        the existing columns)
    commit: bool
        If True, commit the changes in the repository `local_repo`
    skip_ci: bool
        If True and `commit`, then ``[skip ci]`` will be added to the commit
        message
    raise_error: bool
        If True, raise an error on Failure, otherwise return the error msg
    local_repo: str
        The path of the local EMPD-data repository. If None, it will be assumed
        to be the directory of the given `meta`.

    Returns
    -------
    str
        The status message. None if everything is allright.

    See Also
    --------
    accept

    Examples
    --------
    Accept missing Latitudes and Longitudes::

        accept_query(
            meta, "Latitude is NULL or Longitude is NULL", ['Country'])
    """
    if local_repo is None:
        local_repo = osp.dirname(meta)
        base_meta = osp.basename(meta)
    else:
        base_meta = meta
        meta = osp.join(local_repo, meta)
    repo = Repo(local_repo)
    meta_df = read_empd_meta(meta)
    samples = query_samples(meta_df, query)
    if not len(samples):
        msg = "No samples selected with %r" % (query, )
        if raise_error:
            raise ValueError(msg)
        else:
            return msg
    if 'okexcept' not in meta_df.columns:
        meta_df['okexcept'] = ''
    else:
        meta_df['okexcept'] = meta_df.okexcept.fillna('')
    nsamples = len(samples)
    for column in columns:
        meta_df.loc[samples, 'okexcept'] += column + ','
        meta_df.loc[samples, 'okexcept'] = meta_df.loc[
            samples, 'okexcept'].apply(
                lambda s: ','.join(sorted(set(s[:-1].split(',')))) + ',')
        message = (f"Accept wrong {column} for {nsamples} samples\n\n"
                   f"based on '{query}'")

    if commit:
        dump_empd_meta(meta_df, meta)
        repo.index.add([base_meta])
        repo.index.commit(message + ('\n\n[skip ci]' if skip_ci else ''))
    if not commit:
        dump_empd_meta(meta_df, meta)
        return ("Marked the fields as accepted but without having it "
                "commited. %i sample%s would have been affected.") % (
                    nsamples, 's' if nsamples > 1 else '')
Ejemplo n.º 11
0
def accept(meta,
           what,
           commit=True,
           skip_ci=False,
           raise_error=False,
           exact=False,
           local_repo=None):
    """Accept failed metadata

    This function marks columns for specific cells as `okexcept`, such that it
    passes the EMPD-data tests

    Parameters
    ----------
    meta: str
        The path to the metadata
    what: list of str
        A list of strings like `sample:column` where `sample` is a regular
        expression (or the name of the sample if `exact`) and the `column` is
        the column for the corresponding sample that shall be accepted. The
        `sample` can also be ``'all'`` to match all samples in the metadata
    commit: bool
        If True, commit the changes in the repository of `meta`
    skip_ci: bool
        If True and `commit`, then ``[skip ci]`` will be added to the commit
        message
    raise_error: bool
        If True, raise an error on Failure, otherwise return the error msg
    except: bool
        If True, samples must be euqal to the `sample` part in `what`.
        Otherwise we use regular expressions
    local_repo: str
        The path of the local EMPD-data repository. If None, it will be assumed
        to be the directory of the given `meta`.

    Returs
    ------
    str
        The status message. None if everything is allright.

    Examples
    --------
    Accept wrong countries for all samples::

        accept(meta, ['all:Country'])

    Accept wrong latitudes and longitudes for all samples that start with
    ``'Barboni'``::

        accept(meta, ['Barboni:Latitude', 'Barboni:Longitude'])

    Accept wrong Temperature for the sample ``'Beaudouin_a1'`` and nothing
    else::

        accept(meta, ['Beaudouin_a1:Temperature'], exact=True)

    .. note::

        If you skip the `exact` parameter above, wrong temperatures would
        also be accepted for the sample ``Beaudouin_a10``!"""

    if local_repo is None:
        local_repo = osp.dirname(meta)
        base_meta = osp.basename(meta)
    else:
        base_meta = meta
        meta = osp.join(local_repo, meta)
    repo = Repo(local_repo)
    meta_df = read_empd_meta(meta).reset_index()
    samples = np.unique([t[0] for t in what])

    valid = (samples == 'all')
    if exact:
        valid |= np.isin(samples, meta_df.SampleName.values)
    else:
        valid |= np.array(
            [meta_df.SampleName.str.contains(s).any() for s in samples])

    if not valid.all():
        msg = "Missing samples %s in %s" % (samples[~valid],
                                            osp.basename(meta))
        if raise_error:
            raise ValueError(msg)
        else:
            return msg
    if 'okexcept' not in meta_df.columns:
        meta_df['okexcept'] = ''
    else:
        meta_df['okexcept'] = meta_df.okexcept.fillna('')
    names = meta_df.SampleName
    messages = []
    for sample, column in what:
        if sample == 'all':
            slicer = slice(None)
            message = f"Accept wrong {column} for all samples"
        else:
            if exact:
                slicer = names == sample
            else:
                slicer = names.str.contains(sample)
            message = f"Accept wrong {column} for sample {sample}"
        meta_df.loc[slicer, 'okexcept'] += column + ','
        meta_df.loc[slicer, 'okexcept'] = meta_df.loc[
            slicer, 'okexcept'].apply(
                lambda s: ','.join(sorted(set(s[:-1].split(',')))) + ',')
        messages.append(message)

        if commit:
            dump_empd_meta(meta_df, meta)
            repo.index.add([base_meta])
            repo.index.commit(message + ('\n\n[skip ci]' if skip_ci else ''))
    if not commit:
        dump_empd_meta(meta_df, meta)
        return ("Marked the fields as accepted but without having it "
                "commited\n\n- " + "\n- ".join(messages))