Beispiel #1
0
def delete_analysis(job):
    """Deletes a full analysis

    Parameters
    ----------
    job : qiita_db.processing_job.ProcessingJob
        The processing job performing the task
    """
    with qdb.sql_connection.TRN:
        analysis_id = job.parameters.values['analysis_id']
        analysis = qdb.analysis.Analysis(analysis_id)

        # selecting roots of the analysis, can be multiple
        artifacts = [a for a in analysis.artifacts
                     if a.processing_parameters is None]
        # deleting each of the processing graphs
        for a in artifacts:
            to_delete = list(a.descendants.nodes())
            to_delete.reverse()
            for td in to_delete:
                qdb.artifact.Artifact.delete(td.id)
        qdb.analysis.Analysis.delete(analysis_id)

        r_client.delete('analysis_delete_%d' % analysis_id)

        job._set_status('success')
Beispiel #2
0
    def test_get(self):
        # Create the usernames key so we can do autocomplete
        r_client.zadd('qiita-usernames', **{u: 0 for u in User.iter()})
        response = self.get(self.base_url % 't')
        self.assertEqual(response.code, 200)
        self.assertEqual(
            loads(response.body),
            {'results': [{
                "id": "*****@*****.**",
                "text": "*****@*****.**"
            }]})

        response = self.get(self.base_url % 'admi')
        self.assertEqual(response.code, 200)
        self.assertEqual(
            loads(response.body),
            {'results': [{
                "id": "*****@*****.**",
                "text": "*****@*****.**"
            }]})

        response = self.get(self.base_url % 'tesq')
        self.assertEqual(response.code, 200)
        self.assertEqual(loads(response.body), {'results': []})

        r_client.delete('qiita-usernames')
Beispiel #3
0
    def test_get(self):
        base_url = '/study/sharing/autocomplete/?text=%s'

        r_client.zadd('qiita-usernames', {e: 0 for e, n in User.iter()})
        response = self.get(base_url % 't')
        self.assertEqual(response.code, 200)
        self.assertEqual(
            loads(response.body),
            {'results': [{
                "id": "*****@*****.**",
                "text": "*****@*****.**"
            }]})

        response = self.get(base_url % 'admi')
        self.assertEqual(response.code, 200)
        self.assertEqual(
            loads(response.body),
            {'results': [{
                "id": "*****@*****.**",
                "text": "*****@*****.**"
            }]})

        response = self.get(base_url % 'tesq')
        self.assertEqual(response.code, 200)
        self.assertEqual(loads(response.body), {'results': []})

        r_client.delete('qiita-usernames')
Beispiel #4
0
def delete_analysis(job):
    """Deletes a full analysis

    Parameters
    ----------
    job : qiita_db.processing_job.ProcessingJob
        The processing job performing the task
    """
    with qdb.sql_connection.TRN:
        analysis_id = job.parameters.values['analysis_id']
        analysis = qdb.analysis.Analysis(analysis_id)

        # selecting roots of the analysis, can be multiple
        artifacts = [
            a for a in analysis.artifacts if a.processing_parameters is None
        ]
        # deleting each of the processing graphs
        for a in artifacts:
            to_delete = list(a.descendants.nodes())
            to_delete.reverse()
            for td in to_delete:
                qdb.artifact.Artifact.delete(td.id)
        qdb.analysis.Analysis.delete(analysis_id)

        r_client.delete('analysis_delete_%d' % analysis_id)

        job._set_status('success')
Beispiel #5
0
    def test_post_select_samples(self):
        # just making sure that the key is not set in redis
        r_client.delete('maintenance')
        response = self.get('/auth/reset/')
        self.assertEqual(response.code, 200)
        self.assertIn(('<label for="newpass2" class="col-sm-2 '
                       'control-label">Repeat New Password'
                       '</label>'), response.body)

        # not displaying due to maintenance
        r_client.set('maintenance', 'This is my error message')
        response = self.get('/auth/reset/')
        self.assertEqual(response.code, 200)
        self.assertNotIn(('<label for="newpass2" class="col-sm-2 '
                          'control-label">Repeat New Password'
                          '</label>'), response.body)
        r_client.delete('maintenance')
Beispiel #6
0
    def test_post_select_samples(self):
        # just making sure that the key is not set in redis
        r_client.delete('maintenance')
        response = self.get('/auth/reset/')
        self.assertEqual(response.code, 200)
        self.assertIn(('<label for="newpass2" class="col-sm-2 '
                       'control-label">Repeat New Password'
                       '</label>'), response.body)

        # not displaying due to maintenance
        r_client.set('maintenance', 'This is my error message')
        response = self.get('/auth/reset/')
        self.assertEqual(response.code, 200)
        self.assertNotIn(('<label for="newpass2" class="col-sm-2 '
                          'control-label">Repeat New Password'
                          '</label>'), response.body)
        r_client.delete('maintenance')
Beispiel #7
0
def delete_analysis(job):
    """Deletes a full analysis

    Parameters
    ----------
    job : qiita_db.processing_job.ProcessingJob
        The processing job performing the task
    """
    with qdb.sql_connection.TRN:
        analysis_id = job.parameters.values['analysis_id']
        analysis = qdb.analysis.Analysis(analysis_id)

        _delete_analysis_artifacts(analysis)

        r_client.delete('analysis_delete_%d' % analysis_id)

        job._set_status('success')
    def test_get(self):
        # Create the usernames key so we can do autocomplete
        r_client.zadd('qiita-usernames', **{e: 0 for e, n in User.iter()})
        response = self.get(self.base_url % 't')
        self.assertEqual(response.code, 200)
        self.assertEqual(loads(response.body),
                         {'results': [{"id": "*****@*****.**",
                                       "text": "*****@*****.**"}]})

        response = self.get(self.base_url % 'admi')
        self.assertEqual(response.code, 200)
        self.assertEqual(loads(response.body),
                         {'results': [{"id": "*****@*****.**",
                                       "text": "*****@*****.**"}]})

        response = self.get(self.base_url % 'tesq')
        self.assertEqual(response.code, 200)
        self.assertEqual(loads(response.body),
                         {'results': []})

        r_client.delete('qiita-usernames')
    def test_get(self):
        base_url = '/study/sharing/autocomplete/?text=%s'

        r_client.zadd('qiita-usernames', {e: 0 for e, n in User.iter()})
        response = self.get(base_url % 't')
        self.assertEqual(response.code, 200)
        self.assertEqual(loads(response.body),
                         {'results': [{"id": "*****@*****.**",
                                       "text": "*****@*****.**"}]})

        response = self.get(base_url % 'admi')
        self.assertEqual(response.code, 200)
        self.assertEqual(loads(response.body),
                         {'results': [{"id": "*****@*****.**",
                                       "text": "*****@*****.**"}]})

        response = self.get(base_url % 'tesq')
        self.assertEqual(response.code, 200)
        self.assertEqual(loads(response.body),
                         {'results': []})

        r_client.delete('qiita-usernames')
Beispiel #10
0
def delete_analysis(job):
    """Deletes a full analysis

    Parameters
    ----------
    job : qiita_db.processing_job.ProcessingJob
        The processing job performing the task
    """
    with qdb.sql_connection.TRN:
        analysis_id = job.parameters.values['analysis_id']
        analysis = qdb.analysis.Analysis(analysis_id)

        artifacts = sorted(
            analysis.artifacts, key=lambda a: a.id, reverse=True)

        for artifact in artifacts:
            qdb.artifact.Artifact.delete(artifact.id)

        qdb.analysis.Analysis.delete(analysis_id)

        r_client.delete('analysis_delete_%d' % analysis_id)

        job._set_status('success')
Beispiel #11
0
def delete_analysis(job):
    """Deletes a full analysis

    Parameters
    ----------
    job : qiita_db.processing_job.ProcessingJob
        The processing job performing the task
    """
    with qdb.sql_connection.TRN:
        analysis_id = job.parameters.values['analysis_id']
        analysis = qdb.analysis.Analysis(analysis_id)

        artifacts = sorted(
            analysis.artifacts, key=lambda a: a.id, reverse=True)

        for artifact in artifacts:
            qdb.artifact.Artifact.delete(artifact.id)

        qdb.analysis.Analysis.delete(analysis_id)

        r_client.delete('analysis_delete_%d' % analysis_id)

        job._set_status('success')
Beispiel #12
0
def correct_redis_data(key, cmd, values_dict, user):
    """Corrects the data stored in the redis DB

    Parameters
    ----------
    key: str
        The redis key to fix
    cmd : qiita_db.software.Command
        Command to use to create the processing job
    values_dict : dict
        Dictionary used to instantiate the parameters of the command
    user : qiita_db.user. User
        The user that will own the job
    """
    info = r_client.get(key)
    if info:
        info = loads(info)
        if info['job_id'] is not None:
            if 'is_qiita_job' in info:
                if info['is_qiita_job']:
                    try:
                        job = ProcessingJob(info['job_id'])
                        payload = {'job_id': info['job_id'],
                                   'alert_type': info['status'],
                                   'alert_msg': info['alert_msg']}
                        r_client.set(key, dumps(payload))
                    except (QiitaDBUnknownIDError, KeyError):
                        # We shomehow lost the information of this job
                        # Simply delete the key
                        r_client.delete(key)
                else:
                    # These jobs don't contain any information on the live
                    # dump. We can safely delete the key
                    r_client.delete(key)
            else:
                # These jobs don't contain any information on the live
                # dump. We can safely delete the key
                r_client.delete(key)
        else:
            # Job is null, we have the information here
            if info['status'] == 'success':
                # In the success case no information is stored. We can
                # safely delete the key
                r_client.delete(key)
            elif info['status'] == 'warning':
                # In case of warning the key message stores the warning
                # message. We need to create a new job, mark it as
                # successful and store the error message as expected by
                # the new structure
                params = Parameters.load(cmd, values_dict=values_dict)
                job = ProcessingJob.create(user, params)
                job._set_status('success')
                payload = {'job_id': job.id,
                           'alert_type': 'warning',
                           'alert_msg': info['message']}
                r_client.set(key, dumps(payload))
            else:
                # The status is error. The key message stores the error
                # message. We need to create a new job and mark it as
                # failed with the given error message
                params = Parameters.load(cmd, values_dict=values_dict)
                job = ProcessingJob.create(user, params)
                job._set_error(info['message'])
                payload = {'job_id': job.id}
                r_client.set(key, dumps(payload))
    else:
        # The key doesn't contain any information. Delete the key
        r_client.delete(key)
Beispiel #13
0
    # Create the command to complete a job
    parameters = {'job_id': ['string', None], 'payload': ['string', None]}
    create_command(qiita_plugin, "complete_job", "Completes a given job",
                   parameters)

    # Assumptions on the structure of the data in the redis database has
    # changed, we need to fix to avoid failures
    # Get all the sample template keys
    for key in r_client.keys('sample_template_[0-9]*'):
        try:
            study = Study(int(key.split('_')[-1]))
            user = study.owner
        except QiitaDBUnknownIDError:
            # This means that the study no longer exists - delete the key
            # and continue
            r_client.delete(key)
            continue
        values_dict = {'study': study.id, 'template_fp': 'ignored-patch58'}
        correct_redis_data(key, st_cmd, values_dict, user)

    # Get all the prep template keys
    for key in r_client.keys('prep_template_[0-9]*'):
        try:
            pt = PrepTemplate(int(key.split('_')[-1]))
            user = Study(pt.study_id).owner
        except QiitaDBUnknownIDError:
            # This means that the prep template no longer exists - delete the
            # key and continue
            r_client.delete(key)
            continue
        values_dict = {'prep_template': pt.id,
Beispiel #14
0
def update_redis_stats():
    """Generate the system stats and save them in redis

    Returns
    -------
    list of str
        artifact filepaths that are not present in the file system
    """
    STUDY = qdb.study.Study
    studies = {
        'public': STUDY.get_by_status('public'),
        'private': STUDY.get_by_status('private'),
        'sandbox': STUDY.get_by_status('sandbox')
    }
    number_studies = {k: len(v) for k, v in viewitems(studies)}

    number_of_samples = {}
    ebi_samples_prep = {}
    num_samples_ebi = 0
    for k, sts in viewitems(studies):
        number_of_samples[k] = 0
        for s in sts:
            st = s.sample_template
            if st is not None:
                number_of_samples[k] += len(list(st.keys()))

            ebi_samples_prep_count = 0
            for pt in s.prep_templates():
                ebi_samples_prep_count += len([
                    1 for _, v in viewitems(pt.ebi_experiment_accessions)
                    if v is not None and v != ''
                ])
            ebi_samples_prep[s.id] = ebi_samples_prep_count

            if s.sample_template is not None:
                num_samples_ebi += len([
                    1 for _, v in viewitems(
                        s.sample_template.ebi_sample_accessions)
                    if v is not None and v != ''
                ])

    num_users = qdb.util.get_count('qiita.qiita_user')
    num_processing_jobs = qdb.util.get_count('qiita.processing_job')

    lat_longs = dumps(get_lat_longs())

    num_studies_ebi = len(
        [k for k, v in viewitems(ebi_samples_prep) if v >= 1])
    number_samples_ebi_prep = sum([v for _, v in viewitems(ebi_samples_prep)])

    # generating file size stats
    stats = []
    missing_files = []
    for k, sts in viewitems(studies):
        for s in sts:
            for a in s.artifacts():
                for x in a.filepaths:
                    try:
                        s = stat(x['fp'])
                        stats.append((x['fp_type'], s.st_size,
                                      strftime('%Y-%m',
                                               localtime(s.st_ctime))))
                    except OSError:
                        missing_files.append(x['fp'])

    summary = {}
    all_dates = []
    for ft, size, ym in stats:
        if ft not in summary:
            summary[ft] = {}
        if ym not in summary[ft]:
            summary[ft][ym] = 0
            all_dates.append(ym)
        summary[ft][ym] += size
    all_dates = sorted(set(all_dates))

    # sorting summaries
    rm_from_data = [
        'html_summary', 'tgz', 'directory', 'raw_fasta', 'log', 'biom',
        'raw_sff', 'raw_qual'
    ]
    ordered_summary = {}
    for dt in summary:
        if dt in rm_from_data:
            continue
        new_list = []
        current_value = 0
        for ad in all_dates:
            if ad in summary[dt]:
                current_value += summary[dt][ad]
            new_list.append(current_value)
        ordered_summary[dt] = new_list

    plot_order = sorted([(k, ordered_summary[k][-1]) for k in ordered_summary],
                        key=lambda x: x[1])

    # helper function to generate y axis, modified from:
    # http://stackoverflow.com/a/1094933
    def sizeof_fmt(value, position):
        number = None
        for unit in ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']:
            if abs(value) < 1024.0:
                number = "%3.1f%s" % (value, unit)
                break
            value /= 1024.0
        if number is None:
            number = "%.1f%s" % (value, 'Yi')
        return number

    all_dates_axis = range(len(all_dates))
    plt.locator_params(axis='y', nbins=10)
    plt.figure(figsize=(20, 10))
    for k, v in plot_order:
        plt.plot(all_dates_axis, ordered_summary[k], linewidth=2, label=k)

    plt.xticks(all_dates_axis, all_dates)
    plt.legend()
    plt.grid()
    ax = plt.gca()
    ax.yaxis.set_major_formatter(mpl.ticker.FuncFormatter(sizeof_fmt))
    plt.xticks(rotation=90)
    plt.xlabel('Date')
    plt.ylabel('Storage space per data type')

    plot = BytesIO()
    plt.savefig(plot, format='png')
    plot.seek(0)
    img = 'data:image/png;base64,' + quote(b64encode(plot.getbuffer()))

    time = datetime.now().strftime('%m-%d-%y %H:%M:%S')

    portal = qiita_config.portal
    vals = [('number_studies', number_studies, r_client.hmset),
            ('number_of_samples', number_of_samples, r_client.hmset),
            ('num_users', num_users, r_client.set),
            ('lat_longs', (lat_longs), r_client.set),
            ('num_studies_ebi', num_studies_ebi, r_client.set),
            ('num_samples_ebi', num_samples_ebi, r_client.set),
            ('number_samples_ebi_prep', number_samples_ebi_prep, r_client.set),
            ('img', img, r_client.set), ('time', time, r_client.set),
            ('num_processing_jobs', num_processing_jobs, r_client.set)]
    for k, v, f in vals:
        redis_key = '%s:stats:%s' % (portal, k)
        # important to "flush" variables to avoid errors
        r_client.delete(redis_key)
        f(redis_key, v)

    return missing_files
Beispiel #15
0
def correct_redis_data(key, cmd, values_dict, user):
    """Corrects the data stored in the redis DB

    Parameters
    ----------
    key: str
        The redis key to fix
    cmd : qiita_db.software.Command
        Command to use to create the processing job
    values_dict : dict
        Dictionary used to instantiate the parameters of the command
    user : qiita_db.user. User
        The user that will own the job
    """
    info = r_client.get(key)
    if info:
        info = loads(info)
        if info['job_id'] is not None:
            if 'is_qiita_job' in info:
                if info['is_qiita_job']:
                    try:
                        job = ProcessingJob(info['job_id'])
                        payload = {
                            'job_id': info['job_id'],
                            'alert_type': info['status'],
                            'alert_msg': info['alert_msg']
                        }
                        r_client.set(key, dumps(payload))
                    except (QiitaDBUnknownIDError, KeyError):
                        # We shomehow lost the information of this job
                        # Simply delete the key
                        r_client.delete(key)
                else:
                    # These jobs don't contain any information on the live
                    # dump. We can safely delete the key
                    r_client.delete(key)
            else:
                # These jobs don't contain any information on the live
                # dump. We can safely delete the key
                r_client.delete(key)
        else:
            # Job is null, we have the information here
            if info['status'] == 'success':
                # In the success case no information is stored. We can
                # safely delete the key
                r_client.delete(key)
            elif info['status'] == 'warning':
                # In case of warning the key message stores the warning
                # message. We need to create a new job, mark it as
                # successful and store the error message as expected by
                # the new structure
                params = Parameters.load(cmd, values_dict=values_dict)
                job = ProcessingJob.create(user, params)
                job._set_status('success')
                payload = {
                    'job_id': job.id,
                    'alert_type': 'warning',
                    'alert_msg': info['message']
                }
                r_client.set(key, dumps(payload))
            else:
                # The status is error. The key message stores the error
                # message. We need to create a new job and mark it as
                # failed with the given error message
                params = Parameters.load(cmd, values_dict=values_dict)
                job = ProcessingJob.create(user, params)
                job._set_error(info['message'])
                payload = {'job_id': job.id}
                r_client.set(key, dumps(payload))
    else:
        # The key doesn't contain any information. Delete the key
        r_client.delete(key)
Beispiel #16
0
def generate_plugin_releases():
    """Generate releases for plugins
    """
    ARCHIVE = qdb.archive.Archive
    qiita_config = ConfigurationManager()
    working_dir = qiita_config.working_dir

    commands = [c for s in qdb.software.Software.iter(active=True)
                for c in s.commands if c.post_processing_cmd is not None]

    tnow = datetime.now()
    ts = tnow.strftime('%m%d%y-%H%M%S')
    tgz_dir = join(working_dir, 'releases', 'archive')
    create_nested_path(tgz_dir)
    tgz_dir_release = join(tgz_dir, ts)
    create_nested_path(tgz_dir_release)
    for cmd in commands:
        cmd_name = cmd.name
        mschemes = [v for _, v in ARCHIVE.merging_schemes().items()
                    if cmd_name in v]
        for ms in mschemes:
            ms_name = sub('[^0-9a-zA-Z]+', '', ms)
            ms_fp = join(tgz_dir_release, ms_name)
            create_nested_path(ms_fp)

            pfp = join(ms_fp, 'archive.json')
            archives = {k: loads(v)
                        for k, v in ARCHIVE.retrieve_feature_values(
                              archive_merging_scheme=ms).items()
                        if v != ''}
            with open(pfp, 'w') as f:
                dump(archives, f)

            # now let's run the post_processing_cmd
            ppc = cmd.post_processing_cmd

            # concatenate any other parameters into a string
            params = ' '.join(["%s=%s" % (k, v) for k, v in
                              ppc['script_params'].items()])
            # append archives file and output dir parameters
            params = ("%s --fp_archive=%s --output_dir=%s" % (
                params, pfp, ms_fp))

            ppc_cmd = "%s %s %s" % (
                ppc['script_env'], ppc['script_path'], params)
            p_out, p_err, rv = qdb.processing_job._system_call(ppc_cmd)
            p_out = p_out.rstrip()
            if rv != 0:
                raise ValueError('Error %d: %s' % (rv, p_out))
            p_out = loads(p_out)

    # tgz-ing all files
    tgz_name = join(tgz_dir, 'archive-%s-building.tgz' % ts)
    tgz_name_final = join(tgz_dir, 'archive.tgz')
    with topen(tgz_name, "w|gz") as tgz:
        tgz.add(tgz_dir_release, arcname=basename(tgz_dir_release))
    # getting the release md5
    with open(tgz_name, "rb") as f:
        md5sum = md5()
        for c in iter(lambda: f.read(4096), b""):
            md5sum.update(c)
    rename(tgz_name, tgz_name_final)
    vals = [
        ('filepath', tgz_name_final[len(working_dir):], r_client.set),
        ('md5sum', md5sum.hexdigest(), r_client.set),
        ('time', tnow.strftime('%m-%d-%y %H:%M:%S'), r_client.set)]
    for k, v, f in vals:
        redis_key = 'release-archive:%s' % k
        # important to "flush" variables to avoid errors
        r_client.delete(redis_key)
        f(redis_key, v)
Beispiel #17
0
def generate_biom_and_metadata_release(study_status='public'):
    """Generate a list of biom/meatadata filepaths and a tgz of those files

    Parameters
    ----------
    study_status : str, optional
        The study status to search for. Note that this should always be set
        to 'public' but having this exposed helps with testing. The other
        options are 'private' and 'sandbox'
    """
    studies = qdb.study.Study.get_by_status(study_status)
    qiita_config = ConfigurationManager()
    working_dir = qiita_config.working_dir
    portal = qiita_config.portal
    bdir = qdb.util.get_db_files_base_dir()
    time = datetime.now().strftime('%m-%d-%y %H:%M:%S')

    data = []
    for s in studies:
        # [0] latest is first, [1] only getting the filepath
        sample_fp = relpath(s.sample_template.get_filepaths()[0][1], bdir)

        for a in s.artifacts(artifact_type='BIOM'):
            if a.processing_parameters is None:
                continue

            cmd_name = a.processing_parameters.command.name

            # this loop is necessary as in theory an artifact can be
            # generated from multiple prep info files
            human_cmd = []
            for p in a.parents:
                pp = p.processing_parameters
                pp_cmd_name = pp.command.name
                if pp_cmd_name == 'Trimming':
                    human_cmd.append('%s @ %s' % (
                        cmd_name, str(pp.values['length'])))
                else:
                    human_cmd.append('%s, %s' % (cmd_name, pp_cmd_name))
            human_cmd = ', '.join(human_cmd)

            for _, fp, fp_type in a.filepaths:
                if fp_type != 'biom' or 'only-16s' in fp:
                    continue
                fp = relpath(fp, bdir)
                # format: (biom_fp, sample_fp, prep_fp, qiita_artifact_id,
                #          human readable name)
                for pt in a.prep_templates:
                    for _, prep_fp in pt.get_filepaths():
                        if 'qiime' not in prep_fp:
                            break
                    prep_fp = relpath(prep_fp, bdir)
                    data.append((fp, sample_fp, prep_fp, a.id, human_cmd))

    # writing text and tgz file
    ts = datetime.now().strftime('%m%d%y-%H%M%S')
    tgz_dir = join(working_dir, 'releases')
    if not exists(tgz_dir):
        makedirs(tgz_dir)
    tgz_name = join(tgz_dir, '%s-%s-building.tgz' % (portal, study_status))
    tgz_name_final = join(tgz_dir, '%s-%s.tgz' % (portal, study_status))
    txt_hd = StringIO()
    with topen(tgz_name, "w|gz") as tgz:
        # writing header for txt
        txt_hd.write(
            "biom_fp\tsample_fp\tprep_fp\tqiita_artifact_id\tcommand\n")
        for biom_fp, sample_fp, prep_fp, artifact_id, human_cmd in data:
            txt_hd.write("%s\t%s\t%s\t%s\t%s\n" % (
                biom_fp, sample_fp, prep_fp, artifact_id, human_cmd))
            tgz.add(join(bdir, biom_fp), arcname=biom_fp, recursive=False)
            tgz.add(join(bdir, sample_fp), arcname=sample_fp, recursive=False)
            tgz.add(join(bdir, prep_fp), arcname=prep_fp, recursive=False)

        txt_hd.seek(0)
        info = TarInfo(name='%s-%s-%s.txt' % (portal, study_status, ts))
        info.size = len(txt_hd.buf)
        tgz.addfile(tarinfo=info, fileobj=txt_hd)

    with open(tgz_name, "rb") as f:
        md5sum = md5()
        for c in iter(lambda: f.read(4096), b""):
            md5sum.update(c)

    rename(tgz_name, tgz_name_final)

    vals = [
        ('filepath', tgz_name_final[len(working_dir):], r_client.set),
        ('md5sum', md5sum.hexdigest(), r_client.set),
        ('time', time, r_client.set)]
    for k, v, f in vals:
        redis_key = '%s:release:%s:%s' % (portal, study_status, k)
        # important to "flush" variables to avoid errors
        r_client.delete(redis_key)
        f(redis_key, v)
Beispiel #18
0
def generate_biom_and_metadata_release(study_status='public'):
    """Generate a list of biom/meatadata filepaths and a tgz of those files

    Parameters
    ----------
    study_status : str, optional
        The study status to search for. Note that this should always be set
        to 'public' but having this exposed helps with testing. The other
        options are 'private' and 'sandbox'
    """
    studies = qdb.study.Study.get_by_status(study_status)
    qiita_config = ConfigurationManager()
    working_dir = qiita_config.working_dir
    portal = qiita_config.portal
    bdir = qdb.util.get_db_files_base_dir()
    time = datetime.now().strftime('%m-%d-%y %H:%M:%S')

    data = []
    for s in studies:
        # [0] latest is first, [1] only getting the filepath
        sample_fp = relpath(s.sample_template.get_filepaths()[0][1], bdir)

        for a in s.artifacts(artifact_type='BIOM'):
            if a.processing_parameters is None:
                continue

            processing_params = a.processing_parameters
            cmd_name = processing_params.command.name
            ms = processing_params.command.merging_scheme
            software = processing_params.command.software
            software = '%s v%s' % (software.name, software.version)

            # this loop is necessary as in theory an artifact can be
            # generated from multiple prep info files
            afps = [fp for _, fp, _ in a.filepaths if fp.endswith('biom')]
            merging_schemes = []
            parent_softwares = []
            for p in a.parents:
                pparent = p.processing_parameters
                # if parent is None, then is a direct upload; for example
                # per_sample_FASTQ in shotgun data
                if pparent is None:
                    parent_cmd_name = None
                    parent_merging_scheme = None
                    parent_pp = None
                    parent_software = 'N/A'
                else:
                    parent_cmd_name = pparent.command.name
                    parent_merging_scheme = pparent.command.merging_scheme
                    parent_pp = pparent.values
                    psoftware = pparent.command.software
                    parent_software = '%s v%s' % (
                        psoftware.name, psoftware.version)

                merging_schemes.append(qdb.util.human_merging_scheme(
                    cmd_name, ms, parent_cmd_name, parent_merging_scheme,
                    processing_params.values, afps, parent_pp))
                parent_softwares.append(parent_software)
            merging_schemes = ', '.join(merging_schemes)
            parent_softwares = ', '.join(parent_softwares)

            for _, fp, fp_type in a.filepaths:
                if fp_type != 'biom' or 'only-16s' in fp:
                    continue
                fp = relpath(fp, bdir)
                for pt in a.prep_templates:
                    categories = pt.categories()
                    platform = ''
                    target_gene = ''
                    if 'platform' in categories:
                        platform = ', '.join(
                            set(pt.get_category('platform').values()))
                    if 'target_gene' in categories:
                        target_gene = ', '.join(
                            set(pt.get_category('target_gene').values()))
                    for _, prep_fp in pt.get_filepaths():
                        if 'qiime' not in prep_fp:
                            break
                    prep_fp = relpath(prep_fp, bdir)
                    # format: (biom_fp, sample_fp, prep_fp, qiita_artifact_id,
                    #          platform, target gene, merging schemes,
                    #          artifact software/version,
                    #          parent sofware/version)
                    data.append((fp, sample_fp, prep_fp, a.id, platform,
                                 target_gene, merging_schemes, software,
                                 parent_softwares))

    # writing text and tgz file
    ts = datetime.now().strftime('%m%d%y-%H%M%S')
    tgz_dir = join(working_dir, 'releases')
    create_nested_path(tgz_dir)
    tgz_name = join(tgz_dir, '%s-%s-building.tgz' % (portal, study_status))
    tgz_name_final = join(tgz_dir, '%s-%s.tgz' % (portal, study_status))
    txt_hd = StringIO()
    with topen(tgz_name, "w|gz") as tgz:
        txt_hd.write(
            "biom fp\tsample fp\tprep fp\tqiita artifact id\tplatform\t"
            "target gene\tmerging scheme\tartifact software\t"
            "parent software\n")
        for biom_fp, sample_fp, prep_fp, aid, pform, tg, ms, asv, psv in data:
            txt_hd.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (
                biom_fp, sample_fp, prep_fp, aid, pform, tg, ms, asv, psv))
            tgz.add(join(bdir, biom_fp), arcname=biom_fp, recursive=False)
            tgz.add(join(bdir, sample_fp), arcname=sample_fp, recursive=False)
            tgz.add(join(bdir, prep_fp), arcname=prep_fp, recursive=False)

        txt_hd.seek(0)
        info = TarInfo(name='%s-%s-%s.txt' % (portal, study_status, ts))
        info.size = len(txt_hd.buf)
        tgz.addfile(tarinfo=info, fileobj=txt_hd)

    with open(tgz_name, "rb") as f:
        md5sum = md5()
        for c in iter(lambda: f.read(4096), b""):
            md5sum.update(c)

    rename(tgz_name, tgz_name_final)

    vals = [
        ('filepath', tgz_name_final[len(working_dir):], r_client.set),
        ('md5sum', md5sum.hexdigest(), r_client.set),
        ('time', time, r_client.set)]
    for k, v, f in vals:
        redis_key = '%s:release:%s:%s' % (portal, study_status, k)
        # important to "flush" variables to avoid errors
        r_client.delete(redis_key)
        f(redis_key, v)
Beispiel #19
0
def generate_biom_and_metadata_release(study_status='public'):
    """Generate a list of biom/meatadata filepaths and a tgz of those files

    Parameters
    ----------
    study_status : str, optional
        The study status to search for. Note that this should always be set
        to 'public' but having this exposed helps with testing. The other
        options are 'private' and 'sandbox'
    """
    studies = qdb.study.Study.get_by_status(study_status)
    qiita_config = ConfigurationManager()
    working_dir = qiita_config.working_dir
    portal = qiita_config.portal
    bdir = qdb.util.get_db_files_base_dir()
    time = datetime.now().strftime('%m-%d-%y %H:%M:%S')

    data = []
    for s in studies:
        # [0] latest is first, [1] only getting the filepath
        sample_fp = relpath(s.sample_template.get_filepaths()[0][1], bdir)

        for a in s.artifacts(artifact_type='BIOM'):
            if a.processing_parameters is None or a.visibility != study_status:
                continue

            merging_schemes, parent_softwares = a.merging_scheme
            software = a.processing_parameters.command.software
            software = '%s v%s' % (software.name, software.version)

            for x in a.filepaths:
                if x['fp_type'] != 'biom' or 'only-16s' in x['fp']:
                    continue
                fp = relpath(x['fp'], bdir)
                for pt in a.prep_templates:
                    categories = pt.categories()
                    platform = ''
                    target_gene = ''
                    if 'platform' in categories:
                        platform = ', '.join(
                            set(pt.get_category('platform').values()))
                    if 'target_gene' in categories:
                        target_gene = ', '.join(
                            set(pt.get_category('target_gene').values()))
                    for _, prep_fp in pt.get_filepaths():
                        if 'qiime' not in prep_fp:
                            break
                    prep_fp = relpath(prep_fp, bdir)
                    # format: (biom_fp, sample_fp, prep_fp, qiita_artifact_id,
                    #          platform, target gene, merging schemes,
                    #          artifact software/version,
                    #          parent sofware/version)
                    data.append(
                        (fp, sample_fp, prep_fp, a.id, platform, target_gene,
                         merging_schemes, software, parent_softwares))

    # writing text and tgz file
    ts = datetime.now().strftime('%m%d%y-%H%M%S')
    tgz_dir = join(working_dir, 'releases')
    create_nested_path(tgz_dir)
    tgz_name = join(tgz_dir, '%s-%s-building.tgz' % (portal, study_status))
    tgz_name_final = join(tgz_dir, '%s-%s.tgz' % (portal, study_status))
    txt_lines = [
        "biom fp\tsample fp\tprep fp\tqiita artifact id\tplatform\t"
        "target gene\tmerging scheme\tartifact software\tparent software"
    ]
    with topen(tgz_name, "w|gz") as tgz:
        for biom_fp, sample_fp, prep_fp, aid, pform, tg, ms, asv, psv in data:
            txt_lines.append(
                "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" %
                (biom_fp, sample_fp, prep_fp, aid, pform, tg, ms, asv, psv))
            tgz.add(join(bdir, biom_fp), arcname=biom_fp, recursive=False)
            tgz.add(join(bdir, sample_fp), arcname=sample_fp, recursive=False)
            tgz.add(join(bdir, prep_fp), arcname=prep_fp, recursive=False)
        info = TarInfo(name='%s-%s-%s.txt' % (portal, study_status, ts))
        txt_hd = BytesIO()
        txt_hd.write(bytes('\n'.join(txt_lines), 'ascii'))
        txt_hd.seek(0)
        info.size = len(txt_hd.read())
        txt_hd.seek(0)
        tgz.addfile(tarinfo=info, fileobj=txt_hd)

    with open(tgz_name, "rb") as f:
        md5sum = md5()
        for c in iter(lambda: f.read(4096), b""):
            md5sum.update(c)

    rename(tgz_name, tgz_name_final)

    vals = [('filepath', tgz_name_final[len(working_dir):], r_client.set),
            ('md5sum', md5sum.hexdigest(), r_client.set),
            ('time', time, r_client.set)]
    for k, v, f in vals:
        redis_key = '%s:release:%s:%s' % (portal, study_status, k)
        # important to "flush" variables to avoid errors
        r_client.delete(redis_key)
        f(redis_key, v)
Beispiel #20
0
    # Create the command to complete a job
    parameters = {'job_id': ['string', None], 'payload': ['string', None]}
    create_command(qiita_plugin, "complete_job", "Completes a given job",
                   parameters)

    # Assumptions on the structure of the data in the redis database has
    # changed, we need to fix to avoid failures
    # Get all the sample template keys
    for key in r_client.keys('sample_template_[0-9]*'):
        try:
            study = Study(int(key.split('_')[-1]))
            user = study.owner
        except QiitaDBUnknownIDError:
            # This means that the study no longer exists - delete the key
            # and continue
            r_client.delete(key)
            continue
        values_dict = {'study': study.id, 'template_fp': 'ignored-patch58'}
        correct_redis_data(key, st_cmd, values_dict, user)

    # Get all the prep template keys
    for key in r_client.keys('prep_template_[0-9]*'):
        try:
            pt = PrepTemplate(int(key.split('_')[-1]))
            user = Study(pt.study_id).owner
        except QiitaDBUnknownIDError:
            # This means that the prep template no longer exists - delete the
            # key and continue
            r_client.delete(key)
            continue
        values_dict = {
Beispiel #21
0
def generate_biom_and_metadata_release(study_status='public'):
    """Generate a list of biom/meatadata filepaths and a tgz of those files

    Parameters
    ----------
    study_status : str, optional
        The study status to search for. Note that this should always be set
        to 'public' but having this exposed helps with testing. The other
        options are 'private' and 'sandbox'
    """
    studies = qdb.study.Study.get_by_status(study_status)
    qiita_config = ConfigurationManager()
    working_dir = qiita_config.working_dir
    portal = qiita_config.portal
    bdir = qdb.util.get_db_files_base_dir()
    time = datetime.now().strftime('%m-%d-%y %H:%M:%S')

    data = []
    for s in studies:
        # [0] latest is first, [1] only getting the filepath
        sample_fp = relpath(s.sample_template.get_filepaths()[0][1], bdir)

        for a in s.artifacts(artifact_type='BIOM'):
            if a.processing_parameters is None:
                continue

            cmd_name = a.processing_parameters.command.name

            # this loop is necessary as in theory an artifact can be
            # generated from multiple prep info files
            human_cmd = []
            for p in a.parents:
                pp = p.processing_parameters
                pp_cmd_name = pp.command.name
                if pp_cmd_name == 'Trimming':
                    human_cmd.append('%s @ %s' %
                                     (cmd_name, str(pp.values['length'])))
                else:
                    human_cmd.append('%s, %s' % (cmd_name, pp_cmd_name))
            human_cmd = ', '.join(human_cmd)

            for _, fp, fp_type in a.filepaths:
                if fp_type != 'biom' or 'only-16s' in fp:
                    continue
                fp = relpath(fp, bdir)
                # format: (biom_fp, sample_fp, prep_fp, qiita_artifact_id,
                #          human readable name)
                for pt in a.prep_templates:
                    for _, prep_fp in pt.get_filepaths():
                        if 'qiime' not in prep_fp:
                            break
                    prep_fp = relpath(prep_fp, bdir)
                    data.append((fp, sample_fp, prep_fp, a.id, human_cmd))

    # writing text and tgz file
    ts = datetime.now().strftime('%m%d%y-%H%M%S')
    tgz_dir = join(working_dir, 'releases')
    if not exists(tgz_dir):
        makedirs(tgz_dir)
    tgz_name = join(tgz_dir, '%s-%s-building.tgz' % (portal, study_status))
    tgz_name_final = join(tgz_dir, '%s-%s.tgz' % (portal, study_status))
    txt_hd = StringIO()
    with topen(tgz_name, "w|gz") as tgz:
        # writing header for txt
        txt_hd.write(
            "biom_fp\tsample_fp\tprep_fp\tqiita_artifact_id\tcommand\n")
        for biom_fp, sample_fp, prep_fp, artifact_id, human_cmd in data:
            txt_hd.write("%s\t%s\t%s\t%s\t%s\n" %
                         (biom_fp, sample_fp, prep_fp, artifact_id, human_cmd))
            tgz.add(join(bdir, biom_fp), arcname=biom_fp, recursive=False)
            tgz.add(join(bdir, sample_fp), arcname=sample_fp, recursive=False)
            tgz.add(join(bdir, prep_fp), arcname=prep_fp, recursive=False)

        txt_hd.seek(0)
        info = TarInfo(name='%s-%s-%s.txt' % (portal, study_status, ts))
        info.size = len(txt_hd.buf)
        tgz.addfile(tarinfo=info, fileobj=txt_hd)

    with open(tgz_name, "rb") as f:
        md5sum = md5()
        for c in iter(lambda: f.read(4096), b""):
            md5sum.update(c)

    rename(tgz_name, tgz_name_final)

    vals = [('filepath', tgz_name_final[len(working_dir):], r_client.set),
            ('md5sum', md5sum.hexdigest(), r_client.set),
            ('time', time, r_client.set)]
    for k, v, f in vals:
        redis_key = '%s:release:%s:%s' % (portal, study_status, k)
        # important to "flush" variables to avoid errors
        r_client.delete(redis_key)
        f(redis_key, v)
Beispiel #22
0
def update_redis_stats():
    """Generate the system stats and save them in redis

    Returns
    -------
    list of str
        artifact filepaths that are not present in the file system
    """
    STUDY = qdb.study.Study

    number_studies = {'public': 0, 'private': 0, 'sandbox': 0}
    number_of_samples = {'public': 0, 'private': 0, 'sandbox': 0}
    num_studies_ebi = 0
    num_samples_ebi = 0
    number_samples_ebi_prep = 0
    stats = []
    missing_files = []
    per_data_type_stats = Counter()
    for study in STUDY.iter():
        st = study.sample_template
        if st is None:
            continue

        # counting samples submitted to EBI-ENA
        len_samples_ebi = sum(
            [esa is not None for esa in st.ebi_sample_accessions.values()])
        if len_samples_ebi != 0:
            num_studies_ebi += 1
            num_samples_ebi += len_samples_ebi

        samples_status = defaultdict(set)
        for pt in study.prep_templates():
            pt_samples = list(pt.keys())
            pt_status = pt.status
            if pt_status == 'public':
                per_data_type_stats[pt.data_type()] += len(pt_samples)
            samples_status[pt_status].update(pt_samples)
            # counting experiments (samples in preps) submitted to EBI-ENA
            number_samples_ebi_prep += sum([
                esa is not None
                for esa in pt.ebi_experiment_accessions.values()
            ])

        # counting studies
        if 'public' in samples_status:
            number_studies['public'] += 1
        elif 'private' in samples_status:
            number_studies['private'] += 1
        else:
            # note that this is a catch all for other status; at time of
            # writing there is status: awaiting_approval
            number_studies['sandbox'] += 1

        # counting samples; note that some of these lines could be merged with
        # the block above but I decided to split it in 2 for clarity
        if 'public' in samples_status:
            number_of_samples['public'] += len(samples_status['public'])
        if 'private' in samples_status:
            number_of_samples['private'] += len(samples_status['private'])
        if 'sandbox' in samples_status:
            number_of_samples['sandbox'] += len(samples_status['sandbox'])

        # processing filepaths
        for artifact in study.artifacts():
            for adata in artifact.filepaths:
                try:
                    s = stat(adata['fp'])
                except OSError:
                    missing_files.append(adata['fp'])
                else:
                    stats.append((adata['fp_type'], s.st_size,
                                  strftime('%Y-%m', localtime(s.st_mtime))))

    num_users = qdb.util.get_count('qiita.qiita_user')
    num_processing_jobs = qdb.util.get_count('qiita.processing_job')

    lat_longs = dumps(get_lat_longs())

    summary = {}
    all_dates = []
    # these are some filetypes that are too small to plot alone so we'll merge
    # in other
    group_other = {
        'html_summary', 'tgz', 'directory', 'raw_fasta', 'log', 'biom',
        'raw_sff', 'raw_qual', 'qza', 'html_summary_dir', 'qza', 'plain_text',
        'raw_barcodes'
    }
    for ft, size, ym in stats:
        if ft in group_other:
            ft = 'other'
        if ft not in summary:
            summary[ft] = {}
        if ym not in summary[ft]:
            summary[ft][ym] = 0
            all_dates.append(ym)
        summary[ft][ym] += size
    all_dates = sorted(set(all_dates))

    # sorting summaries
    ordered_summary = {}
    for dt in summary:
        new_list = []
        current_value = 0
        for ad in all_dates:
            if ad in summary[dt]:
                current_value += summary[dt][ad]
            new_list.append(current_value)
        ordered_summary[dt] = new_list

    plot_order = sorted([(k, ordered_summary[k][-1]) for k in ordered_summary],
                        key=lambda x: x[1])

    # helper function to generate y axis, modified from:
    # http://stackoverflow.com/a/1094933
    def sizeof_fmt(value, position):
        number = None
        for unit in ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']:
            if abs(value) < 1024.0:
                number = "%3.1f%s" % (value, unit)
                break
            value /= 1024.0
        if number is None:
            number = "%.1f%s" % (value, 'Yi')
        return number

    all_dates_axis = range(len(all_dates))
    plt.locator_params(axis='y', nbins=10)
    plt.figure(figsize=(20, 10))
    for k, v in plot_order:
        plt.plot(all_dates_axis, ordered_summary[k], linewidth=2, label=k)

    plt.xticks(all_dates_axis, all_dates)
    plt.legend()
    plt.grid()
    ax = plt.gca()
    ax.yaxis.set_major_formatter(mpl.ticker.FuncFormatter(sizeof_fmt))
    plt.xticks(rotation=90)
    plt.xlabel('Date')
    plt.ylabel('Storage space per data type')

    plot = BytesIO()
    plt.savefig(plot, format='png')
    plot.seek(0)
    img = 'data:image/png;base64,' + quote(b64encode(plot.getbuffer()))

    time = datetime.now().strftime('%m-%d-%y %H:%M:%S')

    portal = qiita_config.portal
    # making sure per_data_type_stats has some data so hmset doesn't fail
    if per_data_type_stats == {}:
        per_data_type_stats['No data'] = 0

    vals = [('number_studies', number_studies, r_client.hmset),
            ('number_of_samples', number_of_samples, r_client.hmset),
            ('per_data_type_stats', dict(per_data_type_stats), r_client.hmset),
            ('num_users', num_users, r_client.set),
            ('lat_longs', (lat_longs), r_client.set),
            ('num_studies_ebi', num_studies_ebi, r_client.set),
            ('num_samples_ebi', num_samples_ebi, r_client.set),
            ('number_samples_ebi_prep', number_samples_ebi_prep, r_client.set),
            ('img', img, r_client.set), ('time', time, r_client.set),
            ('num_processing_jobs', num_processing_jobs, r_client.set)]
    for k, v, f in vals:
        redis_key = '%s:stats:%s' % (portal, k)
        # important to "flush" variables to avoid errors
        r_client.delete(redis_key)
        f(redis_key, v)

    # preparing vals to insert into DB
    vals = dumps(dict([x[:-1] for x in vals]))
    sql = """INSERT INTO qiita.stats_daily (stats, stats_timestamp)
             VALUES (%s, NOW())"""
    qdb.sql_connection.perform_as_transaction(sql, [vals])

    return missing_files
Beispiel #23
0
def generate_biom_and_metadata_release(study_status='public'):
    """Generate a list of biom/meatadata filepaths and a tgz of those files

    Parameters
    ----------
    study_status : str, optional
        The study status to search for. Note that this should always be set
        to 'public' but having this exposed helps with testing. The other
        options are 'private' and 'sandbox'
    """
    studies = qdb.study.Study.get_by_status(study_status)
    qiita_config = ConfigurationManager()
    working_dir = qiita_config.working_dir
    portal = qiita_config.portal
    bdir = qdb.util.get_db_files_base_dir()
    time = datetime.now().strftime('%m-%d-%y %H:%M:%S')

    data = []
    for s in studies:
        # [0] latest is first, [1] only getting the filepath
        sample_fp = relpath(s.sample_template.get_filepaths()[0][1], bdir)

        for a in s.artifacts(artifact_type='BIOM'):
            if a.processing_parameters is None or a.visibility != study_status:
                continue

            merging_schemes, parent_softwares = a.merging_scheme
            software = a.processing_parameters.command.software
            software = '%s v%s' % (software.name, software.version)

            for x in a.filepaths:
                if x['fp_type'] != 'biom' or 'only-16s' in x['fp']:
                    continue
                fp = relpath(x['fp'], bdir)
                for pt in a.prep_templates:
                    categories = pt.categories()
                    platform = ''
                    target_gene = ''
                    if 'platform' in categories:
                        platform = ', '.join(
                            set(pt.get_category('platform').values()))
                    if 'target_gene' in categories:
                        target_gene = ', '.join(
                            set(pt.get_category('target_gene').values()))
                    for _, prep_fp in pt.get_filepaths():
                        if 'qiime' not in prep_fp:
                            break
                    prep_fp = relpath(prep_fp, bdir)
                    # format: (biom_fp, sample_fp, prep_fp, qiita_artifact_id,
                    #          platform, target gene, merging schemes,
                    #          artifact software/version,
                    #          parent sofware/version)
                    data.append((fp, sample_fp, prep_fp, a.id, platform,
                                 target_gene, merging_schemes, software,
                                 parent_softwares))

    # writing text and tgz file
    ts = datetime.now().strftime('%m%d%y-%H%M%S')
    tgz_dir = join(working_dir, 'releases')
    create_nested_path(tgz_dir)
    tgz_name = join(tgz_dir, '%s-%s-building.tgz' % (portal, study_status))
    tgz_name_final = join(tgz_dir, '%s-%s.tgz' % (portal, study_status))
    txt_lines = [
        "biom fp\tsample fp\tprep fp\tqiita artifact id\tplatform\t"
        "target gene\tmerging scheme\tartifact software\tparent software"]
    with topen(tgz_name, "w|gz") as tgz:
        for biom_fp, sample_fp, prep_fp, aid, pform, tg, ms, asv, psv in data:
            txt_lines.append("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % (
                biom_fp, sample_fp, prep_fp, aid, pform, tg, ms, asv, psv))
            tgz.add(join(bdir, biom_fp), arcname=biom_fp, recursive=False)
            tgz.add(join(bdir, sample_fp), arcname=sample_fp, recursive=False)
            tgz.add(join(bdir, prep_fp), arcname=prep_fp, recursive=False)
        info = TarInfo(name='%s-%s-%s.txt' % (portal, study_status, ts))
        txt_hd = BytesIO()
        txt_hd.write(bytes('\n'.join(txt_lines), 'ascii'))
        txt_hd.seek(0)
        info.size = len(txt_hd.read())
        txt_hd.seek(0)
        tgz.addfile(tarinfo=info, fileobj=txt_hd)

    with open(tgz_name, "rb") as f:
        md5sum = md5()
        for c in iter(lambda: f.read(4096), b""):
            md5sum.update(c)

    rename(tgz_name, tgz_name_final)

    vals = [
        ('filepath', tgz_name_final[len(working_dir):], r_client.set),
        ('md5sum', md5sum.hexdigest(), r_client.set),
        ('time', time, r_client.set)]
    for k, v, f in vals:
        redis_key = '%s:release:%s:%s' % (portal, study_status, k)
        # important to "flush" variables to avoid errors
        r_client.delete(redis_key)
        f(redis_key, v)
Beispiel #24
0
def generate_plugin_releases():
    """Generate releases for plugins
    """
    ARCHIVE = qdb.archive.Archive
    qiita_config = ConfigurationManager()
    working_dir = qiita_config.working_dir

    commands = [
        c for s in qdb.software.Software.iter(active=True) for c in s.commands
        if c.post_processing_cmd is not None
    ]

    tnow = datetime.now()
    ts = tnow.strftime('%m%d%y-%H%M%S')
    tgz_dir = join(working_dir, 'releases', 'archive')
    create_nested_path(tgz_dir)
    tgz_dir_release = join(tgz_dir, ts)
    create_nested_path(tgz_dir_release)
    for cmd in commands:
        cmd_name = cmd.name
        mschemes = [
            v for _, v in ARCHIVE.merging_schemes().items() if cmd_name in v
        ]
        for ms in mschemes:
            ms_name = sub('[^0-9a-zA-Z]+', '', ms)
            ms_fp = join(tgz_dir_release, ms_name)
            create_nested_path(ms_fp)

            pfp = join(ms_fp, 'archive.json')
            archives = {
                k: loads(v)
                for k, v in ARCHIVE.retrieve_feature_values(
                    archive_merging_scheme=ms).items() if v != ''
            }
            with open(pfp, 'w') as f:
                dump(archives, f)

            # now let's run the post_processing_cmd
            ppc = cmd.post_processing_cmd

            # concatenate any other parameters into a string
            params = ' '.join(
                ["%s=%s" % (k, v) for k, v in ppc['script_params'].items()])
            # append archives file and output dir parameters
            params = ("%s --fp_archive=%s --output_dir=%s" %
                      (params, pfp, ms_fp))

            ppc_cmd = "%s %s %s" % (ppc['script_env'], ppc['script_path'],
                                    params)
            p_out, p_err, rv = qdb.processing_job._system_call(ppc_cmd)
            p_out = p_out.rstrip()
            if rv != 0:
                raise ValueError('Error %d: %s' % (rv, p_out))
            p_out = loads(p_out)

    # tgz-ing all files
    tgz_name = join(tgz_dir, 'archive-%s-building.tgz' % ts)
    tgz_name_final = join(tgz_dir, 'archive.tgz')
    with topen(tgz_name, "w|gz") as tgz:
        tgz.add(tgz_dir_release, arcname=basename(tgz_dir_release))
    # getting the release md5
    with open(tgz_name, "rb") as f:
        md5sum = md5()
        for c in iter(lambda: f.read(4096), b""):
            md5sum.update(c)
    rename(tgz_name, tgz_name_final)
    vals = [('filepath', tgz_name_final[len(working_dir):], r_client.set),
            ('md5sum', md5sum.hexdigest(), r_client.set),
            ('time', tnow.strftime('%m-%d-%y %H:%M:%S'), r_client.set)]
    for k, v, f in vals:
        redis_key = 'release-archive:%s' % k
        # important to "flush" variables to avoid errors
        r_client.delete(redis_key)
        f(redis_key, v)
Beispiel #25
0
def update_redis_stats():
    """Generate the system stats and save them in redis

    Returns
    -------
    list of str
        artifact filepaths that are not present in the file system
    """
    STUDY = qdb.study.Study
    studies = {'public': STUDY.get_by_status('public'),
               'private': STUDY.get_by_status('private'),
               'sandbox': STUDY.get_by_status('sandbox')}
    number_studies = {k: len(v) for k, v in viewitems(studies)}

    number_of_samples = {}
    ebi_samples_prep = {}
    num_samples_ebi = 0
    for k, sts in viewitems(studies):
        number_of_samples[k] = 0
        for s in sts:
            st = s.sample_template
            if st is not None:
                number_of_samples[k] += len(list(st.keys()))

            ebi_samples_prep_count = 0
            for pt in s.prep_templates():
                ebi_samples_prep_count += len([
                    1 for _, v in viewitems(pt.ebi_experiment_accessions)
                    if v is not None and v != ''])
            ebi_samples_prep[s.id] = ebi_samples_prep_count

            if s.sample_template is not None:
                num_samples_ebi += len([
                    1 for _, v in viewitems(
                        s.sample_template.ebi_sample_accessions)
                    if v is not None and v != ''])

    num_users = qdb.util.get_count('qiita.qiita_user')

    lat_longs = get_lat_longs()

    num_studies_ebi = len([k for k, v in viewitems(ebi_samples_prep)
                           if v >= 1])
    number_samples_ebi_prep = sum([v for _, v in viewitems(ebi_samples_prep)])

    # generating file size stats
    stats = []
    missing_files = []
    for k, sts in viewitems(studies):
        for s in sts:
            for a in s.artifacts():
                for _, fp, dt in a.filepaths:
                    try:
                        s = stat(fp)
                        stats.append((dt, s.st_size, strftime('%Y-%m',
                                      localtime(s.st_ctime))))
                    except OSError:
                        missing_files.append(fp)

    summary = {}
    all_dates = []
    for ft, size, ym in stats:
        if ft not in summary:
            summary[ft] = {}
        if ym not in summary[ft]:
            summary[ft][ym] = 0
            all_dates.append(ym)
        summary[ft][ym] += size
    all_dates = sorted(set(all_dates))

    # sorting summaries
    rm_from_data = ['html_summary', 'tgz', 'directory', 'raw_fasta', 'log',
                    'biom', 'raw_sff', 'raw_qual']
    ordered_summary = {}
    for dt in summary:
        if dt in rm_from_data:
            continue
        new_list = []
        current_value = 0
        for ad in all_dates:
            if ad in summary[dt]:
                current_value += summary[dt][ad]
            new_list.append(current_value)
        ordered_summary[dt] = new_list

    plot_order = sorted([(k, ordered_summary[k][-1]) for k in ordered_summary],
                        key=lambda x: x[1])

    # helper function to generate y axis, modified from:
    # http://stackoverflow.com/a/1094933
    def sizeof_fmt(value, position):
        number = None
        for unit in ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']:
            if abs(value) < 1024.0:
                number = "%3.1f%s" % (value, unit)
                break
            value /= 1024.0
        if number is None:
            number = "%.1f%s" % (value, 'Yi')
        return number

    all_dates_axis = range(len(all_dates))
    plt.locator_params(axis='y', nbins=10)
    plt.figure(figsize=(20, 10))
    for k, v in plot_order:
        plt.plot(all_dates_axis, ordered_summary[k], linewidth=2, label=k)

    plt.xticks(all_dates_axis, all_dates)
    plt.legend()
    plt.grid()
    ax = plt.gca()
    ax.yaxis.set_major_formatter(mpl.ticker.FuncFormatter(sizeof_fmt))
    plt.xticks(rotation=90)
    plt.xlabel('Date')
    plt.ylabel('Storage space per data type')

    plot = StringIO()
    plt.savefig(plot, format='png')
    plot.seek(0)
    img = 'data:image/png;base64,' + quote(b64encode(plot.buf))

    time = datetime.now().strftime('%m-%d-%y %H:%M:%S')

    portal = qiita_config.portal
    vals = [
        ('number_studies', number_studies, r_client.hmset),
        ('number_of_samples', number_of_samples, r_client.hmset),
        ('num_users', num_users, r_client.set),
        ('lat_longs', lat_longs, r_client.set),
        ('num_studies_ebi', num_studies_ebi, r_client.set),
        ('num_samples_ebi', num_samples_ebi, r_client.set),
        ('number_samples_ebi_prep', number_samples_ebi_prep, r_client.set),
        ('img', img, r_client.set),
        ('time', time, r_client.set)]
    for k, v, f in vals:
        redis_key = '%s:stats:%s' % (portal, k)
        # important to "flush" variables to avoid errors
        r_client.delete(redis_key)
        f(redis_key, v)

    return missing_files