Esempio n. 1
0
File: upload.py Progetto: jlab/qiita
    def post(self, study_id):
        method = self.get_argument('remote-request-type')
        url = self.get_argument('inputURL')
        ssh_key = self.request.files['ssh-key'][0]['body']
        status = 'success'
        message = ''

        try:
            study = Study(int(study_id))
        except QiitaDBUnknownIDError:
            raise HTTPError(404, reason="Study %s does not exist" % study_id)
        check_access(self.current_user,
                     study,
                     no_public=True,
                     raise_error=True)

        _, upload_folder = get_mountpoint("uploads")[0]
        upload_folder = join(upload_folder, study_id)
        ssh_key_fp = join(upload_folder, '.key.txt')

        create_nested_path(upload_folder)

        with open(ssh_key_fp, 'wb') as f:
            f.write(ssh_key)
        chmod(ssh_key_fp, 0o600)

        qiita_plugin = Software.from_name_and_version('Qiita', 'alpha')
        if method == 'list':
            cmd = qiita_plugin.get_command('list_remote_files')
            params = Parameters.load(cmd,
                                     values_dict={
                                         'url': url,
                                         'private_key': ssh_key_fp,
                                         'study_id': study_id
                                     })
        elif method == 'transfer':
            cmd = qiita_plugin.get_command('download_remote_files')
            params = Parameters.load(cmd,
                                     values_dict={
                                         'url': url,
                                         'private_key': ssh_key_fp,
                                         'destination': upload_folder
                                     })
        else:
            status = 'error'
            message = 'Not a valid method'

        if status == 'success':
            job = ProcessingJob.create(self.current_user, params, True)
            job.submit()
            r_client.set(UPLOAD_STUDY_FORMAT % study_id,
                         dumps({'job_id': job.id}))

        self.write({'status': status, 'message': message})
Esempio n. 2
0
    def post(self, study_id):
        method = self.get_argument('remote-request-type')
        url = self.get_argument('inputURL')
        ssh_key = self.request.files['ssh-key'][0]['body']
        status = 'success'
        message = ''

        try:
            study = Study(int(study_id))
        except QiitaDBUnknownIDError:
            raise HTTPError(404, reason="Study %s does not exist" % study_id)
        check_access(
            self.current_user, study, no_public=True, raise_error=True)

        _, upload_folder = get_mountpoint("uploads")[0]
        upload_folder = join(upload_folder, study_id)
        ssh_key_fp = join(upload_folder, '.key.txt')

        create_nested_path(upload_folder)

        with open(ssh_key_fp, 'w') as f:
            f.write(ssh_key)

        qiita_plugin = Software.from_name_and_version('Qiita', 'alpha')
        if method == 'list':
            cmd = qiita_plugin.get_command('list_remote_files')
            params = Parameters.load(cmd, values_dict={
                'url': url, 'private_key': ssh_key_fp, 'study_id': study_id})
        elif method == 'transfer':
            cmd = qiita_plugin.get_command('download_remote_files')
            params = Parameters.load(cmd, values_dict={
                'url': url, 'private_key': ssh_key_fp,
                'destination': upload_folder})
        else:
            status = 'error'
            message = 'Not a valid method'

        if status == 'success':
            job = ProcessingJob.create(self.current_user, params, True)
            job.submit()
            r_client.set(
                UPLOAD_STUDY_FORMAT % study_id, dumps({'job_id': job.id}))

        self.write({'status': status, 'message': message})
Esempio n. 3
0
File: upload.py Progetto: jlab/qiita
    def post(self):
        resumable_identifier = self.get_argument('resumableIdentifier')
        resumable_filename = self.get_argument('resumableFilename')
        resumable_chunk_number = int(self.get_argument('resumableChunkNumber'))
        resumable_total_chunks = int(self.get_argument('resumableTotalChunks'))
        study_id = self.get_argument('study_id')
        data = self.request.files['file'][0]['body']

        check_access(self.current_user,
                     Study(int(study_id)),
                     no_public=True,
                     raise_error=True)

        self.validate_file_extension(resumable_filename)

        _, base_fp = get_mountpoint("uploads")[0]

        # creating temporal folder for upload of the file
        temp_dir = join(base_fp, study_id, resumable_identifier)
        create_nested_path(temp_dir)

        # location of the file as it is transmitted
        temporary_location = join(temp_dir, resumable_filename)

        # this is the result of a failed upload
        if resumable_chunk_number == 1 and exists(temporary_location):
            remove(temporary_location)

        # append every transmitted chunk
        with open(temporary_location, 'ab') as tmp_file:
            tmp_file.write(bytes(data))

        if resumable_chunk_number == resumable_total_chunks:
            final_location = join(base_fp, study_id, resumable_filename)

            if exists(final_location):
                remove(final_location)

            move(temporary_location, final_location)
            rmtree(temp_dir)
            self.set_status(200)
Esempio n. 4
0
    def post(self):
        resumable_identifier = self.get_argument('resumableIdentifier')
        resumable_filename = self.get_argument('resumableFilename')
        resumable_chunk_number = int(self.get_argument('resumableChunkNumber'))
        resumable_total_chunks = int(self.get_argument('resumableTotalChunks'))
        study_id = self.get_argument('study_id')
        data = self.request.files['file'][0]['body']

        check_access(self.current_user, Study(int(study_id)),
                     no_public=True, raise_error=True)

        self.validate_file_extension(resumable_filename)

        _, base_fp = get_mountpoint("uploads")[0]

        # creating temporal folder for upload of the file
        temp_dir = join(base_fp, study_id, resumable_identifier)
        create_nested_path(temp_dir)

        # location of the file as it is transmitted
        temporary_location = join(temp_dir, resumable_filename)

        # this is the result of a failed upload
        if resumable_chunk_number == 1 and exists(temporary_location):
            remove(temporary_location)

        # append every transmitted chunk
        with open(temporary_location, 'ab') as tmp_file:
            tmp_file.write(bytes(data))

        if resumable_chunk_number == resumable_total_chunks:
            final_location = join(base_fp, study_id, resumable_filename)

            if exists(final_location):
                remove(final_location)

            move(temporary_location, final_location)
            rmtree(temp_dir)
            self.set_status(200)
Esempio n. 5
0
def generate_plugin_releases():
    """Generate releases for plugins
    """
    ARCHIVE = qdb.archive.Archive
    qiita_config = ConfigurationManager()
    working_dir = qiita_config.working_dir

    commands = [
        c for s in qdb.software.Software.iter(active=True) for c in s.commands
        if c.post_processing_cmd is not None
    ]

    tnow = datetime.now()
    ts = tnow.strftime('%m%d%y-%H%M%S')
    tgz_dir = join(working_dir, 'releases', 'archive')
    create_nested_path(tgz_dir)
    tgz_dir_release = join(tgz_dir, ts)
    create_nested_path(tgz_dir_release)
    for cmd in commands:
        cmd_name = cmd.name
        mschemes = [
            v for _, v in ARCHIVE.merging_schemes().items() if cmd_name in v
        ]
        for ms in mschemes:
            ms_name = sub('[^0-9a-zA-Z]+', '', ms)
            ms_fp = join(tgz_dir_release, ms_name)
            create_nested_path(ms_fp)

            pfp = join(ms_fp, 'archive.json')
            archives = {
                k: loads(v)
                for k, v in ARCHIVE.retrieve_feature_values(
                    archive_merging_scheme=ms).items() if v != ''
            }
            with open(pfp, 'w') as f:
                dump(archives, f)

            # now let's run the post_processing_cmd
            ppc = cmd.post_processing_cmd

            # concatenate any other parameters into a string
            params = ' '.join(
                ["%s=%s" % (k, v) for k, v in ppc['script_params'].items()])
            # append archives file and output dir parameters
            params = ("%s --fp_archive=%s --output_dir=%s" %
                      (params, pfp, ms_fp))

            ppc_cmd = "%s %s %s" % (ppc['script_env'], ppc['script_path'],
                                    params)
            p_out, p_err, rv = qdb.processing_job._system_call(ppc_cmd)
            p_out = p_out.rstrip()
            if rv != 0:
                raise ValueError('Error %d: %s' % (rv, p_out))
            p_out = loads(p_out)

    # tgz-ing all files
    tgz_name = join(tgz_dir, 'archive-%s-building.tgz' % ts)
    tgz_name_final = join(tgz_dir, 'archive.tgz')
    with topen(tgz_name, "w|gz") as tgz:
        tgz.add(tgz_dir_release, arcname=basename(tgz_dir_release))
    # getting the release md5
    with open(tgz_name, "rb") as f:
        md5sum = md5()
        for c in iter(lambda: f.read(4096), b""):
            md5sum.update(c)
    rename(tgz_name, tgz_name_final)
    vals = [('filepath', tgz_name_final[len(working_dir):], r_client.set),
            ('md5sum', md5sum.hexdigest(), r_client.set),
            ('time', tnow.strftime('%m-%d-%y %H:%M:%S'), r_client.set)]
    for k, v, f in vals:
        redis_key = 'release-archive:%s' % k
        # important to "flush" variables to avoid errors
        r_client.delete(redis_key)
        f(redis_key, v)
Esempio n. 6
0
def generate_biom_and_metadata_release(study_status='public'):
    """Generate a list of biom/meatadata filepaths and a tgz of those files

    Parameters
    ----------
    study_status : str, optional
        The study status to search for. Note that this should always be set
        to 'public' but having this exposed helps with testing. The other
        options are 'private' and 'sandbox'
    """
    studies = qdb.study.Study.get_by_status(study_status)
    qiita_config = ConfigurationManager()
    working_dir = qiita_config.working_dir
    portal = qiita_config.portal
    bdir = qdb.util.get_db_files_base_dir()
    time = datetime.now().strftime('%m-%d-%y %H:%M:%S')

    data = []
    for s in studies:
        # [0] latest is first, [1] only getting the filepath
        sample_fp = relpath(s.sample_template.get_filepaths()[0][1], bdir)

        for a in s.artifacts(artifact_type='BIOM'):
            if a.processing_parameters is None or a.visibility != study_status:
                continue

            merging_schemes, parent_softwares = a.merging_scheme
            software = a.processing_parameters.command.software
            software = '%s v%s' % (software.name, software.version)

            for x in a.filepaths:
                if x['fp_type'] != 'biom' or 'only-16s' in x['fp']:
                    continue
                fp = relpath(x['fp'], bdir)
                for pt in a.prep_templates:
                    categories = pt.categories()
                    platform = ''
                    target_gene = ''
                    if 'platform' in categories:
                        platform = ', '.join(
                            set(pt.get_category('platform').values()))
                    if 'target_gene' in categories:
                        target_gene = ', '.join(
                            set(pt.get_category('target_gene').values()))
                    for _, prep_fp in pt.get_filepaths():
                        if 'qiime' not in prep_fp:
                            break
                    prep_fp = relpath(prep_fp, bdir)
                    # format: (biom_fp, sample_fp, prep_fp, qiita_artifact_id,
                    #          platform, target gene, merging schemes,
                    #          artifact software/version,
                    #          parent sofware/version)
                    data.append(
                        (fp, sample_fp, prep_fp, a.id, platform, target_gene,
                         merging_schemes, software, parent_softwares))

    # writing text and tgz file
    ts = datetime.now().strftime('%m%d%y-%H%M%S')
    tgz_dir = join(working_dir, 'releases')
    create_nested_path(tgz_dir)
    tgz_name = join(tgz_dir, '%s-%s-building.tgz' % (portal, study_status))
    tgz_name_final = join(tgz_dir, '%s-%s.tgz' % (portal, study_status))
    txt_lines = [
        "biom fp\tsample fp\tprep fp\tqiita artifact id\tplatform\t"
        "target gene\tmerging scheme\tartifact software\tparent software"
    ]
    with topen(tgz_name, "w|gz") as tgz:
        for biom_fp, sample_fp, prep_fp, aid, pform, tg, ms, asv, psv in data:
            txt_lines.append(
                "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" %
                (biom_fp, sample_fp, prep_fp, aid, pform, tg, ms, asv, psv))
            tgz.add(join(bdir, biom_fp), arcname=biom_fp, recursive=False)
            tgz.add(join(bdir, sample_fp), arcname=sample_fp, recursive=False)
            tgz.add(join(bdir, prep_fp), arcname=prep_fp, recursive=False)
        info = TarInfo(name='%s-%s-%s.txt' % (portal, study_status, ts))
        txt_hd = BytesIO()
        txt_hd.write(bytes('\n'.join(txt_lines), 'ascii'))
        txt_hd.seek(0)
        info.size = len(txt_hd.read())
        txt_hd.seek(0)
        tgz.addfile(tarinfo=info, fileobj=txt_hd)

    with open(tgz_name, "rb") as f:
        md5sum = md5()
        for c in iter(lambda: f.read(4096), b""):
            md5sum.update(c)

    rename(tgz_name, tgz_name_final)

    vals = [('filepath', tgz_name_final[len(working_dir):], r_client.set),
            ('md5sum', md5sum.hexdigest(), r_client.set),
            ('time', time, r_client.set)]
    for k, v, f in vals:
        redis_key = '%s:release:%s:%s' % (portal, study_status, k)
        # important to "flush" variables to avoid errors
        r_client.delete(redis_key)
        f(redis_key, v)
Esempio n. 7
0
    def generate_demultiplexed_fastq(self, rewrite_fastq=False, mtime=None):
        """Generates demultiplexed fastq

        Parameters
        ----------
        rewrite_fastq : bool, optional
            If true, it forces the rewrite of the fastq files
        mtime : float, optional
            The time to use when creating the gz files. If None, the current
            time will be used by gzip.GzipFile. This is useful for testing.

        Returns
        -------
        demux_samples
            List of successful demultiplexed samples

        Notes
        -----
        - As a performace feature, this method will check if self.full_ebi_dir
        already exists and, if it does, the script will assume that in a
        previous execution this step was performed correctly and will simply
        read the file names from self.full_ebi_dir
        - When the object is created (init), samples, samples_prep and
        sample_demux_fps hold values for all available samples in the database.
        Here some of those values will be deleted (del's, within the loops) for
        those cases where the fastq.gz files weren't written or exist. This is
        an indication that they had no sequences and this kind of files are not
        accepted in EBI

        Raises
        ------
        EBISubmissionError
            - The demux file couldn't be read
            - All samples are removed
        """
        dir_not_exists = not isdir(self.full_ebi_dir)
        missing_samples = []
        if dir_not_exists or rewrite_fastq:
            # if it exists, remove folder and start from scratch
            if isdir(self.full_ebi_dir):
                rmtree(self.full_ebi_dir)

            create_nested_path(self.full_ebi_dir)

            if self.artifact.artifact_type == 'per_sample_FASTQ':
                demux_samples, missing_samples = \
                    self._generate_demultiplexed_fastq_per_sample_FASTQ()
            else:
                demux_samples = self._generate_demultiplexed_fastq_demux(mtime)
        else:
            # if we are within this else, it means that we already have
            # generated the raw files and for some reason the submission
            # failed so we don't need to generate the files again and just
            # check which files exist in the file path to create our final
            # list of samples
            demux_samples = set()
            extension = self.FWD_READ_SUFFIX
            extension_len = len(extension)
            all_missing_files = set()
            for f in listdir(self.full_ebi_dir):
                fpath = join(self.full_ebi_dir, f)
                if isfile(fpath) and f.endswith(extension):
                    demux_samples.add(f[:-extension_len])
                else:
                    all_missing_files.add(f[:-extension_len])
            # at this stage we have created/reviewed all the files and have
            # all the sample names, however, we are not sure if we are dealing
            # with just forwards or if we are dealing with also reverse. The
            # easiest way to do this is to review the all_missing_files
            missing_files = all_missing_files - demux_samples
            if missing_files != all_missing_files:
                self.per_sample_FASTQ_reverse = True

            missing_samples = set(
                self.samples.keys()).difference(demux_samples)

        if missing_samples:
            for ms in missing_samples:
                del(self.samples[ms])
                del(self.samples_prep[ms])
                del(self.sample_demux_fps[ms])

        if not demux_samples:
            error_msg = ("All samples were removed from the submission "
                         "because the demux file is empty or the sample names "
                         "do not match.")
            LogEntry.create('Runtime', error_msg)
            raise EBISubmissionError(error_msg)

        return demux_samples
Esempio n. 8
0
File: ebi.py Progetto: mcmk3/qiita
    def generate_demultiplexed_fastq(self, rewrite_fastq=False, mtime=None):
        """Generates demultiplexed fastq

        Parameters
        ----------
        rewrite_fastq : bool, optional
            If true, it forces the rewrite of the fastq files
        mtime : float, optional
            The time to use when creating the gz files. If None, the current
            time will be used by gzip.GzipFile. This is useful for testing.

        Returns
        -------
        demux_samples
            List of successful demultiplexed samples

        Notes
        -----
        - As a performace feature, this method will check if self.full_ebi_dir
        already exists and, if it does, the script will assume that in a
        previous execution this step was performed correctly and will simply
        read the file names from self.full_ebi_dir
        - When the object is created (init), samples, samples_prep and
        sample_demux_fps hold values for all available samples in the database.
        Here some of those values will be deleted (del's, within the loops) for
        those cases where the fastq.gz files weren't written or exist. This is
        an indication that they had no sequences and this kind of files are not
        accepted in EBI

        Raises
        ------
        EBISubmissionError
            - The demux file couldn't be read
            - All samples are removed
        """
        dir_not_exists = not isdir(self.full_ebi_dir)
        missing_samples = []
        if dir_not_exists or rewrite_fastq:
            # if it exists, remove folder and start from scratch
            if isdir(self.full_ebi_dir):
                rmtree(self.full_ebi_dir)

            create_nested_path(self.full_ebi_dir)

            if self.artifact.artifact_type == 'per_sample_FASTQ':
                demux_samples, missing_samples = \
                    self._generate_demultiplexed_fastq_per_sample_FASTQ()
            else:
                demux_samples = self._generate_demultiplexed_fastq_demux(mtime)
        else:
            # if we are within this else, it means that we already have
            # generated the raw files and for some reason the submission
            # failed so we don't need to generate the files again and just
            # check which files exist in the file path to create our final
            # list of samples
            demux_samples = set()
            extension = self.FWD_READ_SUFFIX
            extension_len = len(extension)
            all_missing_files = set()
            for f in listdir(self.full_ebi_dir):
                fpath = join(self.full_ebi_dir, f)
                if isfile(fpath) and f.endswith(extension):
                    demux_samples.add(f[:-extension_len])
                else:
                    all_missing_files.add(f[:-extension_len])
            # at this stage we have created/reviewed all the files and have
            # all the sample names, however, we are not sure if we are dealing
            # with just forwards or if we are dealing with also reverse. The
            # easiest way to do this is to review the all_missing_files
            missing_files = all_missing_files - demux_samples
            if missing_files != all_missing_files:
                self.per_sample_FASTQ_reverse = True

            missing_samples = set(
                self.samples.keys()).difference(demux_samples)

        if missing_samples:
            for ms in missing_samples:
                del (self.samples[ms])
                del (self.samples_prep[ms])
                del (self.sample_demux_fps[ms])

        if not demux_samples:
            error_msg = ("All samples were removed from the submission "
                         "because the demux file is empty or the sample names "
                         "do not match.")
            LogEntry.create('Runtime', error_msg)
            raise EBISubmissionError(error_msg)

        return demux_samples
Esempio n. 9
0
def generate_plugin_releases():
    """Generate releases for plugins
    """
    ARCHIVE = qdb.archive.Archive
    qiita_config = ConfigurationManager()
    working_dir = qiita_config.working_dir

    commands = [c for s in qdb.software.Software.iter(active=True)
                for c in s.commands if c.post_processing_cmd is not None]

    tnow = datetime.now()
    ts = tnow.strftime('%m%d%y-%H%M%S')
    tgz_dir = join(working_dir, 'releases', 'archive')
    create_nested_path(tgz_dir)
    tgz_dir_release = join(tgz_dir, ts)
    create_nested_path(tgz_dir_release)
    for cmd in commands:
        cmd_name = cmd.name
        mschemes = [v for _, v in ARCHIVE.merging_schemes().items()
                    if cmd_name in v]
        for ms in mschemes:
            ms_name = sub('[^0-9a-zA-Z]+', '', ms)
            ms_fp = join(tgz_dir_release, ms_name)
            create_nested_path(ms_fp)

            pfp = join(ms_fp, 'archive.json')
            archives = {k: loads(v)
                        for k, v in ARCHIVE.retrieve_feature_values(
                              archive_merging_scheme=ms).items()
                        if v != ''}
            with open(pfp, 'w') as f:
                dump(archives, f)

            # now let's run the post_processing_cmd
            ppc = cmd.post_processing_cmd

            # concatenate any other parameters into a string
            params = ' '.join(["%s=%s" % (k, v) for k, v in
                              ppc['script_params'].items()])
            # append archives file and output dir parameters
            params = ("%s --fp_archive=%s --output_dir=%s" % (
                params, pfp, ms_fp))

            ppc_cmd = "%s %s %s" % (
                ppc['script_env'], ppc['script_path'], params)
            p_out, p_err, rv = qdb.processing_job._system_call(ppc_cmd)
            p_out = p_out.rstrip()
            if rv != 0:
                raise ValueError('Error %d: %s' % (rv, p_out))
            p_out = loads(p_out)

    # tgz-ing all files
    tgz_name = join(tgz_dir, 'archive-%s-building.tgz' % ts)
    tgz_name_final = join(tgz_dir, 'archive.tgz')
    with topen(tgz_name, "w|gz") as tgz:
        tgz.add(tgz_dir_release, arcname=basename(tgz_dir_release))
    # getting the release md5
    with open(tgz_name, "rb") as f:
        md5sum = md5()
        for c in iter(lambda: f.read(4096), b""):
            md5sum.update(c)
    rename(tgz_name, tgz_name_final)
    vals = [
        ('filepath', tgz_name_final[len(working_dir):], r_client.set),
        ('md5sum', md5sum.hexdigest(), r_client.set),
        ('time', tnow.strftime('%m-%d-%y %H:%M:%S'), r_client.set)]
    for k, v, f in vals:
        redis_key = 'release-archive:%s' % k
        # important to "flush" variables to avoid errors
        r_client.delete(redis_key)
        f(redis_key, v)
Esempio n. 10
0
def generate_biom_and_metadata_release(study_status='public'):
    """Generate a list of biom/meatadata filepaths and a tgz of those files

    Parameters
    ----------
    study_status : str, optional
        The study status to search for. Note that this should always be set
        to 'public' but having this exposed helps with testing. The other
        options are 'private' and 'sandbox'
    """
    studies = qdb.study.Study.get_by_status(study_status)
    qiita_config = ConfigurationManager()
    working_dir = qiita_config.working_dir
    portal = qiita_config.portal
    bdir = qdb.util.get_db_files_base_dir()
    time = datetime.now().strftime('%m-%d-%y %H:%M:%S')

    data = []
    for s in studies:
        # [0] latest is first, [1] only getting the filepath
        sample_fp = relpath(s.sample_template.get_filepaths()[0][1], bdir)

        for a in s.artifacts(artifact_type='BIOM'):
            if a.processing_parameters is None or a.visibility != study_status:
                continue

            merging_schemes, parent_softwares = a.merging_scheme
            software = a.processing_parameters.command.software
            software = '%s v%s' % (software.name, software.version)

            for x in a.filepaths:
                if x['fp_type'] != 'biom' or 'only-16s' in x['fp']:
                    continue
                fp = relpath(x['fp'], bdir)
                for pt in a.prep_templates:
                    categories = pt.categories()
                    platform = ''
                    target_gene = ''
                    if 'platform' in categories:
                        platform = ', '.join(
                            set(pt.get_category('platform').values()))
                    if 'target_gene' in categories:
                        target_gene = ', '.join(
                            set(pt.get_category('target_gene').values()))
                    for _, prep_fp in pt.get_filepaths():
                        if 'qiime' not in prep_fp:
                            break
                    prep_fp = relpath(prep_fp, bdir)
                    # format: (biom_fp, sample_fp, prep_fp, qiita_artifact_id,
                    #          platform, target gene, merging schemes,
                    #          artifact software/version,
                    #          parent sofware/version)
                    data.append((fp, sample_fp, prep_fp, a.id, platform,
                                 target_gene, merging_schemes, software,
                                 parent_softwares))

    # writing text and tgz file
    ts = datetime.now().strftime('%m%d%y-%H%M%S')
    tgz_dir = join(working_dir, 'releases')
    create_nested_path(tgz_dir)
    tgz_name = join(tgz_dir, '%s-%s-building.tgz' % (portal, study_status))
    tgz_name_final = join(tgz_dir, '%s-%s.tgz' % (portal, study_status))
    txt_lines = [
        "biom fp\tsample fp\tprep fp\tqiita artifact id\tplatform\t"
        "target gene\tmerging scheme\tartifact software\tparent software"]
    with topen(tgz_name, "w|gz") as tgz:
        for biom_fp, sample_fp, prep_fp, aid, pform, tg, ms, asv, psv in data:
            txt_lines.append("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % (
                biom_fp, sample_fp, prep_fp, aid, pform, tg, ms, asv, psv))
            tgz.add(join(bdir, biom_fp), arcname=biom_fp, recursive=False)
            tgz.add(join(bdir, sample_fp), arcname=sample_fp, recursive=False)
            tgz.add(join(bdir, prep_fp), arcname=prep_fp, recursive=False)
        info = TarInfo(name='%s-%s-%s.txt' % (portal, study_status, ts))
        txt_hd = BytesIO()
        txt_hd.write(bytes('\n'.join(txt_lines), 'ascii'))
        txt_hd.seek(0)
        info.size = len(txt_hd.read())
        txt_hd.seek(0)
        tgz.addfile(tarinfo=info, fileobj=txt_hd)

    with open(tgz_name, "rb") as f:
        md5sum = md5()
        for c in iter(lambda: f.read(4096), b""):
            md5sum.update(c)

    rename(tgz_name, tgz_name_final)

    vals = [
        ('filepath', tgz_name_final[len(working_dir):], r_client.set),
        ('md5sum', md5sum.hexdigest(), r_client.set),
        ('time', time, r_client.set)]
    for k, v, f in vals:
        redis_key = '%s:release:%s:%s' % (portal, study_status, k)
        # important to "flush" variables to avoid errors
        r_client.delete(redis_key)
        f(redis_key, v)
Esempio n. 11
0
def generate_biom_and_metadata_release(study_status='public'):
    """Generate a list of biom/meatadata filepaths and a tgz of those files

    Parameters
    ----------
    study_status : str, optional
        The study status to search for. Note that this should always be set
        to 'public' but having this exposed helps with testing. The other
        options are 'private' and 'sandbox'
    """
    studies = qdb.study.Study.get_by_status(study_status)
    qiita_config = ConfigurationManager()
    working_dir = qiita_config.working_dir
    portal = qiita_config.portal
    bdir = qdb.util.get_db_files_base_dir()
    time = datetime.now().strftime('%m-%d-%y %H:%M:%S')

    data = []
    for s in studies:
        # [0] latest is first, [1] only getting the filepath
        sample_fp = relpath(s.sample_template.get_filepaths()[0][1], bdir)

        for a in s.artifacts(artifact_type='BIOM'):
            if a.processing_parameters is None:
                continue

            processing_params = a.processing_parameters
            cmd_name = processing_params.command.name
            ms = processing_params.command.merging_scheme
            software = processing_params.command.software
            software = '%s v%s' % (software.name, software.version)

            # this loop is necessary as in theory an artifact can be
            # generated from multiple prep info files
            afps = [fp for _, fp, _ in a.filepaths if fp.endswith('biom')]
            merging_schemes = []
            parent_softwares = []
            for p in a.parents:
                pparent = p.processing_parameters
                # if parent is None, then is a direct upload; for example
                # per_sample_FASTQ in shotgun data
                if pparent is None:
                    parent_cmd_name = None
                    parent_merging_scheme = None
                    parent_pp = None
                    parent_software = 'N/A'
                else:
                    parent_cmd_name = pparent.command.name
                    parent_merging_scheme = pparent.command.merging_scheme
                    parent_pp = pparent.values
                    psoftware = pparent.command.software
                    parent_software = '%s v%s' % (
                        psoftware.name, psoftware.version)

                merging_schemes.append(qdb.util.human_merging_scheme(
                    cmd_name, ms, parent_cmd_name, parent_merging_scheme,
                    processing_params.values, afps, parent_pp))
                parent_softwares.append(parent_software)
            merging_schemes = ', '.join(merging_schemes)
            parent_softwares = ', '.join(parent_softwares)

            for _, fp, fp_type in a.filepaths:
                if fp_type != 'biom' or 'only-16s' in fp:
                    continue
                fp = relpath(fp, bdir)
                for pt in a.prep_templates:
                    categories = pt.categories()
                    platform = ''
                    target_gene = ''
                    if 'platform' in categories:
                        platform = ', '.join(
                            set(pt.get_category('platform').values()))
                    if 'target_gene' in categories:
                        target_gene = ', '.join(
                            set(pt.get_category('target_gene').values()))
                    for _, prep_fp in pt.get_filepaths():
                        if 'qiime' not in prep_fp:
                            break
                    prep_fp = relpath(prep_fp, bdir)
                    # format: (biom_fp, sample_fp, prep_fp, qiita_artifact_id,
                    #          platform, target gene, merging schemes,
                    #          artifact software/version,
                    #          parent sofware/version)
                    data.append((fp, sample_fp, prep_fp, a.id, platform,
                                 target_gene, merging_schemes, software,
                                 parent_softwares))

    # writing text and tgz file
    ts = datetime.now().strftime('%m%d%y-%H%M%S')
    tgz_dir = join(working_dir, 'releases')
    create_nested_path(tgz_dir)
    tgz_name = join(tgz_dir, '%s-%s-building.tgz' % (portal, study_status))
    tgz_name_final = join(tgz_dir, '%s-%s.tgz' % (portal, study_status))
    txt_hd = StringIO()
    with topen(tgz_name, "w|gz") as tgz:
        txt_hd.write(
            "biom fp\tsample fp\tprep fp\tqiita artifact id\tplatform\t"
            "target gene\tmerging scheme\tartifact software\t"
            "parent software\n")
        for biom_fp, sample_fp, prep_fp, aid, pform, tg, ms, asv, psv in data:
            txt_hd.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (
                biom_fp, sample_fp, prep_fp, aid, pform, tg, ms, asv, psv))
            tgz.add(join(bdir, biom_fp), arcname=biom_fp, recursive=False)
            tgz.add(join(bdir, sample_fp), arcname=sample_fp, recursive=False)
            tgz.add(join(bdir, prep_fp), arcname=prep_fp, recursive=False)

        txt_hd.seek(0)
        info = TarInfo(name='%s-%s-%s.txt' % (portal, study_status, ts))
        info.size = len(txt_hd.buf)
        tgz.addfile(tarinfo=info, fileobj=txt_hd)

    with open(tgz_name, "rb") as f:
        md5sum = md5()
        for c in iter(lambda: f.read(4096), b""):
            md5sum.update(c)

    rename(tgz_name, tgz_name_final)

    vals = [
        ('filepath', tgz_name_final[len(working_dir):], r_client.set),
        ('md5sum', md5sum.hexdigest(), r_client.set),
        ('time', time, r_client.set)]
    for k, v, f in vals:
        redis_key = '%s:release:%s:%s' % (portal, study_status, k)
        # important to "flush" variables to avoid errors
        r_client.delete(redis_key)
        f(redis_key, v)