def post(self, study_id): method = self.get_argument('remote-request-type') url = self.get_argument('inputURL') ssh_key = self.request.files['ssh-key'][0]['body'] status = 'success' message = '' try: study = Study(int(study_id)) except QiitaDBUnknownIDError: raise HTTPError(404, reason="Study %s does not exist" % study_id) check_access(self.current_user, study, no_public=True, raise_error=True) _, upload_folder = get_mountpoint("uploads")[0] upload_folder = join(upload_folder, study_id) ssh_key_fp = join(upload_folder, '.key.txt') create_nested_path(upload_folder) with open(ssh_key_fp, 'wb') as f: f.write(ssh_key) chmod(ssh_key_fp, 0o600) qiita_plugin = Software.from_name_and_version('Qiita', 'alpha') if method == 'list': cmd = qiita_plugin.get_command('list_remote_files') params = Parameters.load(cmd, values_dict={ 'url': url, 'private_key': ssh_key_fp, 'study_id': study_id }) elif method == 'transfer': cmd = qiita_plugin.get_command('download_remote_files') params = Parameters.load(cmd, values_dict={ 'url': url, 'private_key': ssh_key_fp, 'destination': upload_folder }) else: status = 'error' message = 'Not a valid method' if status == 'success': job = ProcessingJob.create(self.current_user, params, True) job.submit() r_client.set(UPLOAD_STUDY_FORMAT % study_id, dumps({'job_id': job.id})) self.write({'status': status, 'message': message})
def post(self, study_id): method = self.get_argument('remote-request-type') url = self.get_argument('inputURL') ssh_key = self.request.files['ssh-key'][0]['body'] status = 'success' message = '' try: study = Study(int(study_id)) except QiitaDBUnknownIDError: raise HTTPError(404, reason="Study %s does not exist" % study_id) check_access( self.current_user, study, no_public=True, raise_error=True) _, upload_folder = get_mountpoint("uploads")[0] upload_folder = join(upload_folder, study_id) ssh_key_fp = join(upload_folder, '.key.txt') create_nested_path(upload_folder) with open(ssh_key_fp, 'w') as f: f.write(ssh_key) qiita_plugin = Software.from_name_and_version('Qiita', 'alpha') if method == 'list': cmd = qiita_plugin.get_command('list_remote_files') params = Parameters.load(cmd, values_dict={ 'url': url, 'private_key': ssh_key_fp, 'study_id': study_id}) elif method == 'transfer': cmd = qiita_plugin.get_command('download_remote_files') params = Parameters.load(cmd, values_dict={ 'url': url, 'private_key': ssh_key_fp, 'destination': upload_folder}) else: status = 'error' message = 'Not a valid method' if status == 'success': job = ProcessingJob.create(self.current_user, params, True) job.submit() r_client.set( UPLOAD_STUDY_FORMAT % study_id, dumps({'job_id': job.id})) self.write({'status': status, 'message': message})
def post(self): resumable_identifier = self.get_argument('resumableIdentifier') resumable_filename = self.get_argument('resumableFilename') resumable_chunk_number = int(self.get_argument('resumableChunkNumber')) resumable_total_chunks = int(self.get_argument('resumableTotalChunks')) study_id = self.get_argument('study_id') data = self.request.files['file'][0]['body'] check_access(self.current_user, Study(int(study_id)), no_public=True, raise_error=True) self.validate_file_extension(resumable_filename) _, base_fp = get_mountpoint("uploads")[0] # creating temporal folder for upload of the file temp_dir = join(base_fp, study_id, resumable_identifier) create_nested_path(temp_dir) # location of the file as it is transmitted temporary_location = join(temp_dir, resumable_filename) # this is the result of a failed upload if resumable_chunk_number == 1 and exists(temporary_location): remove(temporary_location) # append every transmitted chunk with open(temporary_location, 'ab') as tmp_file: tmp_file.write(bytes(data)) if resumable_chunk_number == resumable_total_chunks: final_location = join(base_fp, study_id, resumable_filename) if exists(final_location): remove(final_location) move(temporary_location, final_location) rmtree(temp_dir) self.set_status(200)
def generate_plugin_releases(): """Generate releases for plugins """ ARCHIVE = qdb.archive.Archive qiita_config = ConfigurationManager() working_dir = qiita_config.working_dir commands = [ c for s in qdb.software.Software.iter(active=True) for c in s.commands if c.post_processing_cmd is not None ] tnow = datetime.now() ts = tnow.strftime('%m%d%y-%H%M%S') tgz_dir = join(working_dir, 'releases', 'archive') create_nested_path(tgz_dir) tgz_dir_release = join(tgz_dir, ts) create_nested_path(tgz_dir_release) for cmd in commands: cmd_name = cmd.name mschemes = [ v for _, v in ARCHIVE.merging_schemes().items() if cmd_name in v ] for ms in mschemes: ms_name = sub('[^0-9a-zA-Z]+', '', ms) ms_fp = join(tgz_dir_release, ms_name) create_nested_path(ms_fp) pfp = join(ms_fp, 'archive.json') archives = { k: loads(v) for k, v in ARCHIVE.retrieve_feature_values( archive_merging_scheme=ms).items() if v != '' } with open(pfp, 'w') as f: dump(archives, f) # now let's run the post_processing_cmd ppc = cmd.post_processing_cmd # concatenate any other parameters into a string params = ' '.join( ["%s=%s" % (k, v) for k, v in ppc['script_params'].items()]) # append archives file and output dir parameters params = ("%s --fp_archive=%s --output_dir=%s" % (params, pfp, ms_fp)) ppc_cmd = "%s %s %s" % (ppc['script_env'], ppc['script_path'], params) p_out, p_err, rv = qdb.processing_job._system_call(ppc_cmd) p_out = p_out.rstrip() if rv != 0: raise ValueError('Error %d: %s' % (rv, p_out)) p_out = loads(p_out) # tgz-ing all files tgz_name = join(tgz_dir, 'archive-%s-building.tgz' % ts) tgz_name_final = join(tgz_dir, 'archive.tgz') with topen(tgz_name, "w|gz") as tgz: tgz.add(tgz_dir_release, arcname=basename(tgz_dir_release)) # getting the release md5 with open(tgz_name, "rb") as f: md5sum = md5() for c in iter(lambda: f.read(4096), b""): md5sum.update(c) rename(tgz_name, tgz_name_final) vals = [('filepath', tgz_name_final[len(working_dir):], r_client.set), ('md5sum', md5sum.hexdigest(), r_client.set), ('time', tnow.strftime('%m-%d-%y %H:%M:%S'), r_client.set)] for k, v, f in vals: redis_key = 'release-archive:%s' % k # important to "flush" variables to avoid errors r_client.delete(redis_key) f(redis_key, v)
def generate_biom_and_metadata_release(study_status='public'): """Generate a list of biom/meatadata filepaths and a tgz of those files Parameters ---------- study_status : str, optional The study status to search for. Note that this should always be set to 'public' but having this exposed helps with testing. The other options are 'private' and 'sandbox' """ studies = qdb.study.Study.get_by_status(study_status) qiita_config = ConfigurationManager() working_dir = qiita_config.working_dir portal = qiita_config.portal bdir = qdb.util.get_db_files_base_dir() time = datetime.now().strftime('%m-%d-%y %H:%M:%S') data = [] for s in studies: # [0] latest is first, [1] only getting the filepath sample_fp = relpath(s.sample_template.get_filepaths()[0][1], bdir) for a in s.artifacts(artifact_type='BIOM'): if a.processing_parameters is None or a.visibility != study_status: continue merging_schemes, parent_softwares = a.merging_scheme software = a.processing_parameters.command.software software = '%s v%s' % (software.name, software.version) for x in a.filepaths: if x['fp_type'] != 'biom' or 'only-16s' in x['fp']: continue fp = relpath(x['fp'], bdir) for pt in a.prep_templates: categories = pt.categories() platform = '' target_gene = '' if 'platform' in categories: platform = ', '.join( set(pt.get_category('platform').values())) if 'target_gene' in categories: target_gene = ', '.join( set(pt.get_category('target_gene').values())) for _, prep_fp in pt.get_filepaths(): if 'qiime' not in prep_fp: break prep_fp = relpath(prep_fp, bdir) # format: (biom_fp, sample_fp, prep_fp, qiita_artifact_id, # platform, target gene, merging schemes, # artifact software/version, # parent sofware/version) data.append( (fp, sample_fp, prep_fp, a.id, platform, target_gene, merging_schemes, software, parent_softwares)) # writing text and tgz file ts = datetime.now().strftime('%m%d%y-%H%M%S') tgz_dir = join(working_dir, 'releases') create_nested_path(tgz_dir) tgz_name = join(tgz_dir, '%s-%s-building.tgz' % (portal, study_status)) tgz_name_final = join(tgz_dir, '%s-%s.tgz' % (portal, study_status)) txt_lines = [ "biom fp\tsample fp\tprep fp\tqiita artifact id\tplatform\t" "target gene\tmerging scheme\tartifact software\tparent software" ] with topen(tgz_name, "w|gz") as tgz: for biom_fp, sample_fp, prep_fp, aid, pform, tg, ms, asv, psv in data: txt_lines.append( "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % (biom_fp, sample_fp, prep_fp, aid, pform, tg, ms, asv, psv)) tgz.add(join(bdir, biom_fp), arcname=biom_fp, recursive=False) tgz.add(join(bdir, sample_fp), arcname=sample_fp, recursive=False) tgz.add(join(bdir, prep_fp), arcname=prep_fp, recursive=False) info = TarInfo(name='%s-%s-%s.txt' % (portal, study_status, ts)) txt_hd = BytesIO() txt_hd.write(bytes('\n'.join(txt_lines), 'ascii')) txt_hd.seek(0) info.size = len(txt_hd.read()) txt_hd.seek(0) tgz.addfile(tarinfo=info, fileobj=txt_hd) with open(tgz_name, "rb") as f: md5sum = md5() for c in iter(lambda: f.read(4096), b""): md5sum.update(c) rename(tgz_name, tgz_name_final) vals = [('filepath', tgz_name_final[len(working_dir):], r_client.set), ('md5sum', md5sum.hexdigest(), r_client.set), ('time', time, r_client.set)] for k, v, f in vals: redis_key = '%s:release:%s:%s' % (portal, study_status, k) # important to "flush" variables to avoid errors r_client.delete(redis_key) f(redis_key, v)
def generate_demultiplexed_fastq(self, rewrite_fastq=False, mtime=None): """Generates demultiplexed fastq Parameters ---------- rewrite_fastq : bool, optional If true, it forces the rewrite of the fastq files mtime : float, optional The time to use when creating the gz files. If None, the current time will be used by gzip.GzipFile. This is useful for testing. Returns ------- demux_samples List of successful demultiplexed samples Notes ----- - As a performace feature, this method will check if self.full_ebi_dir already exists and, if it does, the script will assume that in a previous execution this step was performed correctly and will simply read the file names from self.full_ebi_dir - When the object is created (init), samples, samples_prep and sample_demux_fps hold values for all available samples in the database. Here some of those values will be deleted (del's, within the loops) for those cases where the fastq.gz files weren't written or exist. This is an indication that they had no sequences and this kind of files are not accepted in EBI Raises ------ EBISubmissionError - The demux file couldn't be read - All samples are removed """ dir_not_exists = not isdir(self.full_ebi_dir) missing_samples = [] if dir_not_exists or rewrite_fastq: # if it exists, remove folder and start from scratch if isdir(self.full_ebi_dir): rmtree(self.full_ebi_dir) create_nested_path(self.full_ebi_dir) if self.artifact.artifact_type == 'per_sample_FASTQ': demux_samples, missing_samples = \ self._generate_demultiplexed_fastq_per_sample_FASTQ() else: demux_samples = self._generate_demultiplexed_fastq_demux(mtime) else: # if we are within this else, it means that we already have # generated the raw files and for some reason the submission # failed so we don't need to generate the files again and just # check which files exist in the file path to create our final # list of samples demux_samples = set() extension = self.FWD_READ_SUFFIX extension_len = len(extension) all_missing_files = set() for f in listdir(self.full_ebi_dir): fpath = join(self.full_ebi_dir, f) if isfile(fpath) and f.endswith(extension): demux_samples.add(f[:-extension_len]) else: all_missing_files.add(f[:-extension_len]) # at this stage we have created/reviewed all the files and have # all the sample names, however, we are not sure if we are dealing # with just forwards or if we are dealing with also reverse. The # easiest way to do this is to review the all_missing_files missing_files = all_missing_files - demux_samples if missing_files != all_missing_files: self.per_sample_FASTQ_reverse = True missing_samples = set( self.samples.keys()).difference(demux_samples) if missing_samples: for ms in missing_samples: del(self.samples[ms]) del(self.samples_prep[ms]) del(self.sample_demux_fps[ms]) if not demux_samples: error_msg = ("All samples were removed from the submission " "because the demux file is empty or the sample names " "do not match.") LogEntry.create('Runtime', error_msg) raise EBISubmissionError(error_msg) return demux_samples
def generate_demultiplexed_fastq(self, rewrite_fastq=False, mtime=None): """Generates demultiplexed fastq Parameters ---------- rewrite_fastq : bool, optional If true, it forces the rewrite of the fastq files mtime : float, optional The time to use when creating the gz files. If None, the current time will be used by gzip.GzipFile. This is useful for testing. Returns ------- demux_samples List of successful demultiplexed samples Notes ----- - As a performace feature, this method will check if self.full_ebi_dir already exists and, if it does, the script will assume that in a previous execution this step was performed correctly and will simply read the file names from self.full_ebi_dir - When the object is created (init), samples, samples_prep and sample_demux_fps hold values for all available samples in the database. Here some of those values will be deleted (del's, within the loops) for those cases where the fastq.gz files weren't written or exist. This is an indication that they had no sequences and this kind of files are not accepted in EBI Raises ------ EBISubmissionError - The demux file couldn't be read - All samples are removed """ dir_not_exists = not isdir(self.full_ebi_dir) missing_samples = [] if dir_not_exists or rewrite_fastq: # if it exists, remove folder and start from scratch if isdir(self.full_ebi_dir): rmtree(self.full_ebi_dir) create_nested_path(self.full_ebi_dir) if self.artifact.artifact_type == 'per_sample_FASTQ': demux_samples, missing_samples = \ self._generate_demultiplexed_fastq_per_sample_FASTQ() else: demux_samples = self._generate_demultiplexed_fastq_demux(mtime) else: # if we are within this else, it means that we already have # generated the raw files and for some reason the submission # failed so we don't need to generate the files again and just # check which files exist in the file path to create our final # list of samples demux_samples = set() extension = self.FWD_READ_SUFFIX extension_len = len(extension) all_missing_files = set() for f in listdir(self.full_ebi_dir): fpath = join(self.full_ebi_dir, f) if isfile(fpath) and f.endswith(extension): demux_samples.add(f[:-extension_len]) else: all_missing_files.add(f[:-extension_len]) # at this stage we have created/reviewed all the files and have # all the sample names, however, we are not sure if we are dealing # with just forwards or if we are dealing with also reverse. The # easiest way to do this is to review the all_missing_files missing_files = all_missing_files - demux_samples if missing_files != all_missing_files: self.per_sample_FASTQ_reverse = True missing_samples = set( self.samples.keys()).difference(demux_samples) if missing_samples: for ms in missing_samples: del (self.samples[ms]) del (self.samples_prep[ms]) del (self.sample_demux_fps[ms]) if not demux_samples: error_msg = ("All samples were removed from the submission " "because the demux file is empty or the sample names " "do not match.") LogEntry.create('Runtime', error_msg) raise EBISubmissionError(error_msg) return demux_samples
def generate_plugin_releases(): """Generate releases for plugins """ ARCHIVE = qdb.archive.Archive qiita_config = ConfigurationManager() working_dir = qiita_config.working_dir commands = [c for s in qdb.software.Software.iter(active=True) for c in s.commands if c.post_processing_cmd is not None] tnow = datetime.now() ts = tnow.strftime('%m%d%y-%H%M%S') tgz_dir = join(working_dir, 'releases', 'archive') create_nested_path(tgz_dir) tgz_dir_release = join(tgz_dir, ts) create_nested_path(tgz_dir_release) for cmd in commands: cmd_name = cmd.name mschemes = [v for _, v in ARCHIVE.merging_schemes().items() if cmd_name in v] for ms in mschemes: ms_name = sub('[^0-9a-zA-Z]+', '', ms) ms_fp = join(tgz_dir_release, ms_name) create_nested_path(ms_fp) pfp = join(ms_fp, 'archive.json') archives = {k: loads(v) for k, v in ARCHIVE.retrieve_feature_values( archive_merging_scheme=ms).items() if v != ''} with open(pfp, 'w') as f: dump(archives, f) # now let's run the post_processing_cmd ppc = cmd.post_processing_cmd # concatenate any other parameters into a string params = ' '.join(["%s=%s" % (k, v) for k, v in ppc['script_params'].items()]) # append archives file and output dir parameters params = ("%s --fp_archive=%s --output_dir=%s" % ( params, pfp, ms_fp)) ppc_cmd = "%s %s %s" % ( ppc['script_env'], ppc['script_path'], params) p_out, p_err, rv = qdb.processing_job._system_call(ppc_cmd) p_out = p_out.rstrip() if rv != 0: raise ValueError('Error %d: %s' % (rv, p_out)) p_out = loads(p_out) # tgz-ing all files tgz_name = join(tgz_dir, 'archive-%s-building.tgz' % ts) tgz_name_final = join(tgz_dir, 'archive.tgz') with topen(tgz_name, "w|gz") as tgz: tgz.add(tgz_dir_release, arcname=basename(tgz_dir_release)) # getting the release md5 with open(tgz_name, "rb") as f: md5sum = md5() for c in iter(lambda: f.read(4096), b""): md5sum.update(c) rename(tgz_name, tgz_name_final) vals = [ ('filepath', tgz_name_final[len(working_dir):], r_client.set), ('md5sum', md5sum.hexdigest(), r_client.set), ('time', tnow.strftime('%m-%d-%y %H:%M:%S'), r_client.set)] for k, v, f in vals: redis_key = 'release-archive:%s' % k # important to "flush" variables to avoid errors r_client.delete(redis_key) f(redis_key, v)
def generate_biom_and_metadata_release(study_status='public'): """Generate a list of biom/meatadata filepaths and a tgz of those files Parameters ---------- study_status : str, optional The study status to search for. Note that this should always be set to 'public' but having this exposed helps with testing. The other options are 'private' and 'sandbox' """ studies = qdb.study.Study.get_by_status(study_status) qiita_config = ConfigurationManager() working_dir = qiita_config.working_dir portal = qiita_config.portal bdir = qdb.util.get_db_files_base_dir() time = datetime.now().strftime('%m-%d-%y %H:%M:%S') data = [] for s in studies: # [0] latest is first, [1] only getting the filepath sample_fp = relpath(s.sample_template.get_filepaths()[0][1], bdir) for a in s.artifacts(artifact_type='BIOM'): if a.processing_parameters is None or a.visibility != study_status: continue merging_schemes, parent_softwares = a.merging_scheme software = a.processing_parameters.command.software software = '%s v%s' % (software.name, software.version) for x in a.filepaths: if x['fp_type'] != 'biom' or 'only-16s' in x['fp']: continue fp = relpath(x['fp'], bdir) for pt in a.prep_templates: categories = pt.categories() platform = '' target_gene = '' if 'platform' in categories: platform = ', '.join( set(pt.get_category('platform').values())) if 'target_gene' in categories: target_gene = ', '.join( set(pt.get_category('target_gene').values())) for _, prep_fp in pt.get_filepaths(): if 'qiime' not in prep_fp: break prep_fp = relpath(prep_fp, bdir) # format: (biom_fp, sample_fp, prep_fp, qiita_artifact_id, # platform, target gene, merging schemes, # artifact software/version, # parent sofware/version) data.append((fp, sample_fp, prep_fp, a.id, platform, target_gene, merging_schemes, software, parent_softwares)) # writing text and tgz file ts = datetime.now().strftime('%m%d%y-%H%M%S') tgz_dir = join(working_dir, 'releases') create_nested_path(tgz_dir) tgz_name = join(tgz_dir, '%s-%s-building.tgz' % (portal, study_status)) tgz_name_final = join(tgz_dir, '%s-%s.tgz' % (portal, study_status)) txt_lines = [ "biom fp\tsample fp\tprep fp\tqiita artifact id\tplatform\t" "target gene\tmerging scheme\tartifact software\tparent software"] with topen(tgz_name, "w|gz") as tgz: for biom_fp, sample_fp, prep_fp, aid, pform, tg, ms, asv, psv in data: txt_lines.append("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % ( biom_fp, sample_fp, prep_fp, aid, pform, tg, ms, asv, psv)) tgz.add(join(bdir, biom_fp), arcname=biom_fp, recursive=False) tgz.add(join(bdir, sample_fp), arcname=sample_fp, recursive=False) tgz.add(join(bdir, prep_fp), arcname=prep_fp, recursive=False) info = TarInfo(name='%s-%s-%s.txt' % (portal, study_status, ts)) txt_hd = BytesIO() txt_hd.write(bytes('\n'.join(txt_lines), 'ascii')) txt_hd.seek(0) info.size = len(txt_hd.read()) txt_hd.seek(0) tgz.addfile(tarinfo=info, fileobj=txt_hd) with open(tgz_name, "rb") as f: md5sum = md5() for c in iter(lambda: f.read(4096), b""): md5sum.update(c) rename(tgz_name, tgz_name_final) vals = [ ('filepath', tgz_name_final[len(working_dir):], r_client.set), ('md5sum', md5sum.hexdigest(), r_client.set), ('time', time, r_client.set)] for k, v, f in vals: redis_key = '%s:release:%s:%s' % (portal, study_status, k) # important to "flush" variables to avoid errors r_client.delete(redis_key) f(redis_key, v)
def generate_biom_and_metadata_release(study_status='public'): """Generate a list of biom/meatadata filepaths and a tgz of those files Parameters ---------- study_status : str, optional The study status to search for. Note that this should always be set to 'public' but having this exposed helps with testing. The other options are 'private' and 'sandbox' """ studies = qdb.study.Study.get_by_status(study_status) qiita_config = ConfigurationManager() working_dir = qiita_config.working_dir portal = qiita_config.portal bdir = qdb.util.get_db_files_base_dir() time = datetime.now().strftime('%m-%d-%y %H:%M:%S') data = [] for s in studies: # [0] latest is first, [1] only getting the filepath sample_fp = relpath(s.sample_template.get_filepaths()[0][1], bdir) for a in s.artifacts(artifact_type='BIOM'): if a.processing_parameters is None: continue processing_params = a.processing_parameters cmd_name = processing_params.command.name ms = processing_params.command.merging_scheme software = processing_params.command.software software = '%s v%s' % (software.name, software.version) # this loop is necessary as in theory an artifact can be # generated from multiple prep info files afps = [fp for _, fp, _ in a.filepaths if fp.endswith('biom')] merging_schemes = [] parent_softwares = [] for p in a.parents: pparent = p.processing_parameters # if parent is None, then is a direct upload; for example # per_sample_FASTQ in shotgun data if pparent is None: parent_cmd_name = None parent_merging_scheme = None parent_pp = None parent_software = 'N/A' else: parent_cmd_name = pparent.command.name parent_merging_scheme = pparent.command.merging_scheme parent_pp = pparent.values psoftware = pparent.command.software parent_software = '%s v%s' % ( psoftware.name, psoftware.version) merging_schemes.append(qdb.util.human_merging_scheme( cmd_name, ms, parent_cmd_name, parent_merging_scheme, processing_params.values, afps, parent_pp)) parent_softwares.append(parent_software) merging_schemes = ', '.join(merging_schemes) parent_softwares = ', '.join(parent_softwares) for _, fp, fp_type in a.filepaths: if fp_type != 'biom' or 'only-16s' in fp: continue fp = relpath(fp, bdir) for pt in a.prep_templates: categories = pt.categories() platform = '' target_gene = '' if 'platform' in categories: platform = ', '.join( set(pt.get_category('platform').values())) if 'target_gene' in categories: target_gene = ', '.join( set(pt.get_category('target_gene').values())) for _, prep_fp in pt.get_filepaths(): if 'qiime' not in prep_fp: break prep_fp = relpath(prep_fp, bdir) # format: (biom_fp, sample_fp, prep_fp, qiita_artifact_id, # platform, target gene, merging schemes, # artifact software/version, # parent sofware/version) data.append((fp, sample_fp, prep_fp, a.id, platform, target_gene, merging_schemes, software, parent_softwares)) # writing text and tgz file ts = datetime.now().strftime('%m%d%y-%H%M%S') tgz_dir = join(working_dir, 'releases') create_nested_path(tgz_dir) tgz_name = join(tgz_dir, '%s-%s-building.tgz' % (portal, study_status)) tgz_name_final = join(tgz_dir, '%s-%s.tgz' % (portal, study_status)) txt_hd = StringIO() with topen(tgz_name, "w|gz") as tgz: txt_hd.write( "biom fp\tsample fp\tprep fp\tqiita artifact id\tplatform\t" "target gene\tmerging scheme\tartifact software\t" "parent software\n") for biom_fp, sample_fp, prep_fp, aid, pform, tg, ms, asv, psv in data: txt_hd.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % ( biom_fp, sample_fp, prep_fp, aid, pform, tg, ms, asv, psv)) tgz.add(join(bdir, biom_fp), arcname=biom_fp, recursive=False) tgz.add(join(bdir, sample_fp), arcname=sample_fp, recursive=False) tgz.add(join(bdir, prep_fp), arcname=prep_fp, recursive=False) txt_hd.seek(0) info = TarInfo(name='%s-%s-%s.txt' % (portal, study_status, ts)) info.size = len(txt_hd.buf) tgz.addfile(tarinfo=info, fileobj=txt_hd) with open(tgz_name, "rb") as f: md5sum = md5() for c in iter(lambda: f.read(4096), b""): md5sum.update(c) rename(tgz_name, tgz_name_final) vals = [ ('filepath', tgz_name_final[len(working_dir):], r_client.set), ('md5sum', md5sum.hexdigest(), r_client.set), ('time', time, r_client.set)] for k, v, f in vals: redis_key = '%s:release:%s:%s' % (portal, study_status, k) # important to "flush" variables to avoid errors r_client.delete(redis_key) f(redis_key, v)