def preconf(self, d): print("preconf") mpfr = CompileMPFR(self.lfs) mpc = CompileMPC(self.lfs) gmp = CompileGMP(self.lfs) # TODO parallel with topen(mpfr.arc, 'r') as tar: tar.extractall(self.s) move(join(self.s, mpfr.pkgd), join(self.s, 'mpfr')) with topen(mpc.arc, 'r') as tar: tar.extractall(self.s) move(join(self.s, mpc.pkgd), join(self.s, 'mpc')) with topen(gmp.arc, 'r') as tar: tar.extractall(self.s) move(join(self.s, gmp.pkgd), join(self.s, 'gmp')) for file in map(lambda s: join(self.s, 'gcc/config/%slinux%s.h' % s), [('', ''), ('i386/', ''), ('i386/', '64')]): with open(file, 'r') as f: r = f.read() r = sub('/lib\(64\)\?\(32\)\?/ld', '/tools&', r) r = r.replace('/usr', '/tools') #undef STANDARD_STARTFILE_PREFIX_1 #undef STANDARD_STARTFILE_PREFIX_2 #define STANDARD_STARTFILE_PREFIX_1 "%s" #define STANDARD_STARTFILE_PREFIX_2 """"" % join(self.lfs.toolsym, 'lib') with open(file, 'w') as f: f.write(r) if uname()[4] in 'x86_64': with open(join(self.s, 'gcc/config/i386/t-linux64'), 'r') as f: r = f.readlines() r = [l.replace('lib64', 'lib') if 'm64=' in l else l for l in r] with open(join(self.s, 'gcc/config/i386/t-linux64'), 'w') as f: f.writelines(r)
def preconf(self, d): print("preconf") mpfr = CompileMPFR(self.lfs) mpc = CompileMPC(self.lfs) gmp = CompileGMP(self.lfs) # TODO parallel with topen(mpfr.arc, 'r') as tar: tar.extractall(self.s) move(join(self.s, mpfr.pkgd), join(self.s, 'mpfr')) with topen(mpc.arc, 'r') as tar: tar.extractall(self.s) move(join(self.s, mpc.pkgd), join(self.s, 'mpc')) with topen(gmp.arc, 'r') as tar: tar.extractall(self.s) move(join(self.s, gmp.pkgd), join(self.s, 'gmp'))
def compilePackage(self, name, version, fmt, s, arc, is_separate_builddir, build): print("lfs compile package") #pkgd = '%s-%s' % (name, version) #pkg = '%s.%s' % (pkgd, fmt) #arc = join(self.sources, pkg) #s = join(self.srcx, pkgd) if is_separate_builddir: #d = mkdtemp(dir=self.srcx) d = join(s, 'build') else: d = s #with tarfile.open(pkg, 'r') as tar: tar.extractall(self.srcx) #s = mkdtemp(dir=self.srcx) #print("made temp source dir: %s" % s) try: with topen(arc, 'r') as tar: tar.extractall(self.srcx) if is_separate_builddir: mkdir(d) #d = join(self.srcx, 'build') #mkdir(d) #chdir(d) build(d) #except: copyfile(join(d, 'config.log'), stdout) finally: #chdir(self.srcx) if is_separate_builddir: rmtree(d) rmtree(s)
def setup(): if not isdir('.\\data') and 'n' in input( 'I\'m going to install Wikipedia, this may take a long time (this is 7 Gb) would you like to do in in another time? (y/n)\n' ).lower(): with tqdm( 'http://gfs270n122.userstorage.mega.co.nz/dl/RVLMhIxMApeyQQIxkFDWl-0aHd_iqz6yDLq4wI2brHvPeHgy_D9mdO7470RrDQwek4XjBUzF0Nc8SlxvUWPoM8hjTJAFUQig6jgP8dP6udLY-3_4bwfmcMs7HcC_Pg', desc='Installing Wikipedia') as t: reporthook = my_hook(t) urlretrieve( 'http://gfs270n122.userstorage.mega.co.nz/dl/RVLMhIxMApeyQQIxkFDWl-0aHd_iqz6yDLq4wI2brHvPeHgy_D9mdO7470RrDQwek4XjBUzF0Nc8SlxvUWPoM8hjTJAFUQig6jgP8dP6udLY-3_4bwfmcMs7HcC_Pg', '.\\data.tar.gz', reporthook=reporthook) with topen('.\\data.tar.gz', 'r:gz') as tar: for member in tqdm(iterable=tar.getmembers(), total=len(tar.getmembers()), desc='Extracting wikipedia'): tar.extract(member=member, path='.') remove('.\\data.tar.gz') _torch() print('DONE\n') elif not isdir('.\\data'): exit( print( 'You chose not to install wikipedia, you can re-run it any time and install wikipedia' )) if not isfile('data.in'): open('data.in', 'a').write( input( 'Give me a candidate file, if you don\'t have any press enter\n' ))
def untar(src, dst): """ Unpack a path to a target folder. All the required directories will be created :param str src: path to unpack :param str dst: path to output folder """ with topen(src) as tf: tf.extractall(path=dst)
def main(): export_path = 'invoices' if not path.exists(export_path): makedirs(export_path) orders = Order.objects.filter( address__country__name__contains="United States", order_status=Order.SHIPPED, shipped_date__year=2014, shipped_date__month=5) print('Total orders matching query: ' + str(len(orders)) + '.') signal(SIGINT, partial(trap_sigint, export_path)) with open(export_path + '/usa_shipped_orders.csv', 'w') as out: out.write(';'.join([ 'Order ID', 'Full Name', 'Address', 'Tracking Number', 'Invoice Number' ]) + '\n') usa_orders_count = 0 for o in orders: usa_orders_count += 1 address = ', '.join( filter(None, [ o.address.street, o.address.street2, o.address.street3, o.address.city, o.address.postal_code, o.address.state, o.address.country.name ])) invoice_no = 'P' + o.created.strftime('%y%m%d') + str( o.id % 10000).zfill(4) tracking_number = '--' if ( o.tracking_number is None or o.tracking_number == '') else o.tracking_number csv_row = ';'.join([ str(o.id), o.address.full_name, address, tracking_number, invoice_no ]) + '\n' stdout.write(csv_row) out_path = export_path + '/invoice-' + invoice_no + '.pdf' pdf_from_string(o.get_invoice_as_string(), out_path, options={'quiet': ''}) out.write(csv_row) print('\nTotal USA orders: ' + str(usa_orders_count)) tarball = 'usa_order_data.tar.gz' with topen(tarball, "w:gz") as tar: tar.add(export_path, arcname=path.basename(export_path)) rm_dir_and_contents(export_path) print('Added files to "' + tarball + '" archive in the current directory.')
def test_generate_plugin_releases(self): qdb.meta_util.generate_plugin_releases() working_dir = qiita_config.working_dir tgz = r_client.get('release-archive:filepath') with topen(join(working_dir, tgz.decode('ascii')), "r:gz") as tmp: tgz_obs = [ti.name for ti in tmp] # the expected folder/file in the tgz should be named as the time # when it was created so let's test that time = r_client.get('release-archive:time').decode('ascii').replace( '-', '').replace(':', '').replace(' ', '-') self.assertEqual(tgz_obs, [time])
def CreateTar(): with topen("/tmp/%s.tar" % (BKP_FILENAME), "w") as tar_parse: for root, dirs, files in os.walk(LOC_SRCS): for eachFile in files: try: tar_parse.add(os.path.join(root, eachFile)) except PermissionError: stderr.write( "Permission refusé, ce fichier/dossier sera ignoré: %s/%s\n" % (root, eachFile)) except Exception as e: stderr.write("Un problème est survenu: %s\n" % (e)) return md5("/tmp/%s.tar" % BKP_FILENAME)
def _open(self, reopen=False): """ Open self.path on disk if not already open. If self.path is already open, then simply return. Reloading to file can be done by setting reopen=True. """ _tarfile = self._tarfile if not reopen and isinstance(_tarfile, TarFile) and not _tarfile.closed: # Checked first for lowest overhead possible return if _tarfile is None: self._tarfile = topen(self.path, mode='r') else: try: _tarfile.close() self._tarfile = topen(self.path, mode='r') except AttributeError: raise except OSError: raise
def unTarOrSkip(self, tname, files): tfile = self.topdir / BINSDIR / tname try: with topen(tfile) as tar: for f in files: try: tar.extract(f[0], path=self.workdir) with open(self.workdir / f[0], "rb") as g: s1 = sha1(g.read()).hexdigest() self.assertEqual( f[1], s1, "Sha1sum mismatch file (%s), %s != %s" % (f[0], f[1], s1)) except KeyError: self.skipTest("File (%s) not found in archive" % f[0]) except IOError: self.skipTest("Archive not found or unreadable(%s)" % tfile)
def extract(self) -> None: from tarfile import open as topen from tarfile import TarFile from os.path import exists #from os import chdir, rmdir from shutil import rmtree if exists(self.anki_source): #chdir(self.source_dir) if exists(self.anki_dir): rmtree(self.anki_dir) file: TarFile = topen(self.anki_source) print(f'Extracting {self.anki_source}') file.extractall(self.source_dir) file.close()
def generate_plugin_releases(): """Generate releases for plugins """ ARCHIVE = qdb.archive.Archive qiita_config = ConfigurationManager() working_dir = qiita_config.working_dir commands = [ c for s in qdb.software.Software.iter(active=True) for c in s.commands if c.post_processing_cmd is not None ] tnow = datetime.now() ts = tnow.strftime('%m%d%y-%H%M%S') tgz_dir = join(working_dir, 'releases', 'archive') create_nested_path(tgz_dir) tgz_dir_release = join(tgz_dir, ts) create_nested_path(tgz_dir_release) for cmd in commands: cmd_name = cmd.name mschemes = [ v for _, v in ARCHIVE.merging_schemes().items() if cmd_name in v ] for ms in mschemes: ms_name = sub('[^0-9a-zA-Z]+', '', ms) ms_fp = join(tgz_dir_release, ms_name) create_nested_path(ms_fp) pfp = join(ms_fp, 'archive.json') archives = { k: loads(v) for k, v in ARCHIVE.retrieve_feature_values( archive_merging_scheme=ms).items() if v != '' } with open(pfp, 'w') as f: dump(archives, f) # now let's run the post_processing_cmd ppc = cmd.post_processing_cmd # concatenate any other parameters into a string params = ' '.join( ["%s=%s" % (k, v) for k, v in ppc['script_params'].items()]) # append archives file and output dir parameters params = ("%s --fp_archive=%s --output_dir=%s" % (params, pfp, ms_fp)) ppc_cmd = "%s %s %s" % (ppc['script_env'], ppc['script_path'], params) p_out, p_err, rv = qdb.processing_job._system_call(ppc_cmd) p_out = p_out.rstrip() if rv != 0: raise ValueError('Error %d: %s' % (rv, p_out)) p_out = loads(p_out) # tgz-ing all files tgz_name = join(tgz_dir, 'archive-%s-building.tgz' % ts) tgz_name_final = join(tgz_dir, 'archive.tgz') with topen(tgz_name, "w|gz") as tgz: tgz.add(tgz_dir_release, arcname=basename(tgz_dir_release)) # getting the release md5 with open(tgz_name, "rb") as f: md5sum = md5() for c in iter(lambda: f.read(4096), b""): md5sum.update(c) rename(tgz_name, tgz_name_final) vals = [('filepath', tgz_name_final[len(working_dir):], r_client.set), ('md5sum', md5sum.hexdigest(), r_client.set), ('time', tnow.strftime('%m-%d-%y %H:%M:%S'), r_client.set)] for k, v, f in vals: redis_key = 'release-archive:%s' % k # important to "flush" variables to avoid errors r_client.delete(redis_key) f(redis_key, v)
def generate_biom_and_metadata_release(study_status='public'): """Generate a list of biom/meatadata filepaths and a tgz of those files Parameters ---------- study_status : str, optional The study status to search for. Note that this should always be set to 'public' but having this exposed helps with testing. The other options are 'private' and 'sandbox' """ studies = qdb.study.Study.get_by_status(study_status) qiita_config = ConfigurationManager() working_dir = qiita_config.working_dir portal = qiita_config.portal bdir = qdb.util.get_db_files_base_dir() time = datetime.now().strftime('%m-%d-%y %H:%M:%S') data = [] for s in studies: # [0] latest is first, [1] only getting the filepath sample_fp = relpath(s.sample_template.get_filepaths()[0][1], bdir) for a in s.artifacts(artifact_type='BIOM'): if a.processing_parameters is None or a.visibility != study_status: continue merging_schemes, parent_softwares = a.merging_scheme software = a.processing_parameters.command.software software = '%s v%s' % (software.name, software.version) for x in a.filepaths: if x['fp_type'] != 'biom' or 'only-16s' in x['fp']: continue fp = relpath(x['fp'], bdir) for pt in a.prep_templates: categories = pt.categories() platform = '' target_gene = '' if 'platform' in categories: platform = ', '.join( set(pt.get_category('platform').values())) if 'target_gene' in categories: target_gene = ', '.join( set(pt.get_category('target_gene').values())) for _, prep_fp in pt.get_filepaths(): if 'qiime' not in prep_fp: break prep_fp = relpath(prep_fp, bdir) # format: (biom_fp, sample_fp, prep_fp, qiita_artifact_id, # platform, target gene, merging schemes, # artifact software/version, # parent sofware/version) data.append( (fp, sample_fp, prep_fp, a.id, platform, target_gene, merging_schemes, software, parent_softwares)) # writing text and tgz file ts = datetime.now().strftime('%m%d%y-%H%M%S') tgz_dir = join(working_dir, 'releases') create_nested_path(tgz_dir) tgz_name = join(tgz_dir, '%s-%s-building.tgz' % (portal, study_status)) tgz_name_final = join(tgz_dir, '%s-%s.tgz' % (portal, study_status)) txt_lines = [ "biom fp\tsample fp\tprep fp\tqiita artifact id\tplatform\t" "target gene\tmerging scheme\tartifact software\tparent software" ] with topen(tgz_name, "w|gz") as tgz: for biom_fp, sample_fp, prep_fp, aid, pform, tg, ms, asv, psv in data: txt_lines.append( "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % (biom_fp, sample_fp, prep_fp, aid, pform, tg, ms, asv, psv)) tgz.add(join(bdir, biom_fp), arcname=biom_fp, recursive=False) tgz.add(join(bdir, sample_fp), arcname=sample_fp, recursive=False) tgz.add(join(bdir, prep_fp), arcname=prep_fp, recursive=False) info = TarInfo(name='%s-%s-%s.txt' % (portal, study_status, ts)) txt_hd = BytesIO() txt_hd.write(bytes('\n'.join(txt_lines), 'ascii')) txt_hd.seek(0) info.size = len(txt_hd.read()) txt_hd.seek(0) tgz.addfile(tarinfo=info, fileobj=txt_hd) with open(tgz_name, "rb") as f: md5sum = md5() for c in iter(lambda: f.read(4096), b""): md5sum.update(c) rename(tgz_name, tgz_name_final) vals = [('filepath', tgz_name_final[len(working_dir):], r_client.set), ('md5sum', md5sum.hexdigest(), r_client.set), ('time', time, r_client.set)] for k, v, f in vals: redis_key = '%s:release:%s:%s' % (portal, study_status, k) # important to "flush" variables to avoid errors r_client.delete(redis_key) f(redis_key, v)
def test_generate_biom_and_metadata_release(self): level = 'private' qdb.meta_util.generate_biom_and_metadata_release(level) portal = qiita_config.portal working_dir = qiita_config.working_dir vals = [('filepath', r_client.get), ('md5sum', r_client.get), ('time', r_client.get)] # we are storing the [0] filepath, [1] md5sum and [2] time but we are # only going to check the filepath contents so ignoring the others tgz = vals[0][1]('%s:release:%s:%s' % (portal, level, vals[0][0])) tgz = join(working_dir, tgz) self.files_to_remove.extend([tgz]) tmp = topen(tgz, "r:gz") tgz_obs = [ti.name for ti in tmp] tmp.close() # files names might change due to updates and patches so just check # that the prefix exists. fn = 'processed_data/1_study_1001_closed_reference_otu_table.biom' self.assertTrue(fn in tgz_obs) tgz_obs.remove(fn) # yes, this file is there twice self.assertTrue(fn in tgz_obs) tgz_obs.remove(fn) # let's check the next biom fn = ('processed_data/1_study_1001_closed_reference_otu_table_Silva.' 'biom') self.assertTrue(fn in tgz_obs) tgz_obs.remove(fn) # now let's check prep info files based on their suffix, just take # the first one and check/rm the occurances of that file fn_prep = [f for f in tgz_obs if f.startswith('templates/1_prep_1_')][0] # 3 times self.assertTrue(fn_prep in tgz_obs) tgz_obs.remove(fn_prep) self.assertTrue(fn_prep in tgz_obs) tgz_obs.remove(fn_prep) self.assertTrue(fn_prep in tgz_obs) tgz_obs.remove(fn_prep) fn_sample = [f for f in tgz_obs if f.startswith('templates/1_')][0] # 3 times self.assertTrue(fn_sample in tgz_obs) tgz_obs.remove(fn_sample) self.assertTrue(fn_sample in tgz_obs) tgz_obs.remove(fn_sample) self.assertTrue(fn_sample in tgz_obs) tgz_obs.remove(fn_sample) # now we should only have the text file txt = tgz_obs.pop() # now it should be empty self.assertEqual(tgz_obs, []) tmp = topen(tgz, "r:gz") fhd = tmp.extractfile(txt) txt_obs = fhd.readlines() tmp.close() txt_exp = [ 'biom_fp\tsample_fp\tprep_fp\tqiita_artifact_id\tcommand\n', 'processed_data/1_study_1001_closed_reference_otu_table.biom\t' '%s\t%s\t4\tPick closed-reference OTUs, Split libraries FASTQ\n' % (fn_sample, fn_prep), 'processed_data/1_study_1001_closed_reference_otu_table.biom\t' '%s\t%s\t5\tPick closed-reference OTUs, Split libraries FASTQ\n' % (fn_sample, fn_prep), 'processed_data/1_study_1001_closed_reference_otu_table_Silva.bio' 'm\t%s\t%s\t6\tPick closed-reference OTUs, Split libraries FASTQ\n' % (fn_sample, fn_prep) ] self.assertEqual(txt_obs, txt_exp) # whatever the configuration was, we will change to settings so we can # test the other option when dealing with the end '/' with qdb.sql_connection.TRN: qdb.sql_connection.TRN.add("SELECT base_data_dir FROM settings") obdr = qdb.sql_connection.TRN.execute_fetchlast() if obdr[-1] == '/': bdr = obdr[:-1] else: bdr = obdr + '/' qdb.sql_connection.TRN.add( "UPDATE settings SET base_data_dir = '%s'" % bdr) bdr = qdb.sql_connection.TRN.execute() qdb.meta_util.generate_biom_and_metadata_release(level) # we are storing the [0] filepath, [1] md5sum and [2] time but we are # only going to check the filepath contents so ignoring the others tgz = vals[0][1]('%s:release:%s:%s' % (portal, level, vals[0][0])) tgz = join(working_dir, tgz) tmp = topen(tgz, "r:gz") tgz_obs = [ti.name for ti in tmp] tmp.close() # files names might change due to updates and patches so just check # that the prefix exists. fn = 'processed_data/1_study_1001_closed_reference_otu_table.biom' self.assertTrue(fn in tgz_obs) tgz_obs.remove(fn) # yes, this file is there twice self.assertTrue(fn in tgz_obs) tgz_obs.remove(fn) # let's check the next biom fn = ('processed_data/1_study_1001_closed_reference_otu_table_Silva.' 'biom') self.assertTrue(fn in tgz_obs) tgz_obs.remove(fn) # now let's check prep info files based on their suffix, just take # the first one and check/rm the occurances of that file fn_prep = [f for f in tgz_obs if f.startswith('templates/1_prep_1_')][0] # 3 times self.assertTrue(fn_prep in tgz_obs) tgz_obs.remove(fn_prep) self.assertTrue(fn_prep in tgz_obs) tgz_obs.remove(fn_prep) self.assertTrue(fn_prep in tgz_obs) tgz_obs.remove(fn_prep) fn_sample = [f for f in tgz_obs if f.startswith('templates/1_')][0] # 3 times self.assertTrue(fn_sample in tgz_obs) tgz_obs.remove(fn_sample) self.assertTrue(fn_sample in tgz_obs) tgz_obs.remove(fn_sample) self.assertTrue(fn_sample in tgz_obs) tgz_obs.remove(fn_sample) # now we should only have the text file txt = tgz_obs.pop() # now it should be empty self.assertEqual(tgz_obs, []) tmp = topen(tgz, "r:gz") fhd = tmp.extractfile(txt) txt_obs = fhd.readlines() tmp.close() txt_exp = [ 'biom_fp\tsample_fp\tprep_fp\tqiita_artifact_id\tcommand\n', 'processed_data/1_study_1001_closed_reference_otu_table.biom\t' '%s\t%s\t4\tPick closed-reference OTUs, Split libraries FASTQ\n' % (fn_sample, fn_prep), 'processed_data/1_study_1001_closed_reference_otu_table.biom\t' '%s\t%s\t5\tPick closed-reference OTUs, Split libraries FASTQ\n' % (fn_sample, fn_prep), 'processed_data/1_study_1001_closed_reference_otu_table_Silva.bio' 'm\t%s\t%s\t6\tPick closed-reference OTUs, Split libraries FASTQ\n' % (fn_sample, fn_prep) ] self.assertEqual(txt_obs, txt_exp) # returning configuration with qdb.sql_connection.TRN: qdb.sql_connection.TRN.add( "UPDATE settings SET base_data_dir = '%s'" % obdr) bdr = qdb.sql_connection.TRN.execute()
def generate_biom_and_metadata_release(study_status='public'): """Generate a list of biom/meatadata filepaths and a tgz of those files Parameters ---------- study_status : str, optional The study status to search for. Note that this should always be set to 'public' but having this exposed helps with testing. The other options are 'private' and 'sandbox' """ studies = qdb.study.Study.get_by_status(study_status) qiita_config = ConfigurationManager() working_dir = qiita_config.working_dir portal = qiita_config.portal bdir = qdb.util.get_db_files_base_dir() time = datetime.now().strftime('%m-%d-%y %H:%M:%S') data = [] for s in studies: # [0] latest is first, [1] only getting the filepath sample_fp = relpath(s.sample_template.get_filepaths()[0][1], bdir) for a in s.artifacts(artifact_type='BIOM'): if a.processing_parameters is None: continue processing_params = a.processing_parameters cmd_name = processing_params.command.name ms = processing_params.command.merging_scheme software = processing_params.command.software software = '%s v%s' % (software.name, software.version) # this loop is necessary as in theory an artifact can be # generated from multiple prep info files afps = [fp for _, fp, _ in a.filepaths if fp.endswith('biom')] merging_schemes = [] parent_softwares = [] for p in a.parents: pparent = p.processing_parameters # if parent is None, then is a direct upload; for example # per_sample_FASTQ in shotgun data if pparent is None: parent_cmd_name = None parent_merging_scheme = None parent_pp = None parent_software = 'N/A' else: parent_cmd_name = pparent.command.name parent_merging_scheme = pparent.command.merging_scheme parent_pp = pparent.values psoftware = pparent.command.software parent_software = '%s v%s' % ( psoftware.name, psoftware.version) merging_schemes.append(qdb.util.human_merging_scheme( cmd_name, ms, parent_cmd_name, parent_merging_scheme, processing_params.values, afps, parent_pp)) parent_softwares.append(parent_software) merging_schemes = ', '.join(merging_schemes) parent_softwares = ', '.join(parent_softwares) for _, fp, fp_type in a.filepaths: if fp_type != 'biom' or 'only-16s' in fp: continue fp = relpath(fp, bdir) for pt in a.prep_templates: categories = pt.categories() platform = '' target_gene = '' if 'platform' in categories: platform = ', '.join( set(pt.get_category('platform').values())) if 'target_gene' in categories: target_gene = ', '.join( set(pt.get_category('target_gene').values())) for _, prep_fp in pt.get_filepaths(): if 'qiime' not in prep_fp: break prep_fp = relpath(prep_fp, bdir) # format: (biom_fp, sample_fp, prep_fp, qiita_artifact_id, # platform, target gene, merging schemes, # artifact software/version, # parent sofware/version) data.append((fp, sample_fp, prep_fp, a.id, platform, target_gene, merging_schemes, software, parent_softwares)) # writing text and tgz file ts = datetime.now().strftime('%m%d%y-%H%M%S') tgz_dir = join(working_dir, 'releases') create_nested_path(tgz_dir) tgz_name = join(tgz_dir, '%s-%s-building.tgz' % (portal, study_status)) tgz_name_final = join(tgz_dir, '%s-%s.tgz' % (portal, study_status)) txt_hd = StringIO() with topen(tgz_name, "w|gz") as tgz: txt_hd.write( "biom fp\tsample fp\tprep fp\tqiita artifact id\tplatform\t" "target gene\tmerging scheme\tartifact software\t" "parent software\n") for biom_fp, sample_fp, prep_fp, aid, pform, tg, ms, asv, psv in data: txt_hd.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % ( biom_fp, sample_fp, prep_fp, aid, pform, tg, ms, asv, psv)) tgz.add(join(bdir, biom_fp), arcname=biom_fp, recursive=False) tgz.add(join(bdir, sample_fp), arcname=sample_fp, recursive=False) tgz.add(join(bdir, prep_fp), arcname=prep_fp, recursive=False) txt_hd.seek(0) info = TarInfo(name='%s-%s-%s.txt' % (portal, study_status, ts)) info.size = len(txt_hd.buf) tgz.addfile(tarinfo=info, fileobj=txt_hd) with open(tgz_name, "rb") as f: md5sum = md5() for c in iter(lambda: f.read(4096), b""): md5sum.update(c) rename(tgz_name, tgz_name_final) vals = [ ('filepath', tgz_name_final[len(working_dir):], r_client.set), ('md5sum', md5sum.hexdigest(), r_client.set), ('time', time, r_client.set)] for k, v, f in vals: redis_key = '%s:release:%s:%s' % (portal, study_status, k) # important to "flush" variables to avoid errors r_client.delete(redis_key) f(redis_key, v)
def generate_biom_and_metadata_release(study_status='public'): """Generate a list of biom/meatadata filepaths and a tgz of those files Parameters ---------- study_status : str, optional The study status to search for. Note that this should always be set to 'public' but having this exposed helps with testing. The other options are 'private' and 'sandbox' """ studies = qdb.study.Study.get_by_status(study_status) qiita_config = ConfigurationManager() working_dir = qiita_config.working_dir portal = qiita_config.portal bdir = qdb.util.get_db_files_base_dir() time = datetime.now().strftime('%m-%d-%y %H:%M:%S') data = [] for s in studies: # [0] latest is first, [1] only getting the filepath sample_fp = relpath(s.sample_template.get_filepaths()[0][1], bdir) for a in s.artifacts(artifact_type='BIOM'): if a.processing_parameters is None or a.visibility != study_status: continue merging_schemes, parent_softwares = a.merging_scheme software = a.processing_parameters.command.software software = '%s v%s' % (software.name, software.version) for x in a.filepaths: if x['fp_type'] != 'biom' or 'only-16s' in x['fp']: continue fp = relpath(x['fp'], bdir) for pt in a.prep_templates: categories = pt.categories() platform = '' target_gene = '' if 'platform' in categories: platform = ', '.join( set(pt.get_category('platform').values())) if 'target_gene' in categories: target_gene = ', '.join( set(pt.get_category('target_gene').values())) for _, prep_fp in pt.get_filepaths(): if 'qiime' not in prep_fp: break prep_fp = relpath(prep_fp, bdir) # format: (biom_fp, sample_fp, prep_fp, qiita_artifact_id, # platform, target gene, merging schemes, # artifact software/version, # parent sofware/version) data.append((fp, sample_fp, prep_fp, a.id, platform, target_gene, merging_schemes, software, parent_softwares)) # writing text and tgz file ts = datetime.now().strftime('%m%d%y-%H%M%S') tgz_dir = join(working_dir, 'releases') create_nested_path(tgz_dir) tgz_name = join(tgz_dir, '%s-%s-building.tgz' % (portal, study_status)) tgz_name_final = join(tgz_dir, '%s-%s.tgz' % (portal, study_status)) txt_lines = [ "biom fp\tsample fp\tprep fp\tqiita artifact id\tplatform\t" "target gene\tmerging scheme\tartifact software\tparent software"] with topen(tgz_name, "w|gz") as tgz: for biom_fp, sample_fp, prep_fp, aid, pform, tg, ms, asv, psv in data: txt_lines.append("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % ( biom_fp, sample_fp, prep_fp, aid, pform, tg, ms, asv, psv)) tgz.add(join(bdir, biom_fp), arcname=biom_fp, recursive=False) tgz.add(join(bdir, sample_fp), arcname=sample_fp, recursive=False) tgz.add(join(bdir, prep_fp), arcname=prep_fp, recursive=False) info = TarInfo(name='%s-%s-%s.txt' % (portal, study_status, ts)) txt_hd = BytesIO() txt_hd.write(bytes('\n'.join(txt_lines), 'ascii')) txt_hd.seek(0) info.size = len(txt_hd.read()) txt_hd.seek(0) tgz.addfile(tarinfo=info, fileobj=txt_hd) with open(tgz_name, "rb") as f: md5sum = md5() for c in iter(lambda: f.read(4096), b""): md5sum.update(c) rename(tgz_name, tgz_name_final) vals = [ ('filepath', tgz_name_final[len(working_dir):], r_client.set), ('md5sum', md5sum.hexdigest(), r_client.set), ('time', time, r_client.set)] for k, v, f in vals: redis_key = '%s:release:%s:%s' % (portal, study_status, k) # important to "flush" variables to avoid errors r_client.delete(redis_key) f(redis_key, v)
def generate_biom_and_metadata_release(study_status='public'): """Generate a list of biom/meatadata filepaths and a tgz of those files Parameters ---------- study_status : str, optional The study status to search for. Note that this should always be set to 'public' but having this exposed helps with testing. The other options are 'private' and 'sandbox' """ studies = qdb.study.Study.get_by_status(study_status) qiita_config = ConfigurationManager() working_dir = qiita_config.working_dir portal = qiita_config.portal bdir = qdb.util.get_db_files_base_dir() time = datetime.now().strftime('%m-%d-%y %H:%M:%S') data = [] for s in studies: # [0] latest is first, [1] only getting the filepath sample_fp = relpath(s.sample_template.get_filepaths()[0][1], bdir) for a in s.artifacts(artifact_type='BIOM'): if a.processing_parameters is None: continue cmd_name = a.processing_parameters.command.name # this loop is necessary as in theory an artifact can be # generated from multiple prep info files human_cmd = [] for p in a.parents: pp = p.processing_parameters pp_cmd_name = pp.command.name if pp_cmd_name == 'Trimming': human_cmd.append('%s @ %s' % ( cmd_name, str(pp.values['length']))) else: human_cmd.append('%s, %s' % (cmd_name, pp_cmd_name)) human_cmd = ', '.join(human_cmd) for _, fp, fp_type in a.filepaths: if fp_type != 'biom' or 'only-16s' in fp: continue fp = relpath(fp, bdir) # format: (biom_fp, sample_fp, prep_fp, qiita_artifact_id, # human readable name) for pt in a.prep_templates: for _, prep_fp in pt.get_filepaths(): if 'qiime' not in prep_fp: break prep_fp = relpath(prep_fp, bdir) data.append((fp, sample_fp, prep_fp, a.id, human_cmd)) # writing text and tgz file ts = datetime.now().strftime('%m%d%y-%H%M%S') tgz_dir = join(working_dir, 'releases') if not exists(tgz_dir): makedirs(tgz_dir) tgz_name = join(tgz_dir, '%s-%s-building.tgz' % (portal, study_status)) tgz_name_final = join(tgz_dir, '%s-%s.tgz' % (portal, study_status)) txt_hd = StringIO() with topen(tgz_name, "w|gz") as tgz: # writing header for txt txt_hd.write( "biom_fp\tsample_fp\tprep_fp\tqiita_artifact_id\tcommand\n") for biom_fp, sample_fp, prep_fp, artifact_id, human_cmd in data: txt_hd.write("%s\t%s\t%s\t%s\t%s\n" % ( biom_fp, sample_fp, prep_fp, artifact_id, human_cmd)) tgz.add(join(bdir, biom_fp), arcname=biom_fp, recursive=False) tgz.add(join(bdir, sample_fp), arcname=sample_fp, recursive=False) tgz.add(join(bdir, prep_fp), arcname=prep_fp, recursive=False) txt_hd.seek(0) info = TarInfo(name='%s-%s-%s.txt' % (portal, study_status, ts)) info.size = len(txt_hd.buf) tgz.addfile(tarinfo=info, fileobj=txt_hd) with open(tgz_name, "rb") as f: md5sum = md5() for c in iter(lambda: f.read(4096), b""): md5sum.update(c) rename(tgz_name, tgz_name_final) vals = [ ('filepath', tgz_name_final[len(working_dir):], r_client.set), ('md5sum', md5sum.hexdigest(), r_client.set), ('time', time, r_client.set)] for k, v, f in vals: redis_key = '%s:release:%s:%s' % (portal, study_status, k) # important to "flush" variables to avoid errors r_client.delete(redis_key) f(redis_key, v)
def test_generate_biom_and_metadata_release(self): level = 'private' qdb.meta_util.generate_biom_and_metadata_release(level) portal = qiita_config.portal working_dir = qiita_config.working_dir vals = [ ('filepath', r_client.get), ('md5sum', r_client.get), ('time', r_client.get)] # we are storing the [0] filepath, [1] md5sum and [2] time but we are # only going to check the filepath contents so ignoring the others tgz = vals[0][1]('%s:release:%s:%s' % (portal, level, vals[0][0])) tgz = join(working_dir, tgz) self.files_to_remove.extend([tgz]) tmp = topen(tgz, "r:gz") tgz_obs = [ti.name for ti in tmp] tmp.close() # files names might change due to updates and patches so just check # that the prefix exists. fn = 'processed_data/1_study_1001_closed_reference_otu_table.biom' self.assertTrue(fn in tgz_obs) tgz_obs.remove(fn) # yes, this file is there twice self.assertTrue(fn in tgz_obs) tgz_obs.remove(fn) # let's check the next biom fn = ('processed_data/1_study_1001_closed_reference_otu_table_Silva.' 'biom') self.assertTrue(fn in tgz_obs) tgz_obs.remove(fn) # now let's check prep info files based on their suffix, just take # the first one and check/rm the occurances of that file fn_prep = [f for f in tgz_obs if f.startswith('templates/1_prep_1_')][0] # 3 times self.assertTrue(fn_prep in tgz_obs) tgz_obs.remove(fn_prep) self.assertTrue(fn_prep in tgz_obs) tgz_obs.remove(fn_prep) self.assertTrue(fn_prep in tgz_obs) tgz_obs.remove(fn_prep) fn_sample = [f for f in tgz_obs if f.startswith('templates/1_')][0] # 3 times self.assertTrue(fn_sample in tgz_obs) tgz_obs.remove(fn_sample) self.assertTrue(fn_sample in tgz_obs) tgz_obs.remove(fn_sample) self.assertTrue(fn_sample in tgz_obs) tgz_obs.remove(fn_sample) # now we should only have the text file txt = tgz_obs.pop() # now it should be empty self.assertEqual(tgz_obs, []) tmp = topen(tgz, "r:gz") fhd = tmp.extractfile(txt) txt_obs = fhd.readlines() tmp.close() txt_exp = [ 'biom_fp\tsample_fp\tprep_fp\tqiita_artifact_id\tcommand\n', 'processed_data/1_study_1001_closed_reference_otu_table.biom\t' '%s\t%s\t4\tPick closed-reference OTUs, Split libraries FASTQ\n' % (fn_sample, fn_prep), 'processed_data/1_study_1001_closed_reference_otu_table.biom\t' '%s\t%s\t5\tPick closed-reference OTUs, Split libraries FASTQ\n' % (fn_sample, fn_prep), 'processed_data/1_study_1001_closed_reference_otu_table_Silva.bio' 'm\t%s\t%s\t6\tPick closed-reference OTUs, Split libraries FASTQ\n' % (fn_sample, fn_prep)] self.assertEqual(txt_obs, txt_exp) # whatever the configuration was, we will change to settings so we can # test the other option when dealing with the end '/' with qdb.sql_connection.TRN: qdb.sql_connection.TRN.add( "SELECT base_data_dir FROM settings") obdr = qdb.sql_connection.TRN.execute_fetchlast() if obdr[-1] == '/': bdr = obdr[:-1] else: bdr = obdr + '/' qdb.sql_connection.TRN.add( "UPDATE settings SET base_data_dir = '%s'" % bdr) bdr = qdb.sql_connection.TRN.execute() qdb.meta_util.generate_biom_and_metadata_release(level) # we are storing the [0] filepath, [1] md5sum and [2] time but we are # only going to check the filepath contents so ignoring the others tgz = vals[0][1]('%s:release:%s:%s' % (portal, level, vals[0][0])) tgz = join(working_dir, tgz) tmp = topen(tgz, "r:gz") tgz_obs = [ti.name for ti in tmp] tmp.close() # files names might change due to updates and patches so just check # that the prefix exists. fn = 'processed_data/1_study_1001_closed_reference_otu_table.biom' self.assertTrue(fn in tgz_obs) tgz_obs.remove(fn) # yes, this file is there twice self.assertTrue(fn in tgz_obs) tgz_obs.remove(fn) # let's check the next biom fn = ('processed_data/1_study_1001_closed_reference_otu_table_Silva.' 'biom') self.assertTrue(fn in tgz_obs) tgz_obs.remove(fn) # now let's check prep info files based on their suffix, just take # the first one and check/rm the occurances of that file fn_prep = [f for f in tgz_obs if f.startswith('templates/1_prep_1_')][0] # 3 times self.assertTrue(fn_prep in tgz_obs) tgz_obs.remove(fn_prep) self.assertTrue(fn_prep in tgz_obs) tgz_obs.remove(fn_prep) self.assertTrue(fn_prep in tgz_obs) tgz_obs.remove(fn_prep) fn_sample = [f for f in tgz_obs if f.startswith('templates/1_')][0] # 3 times self.assertTrue(fn_sample in tgz_obs) tgz_obs.remove(fn_sample) self.assertTrue(fn_sample in tgz_obs) tgz_obs.remove(fn_sample) self.assertTrue(fn_sample in tgz_obs) tgz_obs.remove(fn_sample) # now we should only have the text file txt = tgz_obs.pop() # now it should be empty self.assertEqual(tgz_obs, []) tmp = topen(tgz, "r:gz") fhd = tmp.extractfile(txt) txt_obs = fhd.readlines() tmp.close() txt_exp = [ 'biom_fp\tsample_fp\tprep_fp\tqiita_artifact_id\tcommand\n', 'processed_data/1_study_1001_closed_reference_otu_table.biom\t' '%s\t%s\t4\tPick closed-reference OTUs, Split libraries FASTQ\n' % (fn_sample, fn_prep), 'processed_data/1_study_1001_closed_reference_otu_table.biom\t' '%s\t%s\t5\tPick closed-reference OTUs, Split libraries FASTQ\n' % (fn_sample, fn_prep), 'processed_data/1_study_1001_closed_reference_otu_table_Silva.bio' 'm\t%s\t%s\t6\tPick closed-reference OTUs, Split libraries FASTQ\n' % (fn_sample, fn_prep)] self.assertEqual(txt_obs, txt_exp) # returning configuration with qdb.sql_connection.TRN: qdb.sql_connection.TRN.add( "UPDATE settings SET base_data_dir = '%s'" % obdr) bdr = qdb.sql_connection.TRN.execute()
else: print("%s: %s" % (FILE_NOT_FOUND, fn), file=stderr) exit(1) if not fn.lower().endswith('.tar.gz'): print("%s: %s" % (NOT_TAR_GZ, fn), file=stderr) exit(1) # load data to dict stats = dict() header_order = list() item_order = dict() item_set = dict() for full_fn in argv[1:]: # load tar.gz and check for validity try: tar = topen(full_fn, 'r:gz') except: warn("Unable to open tar: %s" % full_fn) fn = full_fn.split('/')[-1] html_fn = None for tar_fn in tar.getnames(): if tar_fn.split('/')[-1].lower() == QUALIMAP_REPORT_HTML.lower(): html_fn = tar_fn break if html_fn is None: print("%s: %s" % (HTML_NOT_FOUND, fn), file=stderr) exit(1) # prep for parsing this HTML file html_lines = [ l.strip()
def generate_plugin_releases(): """Generate releases for plugins """ ARCHIVE = qdb.archive.Archive qiita_config = ConfigurationManager() working_dir = qiita_config.working_dir commands = [c for s in qdb.software.Software.iter(active=True) for c in s.commands if c.post_processing_cmd is not None] tnow = datetime.now() ts = tnow.strftime('%m%d%y-%H%M%S') tgz_dir = join(working_dir, 'releases', 'archive') create_nested_path(tgz_dir) tgz_dir_release = join(tgz_dir, ts) create_nested_path(tgz_dir_release) for cmd in commands: cmd_name = cmd.name mschemes = [v for _, v in ARCHIVE.merging_schemes().items() if cmd_name in v] for ms in mschemes: ms_name = sub('[^0-9a-zA-Z]+', '', ms) ms_fp = join(tgz_dir_release, ms_name) create_nested_path(ms_fp) pfp = join(ms_fp, 'archive.json') archives = {k: loads(v) for k, v in ARCHIVE.retrieve_feature_values( archive_merging_scheme=ms).items() if v != ''} with open(pfp, 'w') as f: dump(archives, f) # now let's run the post_processing_cmd ppc = cmd.post_processing_cmd # concatenate any other parameters into a string params = ' '.join(["%s=%s" % (k, v) for k, v in ppc['script_params'].items()]) # append archives file and output dir parameters params = ("%s --fp_archive=%s --output_dir=%s" % ( params, pfp, ms_fp)) ppc_cmd = "%s %s %s" % ( ppc['script_env'], ppc['script_path'], params) p_out, p_err, rv = qdb.processing_job._system_call(ppc_cmd) p_out = p_out.rstrip() if rv != 0: raise ValueError('Error %d: %s' % (rv, p_out)) p_out = loads(p_out) # tgz-ing all files tgz_name = join(tgz_dir, 'archive-%s-building.tgz' % ts) tgz_name_final = join(tgz_dir, 'archive.tgz') with topen(tgz_name, "w|gz") as tgz: tgz.add(tgz_dir_release, arcname=basename(tgz_dir_release)) # getting the release md5 with open(tgz_name, "rb") as f: md5sum = md5() for c in iter(lambda: f.read(4096), b""): md5sum.update(c) rename(tgz_name, tgz_name_final) vals = [ ('filepath', tgz_name_final[len(working_dir):], r_client.set), ('md5sum', md5sum.hexdigest(), r_client.set), ('time', tnow.strftime('%m-%d-%y %H:%M:%S'), r_client.set)] for k, v, f in vals: redis_key = 'release-archive:%s' % k # important to "flush" variables to avoid errors r_client.delete(redis_key) f(redis_key, v)