def process_file(filename, folder=None): if not filename: raise Exception("File not found") cur_file = None for tarfile, f_info in _extract_tar(filename): if cur_file != f_info.name: cur_file = f_info.name logger.info("Parsing: %s" % cur_file) f = tarfile.extractfile(f_info) lines = (line for line in _extract_data(f)) frames = [] for line in lines: _line = _extract_multiple_data(line) _line["time"] = "{}:{}".format(cur_file.split(".")[0][-4:-2], cur_file.split(".")[0][-2:]) # call_info = {k:v for k,v in _line} def is_good_call(call): if call["duration"] and call["duration"] != "0": return True return bool(call["status"] not in ["487", "402"] and int(call["num_call_ringtone"]) > 500) _line["failed"] = not is_good_call(_line) frames.append(_line) yield frames
def load_chunk(tarfile, size=None): """Load a number of images from a single imagenet .tar file. This function also converts the image from grayscale to RGB if necessary. Args: tarfile (tarfile.TarFile): The archive from which the files get loaded. size (Optional[Tuple[int, int]]): Resize the image to this size if provided. Returns: numpy.ndarray: Contains the image data in format [batch, w, h, c] """ result = [] filenames = [] for member in tarfile.getmembers(): filename = member.path content = tarfile.extractfile(member) img = Image.open(content) rgbimg = Image.new("RGB", img.size) rgbimg.paste(img) if size != None: rgbimg = rgbimg.resize(size, Image.ANTIALIAS) result.append(np.array(rgbimg).reshape(1, rgbimg.size[0], rgbimg.size[1], 3)) filenames.append(filename) return np.concatenate(result), filenames
def hashcalc(tarfile, filename, member, hashtype, extract, data): hashtxt = 0 if member.isdir(): if extract == 1: if not os.path.exists(filename): os.makedirs(filename) os.chmod(filename, member.mode) os.chown(filename, member.uid, member.gid) if member.isfile(): filecontents = tarfile.extractfile(filename).read() if hashtype == 'md5': hash = hashlib.md5(filecontents) elif hashtype == 'sha1': hash = hashlib.sha1(filecontents) elif hashtype == 'sha256': hash = hashlib.sha256(filecontents) elif hashtype == 'sha512': hash = hashlib.sha512(filecontents) #print(hash.hexdigest(), end ="|") hashtxt = hash.hexdigest() data.append(hashtxt) if extract == 1: filenamechk = filename + "_" + hashtxt if not os.path.isfile(filenamechk): fout = open(filenamechk, 'wb') fout.write(filecontents) fout.close() os.chmod(filenamechk, member.mode) os.chown(filenamechk, member.uid, member.gid) else: #print('0', end ="|") data.append('0') return data
def load_chunk(tarfile, size=None): """Load a number of images from a single imagenet .tar file. This function also converts the image from grayscale to RGB if necessary. Args: tarfile (tarfile.TarFile): The archive from which the files get loaded. size (Optional[Tuple[int, int]]): Resize the image to this size if provided. Returns: numpy.ndarray: Contains the image data in format [batch, w, h, c] """ result = [] filenames = [] for member in tarfile.getmembers(): filename = member.path content = tarfile.extractfile(member) img = Image.open(content) rgbimg = Image.new("RGB", img.size) rgbimg.paste(img) if size != None: rgbimg = rgbimg.resize(size, Image.ANTIALIAS) result.append( np.array(rgbimg).reshape(1, rgbimg.size[0], rgbimg.size[1], 3)) filenames.append(filename) return np.concatenate(result), filenames
def read_sp_manifest_file(path): # Must use forward slashes, not os.path.sep. fn_manifest = _tarfile_path_join(path, project.Job.FN_MANIFEST) try: with closing(tarfile.extractfile(fn_manifest)) as file: return json.loads(file.read()) except KeyError: pass
def __init__(self, tarfile, file): self.fh = tarfile.extractfile(file) # self.hsh = hashlib.new('ripemd160') self.hsh = hashlib.md5() self.data = self.fh.read(100 * 1024) while self.data: self.hsh.update(self.data) self.data = self.fh.read(100 * 1024)
def read_sp_manifest_file(path): fn_manifest = os.path.join(path, project.Job.FN_MANIFEST) try: with closing(tarfile.extractfile(fn_manifest)) as file: if sys.version_info < (3, 6): return json.loads(file.read().decode()) else: return json.loads(file.read()) except KeyError: pass
def __getPostScript(self, tarfile, key, package): """ Writes a post-install script to the file system and makes it ready to be executed. """ scriptname = ('/tmp/%s-%s' % (package, key.split('/')[-1])) fh = open(scriptname, "w") fh.write(tarfile.extractfile(key).read()) fh.close() os.chmod(scriptname, 0700) return scriptname
def extract_icon(tarfile, iconName, newIconName): extractName = iconName if iconName.startswith('/'): extractName = iconName[1:] try: iconFile = tarfile.extractfile(extractName) outicon = open(newIconName, "w") outicon.write(iconFile.read()) outicon.close() iconFile.close() logging.debug("wrote iconfile '%s' (from '%s') " % (os.path.basename(outicon.name),iconName)) return True except Exception,e: # we may sometimes get very confusing errors from tarfile here # (like 'filename None not found' from xmms) - this usually means something is strange in the # tarball eg. xmms.xpm is a symlink to the (non-existant) xmms_mini.xpm logging.error("ERROR: Icon '%s' for could not be obtained: %s " % (iconName,e))
def extract_icon(tarfile, iconName, newIconName): extractName = iconName if iconName.startswith('/'): extractName = iconName[1:] try: iconFile = tarfile.extractfile(extractName) outicon = open(newIconName, "w") outicon.write(iconFile.read()) outicon.close() iconFile.close() logging.debug("wrote iconfile '%s' (from '%s') " % (os.path.basename(outicon.name), iconName)) return True except Exception, e: # we may sometimes get very confusing errors from tarfile here # (like 'filename None not found' from xmms) - this usually means something is strange in the # tarball eg. xmms.xpm is a symlink to the (non-existant) xmms_mini.xpm logging.error("ERROR: Icon '%s' for could not be obtained: %s " % (iconName, e))
def action(self, tarfile, tarinfo): if tarinfo.isreg(): return file.FileAction(tarfile.extractfile(tarinfo), mode=oct(stat.S_IMODE(tarinfo.mode)), owner=tarinfo.uname, group=tarinfo.gname, path=tarinfo.name, timestamp=misc.time_to_timestamp(tarinfo.mtime)) elif tarinfo.isdir(): return directory.DirectoryAction( mode=oct(stat.S_IMODE(tarinfo.mode)), owner=tarinfo.uname, group=tarinfo.gname, path=tarinfo.name) elif tarinfo.issym(): return link.LinkAction(path=tarinfo.name, target=tarinfo.linkname) elif tarinfo.islnk(): return hardlink.HardLinkAction(path=tarinfo.name, target=tarinfo.linkname) else: return unknown.UnknownAction(path=tarinfo.name)
def read_sp_manifest_file(path): """Read state point from the manifest file. Parameters ---------- path : str Path to manifest file. Returns ------- dict state point. """ # Must use forward slashes, not os.path.sep. fn_manifest = _tarfile_path_join(path, project.Job.FN_MANIFEST) try: with closing(tarfile.extractfile(fn_manifest)) as file: return json.loads(file.read()) except KeyError: pass
def stream_read_file(tarfile: Any, path: str, max_size: int) -> bytes: """ Instead of reading everything in one go which is vulnerable to zip bombs, stream and accumulate the bytes :param tarfile: :param path: path to file to read in tar file :param max_size: maximum allowed size :raises MaxFileSizeExceeded: if the maximum size was reached :return: the file as binary """ file = tarfile.extractfile(path) size = 0 result = b'' while True: size += 1024 if size > max_size: msg = 'file %s was bigger than allowed %i bytes' % (path, max_size) raise MaxFileSizeExceeded(msg) chunk = file.read(1024) if not chunk: break result += chunk return result
def _get_json_from_tarfile(tarfile, json_name): json_file = (tarfile.extractfile( tarfile.getmember(json_name)).read().decode("utf8")) return json.loads(json_file)
def _package_chart(self, tarfile, version=None, **kwargs): '''Internal Helper Internal method to make it easier to hanle closing the tarfile passed here automatically on exit. ''' def get_data(filename): membername = os.path.join(self.name, filename) yaml = tarfile.extractfile(membername) return membername, ruamel.yaml.load( yaml, Loader=ruamel.yaml.RoundTripLoader) chart_file, chart_data = get_data('Chart.yaml') chart_data['version'] = version values_file, values_data = get_data('values.yaml') values = self.data.get('values', None) if values: # TODO(kerrin) expand the amount of data available # for users to control data = { 'version': version, 'name': self.name, } data.update(kwargs) def expand_values(source, expanded): for key, value in source.items(): if isinstance(value, dict): try: expand_values(value, expanded[key]) except KeyError as e: raise windlass.exc.MissingEntryInChartValues( expected_source=source, missing_key=e.args[0], values_filename=values_file, chart_name=self.name) else: newvalue = value.format(**data) expanded[key] = newvalue # Update by reference the values_data dictionary based on # the format of the supplied values field. expand_values(values, values_data) with tempfile.NamedTemporaryFile() as tmp_file: with tarfile.open(tmp_file.name, 'w:gz') as out: for member in tarfile.getmembers(): if member.name == chart_file: # Override the size of the file datastr = ruamel.yaml.dump( chart_data, Dumper=ruamel.yaml.RoundTripDumper) databytes = datastr.encode('utf-8') member.size = len(databytes) out.addfile(member, io.BytesIO(databytes)) elif member.name == values_file: # Override the size of the file datastr = ruamel.yaml.dump( values_data, Dumper=ruamel.yaml.RoundTripDumper) databytes = datastr.encode('utf-8') member.size = len(databytes) out.addfile(member, io.BytesIO(databytes)) else: out.addfile(member, tarfile.extractfile(member.name)) with open(tmp_file.name, 'rb') as fp: return fp.read()
def main(): logging.info('Beginning cron job') conn = sqlite3.connect(conf.DSN) c = conn.cursor() release_cache = DBCache(conn.cursor(), 'releases', 'name') package_cache = DBCache(conn.cursor(), 'packages', 'name') locale_cache = DBCache(conn.cursor(), 'locales', 'name') section_cache = DBCache(conn.cursor(), 'sections', 'section') for release, package in iter_packages(): release_id = release_cache[release] package_id = package_cache[package['Package']] package_path = conf.MIRROR + '/' + package['Filename'] if not os.path.exists(package_path): logging.error('File not found for package {0} ({1})'\ .format(package['Package'], package['Filename'])) continue try: tarfile = get_tarfile(package_path) except CorruptArchiveException: continue for tarinfo in tarfile: match = MAN_REGEX.search(tarinfo.name) simple_match = SIMPLE_MAN_REGEX.search(tarinfo.name) if simple_match and not match: logging.info('Simple regex matched line but fancy didn\'t: ' '{0} in {1}'.format(tarinfo.name, package_path)) if not match: continue section = match.group('section') + \ match.group('extrasection') section_id = section_cache[section] name = match.group('manpage') if '/' in name: logging.error('Invalid manpage name in package {0}.'\ .format(package['Package'])) continue if match.group('locale'): # strip leading / locale = match.group('locale')[1:] else: locale = 'DEFAULT_LOCALE' locale_id = locale_cache[locale] if tarinfo.issym(): target = os.path.dirname(tarinfo.name) target = target + '/' + tarinfo.linkname target = './' + os.path.normpath(target) target_match = MAN_REGEX.search(target) if not target_match: logging.error('The symlink for {0} in {1} is really ' 'broken.'.format(tarinfo.name, package_path)) continue if target_match.group('locale'): target_locale = target_match.group('locale')[1:] else: target_locale = 'DEFAULT_LOCALE' c.execute( """INSERT INTO symlinks (link_release, link_section, link_name, link_locale, target_release, target_section, target_name, target_locale) VALUES (?, ?, ?, ?, ?, ?, ?, ?)""", (release_id, section_id, name, locale_id, release_id, section_cache[target_match.group('section') + target_match.group('extrasection')], target_match.group('manpage'), locale_cache[target_locale])) continue try: contents = tarfile.extractfile(tarinfo.name) except KeyError: logging.error('Unable to find file {0} in {1}, possibly a ' 'symlink to something in another package.'\ .format(tarinfo.name, package_path)) continue if contents == None: logging.error('Didn\'t find {0} in {1}'.format( tarinfo.name, package_path)) continue contents = contents.read() try: apropos = get_apropos(contents, name, locale) except AproposException as e: #logging.info('Apropos error {0} for {1} ({2})'\ # .format(e.args, package['Package'], line.strip())) apropos = None if conf.COPY_MANPAGES: # cache the troff file and save its path cache_dir = get_path(release, package['Package'], package['Version'], locale, section) path = cache_dir + '/' + name + '.gz' with open(path, 'wb') as fd: fd.write(contents) else: # save the deb path = package['Filename'] try: c.execute( 'INSERT INTO manpages ' '(id, release, section, package, name, path, version, ' 'locale) VALUES (NULL, ?, ?, ?, ?, ?, ?, ?)', (release_id, section_id, package_id, name, path, package['Version'], locale_id)) manpage_id = c.lastrowid except sqlite3.IntegrityError as e: logging.error('Duplicate primary key: ' '(release: {0}, section: {1}, package: {2}, ' 'name: {3}, locale: {4})'.format( release, section_id, package['Package'], name, locale)) continue c.execute('INSERT INTO aproposes (docid, apropos) VALUES (?, ?)', (manpage_id, apropos)) conn.commit() conn.close()
def get_data(filename): membername = os.path.join(self.name, filename) yaml = tarfile.extractfile(membername) return membername, ruamel.yaml.load( yaml, Loader=ruamel.yaml.RoundTripLoader)
def get_attribute_from_tarfile(attribute, tarfile): return tarfile.extractfile(attribute).read().strip()
def commit_logs(b, wd, *args, **kwargs): opts = kwargs['opts'] if 'opts' in kwargs else None push = kwargs['push'] if 'push' in kwargs else True alt_year_month = kwargs['year_month'] if 'year_month' in kwargs else None tarfile = kwargs['tarfile'] if 'tarfile' in kwargs else None tarballname = kwargs['tarballname'] if 'tarballname' in kwargs else None osver = kwargs['osver'] if 'osver' in kwargs else None allowed_fields = opts.allowed_fields if opts is not None and \ hasattr(opts, 'allowed_fields') else [] # for error reporting: testdir = kwargs['testdir'] if 'testdir' in kwargs else None if tarfile is not None and testdir is None: testdir = '<tarfile>' # TODOXXX also extract datestamp for bunsen-push upload # XXX tmpdir is required for unxzing tmpdir = kwargs['tmpdir'] if 'tmpdir' in kwargs else None tmpdir_created = False if tmpdir is None: tmpdir_created = True tmpdir = tempfile.mkdtemp() # flatten list of args to list of (path, OPTIONAL tarfile.TarInfo) logfiles = flatten_logfiles(args) for logfile, tarinfo in logfiles: if logfile == 'BUNSEN_COMMIT': continue # don't add to commit if logfile == 'year_month.txt': continue # don't add to commit if logfile.startswith('index.html'): continue # don't add to commit if logfile.startswith('baseline'): continue # don't add to commit if logfile.startswith('xfail'): continue # don't add to commit if logfile.startswith('previous_'): continue # don't add to commit if tarinfo is not None: t = tarfile.extractfile(tarinfo) logname = os.path.basename(logfile) with open(os.path.join(tmpdir, logname), 'wb') as f: f.write(t.read()) # TODOXXX read_decode utf-8? logpath = os.path.join(tmpdir, logname) else: logpath = os.path.join(testdir, logfile) if os.path.isdir(logpath): continue # don't add to commit add_testlog_or_xz(b, tmpdir, logpath) testrun = Testrun() all_cases = [] gdb_README = pick_testlog(testdir, tmpdir, 'README.txt') gdb_sum = pick_testlog(testdir, tmpdir, 'gdb.sum') # XXX parser autodetects .xz gdb_log = pick_testlog(testdir, tmpdir, 'gdb.log') # XXX parser autodetects .xz testrun = parse_README(testrun, gdb_README) testrun.osver = osver testrun = parse_dejagnu_sum(testrun, gdb_sum, all_cases=all_cases) testrun = annotate_dejagnu_log(testrun, gdb_log, all_cases, verbose=False) for field_name in allowed_fields: if hasattr(opts,fieldname): # <TODO> opts should support dict operations, here and elsewhere testrun[field_name] = getattr(opts, field_name) if testrun is None: b.reset_all() return None # TODOXXX Pass error message? b.add_testrun(testrun) if testrun.year_month is None and alt_year_month is not None: testrun.year_month = alt_year_month # TODOXXX handle year_month from tarballname if testrun.year_month is None: print("WARNING: skipped {} due to missing year_month"\ .format(testdir)) b.reset_all() return None # TODOXXX Pass error message? # XXX To avoid huge working copies, use branch_extra to split testruns branches by source buildbot: if 'osver' in testrun: commit_id = b.commit(opts.tag, wd=wd, push=False, allow_duplicates=False, extra_label=testrun.osver) else: # TODOXXX Need to extract osver more diligently for tarfile submissions: commit_id = b.commit(opts.tag, wd=wd, push=False, allow_duplicates=False) #commit_id = b.commit(opts.tag, wd=wd, push=False, allow_duplicates=True, wd_index=wd_index, wd_testruns=wd_testruns) if push: wd.push_all() if tmpdir_created: shutil.rmtree(tmpdir) return commit_id
def main(): logging.info('Beginning cron job') conn = sqlite3.connect(conf.DSN) c = conn.cursor() release_cache = DBCache(conn.cursor(), 'releases', 'name') package_cache = DBCache(conn.cursor(), 'packages', 'name') locale_cache = DBCache(conn.cursor(), 'locales', 'name') section_cache = DBCache(conn.cursor(), 'sections', 'section') for release, package in iter_packages(): release_id = release_cache[release] package_id = package_cache[package['Package']] package_path = conf.MIRROR + '/' + package['Filename'] if not os.path.exists(package_path): logging.error('File not found for package {0} ({1})'\ .format(package['Package'], package['Filename'])) continue try: tarfile = get_tarfile(package_path) except CorruptArchiveException: continue for tarinfo in tarfile: match = MAN_REGEX.search(tarinfo.name) simple_match = SIMPLE_MAN_REGEX.search(tarinfo.name) if simple_match and not match: logging.info('Simple regex matched line but fancy didn\'t: ' '{0} in {1}'.format(tarinfo.name, package_path)) if not match: continue section = match.group('section') + \ match.group('extrasection') section_id = section_cache[section] name = match.group('manpage') if '/' in name: logging.error('Invalid manpage name in package {0}.'\ .format(package['Package'])) continue if match.group('locale'): # strip leading / locale = match.group('locale')[1:] else: locale = 'DEFAULT_LOCALE' locale_id = locale_cache[locale] if tarinfo.issym(): target = os.path.dirname(tarinfo.name) target = target + '/' + tarinfo.linkname target = './' + os.path.normpath(target) target_match = MAN_REGEX.search(target) if not target_match: logging.error('The symlink for {0} in {1} is really ' 'broken.'.format(tarinfo.name, package_path)) continue if target_match.group('locale'): target_locale = target_match.group('locale')[1:] else: target_locale = 'DEFAULT_LOCALE' c.execute("""INSERT INTO symlinks (link_release, link_section, link_name, link_locale, target_release, target_section, target_name, target_locale) VALUES (?, ?, ?, ?, ?, ?, ?, ?)""", (release_id, section_id, name, locale_id, release_id, section_cache[target_match.group('section') + target_match.group('extrasection')], target_match.group('manpage'), locale_cache[target_locale])) continue try: contents = tarfile.extractfile(tarinfo.name) except KeyError: logging.error('Unable to find file {0} in {1}, possibly a ' 'symlink to something in another package.'\ .format(tarinfo.name, package_path)) continue if contents == None: logging.error('Didn\'t find {0} in {1}'.format(tarinfo.name, package_path)) continue contents = contents.read() try: apropos = get_apropos(contents, name, locale) except AproposException as e: #logging.info('Apropos error {0} for {1} ({2})'\ # .format(e.args, package['Package'], line.strip())) apropos = None if conf.COPY_MANPAGES: # cache the troff file and save its path cache_dir = get_path(release, package['Package'], package['Version'], locale, section) path = cache_dir + '/' + name + '.gz' with open(path, 'wb') as fd: fd.write(contents) else: # save the deb path = package['Filename'] try: c.execute('INSERT INTO manpages ' '(id, release, section, package, name, path, version, ' 'locale) VALUES (NULL, ?, ?, ?, ?, ?, ?, ?)', (release_id, section_id, package_id, name, path, package['Version'], locale_id)) manpage_id = c.lastrowid except sqlite3.IntegrityError as e: logging.error('Duplicate primary key: ' '(release: {0}, section: {1}, package: {2}, ' 'name: {3}, locale: {4})'.format(release, section_id, package['Package'], name, locale)) continue c.execute('INSERT INTO aproposes (docid, apropos) VALUES (?, ?)', (manpage_id, apropos)) conn.commit() conn.close()
print(datetime.now().strftime("%H:%M:%S>"), "created output directory called " + args.output_dir) os.makedirs(args.output_dir) output_dir = args.output_dir + "/filtered_matrices_mex/hg19" # %% Combine Inputs combined_matrix = None combined_barcodes = pd.DataFrame() celltype_label = [] for filepath in glob.iglob(args.input_dir + "/*.tar.gz"): print(datetime.now().strftime("%H:%M:%S>"), "unpacking " + filepath[25:] + "...") tarfile = tarfile.open(filepath, "r:gz") mtx_file = tarfile.extractfile("filtered_matrices_mex/hg19/matrix.mtx") current_label = filepath[25:filepath. find("_filtered_gene_bc_matrices.tar.gz")] current_matrix = scipy.io.mmread(mtx_file) combined_matrix = scipy.sparse.hstack((combined_matrix, current_matrix)) # also export the gene files. (will be overwritten each round but whatev) genes_file = tarfile.extractfile("filtered_matrices_mex/hg19/genes.tsv") genes = pd.read_csv(genes_file, header=None, sep="\t") # export the cell barcodes barcodes_file = tarfile.extractfile( "filtered_matrices_mex/hg19/barcodes.tsv") barcodes = pd.read_csv(barcodes_file, header=None) combined_barcodes = combined_barcodes.append(barcodes)