def read_cygport(dirpath, tf): try: with xtarfile.open(os.path.join(dirpath, tf), mode='r') as a: cygports = [ m for m in a.getmembers() if m.name.endswith('.cygport') ] if len(cygports) != 1: logging.info('srcpkg %s contains %d .cygport files' % (tf, len(cygports))) return None f = a.extractfile(cygports[0]) content = f.read() except tarfile.ReadError: logging.error("srcpkg %s is not a valid compressed archive" % tf) return None try: content = content.decode() except UnicodeDecodeError: logging.error("utf8 decode error for .cygport in srcpkg %s" % tf) content = content.decode(errors='replace') # fold any line-continuations content = content.replace('\\\n', '') return content
def get_man_files(self, pkg, repo=None): if repo is None: repo = [db for db in self.sync_db.get_syncdbs() if db.get_pkg(pkg.name)][0].name local_db = self.files_db[repo]["path"] t = self._cached_tarfiles.setdefault(local_db, tarfile.open(str(local_db.resolve()), "r")) files = t.extractfile("{}-{}/files".format(pkg.name, pkg.version)) for line in files.readlines(): line = line.decode("utf-8").rstrip() if line.startswith(MANDIR) and not line.endswith("/"): yield line
def read_pkginfo_buildinfo(path): if not os.path.exists(path) or not os.path.isfile(path): raise Exception("This is not a valid path!") with xtarfile.open(path, "r") as fp: try: pkginfo = fp.extractfile(".PKGINFO").read() buildinfo = fp.extractfile(".BUILDINFO").read() except: return "", "" return pkginfo.decode(), buildinfo.decode()
def parse_repo(repopath): """ Parses an Arch repo db file, and returns a list of RepoPackage objects. Arguments: repopath -- The path of a repository db file. """ logger.info("Starting repo parsing") if not os.path.exists(repopath): logger.error("Could not read file %s", repopath) logger.info("Reading repo tarfile %s", repopath) filename = os.path.split(repopath)[1] m = re.match(r"^(.*)\.(db|files)\.tar(\..*)?$", filename) if m: reponame = m.group(1) else: logger.error("File does not have the proper extension") raise Exception("File does not have the proper extension") with tarfile.open(repopath, 'r') as repodb: logger.debug("Starting package parsing") newpkg = lambda: RepoPackage(reponame) pkgs = defaultdict(newpkg) for tarinfo in repodb.getmembers(): if tarinfo.isreg(): pkgid, fname = os.path.split(tarinfo.name) if fname == 'files': # don't parse yet for speed and memory consumption reasons files_data = repodb.extractfile(tarinfo) pkgs[pkgid].files = files_data.read() del files_data elif fname in ('desc', 'depends'): data_file = repodb.extractfile(tarinfo) data_file = io.TextIOWrapper(io.BytesIO(data_file.read()), encoding='UTF-8') try: pkgs[pkgid].populate( parse_info(pkgid, fname, data_file)) except UnicodeDecodeError: logger.warning( "Could not correctly decode %s, skipping file", tarinfo.name) data_file.close() del data_file logger.debug("Done parsing file %s/%s", pkgid, fname) logger.info("Finished repo parsing, %d total packages", len(pkgs)) return (reponame, pkgs.values())
def get_man_contents(self, pkg): """ Note: the content is yielded as `bytes`, its decoding is not a priori known """ # first check if there are any man files at all to avoid useless downloads man_files = list(self.get_man_files(pkg)) if not man_files: return # get the pkg tarball _pattern = "{}-{}-{}.pkg.tar.*".format(pkg.name, pkg.version, pkg.arch) if not list(f for f in self.cachedir.glob(_pattern) if not str(f).endswith(".part")): self._download_package(pkg) tarballs = sorted(f for f in self.cachedir.glob(_pattern) if not str(f).endswith(".part")) assert len(tarballs) > 0, _pattern tarball = tarballs[0] # extract man files with tarfile.open(str(tarball), "r") as t: hardlinks = [] for file in man_files: info = t.getmember(file) # Hardlinks on the filesystem level are indifferentiable from normal files, # but in tar the first file added is "file" and the subsequent are hardlinks. # To make sure that normal files are processed first, we postpone yielding of # the hardlinks. if info.islnk(): if file.endswith(".gz"): file = file[:-3] target = info.linkname if target.endswith(".gz"): target = target[:-3] hardlinks.append( ("hardlink", file, target) ) elif info.issym(): if file.endswith(".gz"): file = file[:-3] target = info.linkname if target.endswith(".gz"): target = target[:-3] yield "symlink", file, target else: man = t.extractfile(file).read() if file.endswith(".gz"): file = file[:-3] man = gzip.decompress(man) yield "file", file, man yield from hardlinks
def extract_pkginfo(self, package): """Given a package (.tar.xz filename), extract and parse its .PKGINFO file as a dict""" with tarfile.open(package, mode='r') as tar: # Manual seeking to find .PKGINFO without having to uncompress the whole package while True: f = tar.next() if f.name == '.PKGINFO': break pkginfo = tar.extractfile(f).readlines() # Parse .PKGINFO res = dict() for line in pkginfo: m = re.match(r'([^=]*) = (.*)', line.decode('utf8')) if m: # TODO: support multi-valued attributes key, value = m[1], m[2].strip() res[key] = value return res
def read_tar(f): result = {} try: with xtarfile.open(f, mode='r') as t: for m in t: if m.isfile(): f = t.extractfile(m) sha512 = sha512_file(f) else: sha512 = None result[m.name] = TarMemberInfo(m, sha512) except tarfile.ReadError: # if we can't read the tar archive, we should never consider it to have # the same contents as another tar archive... result[f] = None return result
def scan(scandir, m, all_packages, arch, args): homedir = os.path.join(scandir, m.name) basedir = os.path.join(homedir, arch) packages = defaultdict(package.Package) move = MoveList(homedir) vault = MoveList() remove = [] remove_success = [] error = False mtimes = [('', 0)] ignored = 0 logging.debug('reading packages from %s' % (basedir)) # note mtime of any !ready file at top-level for ready in [ os.path.join(basedir, '!ready'), os.path.join(basedir, 'release', '!ready') ]: if os.path.exists(ready): mtime = os.path.getmtime(ready) mtimes.append(('', mtime)) logging.debug('processing files with mtime older than %d' % (mtime)) remove.append(ready) # we record a timestamp when 'ignoring as there is no !ready' warnings were # last emitted logging.debug( "reminder-timestamp %d, interval %d, next reminder %d, current time %d" % (m.reminder_time, REMINDER_INTERVAL, m.reminder_time + REMINDER_INTERVAL, time.time())) # scan package directories for (dirpath, _subdirs, files) in os.walk(os.path.join(basedir, 'release')): relpath = os.path.relpath(dirpath, homedir) removed_files = [] # filter out files we don't need to consider for f in sorted(files): if f.endswith('.bak'): files.remove(f) # skip uninteresting directories if (not files) or (relpath == os.path.join(arch, 'release')): continue logging.debug('reading uploads from %s' % dirpath) # note the mtime of the !ready file if '!ready' in files: ready = os.path.join(dirpath, '!ready') mtime = os.path.getmtime(ready) mtimes.append((relpath + '/', mtime)) remove.append(ready) files.remove('!ready') logging.debug( "processing files below '%s' with mtime older than %d" % (relpath, mtime)) else: # otherwise work back up a list of (path,mtimes) (which should be in # shortest-to-longest order, since os.walk() walks the tree # top-down), and use the mtime of the first (longest) matching path. while True: (path, mtime) = mtimes[-1] if relpath.startswith(path): logging.debug("using mtime %d from subpath '%s' of '%s'" % (mtime, path, relpath)) break else: mtimes.pop() # only process files newer than !ready for f in sorted(files): fn = os.path.join(dirpath, f) file_mtime = os.path.getmtime(fn) if file_mtime > mtime: if mtime == 0: m.reminders_timestamp_checked = True logging.debug("ignoring %s as there is no !ready" % fn) # don't warn until file is at least REMINDER_GRACE old if (file_mtime < (time.time() - REMINDER_GRACE)): ignored += 1 else: logging.warning("ignoring %s as it is newer than !ready" % fn) files.remove(f) # any file remaining? if not files: continue # package doesn't appear in package list at all (_, _, pkgpath) = relpath.split(os.sep, 2) if not package.is_in_package_list(pkgpath, all_packages): logging.error("package '%s' is not in the package list" % relpath) continue # only process packages for which we are listed as a maintainer if not package.is_in_package_list(pkgpath, m.pkgs): logging.warning( "package '%s' is not in the package list for maintainer '%s'" % (relpath, m.name)) continue # see if we can fix-up any setup.hint files pvr = None ambiguous = False seen = False for f in sorted(files): # warn about legacy setup.hint uploads if f == 'setup.hint': logging.warning( "'%s' seen, please update to cygport >= 0.23.0" % f) seen = True match = re.match( r'^([^-].*?)(-src|)\.tar' + common_constants.PACKAGE_COMPRESSIONS_RE + r'$', f) if match: if (pvr is not None) and (pvr != match.group(1)): ambiguous = True pvr = match.group(1) if seen: if ambiguous or (pvr is None): error = True logging.error( "'setup.hint' seen in %s, and couldn't determine what version it applies to", dirpath) else: old = "setup.hint" new = pvr + ".hint" logging.warning("renaming '%s' to '%s'" % (old, new)) os.rename(os.path.join(dirpath, old), os.path.join(dirpath, new)) files.remove(old) files.append(new) for f in sorted(files): match = re.match( r'^([^-].*)-src\.tar' + common_constants.PACKAGE_COMPRESSIONS_RE + r'$', f) if match: pvr = match.group(1) old = pvr + '.hint' new = pvr + '-src.hint' # see if we can fix-up missing -src.hint file if (old in files) and (new not in files): logging.warning("copying '%s' to '%s'" % (old, new)) shutil.copy2(os.path.join(dirpath, old), os.path.join(dirpath, new)) files.append(new) if f.replace('-src', '') not in files: logging.info("discarding '%s'" % (old)) files.remove(old) remove.append(os.path.join(dirpath, old)) # see if we can fix-up missing homepage: in -src.hint file # check homepage: for liveliness and redirection # discard any keys which are invalid in a -src.hint if (new in files): fixes.fix_hint(dirpath, new, f, ['homepage', 'invalid_keys']) # filter out files we don't need to consider for f in sorted(files): fn = os.path.join(dirpath, f) rel_fn = os.path.join(relpath, f) logging.debug("processing %s" % rel_fn) # ignore !packages (which we no longer use) # ignore !mail and !email (which we have already read) if f in ['!packages', '!mail', '!email']: files.remove(f) continue # ignore in-progress sftp uploads. Net::SFTP::SftpServer uses # temporary upload filenames ending with '.SftpXFR.<pid>' if re.search(r'\.SftpXFR\.\d*$', f): logging.debug("ignoring temporary upload file %s" % fn) files.remove(f) continue # a remove file, which indicates some other file should be removed if f.startswith('-'): if ('*' in f) or ('?' in f): logging.error( "remove file %s name contains metacharacters, which are no longer supported" % fn) error = True elif os.path.getsize(fn) != 0: logging.error("remove file %s is not empty" % fn) error = True else: vault.add(relpath, f[1:]) remove_success.append(fn) removed_files.append(f[1:]) files.remove(f) continue # verify compressed archive files are valid match = re.search( r'\.tar' + common_constants.PACKAGE_COMPRESSIONS_RE + r'$', f) if match: valid = True try: # we need to extract all of an archive contents to validate # it with xtarfile.open(fn, mode='r') as a: a.getmembers() except Exception as e: valid = False logging.error("exception %s while reading %s" % (type(e).__name__, fn)) logging.debug('', exc_info=True) if not valid: files.remove(f) continue # does file already exist in release area? dest = os.path.join(args.rel_area, relpath, f) if os.path.isfile(dest): if not f.endswith('.hint'): if filecmp.cmp(dest, fn, shallow=False): logging.info( "discarding, identical %s is already in release area" % fn) remove_success.append(fn) else: logging.error( "discarding, different %s is already in release area (perhaps you should rebuild with a different version-release identifier?)" % fn) remove.append(fn) error = True files.remove(f) else: if filecmp.cmp(dest, fn, shallow=False): logging.debug( "identical %s is already in release area" % fn) else: logging.debug( "different %s is already in release area" % fn) # we always consider .hint files as needing to be moved, as # we currently can't have a valid package without one move.add(relpath, f) else: move.add(relpath, f) # read and validate package if files: if package.read_package_dir(packages, homedir, dirpath, files, remove=removed_files, upload=True): error = True # always consider timestamp as checked during a dry-run, so it is never # reset if args.dryrun: m.reminders_timestamp_checked = True # if files are being ignored, and more than REMINDER_INTERVAL has elapsed # since we warned about files being ignored, warn again if ignored > 0: if (time.time() > (m.reminder_time + REMINDER_INTERVAL)): logging.warning("ignored %d files in %s as there is no !ready" % (ignored, arch)) if not args.dryrun: m.reminders_issued = True return ScanResult(error, packages, move, vault, remove, remove_success)
path = "/datos/ot/lbcajica/" # path that contains the actual folder folder = path + "datos/breast_data.gz" # initial .gz file count = 0 bad = list() # this list saves the non .gz file names print("Creating folders...", end = " ") try: os.mkdir(path + "file/") # creates the folder for the unzipped folders os.mkdir(path + "data/") # creates the folder where the data is going to be saved except OSError as error: print("the folders already exists.") print("finished.\nUnziping folder...", end = " ") f = tarfile.open(folder, 'r:gz') # unziping the .gz file f.extractall(path = path + "file/") # saving the data in the file/ folder print("finished.\nStarting extraction.") for dir in os.listdir(path + "file/"): # navigates through the folder of cases if not dir.endswith(".txt"): # ignores the txt files print("file " + str(count) + "...", end = " ") for gz in os.listdir(path + "file/" + dir + "/"): # navigates through the case files if not gz.endswith(".txt"): # ignores the txt files try: txt = open(path + "data/" + dir + ".txt", 'wb') # new .txt files gzfile = gzip.open(path + "file/" + dir + "/" + gz, 'rb') # zipped folder txt.writelines(gzfile) except OSError as e: # if something goes wrong print("Not a valid .gz file")
def write_arch_listing(args, packages, arch): update_summary = set() base = os.path.join(args.htdocs, arch) ensure_dir_exists(args, base) # # write base directory .htaccess, if needed # # force trying to access the base directory to redirect to the package list # page, as having the server index this directory containing lots of # subdirectories makes this URL very expensive to serve if someone stumbles # onto it by accident) # htaccess = os.path.join(base, '.htaccess') if not os.path.exists(htaccess) or args.force: if not args.dryrun: with utils.open_amifc(htaccess) as f: print('Redirect temp /packages/%s/index.html https://cygwin.com/packages/package_list.html' % (arch), file=f) toremove = glob.glob(os.path.join(base, '*', '*')) + glob.glob(os.path.join(base, '*', '.*')) for p in packages: dirpath = os.path.join(base, p) ensure_dir_exists(args, dirpath) # # write .htaccess if needed # htaccess = os.path.join(dirpath, '.htaccess') if not os.path.exists(htaccess): if not args.dryrun or args.force: with utils.open_amifc(htaccess) as f: # We used to allow access to the directory listing as a # crude way of listing the versions of the package available # for which file lists were available. Redirect that index # page to the summary page, which now has that information # (and more). print('RedirectMatch temp /packages/%s/%s/$ /packages/summary/%s.html' % (arch, p, p), file=f) # listing files don't have the extension, but are html print('ForceType text/html', file=f) # this file should exist, so remove from the toremove list if htaccess in toremove: toremove.remove(htaccess) # # for each tarfile, write tarfile listing # if os.path.exists(dirpath): listings = os.listdir(dirpath) listings.remove('.htaccess') else: listings = [] for to in packages[p].tarfiles.values(): tn = to.fn fver = re.sub(r'\.tar.*$', '', tn) listing = os.path.join(dirpath, fver) # ... if it doesn't already exist, or --force --force if not os.path.exists(listing) or (args.force > 1): if not args.dryrun: # versions are being added, so summary needs updating update_summary.add(p) with utils.open_amifc(listing) as f: bv = packages[p].best_version desc = sdesc(packages[p], bv) if fver.endswith('-src'): desc = desc + " (source)" print(textwrap.dedent('''\ <!DOCTYPE html> <html> <head> <title>%s: %s</title> </head> <body> <h1><a href="/packages/summary/%s.html">%s</a>: %s</h1> <pre>''' % (p, desc, p, p, desc)), file=f) tf = os.path.join(args.rel_area, to.path, to.fn) if not os.path.exists(tf): # this shouldn't happen with a full mirror logging.error("tarfile %s not found" % (tf)) elif os.path.getsize(tf) <= 32: # compressed empty files aren't a valid tar file, # but we can just ignore them pass else: try: with xtarfile.open(tf, mode='r') as a: for i in a: print(' %-16s%12d %s' % (time.strftime('%Y-%m-%d %H:%M', time.gmtime(i.mtime)), i.size, i.name), file=f, end='') if i.isdir(): print('/', file=f, end='') if i.issym() or i.islnk(): print(' -> %s' % i.linkname, file=f, end='') print('', file=f) except Exception as e: print('package is corrupted', file=f) logging.error("exception %s while reading %s" % (type(e).__name__, tf)) logging.debug('', exc_info=True) print(textwrap.dedent('''\ </pre> </body> </html>'''), file=f) else: logging.log(5, 'not writing %s, already exists' % listing) # this file should exist, so remove from the toremove list if listing in toremove: toremove.remove(listing) if fver in listings: listings.remove(fver) # some versions remain on toremove list, and will be removed, so summary # needs updating if listings: update_summary.add(p) # # remove any remaining files for which there was no corresponding package # for r in toremove: logging.debug('rm %s' % r) if not args.dryrun: os.unlink(r) # # remove any directories which are now empty # dirpath = os.path.dirname(r) if len(os.listdir(dirpath)) == 0: logging.debug('rmdir %s' % dirpath) os.rmdir(os.path.join(dirpath)) return update_summary
def library_run(token, resume, auto_run): if path.exists('./.alectio') and not resume: termwarn( 'An experiment initialized here. Use either alectio library-run --resume or start a new experiment with alectio clear' ) sys.exit(0) elif path.exists('./.alectio') and resume: pass else: if token is None: termerror("Token is required.") sys.exit(1) _log("Experiment Initilizing.") if not resume: payload = requests.post( 'https://api.alectio.com/experiments/libraryRunFetch', json={ 'token': token }).json() else: payload_file = open('./.alectio/alectio_env.json') payload = json.load(payload_file) payload_file.close() if payload['status'] == 'error': termerror(payload['message']) sys.exit(1) if payload['data_url'] is None or payload['code_url'] is None: termerror( 'This Dataset or model is not supported it. Please check back') sys.exit(1) else: if not path.exists('./.alectio'): os.mkdir('./.alectio') with open('./.alectio/alectio_env.json', 'w') as fp: json.dump(payload, fp) while True: if path.exists('./' + payload['code_file']): break try: urllib.request.urlretrieve(payload['code_url'], payload['code_file']) except Exception: urllib.request.urlretrieve(payload['code_url'], payload['code_file']) else: break # while True: # response = requests.get(payload['code_url'], verify=False, headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'}, stream=True) # if response.status_code == 200: # with open(payload['code_file'], 'wb') as f: # f.write(response.raw.read()) # else: # if download == 2: # termerror('Unable to get model. Please contact [email protected] if this persists') # sys.exit(1) # else: # termerror('Retrying to fetch the model') # download += 1 if not path.exists('./' + payload['code_file'].replace('.tar.gz', "")): with tarfile.open(payload['code_file'], 'r') as archive: archive.extractall() os.chdir(payload['code_file'].replace('.tar.gz', '')) if not path.exists('./data'): os.mkdir('data') if not path.exists('./log'): os.mkdir('log') if not path.exists('./weights'): os.mkdir('weights') if not path.exists('./weight'): os.mkdir('weight') if payload['data_url'] == 'Inplace': _log('Data will be dowloaded on first run of code') elif payload['data_url'] == 'Internal': pass else: while True: if path.exists('./data/' + payload['data_file']): break try: urllib.request.urlretrieve( payload['data_url'], "./data/" + payload['data_file']) except Exception: urllib.request.urlretrieve( payload['data_url'], "./data/" + payload['data_file']) else: break # downloader.download_file(payload['data_url'], './data' + payload['data_file']) _log('Extracting Data into data dir') os.chdir('./data') if not path.exists( './' + payload['data_file'].replace('.tar.gz', "")): with tarfile.open('./' + payload['data_file'], 'r') as archive: archive.extractall() os.chdir('../') _log('All files are feteched.') if auto_run: pass else: print( 'Run you experiment by installing all dependencies with %s/requirements.txt and the run running python main.py %s at: %s' % (os.getcwd(), token, os.getcwd()))
def fix_one_hint(dirpath, hintfile, tf): pn = os.path.join(dirpath, hintfile) hints = hint.hint_file_parse(pn, hint.pvr) hints.pop('parse-warnings', None) if 'parse-errors' in hints: logging.error('invalid hints %s' % hintfile) return annotation = '' modified = False requires = hints.get('requires', '').split() if requires: # is a perl provide is already present in requires? if any(r.startswith('perl5_') for r in requires): return # ... otherwise, add a perl annotation if ('perl_base' in requires) or ('perl' in requires): logging.info("%s has perl but no perl5_nnn in requires" % (hintfile)) annotation = 'perl5_032' # if annotated, check if this package installs into vendor_perl, and if so, # add the annotated perl version to requires, if not already present if annotation: ivp = False exe = False try: with xtarfile.open(os.path.join(dirpath, tf), mode='r') as a: ivp = any( re.match(r'usr/(lib|share)/perl5/vendor_perl/', m) for m in a.getnames()) exe = any(re.search(r'\.(exe|dll)$', m) for m in a.getnames()) except tarfile.ReadError: pass knwn = any(hintfile.startswith(k) for k in known_packages) if ivp or knwn: requires = hints.get('requires', '').split() if annotation not in requires: requires.append(annotation) requires = sorted(requires) modified = True logging.warning("adding %s to requires in %s" % (annotation, hintfile)) hints['requires'] = ' '.join(requires) else: if exe: logging.info( "%s has perl in requires, and might have content linked to libperl" % (hintfile)) else: logging.info( "%s has perl in requires, assuming that's for a perl script" % (hintfile)) if not modified: return # write updated hints shutil.copy2(pn, pn + '.bak') hint.hint_file_write(pn, hints)