def _do_update(self, statepath: str, logger: Logger) -> bool: old_head = get_subprocess_output(['git', 'rev-parse', 'HEAD'], cwd=statepath, logger=logger).strip() run_subprocess([ 'timeout', str(self.fetch_timeout), 'git', 'fetch', '--progress', '--depth=1' ], cwd=statepath, logger=logger) run_subprocess( ['git', 'checkout'], cwd=statepath, logger=logger ) # needed for reset to not fail on changed sparse checkout self._setup_sparse_checkout(statepath, logger) run_subprocess(['git', 'reset', '--hard', 'origin/' + self.branch], cwd=statepath, logger=logger) run_subprocess(['git', 'reflog', 'expire', '--expire=0', '--all'], cwd=statepath, logger=logger) run_subprocess(['git', 'prune'], cwd=statepath, logger=logger) new_head = get_subprocess_output(['git', 'rev-parse', 'HEAD'], cwd=statepath, logger=logger).strip() if new_head == old_head: logger.log('HEAD has not changed: {}'.format(new_head)) return False logger.log('HEAD was updated from {} to {}'.format(old_head, new_head)) return True
def fetch(self, statepath: str, update: bool = True, logger: Logger = NoopLogger()) -> bool: if os.path.exists(statepath) and not update: logger.log('no update requested, skipping') return False args = [ '--info=stats2', '--archive', '--compress', '--delete', '--delete-excluded', '--safe-links', ] if self.fetch_timeout is not None: args += ['--timeout', str(self.fetch_timeout)] if self.rsync_include is not None: args += ['--include', self.rsync_include] if self.rsync_exclude is not None: args += ['--exclude', self.rsync_exclude] run_subprocess(['rsync'] + args + [self.url, statepath], logger) return True
def _do_fetch(self, statedir: AtomicDir, persdata: PersistentData, logger: Logger) -> bool: numpage = 0 nextpageurl = self.url + 'Packages()?$filter=IsLatestVersion' while True: logger.log('getting ' + nextpageurl) text = self.do_http(nextpageurl).text with open(os.path.join(statedir.get_path(), '{}.xml'.format(numpage)), 'w', encoding='utf-8') as pagefile: pagefile.write(text) # parse next page logger.log('parsing ' + nextpageurl) root = xml.etree.ElementTree.fromstring(text) next_link = root.find( '{http://www.w3.org/2005/Atom}link[@rel="next"]') if next_link is None: break nextpageurl = next_link.attrib['href'] numpage += 1 return True
def fetch(self, statepath: str, update: bool = True, logger: Logger = NoopLogger()) -> bool: if os.path.isfile(statepath) and not update: logger.log('no update requested, skipping') return False args = { 'mode': 'wb' } if self.binary else { 'mode': 'w', 'encoding': 'utf-8' } persdata: Dict[str, Any] = {} perspath = statepath + '.persdata' if os.path.exists(perspath): with open(perspath, 'rb') as rpersfile: persdata = pickle.load(rpersfile) with AtomicFile(statepath, **args) as statefile: have_changes = self._do_fetch(statefile, persdata, logger) if persdata: with AtomicFile(perspath, 'wb') as wpersfile: pickle.dump(persdata, wpersfile.get_file()) if not have_changes: statefile.cancel() return have_changes
def fetch(self, statepath: str, update: bool = True, logger: Logger = NoopLogger()) -> bool: if os.path.isdir(statepath) and not update: logger.log('no update requested, skipping') return False persdata: Dict[str, Any] = {} perspath = statepath + '.persdata' try: with open(perspath, 'rb') as rpersfile: persdata = pickle.load(rpersfile) except (EOFError, FileNotFoundError, pickle.UnpicklingError): pass with AtomicDir(statepath) as statedir: have_changes = self._do_fetch(statedir, persdata, logger) if persdata: with AtomicFile(perspath, 'wb') as wpersfile: pickle.dump(persdata, wpersfile.get_file()) wpersfile.get_file().flush() os.fsync(wpersfile.get_file().fileno()) if not have_changes: statedir.cancel() return have_changes
def get_subprocess_output(command: List[str], logger: Logger, cwd: Optional[str] = None) -> str: message = 'running "{}"'.format(' '.join(command)) if cwd is not None: message += ' in "{}"'.format(cwd) logger.log(message) res = '' with subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True, encoding='utf-8', errors='ignore', cwd=cwd) as proc: for line in proc.stdout: res += line proc.wait() logger.log('command finished with code {}'.format(proc.returncode), logger.NOTICE if proc.returncode == 0 else logger.ERROR) if proc.returncode != 0: raise subprocess.CalledProcessError(cmd=command, returncode=proc.returncode) return res
def fetch(self, statepath: str, update: bool = True, logger: Logger = NoopLogger()) -> bool: if os.path.isdir(statepath) and not update: logger.log('no update requested, skipping') return False persdata: Dict[str, Any] = {} perspath = statepath + '.persdata' if os.path.exists(perspath): with open(perspath, 'rb') as rpersfile: persdata = pickle.load(rpersfile) with AtomicDir(statepath) as statedir: have_changes = self._do_fetch(statedir, persdata, logger) if persdata: with AtomicFile(perspath, 'wb') as wpersfile: pickle.dump(persdata, wpersfile.get_file()) if not have_changes: statedir.cancel() return have_changes
def _iter_parse_all_sources(self, repository: RepositoryMetadata, transformer: Optional[PackageTransformer], logger: Logger) -> Iterator[Package]: for source in repository['sources']: logger.log('parsing source {} started'.format(source['name'])) yield from self._iter_parse_source(repository, source, transformer, logger.get_indented()) logger.log('parsing source {} complete'.format(source['name']))
def _iter_parse_all_sources( self, repository: Repository, transformer: PackageTransformer | None, maintainermgr: MaintainerManager | None, logger: Logger ) -> Iterator[Package]: for source in repository.sources: logger.log(f'parsing source {source.name} started') yield from self._iter_parse_source(repository, source, transformer, maintainermgr, logger.get_indented()) logger.log(f'parsing source {source.name} complete')
def fetch(self, statepath: str, update: bool = True, logger: Logger = NoopLogger()) -> bool: if not os.path.isdir(statepath): with AtomicDir(statepath) as statedir: return self._do_fetch(statedir.get_path(), logger) elif update: return self._do_update(statepath, logger) else: logger.log('no update requested, skipping') return False
def _do_fetch(self, statedir: AtomicDir, persdata: PersistentData, logger: Logger) -> bool: try: self._do_fetch_scroll(statedir, logger) except requests.exceptions.HTTPError as e: # show server reply as it contains the failure cause logger.log('request failed, server reply follows:\n' + e.response.text, severity=Logger.ERROR) logger.log(e.response.text, severity=Logger.ERROR) raise return True
def iter_parsed(self, reponames: Optional[RepositoryNameList] = None, logger: Logger = NoopLogger()) -> Iterator[List[Package]]: sources: List[str] = [] for repository in self.repomgr.get_repositories(reponames): repo_sources = self._get_parsed_chunk_paths(repository) if not repo_sources: logger.log('parsed packages for repository {} are missing, treating repository as empty'.format(repository['desc']), severity=Logger.WARNING) sources.extend(repo_sources) if sources: yield from map(packageset_deduplicate, heap_deserialize(sources)) else: logger.log('no parsed packages found', severity=Logger.ERROR)
def _parse_descfile(path: str, logger: Logger) -> Dict[str, List[str]]: data: Dict[str, List[str]] = {} # http://t2sde.org/handbook/html/t2.package.desc.html tag_map = { 'i': 'title', 't': 'text', 'u': 'url', 'a': 'author', 'm': 'maintainer', 'c': 'category', 'f': 'flag', 'r': 'architecture', 'arch': 'architecture', 'k': 'kernel', 'kern': 'kernel', 'e': 'dependency', 'dep': 'dependency', 'l': 'license', 's': 'status', 'v': 'version', 'ver': 'version', 'p': 'priority', 'pri': 'priority', 'o': 'conf', 'd': 'download', 'down': 'download', #'s': 'source', # duplicate - documentation is incorrect? 'src': 'source', } with open(path, 'r', encoding='latin1') as descfile: for line in descfile: line = line.strip() if line.startswith('#'): continue match = re.fullmatch('\\[([^\\[\\]]+)\\]\\s*(.*?)', line, re.DOTALL) if match: tag = match.group(1).lower() tag = tag_map.get(tag, tag) data.setdefault(tag, []).append(match.group(2)) elif line: logger.log('unexpected line "{}"'.format(line), Logger.WARNING) return data
def _parse(self, repository: RepositoryMetadata, transformer: Optional[PackageTransformer], logger: Logger) -> None: logger.log('parsing started') if not os.path.isdir(self.parseddir): os.mkdir(self.parseddir) with AtomicDir(self._get_parsed_path(repository)) as state_dir: serializer = ChunkedSerializer(state_dir.get_path(), MAX_PACKAGES_PER_CHUNK) serializer.serialize(self._iter_parse_all_sources(repository, transformer, logger)) if self.safety_checks and serializer.get_num_packages() < repository['minpackages']: raise TooLittlePackages(serializer.get_num_packages(), repository['minpackages']) logger.log('parsing complete, {} packages'.format(serializer.get_num_packages()))
def _fetch(self, repository: RepositoryMetadata, update: bool, logger: Logger) -> bool: logger.log('fetching started') if not os.path.isdir(self.statedir): os.mkdir(self.statedir) have_changes = False for source in repository['sources']: if not os.path.isdir(self._get_state_path(repository)): os.mkdir(self._get_state_path(repository)) have_changes |= self._fetch_source(repository, update, source, logger.get_indented()) logger.log('fetching complete' + ('' if have_changes else ' (no changes)')) return have_changes
def _do_fetch(self, statedir: AtomicDir, persdata: PersistentData, logger: Logger) -> bool: numpage = 1 while True: url = self.url + '?page={}&per_page={}&sort=alpha'.format(numpage, self.per_page) logger.log('getting ' + url) text = self.do_http(url).text with open(os.path.join(statedir.get_path(), '{}.json'.format(numpage)), 'w', encoding='utf-8') as pagefile: pagefile.write(text) # parse next page if not json.loads(text)['crates']: logger.log('last page detected') return True numpage += 1
def _fetch_source(self, repository: Repository, update: bool, source: Source, logger: Logger) -> bool: logger.log(f'fetching source {source.name} started') fetcher: Fetcher = self.fetcher_factory.spawn_with_known_args( source.fetcher['class'], source.fetcher ) have_changes = fetcher.fetch( self._get_state_source_path(repository, source), update=update, logger=logger.get_indented() ) logger.log(f'fetching source {source.name} complete' + ('' if have_changes else ' (no changes)')) return have_changes
def _do_fetch(self, statedir: AtomicDir, persdata: PersistentData, logger: Logger) -> bool: page = 1 while True: pageurl = self.apiurl + 'packages/?page={}'.format(page) logger.log('getting page {} from {}'.format(page, pageurl)) pagedata = json.loads(do_http(pageurl).text) for package in pagedata['packages']: self._load_spec(package['name'], statedir, logger) page += 1 if page > pagedata['page_total']: break return True
def _do_fetch(self, statedir: AtomicDir, persdata: PersistentData, logger: Logger) -> bool: page_counter = count() query = '?per_page={}&sort=alpha'.format(self.per_page) while query: url = self.url + query logger.log('getting ' + url) text = self.do_http(url).text with open(os.path.join(statedir.get_path(), '{}.json'.format(next(page_counter))), 'w', encoding='utf-8') as pagefile: pagefile.write(text) pagefile.flush() os.fsync(pagefile.fileno()) # parse next page query = json.loads(text)['meta']['next_page'] logger.log('last page detected') return True
def _do_fetch(self, statefile: AtomicFile, persdata: PersistentData, logger: Logger) -> bool: fetching_what = [self.url] headers = self.headers.copy() if self.headers else {} if isinstance(self.post, dict): fetching_what.append('{} fields of form data'.format(len( self.post))) if headers: fetching_what.append('{} extra headers'.format(len(headers))) logger.log('fetching ' + ', with '.join(fetching_what)) if 'last-modified' in persdata: headers['if-modified-since'] = persdata['last-modified'] logger.log('using if-modified-since: {}'.format( headers['if-modified-since'])) try: response = save_http_stream(self.url, statefile.get_file(), compression=self.compression, data=self.post, headers=headers, timeout=self.fetch_timeout) except NotModifiedException: logger.log('got 403 not modified') return False size = os.path.getsize(statefile.get_path()) logger.log('size is {} byte(s)'.format(size)) if size == 0 and not self.allow_zero_size: raise RuntimeError('refusing zero size file') if response.headers.get('last-modified'): persdata['last-modified'] = response.headers['last-modified'] logger.log('storing last-modified: {}'.format( persdata['last-modified'])) return True
def _do_update(self, statepath: str, logger: Logger) -> bool: r = Runner(logger=logger, cwd=statepath) old_head = r.get('git', 'rev-parse', 'HEAD').strip() r.run('timeout', self._timeout_arg, 'git', 'fetch', '--progress', self._depth_arg) r.run('git', 'checkout') # needed for reset to not fail on changed sparse checkout self._setup_sparse_checkout(statepath) r.run('git', 'reset', '--hard', f'origin/{self._branch}') r.run('git', 'reflog', 'expire', '--expire=0', '--all') r.run('git', 'prune') new_head = r.get('git', 'rev-parse', 'HEAD').strip() if new_head == old_head: logger.log('HEAD has not changed: {}'.format(new_head)) return False logger.log('HEAD was updated from {} to {}'.format(old_head, new_head)) return True
def fetch(self, statepath: str, update: bool = True, logger: Logger = NoopLogger()) -> bool: if os.path.isfile(statepath) and not update: logger.log('no update requested, skipping') return False args = { 'mode': 'wb' } if self.binary else { 'mode': 'w', 'encoding': 'utf-8' } persdata: dict[str, Any] = {} perspath = statepath + '.persdata' try: with open(perspath, 'rb') as rpersfile: persdata = pickle.load(rpersfile) except (EOFError, FileNotFoundError, pickle.UnpicklingError): pass with AtomicFile(statepath, **args) as statefile: have_changes = self._do_fetch(statefile, persdata, logger) if persdata: with AtomicFile(perspath, 'wb') as wpersfile: pickle.dump(persdata, wpersfile.get_file()) wpersfile.get_file().flush() os.fsync(wpersfile.get_file().fileno()) if not have_changes: statefile.cancel() statefile.get_file().flush() os.fsync(statefile.get_file().fileno()) return have_changes
def run_subprocess(command: list[str], logger: Logger, cwd: str | None = None) -> None: message = 'running "{}"'.format(' '.join(command)) if cwd is not None: message += ' in "{}"'.format(cwd) logger.log(message) with subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True, encoding='utf-8', errors='ignore', cwd=cwd) as proc: assert(proc.stdout) for line in proc.stdout: logger.get_indented().log(line.strip()) proc.wait() logger.log('command finished with code {}'.format(proc.returncode), logger.NOTICE if proc.returncode == 0 else logger.ERROR) if proc.returncode != 0: raise subprocess.CalledProcessError(cmd=command, returncode=proc.returncode)
def _do_update(self, statepath: str, logger: Logger) -> bool: old_rev = get_subprocess_output( ['svn', 'info', '--show-item', 'revision', statepath], logger=logger).strip() run_subprocess( ['timeout', str(self.fetch_timeout), 'svn', 'up', statepath], logger=logger) new_rev = get_subprocess_output( ['svn', 'info', '--show-item', 'revision', statepath], logger=logger).strip() if new_rev == old_rev: logger.log('Revision has not changed: {}'.format(new_rev)) return False logger.log('Revision was updated from {} to {}'.format( old_rev, new_rev)) return True
def _do_fetch(self, statedir: AtomicDir, persdata: PersistentData, logger: Logger) -> bool: for letter in ['0-9'] + list(ascii_uppercase): page = 1 numpages = 1 while True: logger.log('fetching {} page {}'.format(letter, page)) pageurl = '{}/{}/page/{}/'.format(self.url, letter, page) # fetch HTML response = self.do_http(pageurl) response.encoding = 'utf-8' # is not detected properly text = response.text # get number of pages, if there are more than 1 of them if numpages == 1: for pagebutton in lxml.html.document_fromstring( text).xpath('.//nav[@class="page-selector"]/a' ): # type: ignore numpages = max(numpages, int(pagebutton.text)) # type: ignore # save HTML with open(os.path.join(statedir.get_path(), '{}-{}.html'.format(letter, page)), 'w', encoding='utf-8') as pagefile: pagefile.write(text) pagefile.flush() os.fsync(pagefile.fileno()) # end if that was last (or only) page if page >= numpages: break # proceed with the next page page += 1 return True
def _do_update(self, statepath: str, logger: Logger) -> bool: r = Runner(logger=logger, cwd=statepath) old_url = r.get('git', 'remote', 'get-url', 'origin').strip() if old_url != self._url: logger.log( f'repository URL has changed {old_url} -> {self._url}, will clone from scratch' ) shutil.rmtree(statepath) return self._do_fetch(statepath, logger) old_head = r.get('git', 'rev-parse', 'HEAD').strip() r.run('timeout', self._timeout_arg, 'git', 'fetch', '--progress', self._depth_arg) r.run('git', 'checkout' ) # needed for reset to not fail on changed sparse checkout self._setup_sparse_checkout(statepath) r.run('git', 'reset', '--hard', f'origin/{self._branch}') r.run('git', 'reflog', 'expire', '--expire=0', '--all') r.run('git', 'prune') new_head = r.get('git', 'rev-parse', 'HEAD').strip() if new_head == old_head: logger.log('HEAD has not changed: {}'.format(new_head)) return False logger.log('HEAD was updated from {} to {}'.format(old_head, new_head)) return True
def _do_fetch(self, statedir: AtomicDir, persdata: PersistentData, logger: Logger) -> bool: tarpath = os.path.join(statedir.get_path(), '.temporary.tar') headers = {} if persdata.get('last-modified'): headers['if-modified-since'] = persdata.get('last-modified') logger.log('using if-modified-since: {}'.format( headers['if-modified-since'])) logger.log('fetching {}'.format(self.url)) try: with open(tarpath, 'wb') as tarfile: response = save_http_stream(self.url, tarfile, headers=headers, timeout=self.fetch_timeout) except NotModifiedException: logger.log('got 403 not modified') return False # XXX: may be unportable, FreeBSD tar automatically handles compression type, # may not be the case on linuxes # XXX: this extracts tarball permissions, which is not desirable and it may # produce non-readable files and dirs (blackarch). GNU tar has --mode, BSD tar # lacks this. We should probably require GNU tar, and handle binary name which # may differ on BSD. run_subprocess( ['tar', '-x', '-z', '-f', tarpath, '-C', statedir.get_path()], logger) os.remove(tarpath) if response.headers.get('last-modified'): persdata['last-modified'] = response.headers['last-modified'] logger.log('storing last-modified: {}'.format( persdata['last-modified'])) return True
def _do_fetch_scroll(self, statedir: AtomicDir, logger: Logger) -> None: numpage = 0 logger.log('getting page {}'.format(numpage)) response = self._do_http('{}?scroll={}'.format(self._url, self._scroll), json=self._request_data).json() scroll_id = response['_scroll_id'] while response['hits']['hits']: with open(os.path.join(statedir.get_path(), '{}.json'.format(numpage)), 'w', encoding='utf-8') as pagefile: json.dump(response['hits']['hits'], pagefile) pagefile.flush() os.fsync(pagefile.fileno()) numpage += 1 logger.log('getting page {}'.format(numpage)) response = self._do_http('{}?scroll={}&scroll_id={}'.format( self._scroll_url, self._scroll, scroll_id)).json() try: self._do_http(self._scroll_url, method='DELETE', json={ 'scroll_id': scroll_id }).json() except requests.exceptions.HTTPError as e: # we don't care too much if removing the scroll fails, it'll timeout anyway # XXX: but log this logger.log('failed to DELETE scroll, server reply follows:\n' + e.response.text, severity=Logger.ERROR) logger.log(e.response.text, severity=Logger.ERROR) pass
def _do_fetch(self, statedir: AtomicDir, persdata: PersistentData, logger: Logger) -> bool: tarpath = os.path.join(statedir.get_path(), '.temporary.tar') headers = {} if persdata.get('last-modified'): headers['if-modified-since'] = persdata.get('last-modified') logger.log('using if-modified-since: {}'.format( headers['if-modified-since'])) logger.log('fetching {}'.format(self.url)) try: with open(tarpath, 'wb') as tarfile: response = save_http_stream(self.url, tarfile, headers=headers, timeout=self.fetch_timeout) except NotModifiedException: logger.log('got 403 not modified') return False # XXX: may be unportable, FreeBSD tar automatically handles compression type, # may not be the case on linuxes run_subprocess( ['tar', '-x', '-z', '-f', tarpath, '-C', statedir.get_path()], logger) os.remove(tarpath) if response.headers.get('last-modified'): persdata['last-modified'] = response.headers['last-modified'] logger.log('storing last-modified: {}'.format( persdata['last-modified'])) return True
def _fetch_source(self, repository: RepositoryMetadata, update: bool, source: RepositoryMetadata, logger: Logger) -> bool: if 'fetcher' not in source: logger.log('fetching source {} not supported'.format( source['name'])) return False logger.log('fetching source {} started'.format(source['name'])) fetcher: Fetcher = self.fetcher_factory.spawn_with_known_args( source['fetcher'], source) have_changes = fetcher.fetch(self._get_state_source_path( repository, source), update=update, logger=logger.get_indented()) logger.log('fetching source {} complete'.format(source['name']) + ('' if have_changes else ' (no changes)')) return have_changes