def Fetch(self, statepath, update=True, logger=NoopLogger()): if os.path.isdir(statepath) and not update: logger.Log('no update requested, skipping') return command = ['rsync', '--verbose', '--archive', '--compress', '--delete', '--delete-excluded', '--timeout=60', self.url, statepath] RunSubprocess(command, logger)
def Fetch(self, statepath, update=True, logger=NoopLogger()): if os.path.isdir(statepath) and not update: logger.Log('no update requested, skipping') return with StateDir(statepath) as statedir: for letter in ['0-9'] + [l for l in ascii_uppercase]: page = 1 numpages = 1 while True: logger.Log('fetching {} page {}'.format(letter, page)) pageurl = '{}/{}/page/{}/'.format(self.url, letter, page) # fetch HTML response = Fetch(pageurl) response.encoding = 'utf-8' # is not detected properly text = response.text # get number of pages, if there are more than 1 of them if numpages == 1: for pagebutton in lxml.html.document_fromstring(text).xpath('.//nav[@class="page-selector"]/a'): numpages = max(numpages, int(pagebutton.text)) # save HTML with open(os.path.join(statedir, '{}-{}.html'.format(letter, page)), 'w', encoding='utf-8') as pagefile: pagefile.write(text) # end if that was last (or only) page if page >= numpages: break # proceed with the next page page += 1
def Parse(self, reponame, transformer, logger=NoopLogger()): repository = self.__GetRepository(reponame) packages = self.__Parse(repository, logger) packages = self.__Transform(packages, transformer, repository, logger) return packages
def Fetch(self, statepath, update=True, logger=NoopLogger()): if not os.path.isdir(statepath): RunSubprocess([ 'git', 'clone', '--progress', '--no-checkout', '--depth=1', '--branch', self.branch, self.url, statepath ], logger=logger) self.__SetupSparseCheckout(statepath, logger) RunSubprocess(['git', 'checkout'], cwd=statepath, logger=logger) elif update: RunSubprocess( ['timeout', '10m', 'git', 'fetch', '--progress', '--depth=1'], cwd=statepath, logger=logger) RunSubprocess( ['git', 'checkout'], cwd=statepath, logger=logger ) # needed for reset to not fail on changed sparse checkout self.__SetupSparseCheckout(statepath, logger) RunSubprocess(['git', 'reset', '--hard', 'origin/' + self.branch], cwd=statepath, logger=logger) RunSubprocess(['git', 'reflog', 'expire', '--expire=0', '--all'], cwd=statepath, logger=logger) RunSubprocess(['git', 'prune'], cwd=statepath, logger=logger) else: logger.Log('no update requested, skipping')
def fetch(self, statepath: str, update: bool = True, logger: Logger = NoopLogger()) -> bool: if os.path.exists(statepath) and not update: logger.log('no update requested, skipping') return False args = [ '--info=stats2', '--archive', '--compress', '--delete', '--delete-excluded', '--safe-links', ] if self.fetch_timeout is not None: args += ['--timeout', str(self.fetch_timeout)] if self.rsync_include is not None: args += ['--include', self.rsync_include] if self.rsync_exclude is not None: args += ['--exclude', self.rsync_exclude] run_subprocess(['rsync'] + args + [self.url, statepath], logger) return True
def fetch(self, statepath: str, update: bool = True, logger: Logger = NoopLogger()) -> bool: if os.path.isdir(statepath) and not update: logger.log('no update requested, skipping') return False persdata: Dict[str, Any] = {} perspath = statepath + '.persdata' if os.path.exists(perspath): with open(perspath, 'rb') as rpersfile: persdata = pickle.load(rpersfile) with AtomicDir(statepath) as statedir: have_changes = self._do_fetch(statedir, persdata, logger) if persdata: with AtomicFile(perspath, 'wb') as wpersfile: pickle.dump(persdata, wpersfile.get_file()) if not have_changes: statedir.cancel() return have_changes
def StreamDeserializeMulti(self, processor, reponames=None, logger=NoopLogger()): deserializers = [] for repo in self.__GetRepositories(reponames): deserializers.append( self.__StreamDeserializer(self.__GetSerializedPath(repo))) while deserializers: # find lowest key (effname) thiskey = deserializers[0].Peek().effname for ds in deserializers[1:]: thiskey = min(thiskey, ds.Peek().effname) # fetch all packages with given key from all deserializers packageset = [] for ds in deserializers: while not ds.EOF() and ds.Peek().effname == thiskey: packageset.append(ds.Get()) processor(packageset) # remove EOFed repos deserializers = [ds for ds in deserializers if not ds.EOF()]
def fetch(self, statepath: str, update: bool = True, logger: Logger = NoopLogger()) -> bool: if os.path.isdir(statepath) and not update: logger.log('no update requested, skipping') return False persdata: Dict[str, Any] = {} perspath = statepath + '.persdata' try: with open(perspath, 'rb') as rpersfile: persdata = pickle.load(rpersfile) except (EOFError, FileNotFoundError, pickle.UnpicklingError): pass with AtomicDir(statepath) as statedir: have_changes = self._do_fetch(statedir, persdata, logger) if persdata: with AtomicFile(perspath, 'wb') as wpersfile: pickle.dump(persdata, wpersfile.get_file()) wpersfile.get_file().flush() os.fsync(wpersfile.get_file().fileno()) if not have_changes: statedir.cancel() return have_changes
def fetch(self, statepath: str, update: bool = True, logger: Logger = NoopLogger()) -> bool: if os.path.isfile(statepath) and not update: logger.log('no update requested, skipping') return False args = { 'mode': 'wb' } if self.binary else { 'mode': 'w', 'encoding': 'utf-8' } persdata: Dict[str, Any] = {} perspath = statepath + '.persdata' if os.path.exists(perspath): with open(perspath, 'rb') as rpersfile: persdata = pickle.load(rpersfile) with AtomicFile(statepath, **args) as statefile: have_changes = self._do_fetch(statefile, persdata, logger) if persdata: with AtomicFile(perspath, 'wb') as wpersfile: pickle.dump(persdata, wpersfile.get_file()) if not have_changes: statefile.cancel() return have_changes
def DeserializeMulti(self, reponames=None, logger=NoopLogger()): packages = [] for repo in self.repoman.GetRepositories(reponames): packages += self.Deserialize(repo['name'], logger=logger.GetPrefixed(repo['name'] + ': ')) return packages
def ParseMulti(self, reponames=None, transformer=None, logger=NoopLogger()): packages = [] for repo in self.repoman.GetRepositories(reponames): packages += self.Parse(repo['name'], transformer=transformer, logger=logger.GetPrefixed(repo['name'] + ': ')) return packages
def StreamDeserializeMulti(self, reponames=None, logger=NoopLogger()): deserializers = [] for repo in self.repomgr.GetRepositories(reponames): deserializers.append( self.StreamDeserializer(self.__GetSerializedPath(repo), logger)) while True: # remove EOFed repos deserializers = [ds for ds in deserializers if not ds.EOF()] # stop when all deserializers are empty if not deserializers: break # find lowest key (effname) thiskey = deserializers[0].Peek().effname for ds in deserializers[1:]: thiskey = min(thiskey, ds.Peek().effname) # fetch all packages with given key from all deserializers packageset = [] for ds in deserializers: while not ds.EOF() and ds.Peek().effname == thiskey: packageset.append(ds.Get()) yield packageset
def Fetch(self, statepath, update=True, logger=NoopLogger()): tmppath = statepath + '.tmp' if os.path.isfile(statepath) and not update: logger.Log('no update requested, skipping') return with open(tmppath, 'wb') as statefile: logger.Log('fetching ' + self.url) data = Get(self.url).content logger.GetIndented().Log('size is {} byte(s)'.format(len(data))) if self.compression == 'gz': logger.GetIndented().Log('decompressing with gzip') data = gzip.decompress(data) elif self.compression == 'bz2': logger.GetIndented().Log('decompressing with bz2') data = bz2.decompress(data) elif self.compression == 'xz': logger.GetIndented().Log('decompressing with xz') data = lzma.LZMADecompressor().decompress(data) if self.compression: logger.GetIndented().Log('size after decompression is {} byte(s)'.format(len(data))) logger.GetIndented().Log('saving') statefile.write(data) os.replace(tmppath, statepath)
def test_all_fields(self): factory = PackageFactory(NoopLogger()) maker = factory.begin() maker.set_name_and_version('foo-1.0') maker.set_origin('/foo') maker.set_summary('foo package') maker.add_maintainers(None, 'a@com', [None, ['b@com']], None, 'c@com') maker.add_maintainers('d@com') maker.add_categories(None, 'foo', 'bar') maker.add_categories('baz') maker.add_homepages('http://foo', 'http://bar') maker.add_licenses(['GPLv2', 'GPLv3']) maker.add_licenses('MIT') maker.add_downloads(None, [None, 'http://baz'], ['ftp://quux']) pkg = maker.unwrap() self.assertEqual(pkg.name, 'foo') self.assertEqual(pkg.version, '1.0') self.assertEqual(pkg.extrafields['origin'], '/foo') self.assertEqual(pkg.maintainers, ['a@com', 'b@com', 'c@com', 'd@com']) self.assertEqual(pkg.category, 'foo') # XXX: convert to array self.assertEqual(pkg.homepage, 'http://foo') # XXX: convert to array self.assertEqual(pkg.licenses, ['GPLv2', 'GPLv3', 'MIT']) self.assertEqual(pkg.downloads, ['http://baz', 'ftp://quux'])
def Fetch(self, statepath, update=True, logger=NoopLogger()): if os.path.isdir(statepath) and not update: logger.Log('no update requested, skipping') return with StateDir(statepath) as statedir: numpage = 1 while True: url = self.url + '?page={}&per_page={}&sort=alpha'.format( numpage, self.per_page) logger.Log('getting ' + url) text = Fetch(url, timeout=self.fetch_timeout).text with open(os.path.join(statedir, '{}.json'.format(numpage)), 'w', encoding='utf-8') as pagefile: pagefile.write(text) # parse next page if not json.loads(text)['crates']: logger.Log('last page detected') return numpage += 1 time.sleep(1)
def Fetch(self, statepath, update=True, logger=NoopLogger()): if os.path.isdir(statepath) and not update: logger.Log('no update requested, skipping') return with StateDir(statepath) as statedir: numpage = 0 nextpageurl = self.url + 'Packages()?$filter=IsLatestVersion' while True: logger.Log('getting ' + nextpageurl) text = Fetch(nextpageurl, timeout=5).text with open(os.path.join(statedir, '{}.xml'.format(numpage)), 'w', encoding='utf-8') as pagefile: pagefile.write(text) # parse next page logger.Log('parsing ' + nextpageurl) root = xml.etree.ElementTree.fromstring(text) next_link = root.find( '{http://www.w3.org/2005/Atom}link[@rel="next"]') if next_link is None: break nextpageurl = next_link.attrib['href'] numpage += 1
def Fetch(self, statepath, update=True, logger=NoopLogger()): if os.path.isfile(statepath) and not update: logger.Log('no update requested, skipping') return # Get and parse repomd.xml repomd_url = self.url + 'repodata/repomd.xml' logger.Log('fetching metadata from ' + repomd_url) repomd_content = Fetch(repomd_url, check_status=True).text repomd_xml = xml.etree.ElementTree.fromstring(repomd_content) repodata_url = self.url + repomd_xml.find( '{http://linux.duke.edu/metadata/repo}data[@type="primary"]/{http://linux.duke.edu/metadata/repo}location' ).attrib['href'] logger.Log('fetching ' + repodata_url) data = Fetch(repodata_url).content logger.GetIndented().Log('size is {} byte(s)'.format(len(data))) if repodata_url.endswith('gz'): logger.GetIndented().Log('decompressing with gzip') data = gzip.decompress(data) elif repodata_url.endswith('xz'): logger.GetIndented().Log('decompressing with xz') data = lzma.LZMADecompressor().decompress(data) logger.GetIndented().Log( 'size after decompression is {} byte(s)'.format(len(data))) logger.GetIndented().Log('saving') with StateFile(statepath, 'wb') as statefile: statefile.write(data)
def fetch(self, statepath, update=True, logger=NoopLogger()): if os.path.isdir(statepath) and not update: logger.Log('no update requested, skipping') return with atomic_dir(statepath) as statedir: self.do_fetch(statedir, logger)
def test_strip(self): factory = PackageFactory(NoopLogger()) maker = factory.begin() maker.set_summary(' some package foo ') pkg = maker.unwrap() self.assertEqual(pkg.comment, 'some package foo')
def fetch(self, statepath, update=True, logger=NoopLogger()): if not os.path.isdir(statepath): with atomic_dir(statepath) as statedir: self.do_fetch(statedir, logger) elif update: self.do_update(statepath, logger) else: logger.Log('no update requested, skipping')
def parse( self, reponames: RepositoryNameList, transformer: Optional[PackageTransformer] = None, logger: Logger = NoopLogger() ) -> None: for repository in self.repomgr.get_repositories(reponames): self._parse(repository, transformer, logger)
def Reprocess(self, reponame, transformer=None, logger=NoopLogger()): repository = self.repoman.GetRepository(reponame) packages = self.__Deserialize(self.__GetSerializedPath(repository), repository, logger) packages = self.__Transform(packages, transformer, repository, logger) self.__Serialize(packages, self.__GetSerializedPath(repository), repository, logger) return packages
def ParseAndSerialize(self, reponame, transformer, logger=NoopLogger()): repository = self.repoman.GetRepository(reponame) packages = self.__Parse(repository, logger) packages = self.__Transform(packages, transformer, repository, logger) self.__Serialize(packages, self.__GetSerializedPath(repository), repository, logger) return packages
def test_unicalization_with_order_preserved(self): factory = PackageFactory(NoopLogger()) maker = factory.begin() maker.add_maintainers('z@com', 'y@com', 'x@com', 'z@com', 'y@com', 'x@com') maker.add_maintainers('z@com', 'y@com', 'x@com') pkg = maker.unwrap() self.assertEqual(pkg.maintainers, ['z@com', 'y@com', 'x@com'])
def iter_parse( self, reponames: RepositoryNameList, transformer: PackageTransformer | None = None, maintainermgr: MaintainerManager | None = None, logger: Logger = NoopLogger() ) -> Iterator[Package]: for repository in self.repomgr.get_repositories(reponames): yield from self._iter_parse_all_sources(repository, transformer, maintainermgr, logger)
def fetch(self, statepath, update=True, logger=NoopLogger()): if os.path.isfile(statepath) and not update: logger.Log('no update requested, skipping') return args = {'mode': 'wb'} if self.binary else {'mode': 'w', 'encoding': 'utf-8'} with atomic_file(statepath, **args) as statefile: self.do_fetch(statefile, logger)
def parse( self, reponames: RepositoryNameList, transformer: PackageTransformer | None = None, maintainermgr: MaintainerManager | None = None, logger: Logger = NoopLogger() ) -> None: for repository in self.repomgr.get_repositories(reponames): self._parse(repository, transformer, maintainermgr, logger)
def iter_parse( self, reponames: RepositoryNameList, transformer: Optional[PackageTransformer] = None, logger: Logger = NoopLogger() ) -> Iterator[Package]: for repository in self.repomgr.get_repositories(reponames): yield from self._iter_parse_all_sources(repository, transformer, logger)
def test_normalize_urls(self): factory = PackageFactory(NoopLogger()) maker = factory.begin() maker.add_homepages('Http://Foo.coM') maker.add_downloads('Http://Foo.coM') pkg = maker.unwrap() self.assertEqual(pkg.homepage, 'http://foo.com/') self.assertEqual(pkg.downloads, ['http://foo.com/'])
def fetch(self, reponames: RepositoryNameList, update: bool = True, logger: Logger = NoopLogger()) -> bool: have_changes = False for repository in self.repomgr.get_repositories(reponames): have_changes |= self._fetch(repository, update, logger) return have_changes