Example #1
0
    def do_fetch(self, statefile, logger):
        # Get and parse repomd.xml
        repomd_url = self.url + 'repodata/repomd.xml'
        logger.Log('fetching metadata from ' + repomd_url)
        repomd_content = do_http(repomd_url, check_status=True, timeout=self.fetch_timeout).text
        repomd_xml = xml.etree.ElementTree.fromstring(repomd_content)

        repodata_url = self.url + repomd_xml.find('{http://linux.duke.edu/metadata/repo}data[@type="primary"]/{http://linux.duke.edu/metadata/repo}location').attrib['href']

        logger.Log('fetching ' + repodata_url)
        data = do_http(repodata_url, timeout=self.fetch_timeout).content

        logger.GetIndented().Log('size is {} byte(s)'.format(len(data)))

        if repodata_url.endswith('gz'):
            logger.GetIndented().Log('decompressing with gzip')
            data = gzip.decompress(data)
        elif repodata_url.endswith('xz'):
            logger.GetIndented().Log('decompressing with xz')
            data = lzma.LZMADecompressor().decompress(data)

        logger.GetIndented().Log('size after decompression is {} byte(s)'.format(len(data)))

        logger.GetIndented().Log('saving')

        statefile.write(data)
Example #2
0
    def do_fetch(self, statefile, logger):
        fetching_what = [self.url]
        if isinstance(self.post, dict):
            fetching_what.append('{} fields of form data'.format(len(self.post)))
        elif self.post:
            fetching_what.append('{} bytes of post data'.format(len(self.post)))

        if self.headers:
            fetching_what.append('{} extra headers'.format(len(self.headers)))

        logger.Log('fetching ' + ', with '.join(fetching_what))

        data = do_http(self.url, data=self.post, headers=self.headers, timeout=self.fetch_timeout).content

        logger.GetIndented().Log('size is {} byte(s)'.format(len(data)))

        if self.compression == 'gz':
            logger.GetIndented().Log('decompressing with gzip')
            data = gzip.decompress(data)
        elif self.compression == 'bz2':
            logger.GetIndented().Log('decompressing with bz2')
            data = bz2.decompress(data)
        elif self.compression == 'xz':
            logger.GetIndented().Log('decompressing with xz')
            data = lzma.LZMADecompressor().decompress(data)

        if self.compression:
            logger.GetIndented().Log('size after decompression is {} byte(s)'.format(len(data)))

        logger.GetIndented().Log('saving')

        statefile.write(data)
Example #3
0
    def do_fetch(self, statedir, logger):
        for letter in ['0-9'] + [l for l in ascii_uppercase]:
            page = 1
            numpages = 1
            while True:
                logger.Log('fetching {} page {}'.format(letter, page))

                pageurl = '{}/{}/page/{}/'.format(self.url, letter, page)

                # fetch HTML
                response = do_http(pageurl, timeout=self.fetch_timeout)
                response.encoding = 'utf-8'  # is not detected properly
                text = response.text

                # get number of pages, if there are more than 1 of them
                if numpages == 1:
                    for pagebutton in lxml.html.document_fromstring(text).xpath('.//nav[@class="page-selector"]/a'):
                        numpages = max(numpages, int(pagebutton.text))

                # save HTML
                with open(os.path.join(statedir, '{}-{}.html'.format(letter, page)), 'w', encoding='utf-8') as pagefile:
                    pagefile.write(text)

                # end if that was last (or only) page
                if page >= numpages:
                    break

                # proceed with the next page
                page += 1

                if self.fetch_delay:
                    time.sleep(self.fetch_delay)
Example #4
0
    def _do_fetch(self, statefile: AtomicFile, persdata: PersistentData,
                  logger: Logger) -> bool:
        # fetch and parse repomd.xml
        repomd_url = self.url + 'repodata/repomd.xml'
        logger.log('fetching metadata from ' + repomd_url)
        repomd_content = do_http(repomd_url,
                                 check_status=True,
                                 timeout=self.fetch_timeout).text
        repomd = xml.etree.ElementTree.fromstring(repomd_content)
        repomd_elt_primary = repomd.find(
            '{http://linux.duke.edu/metadata/repo}data[@type="primary"]')
        if repomd_elt_primary is None:
            raise RuntimeError('Cannot find <primary> element in repomd.xml')

        repomd_elt_primary_location = repomd_elt_primary.find(
            './{http://linux.duke.edu/metadata/repo}location')
        repomd_elt_primary_checksum = repomd_elt_primary.find(
            './{http://linux.duke.edu/metadata/repo}open-checksum[@type="sha256"]'
        )

        if repomd_elt_primary_checksum is None:
            logger.log('no supported checksum', Logger.WARNING)
        elif repomd_elt_primary_checksum.text == persdata.get(
                'open-checksum-sha256'):
            logger.log('checksum not changed: {}'.format(
                repomd_elt_primary_checksum.text))
            return False

        if repomd_elt_primary_location is None:
            raise RuntimeError('Cannot find <location> element in repomd.xml')

        repodata_url = self.url + repomd_elt_primary_location.attrib['href']

        # fetch actual repo data
        compression = None
        if repodata_url.endswith('gz'):
            compression = 'gz'
        elif repodata_url.endswith('xz'):
            compression = 'xz'

        logger.log('fetching {}'.format(repodata_url))

        save_http_stream(repodata_url,
                         statefile.get_file(),
                         compression=compression,
                         timeout=self.fetch_timeout)

        if repomd_elt_primary_checksum is not None and repomd_elt_primary_checksum.text:
            persdata['open-checksum-sha256'] = repomd_elt_primary_checksum.text
            logger.log('saving checksum: {}'.format(
                persdata['open-checksum-sha256']))

        logger.log('size is {} byte(s)'.format(
            os.path.getsize(statefile.get_path())))

        return True
Example #5
0
    def _load_spec(self, package: str, statedir: AtomicDir,
                   logger: Logger) -> None:
        specurl = self.giturl + '/{0}.git/plain/{0}.spec'.format(package)

        logger.get_indented().log('getting spec from {}'.format(specurl))

        r = do_http(specurl, check_status=False)
        if r.status_code != 200:
            deadurl = self.giturl + '/{0}.git/plain/dead.package'.format(
                package)
            dr = do_http(deadurl, check_status=False)
            if dr.status_code == 200:
                logger.get_indented(2).log('dead: ' +
                                           ';'.join(dr.text.split('\n')))
            else:
                logger.get_indented(2).log('failed: {}'.format(
                    r.status_code))  # XXX: check .dead.package, instead throw
            return

        with open(os.path.join(statedir.get_path(), package + '.spec'),
                  'wb') as file:
            file.write(r.content)
Example #6
0
    def fetch(self,
              statepath: str,
              update: bool = True,
              logger: Logger = NoopLogger()) -> bool:
        if os.path.isfile(statepath) and not update:
            logger.log('no update requested, skipping')
            return False

        state: Dict[str, Any] = {}

        if os.path.isfile(statepath):
            with open(statepath, 'r', encoding='utf-8') as oldstatefile:
                state = json.load(oldstatefile)
            logger.log('loaded old state, {} entries'.format(len(state)))
        else:
            logger.log('starting with empty state')

        newdata = json.loads(do_http(self.url).text)

        if not newdata['releases']:
            raise RuntimeError(
                'Empty freshcode package list received, refusing to go on')

        # add new entries in reversed order, oldest first so newest
        # have higher priority; may also compare versions here
        for entry in newdata['releases']:
            if 'name' not in entry:
                logger.log('skipping entry with no name')
                continue

            if entry['name'] in state:
                oldentry = state[entry['name']]

                if version_compare(entry['version'], oldentry['version']) > 0:
                    logger.log(
                        'replacing entry "{}", version changed {} -> {}'.
                        format(entry['name'], oldentry['version'],
                               entry['version']))
                    state[entry['name']] = entry
            else:
                logger.log('adding entry "{}", version {}'.format(
                    entry['name'], entry['version']))
                state[entry['name']] = entry

        with AtomicFile(statepath, 'w', encoding='utf-8') as statefile:
            json.dump(state, statefile.get_file())

        logger.log('saved new state, {} entries'.format(len(state)))

        return True
Example #7
0
    def _do_fetch(self, statedir: AtomicDir, persdata: PersistentData,
                  logger: Logger) -> bool:
        page = 1

        while True:
            pageurl = self.apiurl + 'packages/?page={}'.format(page)
            logger.log('getting page {} from {}'.format(page, pageurl))
            pagedata = json.loads(do_http(pageurl).text)

            for package in pagedata['packages']:
                self._load_spec(package['name'], statedir, logger)

            page += 1

            if page > pagedata['page_total']:
                break

        return True
Example #8
0
    def do_fetch(self, statefile, logger):
        # fetch and parse repomd.xml
        repomd_url = self.url + 'repodata/repomd.xml'
        logger.Log('fetching metadata from ' + repomd_url)
        repomd_content = do_http(repomd_url, check_status=True, timeout=self.fetch_timeout).text
        repomd_xml = xml.etree.ElementTree.fromstring(repomd_content)

        repodata_url = self.url + repomd_xml.find('{http://linux.duke.edu/metadata/repo}data[@type="primary"]/{http://linux.duke.edu/metadata/repo}location').attrib['href']

        # fetch actual repo data
        compression = None
        if repodata_url.endswith('gz'):
            compression = 'gz'
        elif repodata_url.endswith('xz'):
            compression = 'xz'

        logger.Log('fetching {}'.format(repodata_url))

        save_http_stream(repodata_url, statefile, compression=compression, timeout=self.fetch_timeout)

        logger.Log('size is {} byte(s)'.format(statefile.tell()))
Example #9
0
def fetch(*args, **kwargs):
    return do_http(*args, **kwargs)