Exemple #1
0
    def _do_fetch(self, statedir: AtomicDir, persdata: PersistentData,
                  logger: Logger) -> bool:
        numpage = 0
        nextpageurl = self.url + 'Packages()?$filter=IsLatestVersion'
        while True:
            logger.log('getting ' + nextpageurl)

            text = self.do_http(nextpageurl).text
            with open(os.path.join(statedir.get_path(),
                                   '{}.xml'.format(numpage)),
                      'w',
                      encoding='utf-8') as pagefile:
                pagefile.write(text)

            # parse next page
            logger.log('parsing ' + nextpageurl)
            root = xml.etree.ElementTree.fromstring(text)

            next_link = root.find(
                '{http://www.w3.org/2005/Atom}link[@rel="next"]')
            if next_link is None:
                break

            nextpageurl = next_link.attrib['href']
            numpage += 1

        return True
Exemple #2
0
    def _do_fetch(self, statedir: AtomicDir, persdata: PersistentData, logger: Logger) -> bool:
        packages_url = self.url + 'packages.gz'
        logger.get_indented().log('fetching package list from ' + packages_url)
        data = self.do_http(packages_url).text  # autogunzipped?

        package_names = []

        for line in data.split('\n'):
            line = line.strip()
            if line.startswith('#') or line == '':
                continue
            package_names.append(line)

        if not package_names:
            raise RuntimeError('Empty package list received, refusing to continue')

        logger.get_indented().log('{} package name(s) parsed'.format(len(package_names)))

        for num_page, (url, num_packages) in enumerate(_split_names_into_urls(self.url + '/rpc/?v=5&type=info', package_names, self.max_api_url_length)):
            logger.get_indented().log('fetching page {} of {} package(s)'.format(num_page + 1, num_packages))

            with open(os.path.join(statedir.get_path(), '{}.json'.format(num_page)), 'wb') as statefile:
                statefile.write(self.do_http(url).content)

        return True
Exemple #3
0
    def _do_fetch(self, statedir: AtomicDir, persdata: PersistentData,
                  logger: Logger) -> bool:
        tarpath = os.path.join(statedir.get_path(), '.temporary.tar')

        headers = {}

        if persdata.get('last-modified'):
            headers['if-modified-since'] = persdata.get('last-modified')
            logger.log('using if-modified-since: {}'.format(
                headers['if-modified-since']))

        logger.log('fetching {}'.format(self.url))

        try:
            with open(tarpath, 'wb') as tarfile:
                response = save_http_stream(self.url,
                                            tarfile,
                                            headers=headers,
                                            timeout=self.fetch_timeout)
        except NotModifiedException:
            logger.log('got 403 not modified')
            return False

        # XXX: may be unportable, FreeBSD tar automatically handles compression type,
        # may not be the case on linuxes
        # XXX: this extracts tarball permissions, which is not desirable and it may
        # produce non-readable files and dirs (blackarch). GNU tar has --mode, BSD tar
        # lacks this. We should probably require GNU tar, and handle binary name which
        # may differ on BSD.
        run_subprocess(
            ['tar', '-x', '-z', '-f', tarpath, '-C',
             statedir.get_path()], logger)
        os.remove(tarpath)

        if response.headers.get('last-modified'):
            persdata['last-modified'] = response.headers['last-modified']
            logger.log('storing last-modified: {}'.format(
                persdata['last-modified']))

        return True
Exemple #4
0
    def _do_fetch(self, statedir: AtomicDir, persdata: PersistentData,
                  logger: Logger) -> bool:
        tarpath = os.path.join(statedir.get_path(), '.temporary.tar')

        headers = {}

        if persdata.get('last-modified'):
            headers['if-modified-since'] = persdata.get('last-modified')
            logger.log('using if-modified-since: {}'.format(
                headers['if-modified-since']))

        logger.log('fetching {}'.format(self.url))

        try:
            with open(tarpath, 'wb') as tarfile:
                response = save_http_stream(self.url,
                                            tarfile,
                                            headers=headers,
                                            timeout=self.fetch_timeout)
        except NotModifiedException:
            logger.log('got 403 not modified')
            return False

        # XXX: may be unportable, FreeBSD tar automatically handles compression type,
        # may not be the case on linuxes
        run_subprocess(
            ['tar', '-x', '-z', '-f', tarpath, '-C',
             statedir.get_path()], logger)
        os.remove(tarpath)

        if response.headers.get('last-modified'):
            persdata['last-modified'] = response.headers['last-modified']
            logger.log('storing last-modified: {}'.format(
                persdata['last-modified']))

        return True
Exemple #5
0
    def _do_fetch(self, statedir: AtomicDir, persdata: PersistentData, logger: Logger) -> bool:
        numpage = 1
        while True:
            url = self.url + '?page={}&per_page={}&sort=alpha'.format(numpage, self.per_page)
            logger.log('getting ' + url)

            text = self.do_http(url).text
            with open(os.path.join(statedir.get_path(), '{}.json'.format(numpage)), 'w', encoding='utf-8') as pagefile:
                pagefile.write(text)

            # parse next page
            if not json.loads(text)['crates']:
                logger.log('last page detected')
                return True

            numpage += 1
Exemple #6
0
    def _do_fetch(self, statedir: AtomicDir, persdata: PersistentData, logger: Logger) -> bool:
        page_counter = count()
        query = '?per_page={}&sort=alpha'.format(self.per_page)
        while query:
            url = self.url + query
            logger.log('getting ' + url)

            text = self.do_http(url).text
            with open(os.path.join(statedir.get_path(), '{}.json'.format(next(page_counter))), 'w', encoding='utf-8') as pagefile:
                pagefile.write(text)
                pagefile.flush()
                os.fsync(pagefile.fileno())

            # parse next page
            query = json.loads(text)['meta']['next_page']

        logger.log('last page detected')
        return True
Exemple #7
0
    def _do_fetch(self, statedir: AtomicDir, persdata: PersistentData,
                  logger: Logger) -> bool:
        for letter in ['0-9'] + list(ascii_uppercase):
            page = 1
            numpages = 1
            while True:
                logger.log('fetching {} page {}'.format(letter, page))

                pageurl = '{}/{}/page/{}/'.format(self.url, letter, page)

                # fetch HTML
                response = self.do_http(pageurl)
                response.encoding = 'utf-8'  # is not detected properly
                text = response.text

                # get number of pages, if there are more than 1 of them
                if numpages == 1:
                    for pagebutton in lxml.html.document_fromstring(
                            text).xpath('.//nav[@class="page-selector"]/a'
                                        ):  # type: ignore
                        numpages = max(numpages,
                                       int(pagebutton.text))  # type: ignore

                # save HTML
                with open(os.path.join(statedir.get_path(),
                                       '{}-{}.html'.format(letter, page)),
                          'w',
                          encoding='utf-8') as pagefile:
                    pagefile.write(text)
                    pagefile.flush()
                    os.fsync(pagefile.fileno())

                # end if that was last (or only) page
                if page >= numpages:
                    break

                # proceed with the next page
                page += 1

        return True
Exemple #8
0
    def _load_spec(self, package: str, statedir: AtomicDir,
                   logger: Logger) -> None:
        specurl = self.giturl + '/{0}.git/plain/{0}.spec'.format(package)

        logger.get_indented().log('getting spec from {}'.format(specurl))

        r = do_http(specurl, check_status=False)
        if r.status_code != 200:
            deadurl = self.giturl + '/{0}.git/plain/dead.package'.format(
                package)
            dr = do_http(deadurl, check_status=False)
            if dr.status_code == 200:
                logger.get_indented(2).log('dead: ' +
                                           ';'.join(dr.text.split('\n')))
            else:
                logger.get_indented(2).log('failed: {}'.format(
                    r.status_code))  # XXX: check .dead.package, instead throw
            return

        with open(os.path.join(statedir.get_path(), package + '.spec'),
                  'wb') as file:
            file.write(r.content)
Exemple #9
0
    def _do_fetch_scroll(self, statedir: AtomicDir, logger: Logger) -> None:
        numpage = 0

        logger.log('getting page {}'.format(numpage))
        response = self._do_http('{}?scroll={}'.format(self._url,
                                                       self._scroll),
                                 json=self._request_data).json()

        scroll_id = response['_scroll_id']

        while response['hits']['hits']:
            with open(os.path.join(statedir.get_path(),
                                   '{}.json'.format(numpage)),
                      'w',
                      encoding='utf-8') as pagefile:
                json.dump(response['hits']['hits'], pagefile)
                pagefile.flush()
                os.fsync(pagefile.fileno())

            numpage += 1

            logger.log('getting page {}'.format(numpage))
            response = self._do_http('{}?scroll={}&scroll_id={}'.format(
                self._scroll_url, self._scroll, scroll_id)).json()

        try:
            self._do_http(self._scroll_url,
                          method='DELETE',
                          json={
                              'scroll_id': scroll_id
                          }).json()
        except requests.exceptions.HTTPError as e:
            # we don't care too much if removing the scroll fails, it'll timeout anyway
            # XXX: but log this
            logger.log('failed to DELETE scroll, server reply follows:\n' +
                       e.response.text,
                       severity=Logger.ERROR)
            logger.log(e.response.text, severity=Logger.ERROR)
            pass