Esempio n. 1
0
    async def test_http_retry_then_succeed(self):
        async def handle_post(_):
            return web.Response(text='the-response-data')

        app = web.Application()
        app.add_routes([web.post('/page', handle_post)])
        runner = web.AppRunner(app)

        request, close = Pool()
        self.add_async_cleanup(close)

        retriable_request = retry(
            request,
            exception_intervals=((HttpConnectionError, (0, 1, 2)), ),
        )

        async def delayed_start():
            await asyncio.sleep(0.5)
            await runner.setup()
            self.add_async_cleanup(runner.cleanup)
            site = web.TCPSite(runner, '0.0.0.0', 8080)
            await site.start()

        asyncio.ensure_future(delayed_start())
        _, _, body = await retriable_request(b'POST',
                                             'http://localhost:8080/page',
                                             body=streamed(b'some-data'),
                                             headers=((b'content-length',
                                                       b'9'), ))

        self.assertEqual(b'the-response-data', await buffered(body))
Esempio n. 2
0
    async def test_http_post_small_buffered_streamed(self):
        request, close = Pool()
        self.add_async_cleanup(close)

        code, headers, body = await request(
            b'POST',
            'http://postman-echo.com/post',
            (),
            (
                (b'content-length', b'19'),
                (b'content-type', b'application/x-www-form-urlencoded'),
            ),
            streamed(b'some-data=something'),
        )
        body_bytes = await buffered(body)

        headers_dict = dict(headers)
        response_dict = json.loads(body_bytes)

        self.assertEqual(code, b'200')
        self.assertEqual(headers_dict[b'content-type'],
                         b'application/json; charset=utf-8')
        self.assertEqual(response_dict['headers']['host'], 'postman-echo.com')
        self.assertEqual(response_dict['headers']['content-length'], '19')
        self.assertEqual(response_dict['form'], {'some-data': 'something'})
Esempio n. 3
0
 async def list_packages():
     request_body = (
         b'<?xml version="1.0"?>'
         b"<methodCall><methodName>list_packages</methodName></methodCall>")
     _, _, body = await request(
         b"POST",
         source_base + "/pypi",
         body=streamed(request_body),
         headers=(
             (b"content-type", b"text/xml"),
             (b"content-length", str(len(request_body)).encode()),
         ),
     )
     return [
         package.text for package in ET.fromstring(await buffered(
             body)).findall("./params/param/value/array/data/value/string")
     ]
Esempio n. 4
0
 async def list_packages():
     request_body = (
         b'<?xml version="1.0"?>'
         b'<methodCall><methodName>list_packages</methodName></methodCall>')
     _, _, body = await request(
         b'POST',
         source_base + '/pypi',
         body=streamed(request_body),
         headers=(
             (b'content-type', b'text/xml'),
             (b'content-length', str(len(request_body)).encode()),
         ),
     )
     return [
         package.text for package in ET.fromstring(await buffered(
             body)).findall('./params/param/value/array/data/value/string')
     ]
Esempio n. 5
0
 async def changelog(sync_changes_after):
     request_body = (
         b'<?xml version="1.0"?>'
         b"<methodCall><methodName>changelog</methodName><params>"
         b"<param><value>"
         b"<int>" + str(sync_changes_after).encode() + b"</int>"
         b"</value></param>"
         b"</params></methodCall>")
     _, _, body = await request(
         b"POST",
         source_base + "/pypi",
         body=streamed(request_body),
         headers=(
             (b"content-type", b"text/xml"),
             (b"content-length", str(len(request_body)).encode()),
         ),
     )
     return [
         package.text for package in ET.
         fromstring((await buffered(body)).replace(b"\x1b", b"")).findall(
             "./params/param/value/array/data/value/array/data/value[1]/string"
         )
     ]
Esempio n. 6
0
 async def changelog(sync_changes_after):
     request_body = (
         b'<?xml version="1.0"?>'
         b'<methodCall><methodName>changelog</methodName><params>'
         b'<param><value>'
         b'<int>' + str(sync_changes_after).encode() + b'</int>'
         b'</value></param>'
         b'</params></methodCall>')
     _, _, body = await request(
         b'POST',
         source_base + '/pypi',
         body=streamed(request_body),
         headers=(
             (b'content-type', b'text/xml'),
             (b'content-length', str(len(request_body)).encode()),
         ),
     )
     return [
         package.text
         for package in ET.fromstring(await buffered(body)).findall(
             './params/param/value/array/data/value/array/data/value[1]/string'
         )
     ]
Esempio n. 7
0
async def nltk_mirror(logger, request, s3_context, s3_prefix):
    base = "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/"
    input_index_url = f"{base}/index.xml"

    # Fetch index file
    code, headers, body = await request(b"GET", input_index_url)
    if code != b"200":
        raise Exception("Unable to fetch index")
    headers_xml_lower = dict((key.lower(), value) for key, value in headers)
    index_xml_content_type = headers_xml_lower[b"content-type"]
    index_xml = ET.fromstring(await buffered(body))

    # Transfer package contents to S£ if they haven't already
    for package in index_xml.findall("./packages/package"):
        package_original_url = package.attrib["url"]
        package_checksum = package.attrib["checksum"]
        package_id = package.attrib["id"]

        # The NTLK downloader is sensitive to the extension of the file
        _, ext = os.path.splitext(package_original_url)
        new_path = f"/{s3_prefix}{package_id}-{package_checksum}{ext}"

        package.attrib[
            "url"] = f"https://{s3_context.bucket.host}/{s3_context.bucket.name}{new_path}"

        # If the file already exists, don't re-upload. Named including the
        # checksum deliberately so we can do this
        code, _ = await s3_request_full(
            logger,
            s3_context,
            b"HEAD",
            new_path,
            (),
            headers,
            empty_async_iterator,
            "UNSIGNED-PAYLOAD",
        )
        logger.info("%s %s", f"{s3_context.bucket.name}{new_path}", code)
        if code == b"200":
            continue

        # Stream the file to S3
        code, headers, body = await request(b"GET", package_original_url)
        if code != b"200":
            await blackhole(body)
            raise Exception(f"{code} {package_original_url}")

        headers_lower = dict((key.lower(), value) for key, value in headers)
        headers = (
            (b"content-length", headers_lower[b"content-length"]),
            (b"content-type", headers_lower[b"content-type"]),
        )
        code, _ = await s3_request_full(
            logger,
            s3_context,
            b"PUT",
            new_path,
            (),
            headers,
            lambda: body,
            "UNSIGNED-PAYLOAD",
        )
        if code != b"200":
            raise Exception()

    # The ascii encoding is important: the nltk downloader seems to assume
    # attributes are ascii
    output_xml_str = ET.tostring(index_xml, encoding="ascii", method="xml")
    index_xml_content_length = str(len(output_xml_str)).encode("ascii")
    headers = (
        (b"content-length", index_xml_content_length),
        (b"content-type", index_xml_content_type),
    )
    code, _ = await s3_request_full(
        logger,
        s3_context,
        b"PUT",
        f"/{s3_prefix}index.xml",
        (),
        headers,
        streamed(output_xml_str),
        "UNSIGNED-PAYLOAD",
    )
    if code != b"200":
        raise Exception()
Esempio n. 8
0
async def conda_mirror(logger, request, s3_context, source_base_url,
                       s3_prefix):
    arch_dirs = ["noarch/", "linux-64/"]
    repodatas = []
    queue = asyncio.Queue()

    logger.info("Finding existing files")
    existing_files = {
        key
        async for key in s3_list_keys_relative_to_prefix(
            logger, s3_context, s3_prefix)
    }

    for arch_dir in arch_dirs:
        code, _, body = await request(
            b"GET", source_base_url + arch_dir + "repodata.json")
        if code != b"200":
            raise Exception()

        source_repodata_raw = await buffered(body)
        source_repodata = json.loads(source_repodata_raw)

        for package_suffix, _ in source_repodata["packages"].items():
            await queue.put(arch_dir + package_suffix)

        repodatas.append((arch_dir + "repodata.json", source_repodata_raw))

        code, _, body = await request(
            b"GET", source_base_url + arch_dir + "repodata.json.bz2")
        if code != b"200":
            raise Exception()
        repodatas.append((arch_dir + "repodata.json.bz2", await
                          buffered(body)))

    async def transfer_package(package_suffix):
        source_package_url = source_base_url + package_suffix
        target_package_key = s3_prefix + package_suffix

        exists = package_suffix in existing_files
        if exists:
            logger.debug("Skipping transfer of {}".format("/" +
                                                          target_package_key))
            return

        code, headers, body = await request(b"GET", source_package_url)
        if code != b"200":
            response = await buffered(body)
            raise Exception("Exception GET {} {} {}".format(
                source_package_url, code, response))
        headers_lower = dict((key.lower(), value) for key, value in headers)
        headers = ((b"content-length", headers_lower[b"content-length"]), )
        code, body = await s3_request_full(
            logger,
            s3_context,
            b"PUT",
            "/" + target_package_key,
            (),
            headers,
            lambda: body,
            "UNSIGNED-PAYLOAD",
        )
        if code != b"200":
            raise Exception(  # pylint: disable=broad-except
                "Exception PUT {} {} {}".format("/" + target_package_key, code,
                                                body))

    async def transfer_task():
        while True:
            package_suffix = await queue.get()

            try:
                for _ in range(0, 10):
                    try:
                        await transfer_package(package_suffix)
                    except (HttpConnectionError, HttpDataError):
                        await asyncio.sleep(10)
                    else:
                        break
            except Exception:  # pylint: disable=broad-except
                logger.exception("Exception transferring %s", package_suffix)
            finally:
                queue.task_done()

    tasks = [asyncio.ensure_future(transfer_task()) for _ in range(0, 10)]
    try:
        await queue.join()
    finally:
        for task in tasks:
            task.cancel()
        await asyncio.sleep(0)

    for path, data in repodatas:
        target_repodata_key = s3_prefix + path
        headers = ((b"content-length", str(len(data)).encode("ascii")), )
        code, _ = await s3_request_full(
            logger,
            s3_context,
            b"PUT",
            "/" + target_repodata_key,
            (),
            headers,
            streamed(data),
            s3_hash(data),
        )
        if code != b"200":
            raise Exception()
Esempio n. 9
0
async def cran_mirror(logger, request, s3_context):
    source_base = "https://cran.ma.imperial.ac.uk/"
    source_base_url = source_base + "web/packages/available_packages_by_name.html"
    source_base_parsed = urllib.parse.urlparse(source_base_url)
    cran_prefix = "cran/"

    done = set()
    queue = asyncio.Queue()
    await queue.put(source_base_url)

    # Main package file. Maybe better parsing this than crawling HTML?
    package_index = "src/contrib/PACKAGES"
    code, _, body = await request(b"GET", source_base + package_index)
    package_index_body = await buffered(body)
    if code != b"200":
        raise Exception()

    logger.info("Finding existing files")
    existing_files = {
        key
        async for key in s3_list_keys_relative_to_prefix(
            logger, s3_context, cran_prefix)
    }

    async def crawl(url):
        key_suffix = urllib.parse.urlparse(url).path[1:]  # Without leading /

        if key_suffix in existing_files and (key_suffix.endswith(".tar.gz")
                                             or key_suffix.endswith(".tgz")
                                             or key_suffix.endswith(".zip")
                                             or key_suffix.endswith(".pdf")):
            # The package files have a version in the file name. Other files like html and pdf
            # don't, but don't think they are actually used in when installing packages from R
            return

        code, headers, body = await request(b"GET", url)
        if code != b"200":
            await blackhole(body)
            raise Exception()
        headers_lower = dict((key.lower(), value) for key, value in headers)
        content_type = headers_lower.get(b"content-type", None)
        content_length = headers_lower[b"content-length"]
        target_key = cran_prefix + key_suffix

        if content_type == b"text/html":
            data = await buffered(body)
            soup = BeautifulSoup(data, "html.parser")
            links = soup.find_all("a")
            for link in links:
                absolute = urllib.parse.urljoin(url, link.get("href"))
                absolute_no_frag = absolute.split("#")[0]
                is_done = (urllib.parse.urlparse(absolute_no_frag).netloc
                           == source_base_parsed.netloc
                           and absolute_no_frag not in done)
                if is_done:
                    done.add(absolute_no_frag)
                    await queue.put(absolute_no_frag)
            return

        if key_suffix in existing_files:
            await blackhole(body)
            return

        headers = ((b"content-length", content_length), )
        code, _ = await s3_request_full(
            logger,
            s3_context,
            b"PUT",
            "/" + target_key,
            (),
            headers,
            lambda: body,
            "UNSIGNED-PAYLOAD",
        )
        if code != b"200":
            raise Exception()

    async def transfer_task():
        while True:
            url = await queue.get()
            try:
                for _ in range(0, 10):
                    try:
                        await crawl(url)
                    except (HttpConnectionError, HttpDataError):
                        await asyncio.sleep(10)
                    else:
                        break
            except Exception:  # pylint: disable=broad-except
                logger.exception("Exception crawling %s", url)
            finally:
                queue.task_done()

    tasks = [asyncio.ensure_future(transfer_task()) for _ in range(0, 10)]
    try:
        await queue.join()
    finally:
        for task in tasks:
            task.cancel()
        await asyncio.sleep(0)

    headers = ((b"content-length",
                str(len(package_index_body)).encode("ascii")), )
    code, _ = await s3_request_full(
        logger,
        s3_context,
        b"PUT",
        "/" + cran_prefix + package_index,
        (),
        headers,
        streamed(package_index_body),
        s3_hash(package_index_body),
    )

    if code != b"200":
        raise Exception()
Esempio n. 10
0
    async def transfer_project(project_name, project_url):
        code, _, body = await request(b"GET", project_url)
        data = await buffered(body)
        if code != b"200":
            raise Exception("Failed GET {}".format(code))

        soup = BeautifulSoup(data, "html.parser")
        links = soup.find_all("a")
        link_data = []

        logger.info("Finding existing files")
        existing_project_filenames = {
            key
            async for key in s3_list_keys_relative_to_prefix(
                logger, s3_context, f"{pypi_prefix}{project_name}/")
        }

        for link in links:
            absolute = link.get("href")
            absolute_no_frag, frag = absolute.split("#")
            filename = str(link.string)
            python_version = link.get("data-requires-python")
            has_python_version = python_version is not None
            python_version_attr = (' data-requires-python="' +
                                   html.escape(python_version) +
                                   '"' if has_python_version else "")

            s3_path = f"/{pypi_prefix}{project_name}/{filename}"
            link_data.append((s3_path, filename, frag, python_version_attr))

            exists = filename in existing_project_filenames
            if exists:
                logger.debug("Skipping transfer of %s", s3_path)
                continue

            for _ in range(0, 10):
                try:
                    code, headers, body = await request(
                        b"GET", absolute_no_frag)
                    if code != b"200":
                        await blackhole(body)
                        raise Exception("Failed GET {}".format(code))

                    content_length = dict(
                        (key.lower(), value)
                        for key, value in headers)[b"content-length"]
                    headers = ((b"content-length", content_length), )
                    code, _ = await s3_request_full(
                        logger,
                        s3_context,
                        b"PUT",
                        s3_path,
                        (),
                        headers,
                        lambda: body,
                        "UNSIGNED-PAYLOAD",
                    )
                except (HttpConnectionError, HttpDataError):
                    await asyncio.sleep(10)
                else:
                    break
            if code != b"200":
                raise Exception("Failed PUT {}".format(code))

        html_str = ("<!DOCTYPE html>" + "<html>" + "<body>" + "".join([
            f'<a href="https://{s3_context.bucket.host}/{s3_context.bucket.name}{s3_path}'
            f'#{frag}"{python_version_attr}>{filename}</a>'
            for s3_path, filename, frag, python_version_attr in link_data
        ]) + "</body>" + "</html>")
        html_bytes = html_str.encode("ascii")
        s3_path = f"/{pypi_prefix}{project_name}/"
        headers = (
            (b"content-type", b"text/html"),
            (b"content-length", str(len(html_bytes)).encode("ascii")),
        )
        for _ in range(0, 5):
            try:
                code, _ = await s3_request_full(
                    logger,
                    s3_context,
                    b"PUT",
                    s3_path,
                    (),
                    headers,
                    streamed(html_bytes),
                    s3_hash(html_bytes),
                )
            except (HttpConnectionError, HttpDataError):
                await asyncio.sleep(10)
            else:
                break
        if code != b"200":
            raise Exception("Failed PUT {}".format(code))
Esempio n. 11
0
async def pypi_mirror(logger, request, s3_context):
    def normalise(name):
        return re.sub(r"[-_.]+", "-", name).lower()

    async def list_packages():
        request_body = (
            b'<?xml version="1.0"?>'
            b"<methodCall><methodName>list_packages</methodName></methodCall>")
        _, _, body = await request(
            b"POST",
            source_base + "/pypi",
            body=streamed(request_body),
            headers=(
                (b"content-type", b"text/xml"),
                (b"content-length", str(len(request_body)).encode()),
            ),
        )
        return [
            package.text for package in ET.fromstring(await buffered(
                body)).findall("./params/param/value/array/data/value/string")
        ]

    async def changelog(sync_changes_after):
        request_body = (
            b'<?xml version="1.0"?>'
            b"<methodCall><methodName>changelog</methodName><params>"
            b"<param><value>"
            b"<int>" + str(sync_changes_after).encode() + b"</int>"
            b"</value></param>"
            b"</params></methodCall>")
        _, _, body = await request(
            b"POST",
            source_base + "/pypi",
            body=streamed(request_body),
            headers=(
                (b"content-type", b"text/xml"),
                (b"content-length", str(len(request_body)).encode()),
            ),
        )
        return [
            package.text for package in ET.
            fromstring((await buffered(body)).replace(b"\x1b", b"")).findall(
                "./params/param/value/array/data/value/array/data/value[1]/string"
            )
        ]

    source_base = "https://pypi.python.org"

    pypi_prefix = "pypi/"

    # We may have overlap, but that's fine
    sync_changes_after_key = "__sync_changes_after"
    # Paranoia: the reference implementation at https://bitbucket.org/loewis/pep381client has -1
    started = int(time()) - 1

    # Determine after when to fetch changes. There is an eventual consistency issue storing this
    # on S3, but at worst we'll be unnecessarily re-fetching updates, rather than missing them.
    # Plus, given the time to run a sync and frequency, this is unlikely anyway
    code, data = await s3_request_full(
        logger,
        s3_context,
        b"GET",
        "/" + pypi_prefix + sync_changes_after_key,
        (),
        (),
        empty_async_iterator,
        s3_hash(b""),
    )
    if code not in [b"200", b"404"]:
        raise Exception("Failed GET of __sync_changes_after {} {}".format(
            code, data))
    sync_changes_after = int(data) if code == b"200" else 0

    # changelog doesn't seem to have changes older than two years, so for all projects on initial
    # import, we need to call list_packages
    project_names_with_duplicates = ((await list_packages())
                                     if sync_changes_after == 0 else
                                     (await changelog(sync_changes_after)))

    project_names = sorted(list(set(project_names_with_duplicates)))

    queue = asyncio.Queue()

    for project_name in project_names:
        normalised_project_name = normalise(project_name)
        await queue.put((
            normalised_project_name,
            source_base + f"/simple/{normalised_project_name}/",
        ))

    async def transfer_project(project_name, project_url):
        code, _, body = await request(b"GET", project_url)
        data = await buffered(body)
        if code != b"200":
            raise Exception("Failed GET {}".format(code))

        soup = BeautifulSoup(data, "html.parser")
        links = soup.find_all("a")
        link_data = []

        logger.info("Finding existing files")
        existing_project_filenames = {
            key
            async for key in s3_list_keys_relative_to_prefix(
                logger, s3_context, f"{pypi_prefix}{project_name}/")
        }

        for link in links:
            absolute = link.get("href")
            absolute_no_frag, frag = absolute.split("#")
            filename = str(link.string)
            python_version = link.get("data-requires-python")
            has_python_version = python_version is not None
            python_version_attr = (' data-requires-python="' +
                                   html.escape(python_version) +
                                   '"' if has_python_version else "")

            s3_path = f"/{pypi_prefix}{project_name}/{filename}"
            link_data.append((s3_path, filename, frag, python_version_attr))

            exists = filename in existing_project_filenames
            if exists:
                logger.debug("Skipping transfer of %s", s3_path)
                continue

            for _ in range(0, 10):
                try:
                    code, headers, body = await request(
                        b"GET", absolute_no_frag)
                    if code != b"200":
                        await blackhole(body)
                        raise Exception("Failed GET {}".format(code))

                    content_length = dict(
                        (key.lower(), value)
                        for key, value in headers)[b"content-length"]
                    headers = ((b"content-length", content_length), )
                    code, _ = await s3_request_full(
                        logger,
                        s3_context,
                        b"PUT",
                        s3_path,
                        (),
                        headers,
                        lambda: body,
                        "UNSIGNED-PAYLOAD",
                    )
                except (HttpConnectionError, HttpDataError):
                    await asyncio.sleep(10)
                else:
                    break
            if code != b"200":
                raise Exception("Failed PUT {}".format(code))

        html_str = ("<!DOCTYPE html>" + "<html>" + "<body>" + "".join([
            f'<a href="https://{s3_context.bucket.host}/{s3_context.bucket.name}{s3_path}'
            f'#{frag}"{python_version_attr}>{filename}</a>'
            for s3_path, filename, frag, python_version_attr in link_data
        ]) + "</body>" + "</html>")
        html_bytes = html_str.encode("ascii")
        s3_path = f"/{pypi_prefix}{project_name}/"
        headers = (
            (b"content-type", b"text/html"),
            (b"content-length", str(len(html_bytes)).encode("ascii")),
        )
        for _ in range(0, 5):
            try:
                code, _ = await s3_request_full(
                    logger,
                    s3_context,
                    b"PUT",
                    s3_path,
                    (),
                    headers,
                    streamed(html_bytes),
                    s3_hash(html_bytes),
                )
            except (HttpConnectionError, HttpDataError):
                await asyncio.sleep(10)
            else:
                break
        if code != b"200":
            raise Exception("Failed PUT {}".format(code))

    async def transfer_task():
        while True:
            project_name, project_url = await queue.get()
            logger.info("Transferring project %s %s", project_name,
                        project_url)

            try:
                await transfer_project(project_name, project_url)
            except Exception:  # pylint: disable=broad-except
                logger.exception("Exception crawling %s", project_url)
            finally:
                queue.task_done()

    tasks = [asyncio.ensure_future(transfer_task()) for _ in range(0, 10)]
    try:
        await queue.join()
    finally:
        for task in tasks:
            task.cancel()
        await asyncio.sleep(0)

    started_bytes = str(started).encode("ascii")

    headers = ((b"content-length", str(len(started_bytes)).encode("ascii")), )
    for _ in range(0, 10):
        try:
            code, _ = await s3_request_full(
                logger,
                s3_context,
                b"PUT",
                "/" + pypi_prefix + sync_changes_after_key,
                (),
                headers,
                streamed(started_bytes),
                s3_hash(started_bytes),
            )
        except (HttpConnectionError, HttpDataError):
            pass
        else:
            break
    if code != b"200":
        raise Exception()
Esempio n. 12
0
async def nltk_mirror(logger, request, s3_context, s3_prefix):
    base = 'https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/'
    input_index_url = f'{base}/index.xml'

    # Fetch index file
    code, headers, body = await request(b'GET', input_index_url)
    if code != b'200':
        raise Exception('Unable to fetch index')
    headers_xml_lower = dict((key.lower(), value) for key, value in headers)
    index_xml_content_type = headers_xml_lower[b'content-type']
    index_xml = ET.fromstring(await buffered(body))

    # Transfer package contents to S£ if they haven't already
    for package in index_xml.findall('./packages/package'):
        package_original_url = package.attrib['url']
        package_checksum = package.attrib['checksum']
        package_id = package.attrib['id']

        # The NTLK downloader is sensitive to the extension of the file
        _, ext = os.path.splitext(package_original_url)
        new_path = f'/{s3_prefix}{package_id}-{package_checksum}{ext}'

        package.attrib[
            'url'
        ] = f'https://{s3_context.bucket.host}/{s3_context.bucket.name}{new_path}'

        # If the file already exists, don't re-upload. Named including the
        # checksum deliberately so we can do this
        code, _ = await s3_request_full(
            logger,
            s3_context,
            b'HEAD',
            new_path,
            (),
            headers,
            empty_async_iterator,
            'UNSIGNED-PAYLOAD',
        )
        logger.info('%s %s', f'{s3_context.bucket.name}{new_path}', code)
        if code == b'200':
            continue

        # Stream the file to S3
        code, headers, body = await request(b'GET', package_original_url)
        if code != b'200':
            await blackhole(body)
            raise Exception(f'{code} {package_original_url}')

        headers_lower = dict((key.lower(), value) for key, value in headers)
        headers = (
            (b'content-length', headers_lower[b'content-length']),
            (b'content-type', headers_lower[b'content-type']),
        )
        code, _ = await s3_request_full(
            logger,
            s3_context,
            b'PUT',
            new_path,
            (),
            headers,
            lambda: body,
            'UNSIGNED-PAYLOAD',
        )
        if code != b'200':
            raise Exception()

    # The ascii encoding is important: the nltk downloader seems to assume
    # attributes are ascii
    output_xml_str = ET.tostring(index_xml, encoding='ascii', method='xml')
    index_xml_content_length = str(len(output_xml_str)).encode('ascii')
    headers = (
        (b'content-length', index_xml_content_length),
        (b'content-type', index_xml_content_type),
    )
    code, _ = await s3_request_full(
        logger,
        s3_context,
        b'PUT',
        f'/{s3_prefix}index.xml',
        (),
        headers,
        streamed(output_xml_str),
        'UNSIGNED-PAYLOAD',
    )
    if code != b'200':
        raise Exception()
Esempio n. 13
0
async def conda_mirror(logger, request, s3_context, source_base_url, s3_prefix):
    arch_dirs = ['noarch/', 'linux-64/']
    repodatas = []
    queue = asyncio.Queue()

    logger.info('Finding existing files')
    existing_files = {
        key
        async for key in s3_list_keys_relative_to_prefix(logger, s3_context, s3_prefix)
    }

    for arch_dir in arch_dirs:
        code, _, body = await request(
            b'GET', source_base_url + arch_dir + 'repodata.json'
        )
        if code != b'200':
            raise Exception()

        source_repodata_raw = await buffered(body)
        source_repodata = json.loads(source_repodata_raw)

        for package_suffix, _ in source_repodata['packages'].items():
            await queue.put(arch_dir + package_suffix)

        repodatas.append((arch_dir + 'repodata.json', source_repodata_raw))

        code, _, body = await request(
            b'GET', source_base_url + arch_dir + 'repodata.json.bz2'
        )
        if code != b'200':
            raise Exception()
        repodatas.append((arch_dir + 'repodata.json.bz2', await buffered(body)))

    async def transfer_package(package_suffix):
        source_package_url = source_base_url + package_suffix
        target_package_key = s3_prefix + package_suffix

        exists = package_suffix in existing_files
        if exists:
            logger.debug('Skipping transfer of {}'.format('/' + target_package_key))
            return

        code, headers, body = await request(b'GET', source_package_url)
        if code != b'200':
            response = await buffered(body)
            raise Exception(
                'Exception GET {} {} {}'.format(source_package_url, code, response)
            )
        headers_lower = dict((key.lower(), value) for key, value in headers)
        headers = ((b'content-length', headers_lower[b'content-length']),)
        code, body = await s3_request_full(
            logger,
            s3_context,
            b'PUT',
            '/' + target_package_key,
            (),
            headers,
            lambda: body,
            'UNSIGNED-PAYLOAD',
        )
        if code != b'200':
            raise Exception(
                'Exception PUT {} {} {}'.format('/' + target_package_key, code, body)
            )

    async def transfer_task():
        while True:
            package_suffix = await queue.get()

            try:
                for _ in range(0, 10):
                    try:
                        await transfer_package(package_suffix)
                    except (HttpConnectionError, HttpDataError):
                        await asyncio.sleep(10)
                    else:
                        break
            except Exception:
                logger.exception('Exception transferring %s', package_suffix)
            finally:
                queue.task_done()

    tasks = [asyncio.ensure_future(transfer_task()) for _ in range(0, 10)]
    try:
        await queue.join()
    finally:
        for task in tasks:
            task.cancel()
        await asyncio.sleep(0)

    for path, data in repodatas:
        target_repodata_key = s3_prefix + path
        headers = ((b'content-length', str(len(data)).encode('ascii')),)
        code, _ = await s3_request_full(
            logger,
            s3_context,
            b'PUT',
            '/' + target_repodata_key,
            (),
            headers,
            streamed(data),
            s3_hash(data),
        )
        if code != b'200':
            raise Exception()
Esempio n. 14
0
    async def transfer_project(project_name, project_url):
        code, _, body = await request(b'GET', project_url)
        data = await buffered(body)
        if code != b'200':
            raise Exception('Failed GET {}'.format(code))

        soup = BeautifulSoup(data, 'html.parser')
        links = soup.find_all('a')
        link_data = []

        logger.info('Finding existing files')
        existing_project_filenames = {
            key
            async for key in s3_list_keys_relative_to_prefix(
                logger, s3_context, f'{pypi_prefix}{project_name}/'
            )
        }

        for link in links:
            absolute = link.get('href')
            absolute_no_frag, frag = absolute.split('#')
            filename = str(link.string)
            python_version = link.get('data-requires-python')
            has_python_version = python_version is not None
            python_version_attr = (
                ' data-requires-python="' + html.escape(python_version) + '"'
                if has_python_version
                else ''
            )

            s3_path = f'/{pypi_prefix}{project_name}/{filename}'
            link_data.append((s3_path, filename, frag, python_version_attr))

            exists = filename in existing_project_filenames
            if exists:
                logger.debug('Skipping transfer of %s', s3_path)
                continue

            for _ in range(0, 10):
                try:
                    code, headers, body = await request(b'GET', absolute_no_frag)
                    if code != b'200':
                        await blackhole(body)
                        raise Exception('Failed GET {}'.format(code))

                    content_length = dict(
                        (key.lower(), value) for key, value in headers
                    )[b'content-length']
                    headers = ((b'content-length', content_length),)
                    code, _ = await s3_request_full(
                        logger,
                        s3_context,
                        b'PUT',
                        s3_path,
                        (),
                        headers,
                        lambda: body,
                        'UNSIGNED-PAYLOAD',
                    )
                except (HttpConnectionError, HttpDataError):
                    await asyncio.sleep(10)
                else:
                    break
            if code != b'200':
                raise Exception('Failed PUT {}'.format(code))

        html_str = (
            '<!DOCTYPE html>'
            + '<html>'
            + '<body>'
            + ''.join(
                [
                    f'<a href="https://{s3_context.bucket.host}/{s3_context.bucket.name}{s3_path}'
                    f'#{frag}"{python_version_attr}>{filename}</a>'
                    for s3_path, filename, frag, python_version_attr in link_data
                ]
            )
            + '</body>'
            + '</html>'
        )
        html_bytes = html_str.encode('ascii')
        s3_path = f'/{pypi_prefix}{project_name}/'
        headers = (
            (b'content-type', b'text/html'),
            (b'content-length', str(len(html_bytes)).encode('ascii')),
        )
        for _ in range(0, 5):
            try:
                code, _ = await s3_request_full(
                    logger,
                    s3_context,
                    b'PUT',
                    s3_path,
                    (),
                    headers,
                    streamed(html_bytes),
                    s3_hash(html_bytes),
                )
            except (HttpConnectionError, HttpDataError):
                await asyncio.sleep(10)
            else:
                break
        if code != b'200':
            raise Exception('Failed PUT {}'.format(code))
Esempio n. 15
0
async def pypi_mirror(logger, request, s3_context):
    def normalise(name):
        return re.sub(r'[-_.]+', '-', name).lower()

    async def list_packages():
        request_body = (
            b'<?xml version="1.0"?>'
            b'<methodCall><methodName>list_packages</methodName></methodCall>'
        )
        _, _, body = await request(
            b'POST',
            source_base + '/pypi',
            body=streamed(request_body),
            headers=(
                (b'content-type', b'text/xml'),
                (b'content-length', str(len(request_body)).encode()),
            ),
        )
        return [
            package.text
            for package in ET.fromstring(await buffered(body)).findall(
                './params/param/value/array/data/value/string'
            )
        ]

    async def changelog(sync_changes_after):
        request_body = (
            b'<?xml version="1.0"?>'
            b'<methodCall><methodName>changelog</methodName><params>'
            b'<param><value>'
            b'<int>' + str(sync_changes_after).encode() + b'</int>'
            b'</value></param>'
            b'</params></methodCall>'
        )
        _, _, body = await request(
            b'POST',
            source_base + '/pypi',
            body=streamed(request_body),
            headers=(
                (b'content-type', b'text/xml'),
                (b'content-length', str(len(request_body)).encode()),
            ),
        )
        return [
            package.text
            for package in ET.fromstring(await buffered(body)).findall(
                './params/param/value/array/data/value/array/data/value[1]/string'
            )
        ]

    source_base = 'https://pypi.python.org'

    pypi_prefix = 'pypi/'

    # We may have overlap, but that's fine
    sync_changes_after_key = '__sync_changes_after'
    # Paranoia: the reference implementation at https://bitbucket.org/loewis/pep381client has -1
    started = int(time()) - 1

    # Determine after when to fetch changes. There is an eventual consistency issue storing this
    # on S3, but at worst we'll be unnecessarily re-fetching updates, rather than missing them.
    # Plus, given the time to run a sync and frequency, this is unlikely anyway
    code, data = await s3_request_full(
        logger,
        s3_context,
        b'GET',
        '/' + pypi_prefix + sync_changes_after_key,
        (),
        (),
        empty_async_iterator,
        s3_hash(b''),
    )
    if code not in [b'200', b'404']:
        raise Exception('Failed GET of __sync_changes_after {} {}'.format(code, data))
    sync_changes_after = int(data) if code == b'200' else 0

    # changelog doesn't seem to have changes older than two years, so for all projects on initial
    # import, we need to call list_packages
    project_names_with_duplicates = (
        (await list_packages())
        if sync_changes_after == 0
        else (await changelog(sync_changes_after))
    )

    project_names = sorted(list(set(project_names_with_duplicates)))

    queue = asyncio.Queue()

    for project_name in project_names:
        normalised_project_name = normalise(project_name)
        await queue.put(
            (
                normalised_project_name,
                source_base + f'/simple/{normalised_project_name}/',
            )
        )

    async def transfer_project(project_name, project_url):
        code, _, body = await request(b'GET', project_url)
        data = await buffered(body)
        if code != b'200':
            raise Exception('Failed GET {}'.format(code))

        soup = BeautifulSoup(data, 'html.parser')
        links = soup.find_all('a')
        link_data = []

        logger.info('Finding existing files')
        existing_project_filenames = {
            key
            async for key in s3_list_keys_relative_to_prefix(
                logger, s3_context, f'{pypi_prefix}{project_name}/'
            )
        }

        for link in links:
            absolute = link.get('href')
            absolute_no_frag, frag = absolute.split('#')
            filename = str(link.string)
            python_version = link.get('data-requires-python')
            has_python_version = python_version is not None
            python_version_attr = (
                ' data-requires-python="' + html.escape(python_version) + '"'
                if has_python_version
                else ''
            )

            s3_path = f'/{pypi_prefix}{project_name}/{filename}'
            link_data.append((s3_path, filename, frag, python_version_attr))

            exists = filename in existing_project_filenames
            if exists:
                logger.debug('Skipping transfer of %s', s3_path)
                continue

            for _ in range(0, 10):
                try:
                    code, headers, body = await request(b'GET', absolute_no_frag)
                    if code != b'200':
                        await blackhole(body)
                        raise Exception('Failed GET {}'.format(code))

                    content_length = dict(
                        (key.lower(), value) for key, value in headers
                    )[b'content-length']
                    headers = ((b'content-length', content_length),)
                    code, _ = await s3_request_full(
                        logger,
                        s3_context,
                        b'PUT',
                        s3_path,
                        (),
                        headers,
                        lambda: body,
                        'UNSIGNED-PAYLOAD',
                    )
                except (HttpConnectionError, HttpDataError):
                    await asyncio.sleep(10)
                else:
                    break
            if code != b'200':
                raise Exception('Failed PUT {}'.format(code))

        html_str = (
            '<!DOCTYPE html>'
            + '<html>'
            + '<body>'
            + ''.join(
                [
                    f'<a href="https://{s3_context.bucket.host}/{s3_context.bucket.name}{s3_path}'
                    f'#{frag}"{python_version_attr}>{filename}</a>'
                    for s3_path, filename, frag, python_version_attr in link_data
                ]
            )
            + '</body>'
            + '</html>'
        )
        html_bytes = html_str.encode('ascii')
        s3_path = f'/{pypi_prefix}{project_name}/'
        headers = (
            (b'content-type', b'text/html'),
            (b'content-length', str(len(html_bytes)).encode('ascii')),
        )
        for _ in range(0, 5):
            try:
                code, _ = await s3_request_full(
                    logger,
                    s3_context,
                    b'PUT',
                    s3_path,
                    (),
                    headers,
                    streamed(html_bytes),
                    s3_hash(html_bytes),
                )
            except (HttpConnectionError, HttpDataError):
                await asyncio.sleep(10)
            else:
                break
        if code != b'200':
            raise Exception('Failed PUT {}'.format(code))

    async def transfer_task():
        while True:
            project_name, project_url = await queue.get()
            logger.info('Transferring project %s %s', project_name, project_url)

            try:
                await transfer_project(project_name, project_url)
            except Exception:
                logger.exception('Exception crawling %s', project_url)
            finally:
                queue.task_done()

    tasks = [asyncio.ensure_future(transfer_task()) for _ in range(0, 10)]
    try:
        await queue.join()
    finally:
        for task in tasks:
            task.cancel()
        await asyncio.sleep(0)

    started_bytes = str(started).encode('ascii')

    headers = ((b'content-length', str(len(started_bytes)).encode('ascii')),)
    for _ in range(0, 10):
        try:
            code, _ = await s3_request_full(
                logger,
                s3_context,
                b'PUT',
                '/' + pypi_prefix + sync_changes_after_key,
                (),
                headers,
                streamed(started_bytes),
                s3_hash(started_bytes),
            )
        except (HttpConnectionError, HttpDataError):
            pass
        else:
            break
    if code != b'200':
        raise Exception()