Esempio n. 1
0
    def test_capture_https_proxy_same_session(self):
        sesh = requests.session()
        with capture_http() as warc_writer:
            res = sesh.get("https://example.com/test", proxies=self.proxies, verify=False)
            res = sesh.get("https://example.com/foo", proxies=self.proxies, verify=False)

        # *will* be captured, as part of same session... (fix this?)
        res = sesh.get("https://example.com/skip", proxies=self.proxies, verify=False)

        with capture_http(warc_writer):
            res = sesh.get("https://example.com/bar", proxies=self.proxies, verify=False)

        ai = ArchiveIterator(warc_writer.get_stream())
        response = next(ai)
        assert response.rec_type == 'response'
        assert response.rec_headers['WARC-Target-URI'] == "https://example.com/test"
        assert response.rec_headers['WARC-Proxy-Host'] == 'https://localhost:{0}'.format(self.port)
        assert response.content_stream().read().decode('utf-8') == 'Proxied: /https://example.com/test'

        request = next(ai)
        assert request.rec_type == 'request'
        assert request.rec_headers['WARC-Target-URI'] == "https://example.com/test"
        assert request.rec_headers['WARC-Proxy-Host'] == 'https://localhost:{0}'.format(self.port)

        response = next(ai)
        assert response.rec_type == 'response'
        assert response.rec_headers['WARC-Target-URI'] == "https://example.com/foo"
        assert response.rec_headers['WARC-Proxy-Host'] == 'https://localhost:{0}'.format(self.port)
        assert response.content_stream().read().decode('utf-8') == 'Proxied: /https://example.com/foo'

        request = next(ai)
        assert request.rec_type == 'request'
        assert request.rec_headers['WARC-Target-URI'] == "https://example.com/foo"
        assert request.rec_headers['WARC-Proxy-Host'] == 'https://localhost:{0}'.format(self.port)

        response = next(ai)
        assert response.rec_type == 'response'
        assert response.rec_headers['WARC-Target-URI'] == "https://example.com/skip"
        assert response.rec_headers['WARC-Proxy-Host'] == 'https://localhost:{0}'.format(self.port)
        assert response.content_stream().read().decode('utf-8') == 'Proxied: /https://example.com/skip'

        request = next(ai)
        assert request.rec_type == 'request'

        response = next(ai)
        assert response.rec_type == 'response'
        assert response.rec_headers['WARC-Target-URI'] == "https://example.com/bar"
        assert response.rec_headers['WARC-Proxy-Host'] == 'https://localhost:{0}'.format(self.port)
        assert response.content_stream().read().decode('utf-8') == 'Proxied: /https://example.com/bar'

        request = next(ai)
        assert request.rec_type == 'request'

        with raises(StopIteration):
            assert next(ai)
def main():
    """
    Scrape locally rendered version of Liesbet's Atelier to compressed WARC,
    including all outcomes of "barbie" scripts
    """

    siteName = "ziklies.home.xs4all.nl"

    # Compressed WARC file for output
    warcOut = siteName + ".warc.gz"

    siteDir = "/var/www/" + siteName

    # List of URLs to scrape
    urls = []

    # First add domain root
    urls.append("http://" + siteName)

    # Add remaining files (and rewrite file paths as URLs)
    for root, dirs, files in os.walk(siteDir):
        for filename in files:
            # Full path
            file_path = os.path.join(root, filename)

            # Construct url and add to list
            url = file_path.replace("/var/www/", "http://")
            urls.append(url)

    # Start capturing stuff
    with capture_http(warcOut):
        # Iterate over URL list
        for url in urls:
            requests.get(url)

        # Iterate over all input combinations of "barbie" scripts
        # Note that we also account for cases where 1 or more
        # fields are not set!
        # ("barbie1.cgi" is the English-language version)
        for indexOnder in ["na", *range(1, 8)]:
            for indexMidden in ["na", *range(1, 8)]:
                for indexTop in ["na", *range(1, 8)]:
                    vOnder = str(indexOnder) + 'a'
                    vMidden = str(indexMidden) + 'b'
                    vTop = str(indexTop) + 'c'

                    scriptParams = {}
                    if indexOnder != "na":
                        scriptParams["onder"] = vOnder
                    if indexMidden != "na":
                        scriptParams["midden"] = vMidden
                    if indexTop != "na":
                        scriptParams["top"] = vTop

                    requests.post(
                        "http://ziklies.home.xs4all.nl/cgi-bin/barbie.cgi",
                        data=scriptParams)
                    requests.post(
                        "http://ziklies.home.xs4all.nl/cgi-bin/barbie1.cgi",
                        data=scriptParams)
Esempio n. 3
0
    def test_post_json(self):
        warc_writer = BufferWARCWriter(gzip=False)

        with capture_http(warc_writer):
            res = requests.post('http://localhost:{0}/post'.format(self.port),
                                headers={'Host': 'httpbin.org'},
                                json={'some': {
                                    'data': 'posted'
                                }})

        assert res.json()['json'] == {'some': {'data': 'posted'}}

        # response
        ai = ArchiveIterator(warc_writer.get_stream())
        response = next(ai)
        assert response.rec_type == 'response'

        assert res.json() == json.loads(
            response.content_stream().read().decode('utf-8'))

        # request
        request = next(ai)
        assert request.rec_type == 'request'
        assert request.http_headers['Content-Type'] == 'application/json'

        data = request.content_stream().read().decode('utf-8')
        assert data == '{"some": {"data": "posted"}}'
Esempio n. 4
0
    def test_remote(self):
        with capture_http(warc_version='1.1', gzip=True) as writer:
            requests.get('http://example.com/')
            requests.get('https://google.com/')

        expected = [('http://example.com/', 'response', True),
                    ('http://example.com/', 'request', True),
                    ('https://google.com/', 'response', True),
                    ('https://google.com/', 'request', True),
                    ('https://www.google.com/', 'response', True),
                    ('https://www.google.com/', 'request', True)]

        actual = [(record.rec_headers['WARC-Target-URI'], record.rec_type,
                   'WARC-IP-Address' in record.rec_headers)
                  for record in ArchiveIterator(writer.get_stream())]

        assert actual == expected
Esempio n. 5
0
    def test_capture_http_proxy(self):
        with capture_http() as warc_writer:
            res = requests.get("http://example.com/test", proxies=self.proxies, verify=False)

        ai = ArchiveIterator(warc_writer.get_stream())
        response = next(ai)
        assert response.rec_type == 'response'
        assert response.rec_headers['WARC-Target-URI'] == "http://example.com/test"
        assert response.content_stream().read().decode('utf-8') == 'Proxied: /http://example.com/test'
        assert response.rec_headers['WARC-Proxy-Host'] == 'http://localhost:{0}'.format(self.port)

        request = next(ai)
        assert request.rec_type == 'request'
        assert request.rec_headers['WARC-Target-URI'] == "http://example.com/test"
        assert request.rec_headers['WARC-Proxy-Host'] == 'http://localhost:{0}'.format(self.port)

        with raises(StopIteration):
            assert next(ai)
Esempio n. 6
0
    def test_post_chunked(self):
        warc_writer = BufferWARCWriter(gzip=False)

        def nop_filter(request, response, recorder):
            assert request
            assert response
            return request, response

        def gen():
            return iter([b'some', b'data', b'to', b'post'])

        #url = 'http://localhost:{0}/post'.format(self.port)
        url = 'https://httpbin.org/post'

        with capture_http(warc_writer, nop_filter, record_ip=False):
            res = requests.post(url,
                                data=gen(),
                                headers={'Content-Type': 'application/json'})

        # response
        ai = ArchiveIterator(warc_writer.get_stream())
        response = next(ai)
        assert response.rec_type == 'response'
        assert response.rec_headers['WARC-Target-URI'] == url
        assert 'WARC-IP-Address' not in response.rec_headers

        assert res.json() == json.loads(
            response.content_stream().read().decode('utf-8'))

        # request
        request = next(ai)
        assert request.rec_type == 'request'
        assert request.rec_headers['WARC-Target-URI'] == url
        assert 'WARC-IP-Address' not in response.rec_headers

        data = request.content_stream().read().decode('utf-8')
        assert data == 'somedatatopost'
def do_warcio_scrape(parsed_args):

    TIME_SECONDS_TO_SLEEP_BETWEEN_FAILURES = 5

    KEY_TYPE = "type"
    KEY_INCLUDED = "included"
    KEY_THUMBNAIL_BASE = "thumbnail-url-base"
    KEY_PARENT = "parent"
    KEY_MEDIA_LIST = "media-list"
    KEY_PROMO = "promo"
    KEY_IMAGES = "images"
    KEY_VIDEOS = "videos"
    KEY_SCREENSHOTS = "screenshots"
    KEY_PREVIEW = "preview"
    KEY_URL = "url"
    KEY_THUMBNAIL = "thumbnail"
    KEY_ID = "id"
    KEY_NAME = "name"
    KEY_ATTRIBUTES = "attributes"
    KEY_DEFAULT_SKU = "default-sku-id"

    MAX_RETRIES = 5

    USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0"

    HEADERS = {
        "Accept": "application/json",
        "Accept-Encoding": "gzip, deflate",
        "Accept-Language": "en-US,en;q=0.5",
        "Cache-Control": "no-cache",
        "Dnt": "1",
        "Host": "store.playstation.com",
        "User-Agent": USER_AGENT,
    }

    VALKYRIE_API_URL_FORMAT = "https://store.playstation.com/valkyrie-api/{}/{}/999/resolve/{}"
    CHIHIRO_API_URL_FORMAT = "https://store.playstation.com/store/api/chihiro/00_09_000/container/{}/{}/999/{}"

    api_entry_list = []
    discovered_media_url_list = []

    session = requests.Session()
    session.headers.update(HEADERS)

    start_time = arrow.utcnow()

    with open(parsed_args.sku_list, "r", encoding="utf-8") as file_list_fh:
        while True:
            sku = file_list_fh.readline()
            if not sku:
                break
            else:
                sku_stripped = sku.strip()
                valkyrie_url = VALKYRIE_API_URL_FORMAT.format(
                    parsed_args.region_lang, parsed_args.region_country,
                    sku_stripped)
                chihiro_url = CHIHIRO_API_URL_FORMAT.format(
                    parsed_args.region_country, parsed_args.region_lang,
                    sku_stripped)
                api_entry_list.append(
                    ApiEntry(sku=sku_stripped,
                             valkyrie_url=valkyrie_url,
                             chihiro_url=chihiro_url))

    api_entry_list_size = len(api_entry_list)
    logger.info("have `%s` urls to download", api_entry_list_size)
    logger.info("writing media urls to `%s`",
                parsed_args.media_files_output_file)

    # the capture_http method probably doesn't take a path objet
    with capture_http(str(parsed_args.warc_output_file), warc_version='1.1'):

        for idx, iter_api_entry in enumerate(api_entry_list):

            # prevent DDOS hopefully...
            # time_to_sleep = random.random()
            # logger.info("[sleeping for `%s`]", time_to_sleep)
            # time.sleep(time_to_sleep)

            iter_discovered_media_list = []

            logger.info("`%s / %s`: url: `%s`", idx + 1, api_entry_list_size,
                        iter_api_entry)

            logger.debug("-- making valkyrie api request")
            # get normal valkyrie response
            success = False
            response = None
            response_json = None
            for i in range(MAX_RETRIES):

                try:

                    response = session.get(iter_api_entry.valkyrie_url)
                    logger.info("-- url `%s` - HTTP `%s`",
                                iter_api_entry.valkyrie_url,
                                response.status_code)

                    response.raise_for_status()
                    success = True
                    response_json = response.json()
                    break

                except Exception as e:
                    logger.error(
                        "-- error number `%s` when getting url `%s`: `%s`", i,
                        iter_api_entry.valkyrie_url, e)
                    time.sleep(TIME_SECONDS_TO_SLEEP_BETWEEN_FAILURES)
                    continue

            if not success:
                logger.error(
                    "-- hit `%s` retries when attempting to get URL `%s`, skipping",
                    MAX_RETRIES, iter_api_entry.valkyrie_url)

            else:

                # go through the json and get media urls

                included_json_list = response_json[KEY_INCLUDED]

                for iter_included_json_dict in included_json_list:

                    iter_attribute_json_dict = iter_included_json_dict[
                        KEY_ATTRIBUTES]

                    # get the thumbnail, but check to make sure its there, not
                    # all of the attribute dicts have it for some reason, like if there
                    # is a bundle, one will have it, the other won't
                    if KEY_THUMBNAIL_BASE in iter_attribute_json_dict.keys():
                        thumbnail = iter_attribute_json_dict[
                            KEY_THUMBNAIL_BASE]
                        logger.debug("-- found media url (thumbnail): `%s`",
                                     thumbnail)
                        iter_discovered_media_list.append(thumbnail)

                    # see if there are other media urls

                    ## look in parent dict

                    if KEY_PARENT in iter_attribute_json_dict.keys(
                    ) and iter_attribute_json_dict[KEY_PARENT] is not None:
                        parent_json_dict = iter_attribute_json_dict[KEY_PARENT]

                        parent_id = parent_json_dict[KEY_ID]
                        # output parent id just in case

                        thumbnail = parent_json_dict[KEY_THUMBNAIL]

                        logger.debug(
                            "-- found media url (parent - thumbnail): `%s`",
                            thumbnail)
                        iter_discovered_media_list.append(thumbnail)

                    # media list stuff

                    if KEY_MEDIA_LIST in iter_attribute_json_dict.keys():

                        media_list_dict = iter_attribute_json_dict[
                            KEY_MEDIA_LIST]

                        for iter_preview_dict in media_list_dict[KEY_PREVIEW]:

                            preview_url = iter_preview_dict[KEY_URL]
                            logger.debug(
                                "-- found media url (media list - previews): `%s`",
                                preview_url)
                            iter_discovered_media_list.append(preview_url)

                        for iter_promos_images_dict in media_list_dict[
                                KEY_PROMO][KEY_IMAGES]:

                            image_url = iter_promos_images_dict[KEY_URL]
                            logger.debug(
                                "-- found media url (media list - promos - images): `%s`",
                                image_url)
                            iter_discovered_media_list.append(image_url)

                        for iter_promos_videos_dict in media_list_dict[
                                KEY_PROMO][KEY_VIDEOS]:

                            vid_url = iter_promos_videos_dict[KEY_URL]
                            logger.debug(
                                "-- found media url (media list - promos - videos): `%s`",
                                vid_url)
                            iter_discovered_media_list.append(vid_url)

                        for iter_screenshot_dict in media_list_dict[
                                KEY_SCREENSHOTS]:

                            ss_url = iter_screenshot_dict[KEY_URL]
                            logger.debug(
                                "-- found media url (media list - screenshots): `%s`",
                                ss_url)
                            iter_discovered_media_list.append(ss_url)

            logger.debug("-- making chihiro api request")
            success_two = False
            response_two = None
            response_json_two = None
            for i in range(MAX_RETRIES):

                try:

                    response_two = session.get(iter_api_entry.chihiro_url)
                    logger.info("-- url `%s` - HTTP `%s`",
                                iter_api_entry.chihiro_url,
                                response_two.status_code)

                    response_two.raise_for_status()
                    success_two = True
                    response_json_two = response_two.json()

                    break

                except Exception as e:
                    logger.error(
                        "-- error number `%s` when getting url `%s`: `%s`", i,
                        iter_api_entry.chihiro_url, e)
                    time.sleep(TIME_SECONDS_TO_SLEEP_BETWEEN_FAILURES)
                    continue

            if not success_two:
                logger.error(
                    "-- hit `%s` retries when attempting to get URL `%s`, skipping",
                    MAX_RETRIES, iter_api_entry.chihiro_url)
                continue

            else:

                # get media urls

                # only getting these for now, as they are higher resolution then the ones that the valkyrie api returns

                image_dict_list = response_json_two[KEY_IMAGES]

                for iter_image_dict in image_dict_list:

                    image_type = iter_image_dict[KEY_TYPE]
                    image_url = iter_image_dict[KEY_URL]
                    logger.debug(
                        "-- found media url (chihiro images): `%s` of type `%s`",
                        image_url, image_type)
                    iter_discovered_media_list.append(image_url)

            num_media_this_run = len(iter_discovered_media_list)
            logger.debug("-- discovered `%s` media urls for this URL",
                         num_media_this_run)
            discovered_media_url_list.extend(iter_discovered_media_list)
            logger.debug("-- discovered media list now has a size of `%s`",
                         len(discovered_media_url_list))

            # write out the new media discovered this iteration in case we crash
            with open(parsed_args.media_files_output_file,
                      "a",
                      encoding="utf-8",
                      newline="\n") as f:

                for iter_media_url in iter_discovered_media_list:
                    f.write("{}\n".format(iter_media_url))

    end_time = arrow.utcnow()

    elapsed_time = end_time - start_time

    logger.info("start time: `%s`, end time: `%s`, elapsed time: `%s`",
                start_time, end_time, elapsed_time)
    logger.info("discovered `%s` media urls", len(discovered_media_url_list))
Esempio n. 8
0
def get_all_static_urls(html):
    static_urls = set.union(set(find_js_urls(html)), set(find_img_urls(html)),
                            set(find_css_urls(html)))
    return static_urls


def filter_records(request, response, recorder):
    if response.http_headers.get_statuscode() != '200':
        print("skipping {}".format(request.http_headers.get_statuscode()))
        return None, None
    return request, response


if __name__ == "__main__":
    warc_file = '{}.warc.gz'.format(sys.argv[2])
    with capture_http(warc_file, filter_records):
        response = requests.get(sys.argv[1])
        html = HTML(html=response.text, url=response.url)
        static_resource_urls = get_all_static_urls(html)
        for static_resource in static_resource_urls:
            if not urlparse(static_resource).hostname:
                continue
            if check_url_similarity(static_resource, html.url):
                print('skipping: {}'.format(static_resource))
                continue
            if not urlparse(static_resource).scheme:
                static_resource = 'https:' + static_resource
            print('downloading resource: {}'.format(static_resource))
            requests.get(static_resource)

    print('Done getting archive of: {}'.format(sys.argv[1]))