Esempio n. 1
0
def test_unmodified(writer):
    """
    Single request/response pair, no revisits
    """

    records = []

    httpHeaders = StatusAndHeaders('GET / HTTP/1.1', {}, is_http_request=True)
    warcHeaders = {}
    record = writer.create_warc_record ('http://example.com/', 'request', payload=BytesIO(b'foobar'),
            warc_headers_dict=warcHeaders, http_headers=httpHeaders)
    records.append (record)

    httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1')
    record = writer.create_warc_record ('http://example.com/', 'response', payload=BytesIO(b'data'),
            warc_headers_dict=warcHeaders, http_headers=httpHeaders)
    records.append (record)

    for r in records:
        writer.write_record (r)

    output = NamedTemporaryFile()
    mergeWarc ([writer.out.name], output)

    output.seek(0)
    recordsEqual (makeGolden (writer, records), ArchiveIterator (output))
Esempio n. 2
0
def test_non_ascii_2():
    st = StatusAndHeaders(
        '200 OK',
        [('Custom-Header',
          u'value; filename="Éxamplè"; param; other=испытание; another')])
    res = st.to_ascii_bytes().decode('ascii')
    assert res == "\
Esempio n. 3
0
def test_different_payload(writer):
    """
    Duplicate URL, but different payload
    """

    records = []
    for i in range (2):
        httpHeaders = StatusAndHeaders('GET / HTTP/1.1', {}, is_http_request=True)
        warcHeaders = {}
        record = writer.create_warc_record ('http://example.com/', 'request', payload=BytesIO(b'foobar'),
                warc_headers_dict=warcHeaders, http_headers=httpHeaders)
        records.append (record)

        httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1')
        record = writer.create_warc_record ('http://example.com/', 'response',
                payload=BytesIO(f'data{i}'.encode ('utf8')),
                warc_headers_dict=warcHeaders, http_headers=httpHeaders)
        records.append (record)

    for r in records:
        writer.write_record (r)

    output = NamedTemporaryFile()
    mergeWarc ([writer.out.name], output)

    output.seek(0)
    recordsEqual (makeGolden (writer, records), ArchiveIterator (output))
Esempio n. 4
0
    def load_resource(self, cdx, params):
        load_url = cdx.get('load_url')
        if not load_url:
            return None

        if params.get('content_type') != self.CONTENT_TYPE:
            return None

        if not self.ydl:
            return None

        info = self.ydl.extract_info(load_url)
        info_buff = json.dumps(info)
        info_buff = info_buff.encode('utf-8')

        warc_headers = {}

        schema, rest = load_url.split('://', 1)
        target_url = 'metadata://' + rest

        dt = timestamp_to_datetime(cdx['timestamp'])

        warc_headers['WARC-Type'] = 'metadata'
        warc_headers['WARC-Record-ID'] = self._make_warc_id()
        warc_headers['WARC-Target-URI'] = target_url
        warc_headers['WARC-Date'] = datetime_to_iso_date(dt)
        warc_headers['Content-Type'] = self.CONTENT_TYPE
        warc_headers['Content-Length'] = str(len(info_buff))

        warc_headers = StatusAndHeaders('WARC/1.0', warc_headers.items())

        return warc_headers, None, BytesIO(info_buff)
Esempio n. 5
0
    def load_resource(self, cdx, params):
        load_url = cdx.get('load_url')
        if not load_url:
            return None

        if params.get('content_type') != self.CONTENT_TYPE:
            return None

        if not self.ydl:
            return None

        info = self.ydl.extract_info(load_url)
        info_buff = json.dumps(info)
        info_buff = info_buff.encode('utf-8')

        warc_headers = {}

        schema, rest = load_url.split('://', 1)
        target_url = 'metadata://' + rest

        dt = timestamp_to_datetime(cdx['timestamp'])

        warc_headers['WARC-Type'] = 'metadata'
        warc_headers['WARC-Record-ID'] = self._make_warc_id()
        warc_headers['WARC-Target-URI'] = target_url
        warc_headers['WARC-Date'] = datetime_to_iso_date(dt)
        warc_headers['Content-Type'] = self.CONTENT_TYPE
        warc_headers['Content-Length'] = str(len(info_buff))

        warc_headers = StatusAndHeaders('WARC/1.0', warc_headers.items())

        return warc_headers, None, BytesIO(info_buff)
Esempio n. 6
0
def test_resp_revisit_same_url(writer):
    """
    Duplicate record for the same URL, creates a revisit
    """

    records = []
    for i in range (2):
        httpHeaders = StatusAndHeaders('GET / HTTP/1.1', {}, is_http_request=True)
        warcHeaders = {}
        record = writer.create_warc_record ('http://example.com/', 'request', payload=BytesIO(b'foobar'),
                warc_headers_dict=warcHeaders, http_headers=httpHeaders)
        records.append (record)

        httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1')
        record = writer.create_warc_record ('http://example.com/', 'response', payload=BytesIO(b'data'),
                warc_headers_dict=warcHeaders, http_headers=httpHeaders)
        records.append (record)

    for r in records:
        writer.write_record (r)

    dup = records.pop ()
    ref = records[1]
    records.append (makeRevisit (writer, ref, dup))

    output = NamedTemporaryFile()
    mergeWarc ([writer.out.name], output)

    output.seek(0)
    recordsEqual (makeGolden (writer, records), ArchiveIterator (output))
Esempio n. 7
0
    def write_request_response_pair(self,
                                    url,
                                    ip,
                                    req_headers,
                                    resp_headers,
                                    is_truncated,
                                    payload,
                                    digest=None,
                                    decompressed=False):
        if self.writer is None:
            self.open()

        req_http_headers = StatusAndHeaders('GET / HTTP/1.1', req_headers)

        warc_headers_dict = OrderedDict()
        warc_headers_dict['WARC-Warcinfo-ID'] = self.warcinfo_id
        request = self.writer.create_warc_record(
            'http://example.com/',
            'request',
            warc_headers_dict=warc_headers_dict,
            http_headers=req_http_headers)

        fake_resp_headers = self._fake_resp_headers(resp_headers,
                                                    len(payload),
                                                    decompressed=decompressed)
        resp_http_headers = StatusAndHeaders('200 OK',
                                             fake_resp_headers,
                                             protocol='HTTP/1.1')

        warc_headers_dict = OrderedDict()
        warc_headers_dict['WARC-Warcinfo-ID'] = self.warcinfo_id
        if ip is not None:
            if not isinstance(ip, str):
                ip = ip[0]
            warc_headers_dict['WARC-IP-Address'] = ip
        if digest is not None:
            warc_headers_dict['WARC-Payload-Digest'] = digest
        if is_truncated:
            if is_truncated in valid_truncations:
                warc_headers_dict['WARC-Truncated'] = is_truncated
            else:
                LOGGER.error('Invalid is_truncation of ' + is_truncated)
                warc_headers_dict['WARC-Truncated'] = 'unspecified'

        response = self.writer.create_warc_record(
            url,
            'response',
            payload=BytesIO(payload),
            length=len(payload),
            warc_headers_dict=warc_headers_dict,
            http_headers=resp_http_headers)

        self.writer.write_request_response_pair(request, response)
        self.maybe_close()
        LOGGER.debug('wrote warc request-response pair%s for url %s',
                     p(self.prefix), url)
        stats.stats_sum('warc r/r' + p(self.prefix), 1)
Esempio n. 8
0
    def process_document(self, doc):
        if doc.status == 200:
            self.concurrency_lock.acquire()
            try:
                # print base64.b64encode(doc.text)+"\t"+doc.url+"\t"+str(time.time())
                headers_list = doc.response.getheaders()

                http_headers = StatusAndHeaders('200 OK',
                                                headers_list,
                                                protocol='HTTP/1.0')

                record = writer.create_warc_record(doc.url,
                                                   'response',
                                                   payload=io.BytesIO(
                                                       doc.text),
                                                   http_headers=http_headers)
                writer.write_record(record)
                self.crawlsize += sys.getsizeof(doc.text) / 1000000.0
                if self.sizelimit is not None and self.crawlsize > self.sizelimit:
                    self.interrupt = True
                    self.save_status()
                if self.timelimit is not None and time.time(
                ) - self.crawlstarts > self.timelimit:
                    self.interrupt = True
                    self.save_status()
            finally:
                self.concurrency_lock.release()
        else:
            pass
Esempio n. 9
0
    def create_redirect_record(self,
                               url,
                               redirect_url,
                               timestamp,
                               status='301'):
        warc_headers = {}
        warc_headers['WARC-Date'] = timestamp_to_iso_date(timestamp)

        #content = 'Redirect to ' + redirect_url
        content = ''
        payload = content.encode('utf-8')
        headers_list = [('Content-Length', str(len(payload))),
                        ('Location', redirect_url)]

        http_headers = StatusAndHeaders(status + ' Redirect',
                                        headers_list,
                                        protocol='HTTP/1.0')

        rec = self.writer.create_warc_record(url,
                                             'response',
                                             payload=BytesIO(payload),
                                             length=len(payload),
                                             http_headers=http_headers,
                                             warc_headers_dict=warc_headers)

        self.writer.write_record(rec)

        return rec
Esempio n. 10
0
    def run(self):

        with open(self.warcfile, 'ab') as output:
            while True:
                self.lock.acquire()
                data = self.out_queue.get()
                writer = WARCWriter(output, gzip=False)
                headers_list = data[0]
                http_headers = StatusAndHeaders('{} {}'.format(
                    data[3], data[4]),
                                                headers_list,
                                                protocol='HTTP/1.0')
                record = writer.create_warc_record(data[2],
                                                   'response',
                                                   payload=data[1],
                                                   http_headers=http_headers)
                h = hashlib.sha1()
                h.update(record.raw_stream.read(BLOCK_SIZE))
                if self.dedup.lookup(h.hexdigest()):
                    record = writer.create_warc_record(
                        data[2], 'revisit', http_headers=http_headers)
                    writer.write_record(record)
                    self.out_queue.task_done()
                    self.lock.release()
                else:
                    self.dedup.save(h.hexdigest(), data[2])
                    record.raw_stream.seek(0)
                    writer.write_record(record)
                    self.out_queue.task_done()
                    self.lock.release()
Esempio n. 11
0
def fetch_urls_to_warc(urls, warcfile_path):
    """Fetch urls and write to warc file

    :urls: list of urls to binary files
    :warcfile_path: path to a WARC file.

    """

    with open(warcfile_path, 'wb') as output:
        writer = WARCWriter(output, gzip=True)

        for url in urls:
            print(url)
            resp = requests.get(url,
                                headers={'Accept-Encoding': 'identity'},
                                stream=True)

            headers_list = resp.raw.headers.items()
            http_headers = StatusAndHeaders('200 OK',
                                            headers_list,
                                            protocol='HTTP/1.0')
            record = writer.create_warc_record(url,
                                               'response',
                                               payload=resp.raw,
                                               http_headers=http_headers)
            writer.write_record(record)
Esempio n. 12
0
    def _writeRequest(self, item):
        logger = self.logger.bind(reqId=item.id)

        req = item.request
        url = item.url

        path = url.relative().with_fragment(None)
        httpHeaders = StatusAndHeaders(f'{req.method} {path} HTTP/1.1',
                                       req.headers,
                                       protocol='HTTP/1.1',
                                       is_http_request=True)
        warcHeaders = {
            'X-Chrome-Initiator': json.dumps(req.initiator),
            'X-Chrome-Request-ID': item.id,
            'WARC-Date': datetime_to_iso_date(req.timestamp),
        }

        body = item.request.body
        if item.request.hasPostData and body is None:
            # oops, don’t know what went wrong here
            logger.error('requestBody missing',
                         uuid='ee9adc58-e723-4595-9feb-312a67ead6a0')
            warcHeaders['WARC-Truncated'] = 'unspecified'
        else:
            warcHeaders['X-Chrome-Base64Body'] = str(type(body) is Base64Body)
            body = BytesIO(body)
        record = self.writeRecord(url,
                                  'request',
                                  payload=body,
                                  http_headers=httpHeaders,
                                  warc_headers_dict=warcHeaders)
        return record.rec_headers['WARC-Record-ID']
Esempio n. 13
0
    def _writeResponse(self, item, concurrentTo):
        # fetch the body
        reqId = item.id

        # now the response
        resp = item.response
        warcHeaders = {
            'WARC-Concurrent-To': concurrentTo,
            'X-Chrome-Request-ID': item.id,
            'WARC-Date': datetime_to_iso_date(resp.timestamp),
        }
        # conditional WARC headers
        if item.remoteIpAddress:
            warcHeaders['WARC-IP-Address'] = item.remoteIpAddress
        if item.protocol:
            warcHeaders['X-Chrome-Protocol'] = item.protocol

        # HTTP headers
        statusText = resp.statusText or \
                BaseHTTPRequestHandler.responses.get (
                resp.status, ('No status text available', ))[0]
        httpHeaders = StatusAndHeaders(f'{resp.status} {statusText}',
                                       resp.headers,
                                       protocol='HTTP/1.1')

        # Content is saved decompressed and decoded, remove these headers
        blacklistedHeaders = {'transfer-encoding', 'content-encoding'}
        for h in blacklistedHeaders:
            httpHeaders.remove_header(h)

        # chrome sends nothing but utf8 encoded text. Fortunately HTTP
        # headers take precedence over the document’s <meta>, thus we can
        # easily override those.
        contentType = resp.mimeType
        if contentType:
            if isinstance(resp.body, UnicodeBody):
                contentType += '; charset=utf-8'
            httpHeaders.replace_header('Content-Type', contentType)

        # response body
        body = resp.body
        if body is None:
            warcHeaders['WARC-Truncated'] = 'unspecified'
        else:
            httpHeaders.replace_header('Content-Length', str(len(body)))
            warcHeaders['X-Chrome-Base64Body'] = str(type(body) is Base64Body)
            body = BytesIO(body)

        record = self.writeRecord(item.url,
                                  'response',
                                  warc_headers_dict=warcHeaders,
                                  payload=body,
                                  http_headers=httpHeaders)

        if item.resourceType == 'Document':
            self.documentRecords[item.url] = record.rec_headers.get_header(
                'WARC-Record-ID')
Esempio n. 14
0
    def write_request_response_pair(self,
                                    url,
                                    req_headers,
                                    resp_headers,
                                    is_truncated,
                                    payload,
                                    digest=None):
        if self.writer is None:
            self.open()

        # XXX WARC-Identified-Payload-Type set from Apache Tika? (done by Common Crawl) (how expensive?)

        req_http_headers = StatusAndHeaders(
            'GET / HTTP/1.1', headers_to_str_headers(req_headers))

        request = self.writer.create_warc_record('http://example.com/',
                                                 'request',
                                                 http_headers=req_http_headers)

        resp_http_headers = StatusAndHeaders(
            '200 OK',
            headers_to_str_headers(resp_headers),
            protocol='HTTP/1.1')

        warc_headers_dict = {}
        if digest is not None:
            warc_headers_dict['WARC-Payload-Digest'] = digest
        if is_truncated:
            if is_truncated in valid_truncations:
                warc_headers_dict['WARC-Truncated'] = is_truncated
            else:
                LOGGER.error('Invalid is_truncation of ' + is_truncated)
                warc_headers_dict['WARC-Truncated'] = 'unspecified'

        response = self.writer.create_warc_record(
            url,
            'response',
            payload=BytesIO(payload),
            length=len(payload),
            warc_headers_dict=warc_headers_dict,
            http_headers=resp_http_headers)

        self.writer.write_request_response_pair(request, response)
        self.maybe_close()
        LOGGER.debug('wrote warc request-response pair%s for url %s',
                     p(self.prefix), url)
        stats.stats_sum('warc r/r' + p(self.prefix), 1)
Esempio n. 15
0
def run(url, out_path, time_limit, agent, filetypes, warcfilename, wait):
    cmd = ""
    if time_limit:
        cmd += "timeout {} ".format(time_limit)
    waitoption = ""
    if wait is not None:
        waitoption = "--wait " + wait
    agentoption = ""
    if agent is not None:
        agentoption = "--user-agent \"" + agent + "\""

    filetypesoption = ""
    if filetypes is not None:
        filetypesoption = "-A \"" + filetypes + "\""

    warcoption = ""
    warcfilebasename = warcfilename[0:warcfilename.find(".warc.gz")]
    if warcfilename is not None:
        warcoption = "--warc-file \"" + warcfilebasename + "\""

    if check_wget_compression("wget --help | grep 'no-warc-compression'"):
        warcoption += " --no-warc-compression"

    cmd += "wget --mirror {WAIT} {FILETYPES} -q -o /dev/null {URL} -P {DOWNLOAD_PATH} {AGENT} {WARC}".format(
        WAIT=waitoption,
        FILETYPES=filetypesoption,
        URL=url,
        DOWNLOAD_PATH=out_path,
        AGENT=agentoption,
        WARC=warcoption)
    # print("cmd", cmd)
    try:
        system_check(cmd)
    except subprocess.CalledProcessError as grepexc:
        sys.stderr.write(
            "Warning: Some files could not be downloaded with wget\n")

    with open(warcfilebasename + ".warc", 'rb') as f_in:
        with open(warcfilebasename + ".warc.gz", 'wb') as f_out:
            writer = WARCWriter(f_out, gzip=True)
            try:
                for record in ArchiveIterator(f_in):
                    if record.http_headers:
                        if record.http_headers.get_header(
                                'Transfer-Encoding') == "chunked":
                            continue
                        try:
                            record.http_headers.to_ascii_bytes()
                        except UnicodeEncodeError:
                            # if header is non ascii, create a new header, with status code only
                            # content length and content type will be filled before writing
                            record.http_headers = StatusAndHeaders(
                                record.http_headers.get_statuscode(), [])
                    record.length = None
                    writer.write_record(record)
            except:
                pass

    system_check("rm {WARC}".format(WARC=warcfilebasename + ".warc"))
Esempio n. 16
0
    def bin_stream(stream, content_type, status='200 OK', headers=None):
        def_headers = [('Content-Type', content_type)]
        if headers:
            def_headers += headers

        status_headers = StatusAndHeaders(status, def_headers)

        return WbResponse(status_headers, value=stream)
Esempio n. 17
0
def warc_from_response(response, resolved_url):
    f_output = BytesIO()
    writer = WARCWriter(f_output, gzip=True)
    # Response
    response_header_items = list(response.headers.to_unicode_dict().items())
    response_headers = StatusAndHeaders("200 OK", response_header_items, protocol="HTTP/1.0")
    response_record = writer.create_warc_record(resolved_url, "response", payload=BytesIO(response.body), http_headers=response_headers)
    writer.write_record(response_record)
    # Request
    request_header_items = list(response.request.headers.to_unicode_dict().items())
    request_headers = StatusAndHeaders("200 OK", request_header_items, protocol="HTTP/1.0")
    request_record = writer.create_warc_record(resolved_url, "request", payload=BytesIO(response.request.body), http_headers=request_headers)
    request_record.rec_headers.add_header("WARC-Concurrent-To", response_record.rec_headers.get_header("WARC-Record-ID"))
    writer.write_record(request_record)
    contents = f_output.getvalue()
    f_output.close()
    return contents
Esempio n. 18
0
    def parse_response(self, url, response, ip=None):
        headers = []
        payload = BytesIO()
        content = response['content'].get('text', '')

        if not content and not response.get('headers'):
            self.logger.info('No headers or payload for: {0}'.format(url))
            headers.append(('Content-Length', '0'))
        if response['content'].get('encoding') == 'base64':
            payload.write(base64.b64decode(content))
        else:
            payload.write(content.encode('utf-8'))

        length = payload.tell()
        payload.seek(0)

        SKIP_HEADERS = ('content-encoding', 'transfer-encoding')

        http2 = False

        for header in response['headers']:
            if header['name'].lower() not in SKIP_HEADERS:
                headers.append((header['name'], header['value']))

            #TODO: http2 detection -- write as same warc header?
            if (not http2 and
                header['name'] in (':method', ':scheme', ':path')):
                http2 = True

        status = response.get('status') or 204

        reason = response.get('statusText')
        if not reason:
            reason = http_status_names.get(status, 'No Reason')

        status_line = str(status) + ' ' + reason

        proto = self._get_http_version(response)

        http_headers = StatusAndHeaders(status_line, headers, protocol=proto)

        if not content:
            content_length = http_headers.get_header('Content-Length', '0')
            if content_length != '0':
                self.logger.info('No Content for length {0} {1}'.format(content_length, url))
                http_headers.replace_header('Content-Length', '0')
        else:
            http_headers.replace_header('Content-Length', str(length))

        warc_headers_dict = {}
        if ip:
            warc_headers_dict['WARC-IP-Address'] = ip

        record = self.writer.create_warc_record(url, 'response',
                                                http_headers=http_headers,
                                                payload=payload,
                                                length=length,
                                                warc_headers_dict=warc_headers_dict)

        return record
Esempio n. 19
0
    def do_rewrite(cls, statusline, headers):
        writer = BufferWARCWriter()

        http_headers = StatusAndHeaders(statusline, headers, protocol='HTTP/1.0')

        record = writer.create_warc_record('http://example.com/', 'response',
                                           http_headers=http_headers)

        return cls.get_rwinfo(record)
Esempio n. 20
0
    def _sample_request(self, writer):
        headers_list = [('User-Agent', 'foo'), ('Host', 'example.com')]

        http_headers = StatusAndHeaders('GET / HTTP/1.0', headers_list)

        record = writer.create_warc_record('http://example.com/',
                                           'request',
                                           http_headers=http_headers)
        return record
Esempio n. 21
0
    def text_response(text,
                      status='200 OK',
                      content_type='text/plain; charset=utf-8'):
        encoded_text = text.encode('utf-8')
        status_headers = StatusAndHeaders(
            status, [('Content-Type', content_type),
                     ('Content-Length', str(len(encoded_text)))])

        return WbResponse(status_headers, value=[encoded_text])
Esempio n. 22
0
    def default_http_headers(self, length, content_type=None):
        headers = []
        if content_type:
            headers.append(('Content-Type', content_type))

        if length is not None and length >= 0:
            headers.append(('Content-Length', str(length)))

        return StatusAndHeaders('200 OK', headers=headers, protocol='HTTP/1.0')
Esempio n. 23
0
def _test_proxy_headers(http_cache=None):
    headers = _make_cache_headers()
    status = '200 OK'
    rewriter = UrlRewriter('20131010/http://example.com/', '/pywb/',
                           rewrite_opts={'http_cache': http_cache})

    rewritten = headerrewriter.rewrite(StatusAndHeaders(status, headers),
                                       rewriter,
                                       rewriter.get_cookie_rewriter())
    return rewritten.status_headers
Esempio n. 24
0
def sample_request(builder):
    headers_list = [('User-Agent', 'foo'), ('Host', 'example.com')]

    http_headers = StatusAndHeaders('GET / HTTP/1.0',
                                    headers_list,
                                    is_http_request=True)

    return builder.create_warc_record('http://example.com/',
                                      'request',
                                      http_headers=http_headers)
Esempio n. 25
0
    def write_memento(self, murl=None):
        """
        This is function to write memento in WARC format.

        Parameters:
            murl (str): URI-M

        Returns:
            (bool): True on Success and False on Failure
        """
        try:
            if self.lookup_memento(murl):
                return True
            else:
                response = Utils.get_murl_info(murl, self.__thandle)
                mpath = self.__memento_dir
                if not os.path.exists(mpath):
                    os.mkdir(mpath)
                mpath = os.path.join(mpath, response["handle"].lower())
                if not os.path.exists(mpath):
                    os.mkdir(mpath)
                mpath = os.path.join(mpath, response["domain"])
                if not os.path.exists(mpath):
                    os.mkdir(mpath)
                mpath = os.path.join(mpath, response["archive"])
                if not os.path.exists(mpath):
                    os.mkdir(mpath)
                mpath = os.path.join(mpath, response["wrep"] + response["lang"])
                if not os.path.exists(mpath):
                    os.mkdir(mpath)
                try:
                    mpath = os.path.join(mpath, str(response["timestamp"]) + self.__constants.WARC_EXT)
                    with open(mpath, "wb") as output:
                        writer = WARCWriter(output, gzip=True)
                        resp = requests.get(murl,
                                            headers={'Accept-Encoding': 'identity'},
                                            stream=True, timeout=120)

                        # get raw headers from urllib3
                        headers_list = resp.raw.headers.items()
                        http_headers = StatusAndHeaders('200 OK', headers_list, protocol='HTTP/1.1')
                        record = writer.create_warc_record(mpath, 'response',
                                                           payload=resp.raw,
                                                           http_headers=http_headers)
                        writer.write_record(record)
                    return True
                except requests.exceptions.TooManyRedirects as err:
                    sys.stderr.write(murl + "Too Many redirects" + "\n")
                except requests.exceptions.ConnectTimeout as err:
                    sys.stderr.write(murl + "Connection Timeout" + "\n")
                except Exception as e:
                    sys.stderr.write("Memento Write Error: " + str(e) + "URL:" + murl + "\n")
        except Exception as e:
            sys.stderr.write("Memento Write Error: " + murl + " " + str(e) + "\n")
        return False
Esempio n. 26
0
    def create_warcinfo_record(self, filename, info):
        warc_headers = StatusAndHeaders(self.warc_version, [])
        warc_headers.add_header('WARC-Type', 'warcinfo')
        warc_headers.add_header('WARC-Record-ID', self._make_warc_id())
        if filename:
            warc_headers.add_header('WARC-Filename', filename)
        warc_headers.add_header('WARC-Date', self._make_warc_date())

        warcinfo = BytesIO()
        for name, value in six.iteritems(info):
            if not value:
                continue

            line = name + ': ' + str(value) + '\r\n'
            warcinfo.write(line.encode('utf-8'))

        length = warcinfo.tell()
        warcinfo.seek(0)

        return self.create_warc_record('', 'warcinfo',
                                       warc_headers=warc_headers,
                                       payload=warcinfo,
                                       length=length)
Esempio n. 27
0
    def _create_response_record(self, url, headers, payload, warc_headers):
        writer = BufferWARCWriter()

        warc_headers = warc_headers or {}

        payload = payload.encode('utf-8')

        http_headers = StatusAndHeaders('200 OK', headers, protocol='HTTP/1.0')

        return writer.create_warc_record(url, 'response',
                                         payload=BytesIO(payload),
                                         length=len(payload),
                                         http_headers=http_headers,
                                         warc_headers_dict=warc_headers)
Esempio n. 28
0
    def __call__(self):
        new_headers_list = []
        for name, value in self.http_headers.headers:
            rule = self.header_rules.get(name.lower())
            new_header = self.rewrite_header(name, value, rule)
            if new_header:
                if isinstance(new_header, list):
                    new_headers_list.extend(new_header)
                else:
                    new_headers_list.append(new_header)

        return StatusAndHeaders(self.http_headers.statusline,
                                headers=new_headers_list,
                                protocol=self.http_headers.protocol)
Esempio n. 29
0
    def options_response(env):
        """Construct WbResponse for OPTIONS based on the WSGI env dictionary

        :param dict env: The WSGI environment dictionary
        :return: The WBResponse for the options request
        :rtype: WbResponse
        """
        status_headers = StatusAndHeaders('200 Ok', [
            ('Content-Type', 'text/plain'),
            ('Content-Length', '0'),
        ])
        response = WbResponse(status_headers)
        response.add_access_control_headers(env=env)
        return response
Esempio n. 30
0
    def redir_response(location, status='302 Redirect', headers=None):
        """Utility method for constructing redirection response.

        :param str location: The location of the resource redirecting to
        :param str status: The HTTP status line
        :param list[tuple[str, str]] headers: Additional headers for this response
        :return: WbResponse redirection response
        :rtype: WbResponse
        """
        redir_headers = [('Location', location), ('Content-Length', '0')]
        if headers:
            redir_headers += headers

        return WbResponse(StatusAndHeaders(status, redir_headers))
Esempio n. 31
0
def test_resp_3():

    resp = vars(WbResponse.redir_response('http://example.com/otherfile'))

    expected = {
        'body': [],
        'status_headers':
        StatusAndHeaders(protocol='',
                         statusline='302 Redirect',
                         headers=[('Location', 'http://example.com/otherfile'),
                                  ('Content-Length', '0')])
    }

    assert (resp == expected)
Esempio n. 32
0
    def load_resource(self, cdx, params):
        load_url = cdx.get('load_url')
        if not load_url:
            return None

        if params.get('content_type') == VideoLoader.CONTENT_TYPE:
            return None

        if self.forward_proxy_prefix and not cdx.get('is_live'):
            load_url = self.forward_proxy_prefix + load_url

        input_req = params['_input_req']

        req_headers = input_req.get_req_headers()

        dt = timestamp_to_datetime(cdx['timestamp'])

        if cdx.get('memento_url'):
            req_headers['Accept-Datetime'] = datetime_to_http_date(dt)

        method = input_req.get_req_method()
        data = input_req.get_req_body()

        p = PreparedRequest()
        try:
            p.prepare_url(load_url, None)
        except:
            raise LiveResourceException(load_url)
        p.prepare_headers(None)
        p.prepare_auth(None, load_url)

        auth = p.headers.get('Authorization')
        if auth:
            req_headers['Authorization'] = auth

        load_url = p.url

        # host is set to the actual host for live loading
        # ensure it is set to the load_url host
        if not cdx.get('is_live'):
            #req_headers.pop('Host', '')
            req_headers['Host'] = urlsplit(p.url).netloc

            referrer = cdx.get('set_referrer')
            if referrer:
                req_headers['Referer'] = referrer

        upstream_res = self._do_request_with_redir_check(method, load_url,
                                                         data, req_headers,
                                                         params, cdx)

        memento_dt = upstream_res.headers.get('Memento-Datetime')
        if memento_dt:
            dt = http_date_to_datetime(memento_dt)
            cdx['timestamp'] = datetime_to_timestamp(dt)
        elif cdx.get('memento_url'):
        # if 'memento_url' set and no Memento-Datetime header present
        # then its an error
            return None

        agg_type = upstream_res.headers.get('Warcserver-Type')
        if agg_type == 'warc':
            cdx['source'] = unquote(upstream_res.headers.get('Warcserver-Source-Coll'))
            return None, upstream_res.headers, upstream_res

        if upstream_res.version == 11:
            version = '1.1'
        else:
            version = '1.0'

        status = 'HTTP/{version} {status} {reason}\r\n'
        status = status.format(version=version,
                               status=upstream_res.status,
                               reason=upstream_res.reason)

        http_headers_buff = status

        orig_resp = upstream_res._original_response

        try:  #pragma: no cover
        #PY 3
            resp_headers = orig_resp.headers._headers
            for n, v in resp_headers:
                nl = n.lower()
                if nl in self.SKIP_HEADERS:
                    continue

                if nl in self.UNREWRITE_HEADERS:
                    v = self.unrewrite_header(cdx, v)

                http_headers_buff += n + ': ' + v + '\r\n'

            http_headers_buff += '\r\n'

            try:
                # http headers could be encoded as utf-8 (though non-standard)
                # first try utf-8 encoding
                http_headers_buff = http_headers_buff.encode('utf-8')
            except:
                # then, fall back to latin-1
                http_headers_buff = http_headers_buff.encode('latin-1')

        except:  #pragma: no cover
        #PY 2
            resp_headers = orig_resp.msg.headers

            for line in resp_headers:
                n, v = line.split(':', 1)
                n = n.lower()
                v = v.strip()

                if n in self.SKIP_HEADERS:
                    continue

                new_v = v
                if n in self.UNREWRITE_HEADERS:
                    new_v = self.unrewrite_header(cdx, v)

                if new_v != v:
                    http_headers_buff += n + ': ' + new_v + '\r\n'
                else:
                    http_headers_buff += line

            # if python2, already byte headers, so leave as is
            http_headers_buff += '\r\n'

        try:
            fp = upstream_res._fp.fp
            if hasattr(fp, 'raw'):  #pragma: no cover
                fp = fp.raw
            remote_ip = fp._sock.getpeername()[0]
        except:  #pragma: no cover
            remote_ip = None

        warc_headers = {}

        warc_headers['WARC-Type'] = 'response'
        warc_headers['WARC-Record-ID'] = self._make_warc_id()
        warc_headers['WARC-Target-URI'] = cdx['url']
        warc_headers['WARC-Date'] = datetime_to_iso_date(dt)

        if not cdx.get('is_live'):
            now = datetime.datetime.utcnow()
            warc_headers['WARC-Source-URI'] = cdx.get('load_url')
            warc_headers['WARC-Creation-Date'] = datetime_to_iso_date(now)

        if remote_ip:
            warc_headers['WARC-IP-Address'] = remote_ip

        ct = upstream_res.headers.get('Content-Type')
        if ct:
            metadata = self.get_custom_metadata(ct, dt)
            if metadata:
                warc_headers['WARC-JSON-Metadata'] = json.dumps(metadata)

        warc_headers['Content-Type'] = 'application/http; msgtype=response'

        if method == 'HEAD':
            content_len = 0
        else:
            content_len = upstream_res.headers.get('Content-Length', -1)

        self._set_content_len(content_len,
                              warc_headers,
                              len(http_headers_buff))

        warc_headers = StatusAndHeaders('WARC/1.0', warc_headers.items())
        return (warc_headers, http_headers_buff, upstream_res)