Esempio n. 1
0
    def writeRecordToTransport(r, t):
        m = ResponseMessage(RequestMessage())
        m.feed(r.content[1])
        m.close()
        b = m.get_body()

        # construct new headers
        new_headers = []
        old_headers = []
        for k, v in m.header.headers:
            if not k.lower() in ("connection", "content-length",
                                 "cache-control", "accept-ranges", "etag",
                                 "last-modified", "transfer-encoding"):
                new_headers.append((k, v))
            old_headers.append(("X-Archive-Orig-%s" % k, v))

        new_headers.append(("Content-Length", "%d" % len(b)))
        new_headers.append(("Connection", "keep-alive"))
        # write the response
        t.write("%s %d %s\r\n" %
                (m.header.version, m.header.code, m.header.phrase))
        h = new_headers + old_headers
        t.write("\r\n".join(["%s: %s" % (k, v) for k, v in h]))
        t.write("\r\n\r\n")
        t.write(b)
Esempio n. 2
0
def parse_http_response_charset(record):
    """Parses the payload of an HTTP 'response' record, returning code,
    content type, declared character set and body.

    Adapted from github's internetarchive/warctools hanzo/warcfilter.py,
    commit 1850f328e31e505569126b4739cec62ffa444223. MIT licenced."""
    message = ResponseMessage(RequestMessage())
    remainder = message.feed(record.content[1])
    message.close()
    if remainder or not message.complete():
        if remainder:
            print 'trailing data in http response for', record.url
        if not message.complete():
            print 'truncated http response for', record.url
    header = message.header

    mime_type = [v for k,v in header.headers if k.lower() == b'content-type']
    charset = None
    if mime_type:
        match = re.search(r'charset=(\S+)', mime_type[0], re.I)
        if match:
            charset = match.group(1).lower()
        mime_type = mime_type[0].split(b';')[0]
    else:
        mime_type = None

    return header.code, mime_type, charset, message.get_body()
Esempio n. 3
0
  def iter_zip(self):
    with ZipFile(self, "w") as outzip:
      for (offset, record, errors) in self.archive.read_records(limit=None):
        if record and record.type == WarcRecord.RESPONSE and re.sub(r'\s+', '', record.content[0]) == ResponseMessage.CONTENT_TYPE:
          message = ResponseMessage(RequestMessage())
          leftover = message.feed(record.content[1])
          message.close()

          filename = self.url_to_filename(record.url)
          date_time = record.date
          date_time = (int(date_time[0:4]), int(date_time[5:7]), int(date_time[8:10]),
                       int(date_time[11:13]), int(date_time[14:16]), int(date_time[17:19]))

          info = ZipInfo(filename, date_time)
          outzip.writestr(info, message.get_body())
          self.files[filename] = record.url

          for chunk in self.buffer:
            yield(chunk)
          self.buffer = []

        elif errors:
          self.errors.append("warc errors at %s:%d"%(name, offset if offset else 0))
          for e in errors:
            self.errors.append(e)

      outzip.writestr("files.txt", "\n".join([ "%s -> %s" % (v,k) for k,v in self.files.iteritems() ]))
      if len(self.errors) > 0:
        outzip.writestr("errors.txt", "\n".join(self.errors))

    for chunk in self.buffer:
      yield(chunk)

    self.buffer = []
Esempio n. 4
0
 def writeRecordToTransport(r, t):
     m = ResponseMessage(RequestMessage())
     m.feed(r.content[1])
     m.close()        
     b = m.get_body()
     
     # construct new headers
     new_headers = []
     old_headers = []
     for k, v in m.header.headers:
         if not k.lower() in ("connection", "content-length",
                              "cache-control", "accept-ranges", "etag",
                              "last-modified", "transfer-encoding"):
             new_headers.append((k, v))
         old_headers.append(("X-Archive-Orig-%s" % k, v))
     
     new_headers.append(("Content-Length", "%d" % len(b)))
     new_headers.append(("Connection", "keep-alive"))
     # write the response
     t.write("%s %d %s\r\n" % (m.header.version,
                               m.header.code,
                               m.header.phrase))
     h = new_headers + old_headers
     t.write("\r\n".join(["%s: %s" % (k, v) for k, v in h]))
     t.write("\r\n\r\n")
     t.write(b)
Esempio n. 5
0
    def __call__(self, request):
        """Called by HTTPServer to execute the request."""
        web_match = re.match(self.WEB_RE, request.uri)
        if not web_match:
            web_match = re.match(self.WEB_VIA_PROXY_RE, request.uri)

        if web_match:
            request.host = "warc"
            request.uri = web_match.group("uri")
            request.path, sep, query = request.uri.partition("?")
            self.web_handler.__call__(request)

        else:
            with self.proxy_handler.warc_record_for_uri(
                    canonicalize_url(request.uri)) as record:
                if record:
                    print "Serving %s from WARC" % request.uri

                    # parse the response
                    message = ResponseMessage(RequestMessage())
                    message.feed(record[1].content[1])
                    message.close()

                    body = message.get_body()

                    # construct new headers
                    new_headers = []
                    old_headers = []
                    for k, v in message.header.headers:
                        if not k.lower() in ("connection", "content-length",
                                             "cache-control", "accept-ranges",
                                             "etag", "last-modified",
                                             "transfer-encoding"):
                            new_headers.append((k, v))
                        old_headers.append(("X-Archive-Orig-%s" % k, v))

                    new_headers.append(("Content-Length", "%d" % len(body)))
                    new_headers.append(("Connection", "keep-alive"))

                    # write the response
                    request.write("%s %d %s\r\n" %
                                  (message.header.version, message.header.code,
                                   message.header.phrase))
                    request.write("\r\n".join([
                        "%s: %s" % (k, v)
                        for k, v in (new_headers + old_headers)
                    ]))
                    request.write("\r\n\r\n")
                    request.write(body)

                else:
                    print "Could not find %s in WARC" % request.uri
                    request.write(
                        "HTTP/1.0 404 Not Found\r\nConnection: keep-alive\r\nContent-Type: text/plain\r\nContent-Length: 91\r\n\r\nThis URL is not in any of your archives. Close the WARC viewer to resume normal browsing.\r\n"
                    )
            request.finish()
Esempio n. 6
0
def dump_record(fh, outzip):
    for (offset, record, errors) in fh.read_records(limit=None):
        if record and record.type == WarcRecord.RESPONSE and record.content[0] == ResponseMessage.CONTENT_TYPE:
            message = ResponseMessage(RequestMessage())
            leftover = message.feed(record.content[1])
            message.close()

            outzip.writestr(re.sub(r'^https?://', '', record.url), message.get_body())
            print(record.url)
        elif errors:
            print >> sys.stderr, "warc errors at %s:%d"%(name, offset if offset else 0)
            for e in errors:
                print '\t', e
Esempio n. 7
0
    def extractPayload(record):
        """
        :type record: WarcRecord
        """
        m = ResponseMessage(RequestMessage())
        m.feed(record.content[1])
        m.close()
        b = m.get_body()

        z = zlib.decompressobj(16 + zlib.MAX_WBITS)
        try:
            b = z.decompress(b)
        except zlib.error:
            pass
        return b
Esempio n. 8
0
    def extractPayload(record):
        """
        :type record: WarcRecord
        """
        m = ResponseMessage(RequestMessage())
        m.feed(record.content[1])
        m.close()
        b = m.get_body()

        z = zlib.decompressobj(16 + zlib.MAX_WBITS)
        try:
            b = z.decompress(b)
        except zlib.error:
            pass
        return b
Esempio n. 9
0
def dump_record(fh, outzip):
    for (offset, record, errors) in fh.read_records(limit=None):
        if record and record.type == WarcRecord.RESPONSE and record.content[
                0] == ResponseMessage.CONTENT_TYPE:
            message = ResponseMessage(RequestMessage())
            leftover = message.feed(record.content[1])
            message.close()

            outzip.writestr(re.sub(r'^https?://', '', record.url),
                            message.get_body())
            print(record.url)
        elif errors:
            print >> sys.stderr, "warc errors at %s:%d" % (name, offset
                                                           if offset else 0)
            for e in errors:
                print '\t', e
Esempio n. 10
0
  def __call__(self, request):
    """Called by HTTPServer to execute the request."""
    web_match = re.match(self.WEB_RE, request.uri)
    if not web_match:
      web_match = re.match(self.WEB_VIA_PROXY_RE, request.uri)

    if web_match:
      request.host = "warc"
      request.uri = web_match.group("uri")
      request.path, sep, query = request.uri.partition("?")
      self.web_handler.__call__(request)

    else:
      with self.proxy_handler.warc_record_for_uri(canonicalize_url(request.uri)) as record:
        if record:
          print "Serving %s from WARC" % request.uri

          # parse the response
          message = ResponseMessage(RequestMessage())
          message.feed(record[1].content[1])
          message.close()

          body = message.get_body()

          # construct new headers
          new_headers = []
          old_headers = []
          for k, v in message.header.headers:
            if not k.lower() in ("connection", "content-length", "cache-control", "accept-ranges", "etag", "last-modified", "transfer-encoding"):
              new_headers.append((k, v))
            old_headers.append(("X-Archive-Orig-%s" % k, v))

          new_headers.append(("Content-Length", "%d" % len(body)))
          new_headers.append(("Connection", "keep-alive"))

          # write the response
          request.write("%s %d %s\r\n" % (message.header.version, message.header.code, message.header.phrase))
          request.write("\r\n".join([ "%s: %s" % (k,v) for k,v in (new_headers + old_headers) ]))
          request.write("\r\n\r\n")
          request.write(body)

        else:
          print "Could not find %s in WARC" % request.uri
          request.write("HTTP/1.0 404 Not Found\r\nConnection: keep-alive\r\nContent-Type: text/plain\r\nContent-Length: 91\r\n\r\nThis URL is not in any of your archives. Close the WARC viewer to resume normal browsing.\r\n")
      request.finish()
Esempio n. 11
0
    def iter_zip(self):
        with ZipFile(self, "w") as outzip:
            for (offset, record,
                 errors) in self.archive.read_records(limit=None):
                if record and record.type == WarcRecord.RESPONSE and re.sub(
                        r'\s+', '',
                        record.content[0]) == ResponseMessage.CONTENT_TYPE:
                    message = ResponseMessage(RequestMessage())
                    leftover = message.feed(record.content[1])
                    message.close()

                    filename = self.url_to_filename(record.url)
                    date_time = record.date
                    date_time = (int(date_time[0:4]), int(date_time[5:7]),
                                 int(date_time[8:10]), int(date_time[11:13]),
                                 int(date_time[14:16]), int(date_time[17:19]))

                    info = ZipInfo(filename, date_time)
                    outzip.writestr(info, message.get_body())
                    self.files[filename] = record.url

                    for chunk in self.buffer:
                        yield (chunk)
                    self.buffer = []

                elif errors:
                    self.errors.append("warc errors at %s:%d" %
                                       (name, offset if offset else 0))
                    for e in errors:
                        self.errors.append(e)

            outzip.writestr(
                "files.txt", "\n".join(
                    ["%s -> %s" % (v, k) for k, v in self.files.iteritems()]))
            if len(self.errors) > 0:
                outzip.writestr("errors.txt", "\n".join(self.errors))

        for chunk in self.buffer:
            yield (chunk)

        self.buffer = []
Esempio n. 12
0
def parse_http_response_charset(record):
    """Parses the payload of an HTTP 'response' record, returning code,
    content type and body.

    Adapted from github's internetarchive/warctools hanzo/warcfilter.py,
    commit 1850f328e31e505569126b4739cec62ffa444223. MIT licenced."""
    message = ResponseMessage(RequestMessage())
    remainder = message.feed(record.content[1])
    message.close()
    if remainder or not message.complete():
        if remainder:
            raise Exception('trailing data in http response for'+str(record.url))
        if not message.complete():
            print Exception('truncated http response for'+str(record.url))
    header = message.header

    mime_type = [v for k,v in header.headers if k.lower() == b'content-type']
    if mime_type:
        mime_type, charset = mime_type[0].split(b';')
    else:
        mime_type = None

    return header.code, mime_type, message.get_body()
Esempio n. 13
0
def parse_http_response(record):
    """Parses the payload of an HTTP 'response' record, returning code,
    content type and body.

    Adapted from github's internetarchive/warctools hanzo/warcfilter.py,
    commit 1850f328e31e505569126b4739cec62ffa444223. MIT licenced."""
    message = ResponseMessage(RequestMessage())
    remainder = message.feed(record.content[1])
    message.close()
    if remainder or not message.complete():
        if remainder:
            print 'trailing data in http response for', record.url
        if not message.complete():
            print 'truncated http response for', record.url
    header = message.header

    mime_type = [v for k, v in header.headers if k.lower() == b'content-type']
    if mime_type:
        mime_type = mime_type[0].split(b';')[0]
    else:
        mime_type = None

    return header.code, mime_type, message.get_body()
Esempio n. 14
0
    def _init_from_warc_record(self, warc_record):
        self._warc_record = warc_record
        self.open(QNetworkReply.ReadOnly | QNetworkReply.Unbuffered)
        self.setUrl(QUrl(str_to_qstring(self._warc_record.url)))

        rs = ResponseMessage(RequestMessage())
        rs.feed(self._warc_record.content[1])

        for name, value in rs.header.headers:
            self.setRawHeader(name, value)

        self.setAttribute(QNetworkRequest.HttpStatusCodeAttribute, \
                rs.header.code)
        self.setAttribute(QNetworkRequest.HttpReasonPhraseAttribute, \
                rs.header.phrase)

        self._check_for_redirect(rs.header.code)

        QTimer.singleShot(0, lambda: self.metaDataChanged.emit())

        self._data = rs.get_body()

        QTimer.singleShot(0, lambda: self.readyRead.emit())
        QTimer.singleShot(0, lambda: self.finished.emit())
Esempio n. 15
0
    def _init_from_warc_record(self, warc_record):
        self._warc_record = warc_record
        self.open(QNetworkReply.ReadOnly | QNetworkReply.Unbuffered)
        self.setUrl(QUrl(str_to_qstring(self._warc_record.url)))

        rs = ResponseMessage(RequestMessage())
        rs.feed(self._warc_record.content[1])

        for name, value in rs.header.headers:
            self.setRawHeader(name, value)

        self.setAttribute(QNetworkRequest.HttpStatusCodeAttribute, \
                rs.header.code)
        self.setAttribute(QNetworkRequest.HttpReasonPhraseAttribute, \
                rs.header.phrase)

        self._check_for_redirect(rs.header.code)

        QTimer.singleShot(0, lambda: self.metaDataChanged.emit())

        self._data = rs.get_body()

        QTimer.singleShot(0, lambda: self.readyRead.emit())
        QTimer.singleShot(0, lambda: self.finished.emit())