def main(argv):
	(options, input_files) = parser.parse_args(args=argv[1:])

	if options.strip_404s and not options.decode_http:
		raise RuntimeError("--strip-404s requires --decode_http")

	with open(options.output, "wb") as out:
		if len(input_files) < 1:
			fh = WarcRecord.open_archive(file_handle=sys.stdin, gzip=None, mode="rb")
			try:
				previous_record = None
				for record in fh:
					process(record, previous_record, out, options)
					previous_record = record
			finally:
				fh.close()
		else:
			for name in input_files:
				previous_record = None
				fh = WarcRecord.open_archive(name, gzip="auto", mode="rb")
				try:
					for record in fh:
						process(record, previous_record, out, options)
						previous_record = record
				finally:
					fh.close()

	return 0
Example #2
0
    def process(self, infn, outfn, delete=False):
        """Process a WARC at a given infn, producing plain text via Tika
        where suitable, and writing a new WARC file to outfn."""
        # These are objects of type RecordStream (or a subclass), unlike with
        # the IA library
        inwf = WarcRecord.open_archive(infn, mode='rb')
        outf = open(outfn, 'wb')
        self._openfiles.add(outfn)
#        try:
#            fcntl.lockf(inwf.file_handle, fcntl.LOCK_EX | fcntl.LOCK_NB)
#            fcntl.lockf(outf, fcntl.LOCK_EX | fcntl.LOCK_NB)
#            # Get locks on both files
#        except IOError:
#            print ("Unable to get file locks processing", infn, "so will "
#                   "try later")
#            return False
        print "Processing", infn
        for record in inwf:
            try:
                if record.type == WarcRecord.WARCINFO:
                    self.add_description_to_warcinfo(record)
                elif (record.type == WarcRecord.RESPONSE
                      or record.type == WarcRecord.RESOURCE):
                    if record.get_header('WARC-Segment-Number'):
                        raise WarcTikaException("Segmented response/resource "
                                                "record. Not processing.")
                    else:
                        record = self.generate_new_record(record)
                # If 'metadata', 'request', 'revisit', 'continuation',
                # 'conversion' or something exotic, we can't do anything more
                # interesting than immediately re-writing it to the new file

                newrecord = WarcRecord(headers=record.headers,
                        content=record.content)

            except Exception as e:
                print ("Warning: WARCTikaProcessor.process() failed on "+
                       record.url+": "+str(e.message)+
                       "\n\tWriting old record to new WARC.")
                traceback.print_exc()
                newrecord = record
            finally:
                newrecord.write_to(outf, gzip=outfn.endswith('.gz'))
        print "****Finished file. Tika status codes:", self.tikacodes.items()
        self.tikacodes = defaultdict(int)
        inwf.close()
        outf.close()
        self._openfiles.remove(outfn)

        # Check that the file has written correctly - for an excess of caution
        validrc = os.system("warcvalid "+outfn)

        if validrc:
            print "New file", outfn, "appears not to be valid. Deleting it." 
            os.unlink(outfn)
        if delete and not validrc:
            print "Deleting", infn
            os.unlink(infn)
        return True
def dump_payload_from_file(filename, offset=None, length=None, output_filename="/tmp/warc_dump"):
    print("ci siamo:", filename, offset, length, output_filename)
    print(WarcRecord.open_archive)
    print(closing)
    print("ok")
    fp = WarcRecord.open_archive(filename=filename, gzip="auto", offset=offset, length=length)
    print("dopo open_archive")
    print(fp)
    with closing(WarcRecord.open_archive(filename=filename, gzip="auto", offset=offset, length=length)) as fh:
       print("ho aperto il file")
       return dump_payload_from_stream(fh)
Example #4
0
 def write_warcinfo_record(self, warc):
     """Writes the initial warcinfo record."""
     headers = [
         (WarcRecord.TYPE, WarcRecord.WARCINFO),
         (WarcRecord.DATE, warc_datetime_str(datetime.now())),
         (WarcRecord.ID, "<urn:uuid:%s>" % uuid.uuid1()),
     ]
     data = "software=%s\nhostname=%s\nip=%s" % (self.software, self.hostname, self.ip)
     if self.description is not None:
         data += "\ndescription=%s" % self.description
     record = WarcRecord(headers=headers, content=("application/warc-fields", data))
     record.write_to(warc, gzip=self.gzip)
     warc.flush()
Example #5
0
 def write_warcinfo_record(self, warc):
     """Writes the initial warcinfo record."""
     headers = [
         (WarcRecord.TYPE, WarcRecord.WARCINFO),
         (WarcRecord.DATE, warc_datetime_str(datetime.now())),
         (WarcRecord.ID, "<urn:uuid:%s>" % uuid.uuid1()),
     ]
     data = "software=%s\nhostname=%s\nip=%s" % (self.software,
                                                 self.hostname, self.ip)
     if self.description is not None:
         data += "\ndescription=%s" % self.description
     record = WarcRecord(headers=headers,
                         content=("application/warc-fields", data))
     record.write_to(warc, gzip=self.gzip)
     warc.flush()
Example #6
0
  def __init__(self, url_or_io, bytes_range=None):
    if isinstance(url_or_io, str):
      self.archive = WarcRecord.open_archive(file_handle=response_as_file(url_or_io, bytes_range))
    elif isinstance(url_or_io, IterContentAsFile):
      self.archive = WarcRecord.open_archive(file_handle=url_or_io)
    else:
      self.archive = WarcRecord.open_archive(file_handle=stream_as_file("upload.warc.gz", url_or_io))

    self.path_types = {}

    self.files = {}
    self.errors = []

    self.offset = 0
    self.buffer = []
def main(argv):
    (options, args) = parser.parse_args(args=argv[1:])

    out = sys.stdout
    if options.output:
        if not os.path.exists(options.output):
            os.makedirs(options.output)
        output_dir =  options.output
    else:
        output_dir  = os.getcwd()

    collisions = 0


    if len(args) < 1:
        log_file = sys.stdout if not options.log_file else open(options.log_file, 'wb')
        log_headers(log_file)
        
        with closing(WarcRecord.open_archive(file_handle=sys.stdin, gzip=None)) as fh:
            collisions += unpack_records('<stdin>', fh, output_dir, options.default_name, log_file, options.wayback)
        
    else:
        for filename in args:
            
            log_file = os.path.join(output_dir, os.path.basename(filename)+ '.index.txt') if not options.log_file else options.log_file
            log_file = open(log_file, 'wb')
            log_headers(log_file)
            try:
                with closing(ArchiveRecord.open_archive(filename=filename, gzip="auto")) as fh:
                    collisions+=unpack_records(filename, fh, output_dir, options.default_name, log_file, options.wayback)

            except StandardError, e:
                print >> sys.stderr, "exception in handling", filename, e
Example #8
0
def main(argv):
    (options, input_files) = parser.parse_args(args=argv[1:])

    out = sys.stdout
    if len(input_files) < 1:
        parser.error("no imput warc file(s)")

    total = 0
    #    print '#WARC filename offset warc-type warc-subject-uri warc-record-id content-type content-length'
    for name in expand_files(input_files):
        fh = WarcRecord.open_archive(name, gzip="auto")

        for (offset, record, errors) in fh.read_records(limit=None):
            if record:
                print name, offset, record.type, record.url, record.id, record.content_type, record.content_length
                total += record.content_length
            elif errors:
                pass
                # ignore
            else:
                pass
                # no errors at tail

        fh.close()
    print total

    return 0
 def build_from_warcs(self, warcs):
     for warc in warcs:
         fh = WarcRecord.open_archive(warc, gzip="auto")
         try:
             for (offset, record, errors) in fh.read_records(limit=None):
                 if record:
                     if record.type == WarcRecord.METADATA:
                         for line in StringIO(record.content[1]):
                             if line.startswith("outlink: "):
                                 outlink = line.strip().split()[1]
                                 self.inverted_index[outlink] = record.url
                     if record.type == WarcRecord.RESPONSE:
                         f = FileHTTPResponse(record.content_file)
                         f.begin()
                         if f.status == 200 and record.url.startswith(
                                 "http"):
                             self.crawled_uris.append(
                                 (record.url, f.getheader("content-type"),
                                  record.date, record.content_length))
                 elif errors:
                     pass
                 else:
                     pass
         finally:
             fh.close()
Example #10
0
def main(argv):
    (options, input_files) = parser.parse_args(args=argv[1:])

    out = sys.stdout
    if len(input_files) < 1:
        parser.error("no imput warc file(s)")

    total = 0        
#    print '#WARC filename offset warc-type warc-subject-uri warc-record-id content-type content-length'
    for name in expand_files(input_files):
        fh = WarcRecord.open_archive(name, gzip="auto")

        for (offset, record, errors) in fh.read_records(limit=None):
            if record:
                print name, offset, record.type, record.url, record.id, record.content_type, record.content_length
                total += record.content_length
            elif errors:
                pass
                # ignore
            else:
                pass
                # no errors at tail




        fh.close()
    print total


    return 0
Example #11
0
def main(argv):
    (options, args) = parser.parse_args(args=argv[1:])

    out = sys.stdout
    if options.output:
        if not os.path.exists(options.output):
            os.makedirs(options.output)
        output_dir =  options.output
    else:
        output_dir  = os.getcwd()

    collisions = 0


    if len(args) < 1:
        log_file = sys.stdout if not options.log_file else open(options.log_file, 'wb')
        log_headers(log_file)
        
        with closing(WarcRecord.open_archive(file_handle=sys.stdin, gzip=None)) as fh:
            collisions += unpack_records('<stdin>', fh, output_dir, options.default_name, log_file, options.wayback)
        
    else:
        for filename in args:
            
            log_file = os.path.join(output_dir, os.path.basename(filename)+ '.index.txt') if not options.log_file else options.log_file
            log_file = open(log_file, 'wb')
            log_headers(log_file)
            try:
                with closing(ArchiveRecord.open_archive(filename=filename, gzip="auto")) as fh:
                    collisions+=unpack_records(filename, fh, output_dir, options.default_name, log_file, options.wayback)

            except StandardError, e:
                print >> sys.stderr, "exception in handling", filename, e
Example #12
0
    def __init__(self, url_or_io, bytes_range=None):
        if isinstance(url_or_io, str):
            self.archive = WarcRecord.open_archive(
                file_handle=response_as_file(url_or_io, bytes_range))
        elif isinstance(url_or_io, IterContentAsFile):
            self.archive = WarcRecord.open_archive(file_handle=url_or_io)
        else:
            self.archive = WarcRecord.open_archive(
                file_handle=stream_as_file("upload.warc.gz", url_or_io))

        self.path_types = {}

        self.files = {}
        self.errors = []

        self.offset = 0
        self.buffer = []
Example #13
0
def main(argv):
    (options, input_files) = parser.parse_args(args=argv[1:])

    if options.strip_404s and not options.decode_http:
        raise RuntimeError("--strip-404s requires --decode_http")

    if options.json_hrefs_file and not options.decode_http:
        raise RuntimeError("--json-hrefs-file requires --decode_http")

    if options.json_hrefs_file:
        found_hrefs = set()
    else:
        found_hrefs = None

    with open(options.output, "wb") as out:
        if len(input_files) < 1:
            fh = WarcRecord.open_archive(file_handle=sys.stdin,
                                         gzip=None,
                                         mode="rb")
            try:
                previous_record = None
                for record in fh:
                    process(record, previous_record, out, options, found_hrefs)
                    previous_record = record
            finally:
                fh.close()
        else:
            for name in input_files:
                previous_record = None
                fh = WarcRecord.open_archive(name, gzip="auto", mode="rb")
                try:
                    for record in fh:
                        process(record, previous_record, out, options,
                                found_hrefs)
                        previous_record = record
                finally:
                    fh.close()

    if found_hrefs is not None:
        fh = bz2.BZ2File(options.json_hrefs_file, "wb")
        try:
            fh.write("\n".join(sorted(found_hrefs)) + "\n")
        finally:
            fh.close()

    return 0
Example #14
0
    def run(self):
        path = self.path
        idx_file = "%s.idx" % path

        records = None

        if os.path.exists(idx_file) and os.path.getmtime(
                idx_file) >= os.path.getmtime(path):
            print "Loading " + path + " from cache"
            self.status = "loading-cache"
            with open(idx_file, "rb") as f:

                def update_progress():
                    self.bytes_read = f.tell()

                f_pr = IOWithProgress(f, update_progress)
                data = cPickle.load(f_pr)
            self.bytes_read = self.bytes_total

            if "version" in data and data["version"] == 1:
                records = data["records"]

        if not records:
            self.status = "indexing"
            self.bytes_total = os.path.getsize(self.path)

            print "Loading " + path
            records = OrderedDict()
            warc = WarcRecord.open_archive(path, gzip="auto")
            for (offset, record, errors) in warc.read_records(limit=None):
                if self.cancel:
                    raise Exception("Loading " + path + " canceled")

                if record and re.sub(
                        r"[^a-z;=/]+", "",
                        record.type) == WarcRecord.RESPONSE and re.sub(
                            r"[^a-z;=/]+", "",
                            record.content[0]) == ResponseMessage.CONTENT_TYPE:
                    http_response = parse_http_response(record)
                    records[canonicalize_url(record.url)] = {
                        "offset": offset,
                        "code": http_response[0],
                        "type": http_response[1]
                    }

                self.bytes_read = offset

            warc.close()

            with open(idx_file, "wb") as f:
                cPickle.dump({"version": 1, "records": records}, f)

        if self.cancel:
            raise Exception("Loading " + path + " canceled")

        print "Indexed " + path + ". Found " + str(len(records)) + " URLs"
        self.status = "indexed"
        self.records = records
Example #15
0
def main(argv):
    (options, input_files) = parser.parse_args(args=argv[1:])

    out = sys.stdout
    if len(input_files) < 1:
        fh = WarcRecord.open_archive(file_handle=sys.stdin, gzip=None)

        for record in fh:
            process(record, out, options)
    else:
        for name in input_files:
            fh = WarcRecord.open_archive(name, gzip="auto")
            for record in fh:
                process(record, out, options)

            fh.close()

    return 0
Example #16
0
def create_metadata_record_bytes(
    url='http://example.com/',
    content_type='image/png',
    date='2016-08-03T10:49:41Z',
    content=b'',
    include_block_digest=True):
    """Build WARC metadata record bits."""

    headers = {
        WarcRecord.TYPE: WarcRecord.METADATA,
        WarcRecord.URL: url.encode('utf-8'),
        WarcRecord.CONTENT_TYPE: content_type.encode('utf-8'),
        WarcRecord.DATE: date.encode('utf-8')
        }
    if include_block_digest:
        hasher = hashlib.sha1(content)
        block_digest = base64.b32encode(hasher.digest())
        headers[WarcRecord.BLOCK_DIGEST] = b'sha1:' + block_digest

    # XXX - I wish I could use WarcRecord. Current implementation of
    # WarcRecord.write_to() ignores Warc-Block-Digest passed and writes out
    # hex-encoded SHA256 calculated from the content.
    out = io.BytesIO()
    if False:
        rec = WarcRecord(
            headers=headers.items(),
            content=(content_type.encode('utf-8'), content)
            )
        out = io.BytesIO()
        rec.write_to(out, gzip=True)
        return out.getvalue()
    else:
        z = GzipFile(fileobj=out, mode='wb')
        z.write(b'WARC/1.0\r\n')
        for k, v in headers.items():
            z.write(b''.join((k, b': ', v, b'\r\n')))
        z.write('Content-Length: {}\r\n'.format(len(content)).encode('ascii'))
        z.write(b'\r\n')
        z.write(content)
        z.write(b'\r\n\r\n')
        z.flush()
        z.close()
        return out.getvalue()
Example #17
0
    def _load_warc_info(self):
        self._warc_file_read.seek(0)
        wrs = WarcRecord.open_archive(file_handle=self._warc_file_read, \
                gzip="record")
        temp = wrs.read_records(limit=1)

        if not temp or (temp[0].type != WarcRecord.WARCINFO):
            raise ValueError("WARC info not found")

        return temp[0]
def main(argv):
	(options, input_files) = parser.parse_args(args=argv[1:])

	if options.strip_404s and not options.decode_http:
		raise RuntimeError("--strip-404s requires --decode_http")

	if options.json_hrefs_file and not options.decode_http:
		raise RuntimeError("--json-hrefs-file requires --decode_http")

	if options.json_hrefs_file:
		found_hrefs = set()
	else:
		found_hrefs = None

	with open(options.output, "wb") as out:
		if len(input_files) < 1:
			fh = WarcRecord.open_archive(file_handle=sys.stdin, gzip=None, mode="rb")
			try:
				previous_record = None
				for record in fh:
					process(record, previous_record, out, options, found_hrefs)
					previous_record = record
			finally:
				fh.close()
		else:
			for name in input_files:
				previous_record = None
				fh = WarcRecord.open_archive(name, gzip="auto", mode="rb")
				try:
					for record in fh:
						process(record, previous_record, out, options, found_hrefs)
						previous_record = record
				finally:
					fh.close()

	if found_hrefs is not None:
		fh = bz2.BZ2File(options.json_hrefs_file, "wb")
		try:
			fh.write("\n".join(sorted(found_hrefs)) + "\n")
		finally:
			fh.close()

	return 0
Example #19
0
    def _load_warc_info(self):
        self._warc_file_read.seek(0)
        wrs = WarcRecord.open_archive(file_handle=self._warc_file_read, \
                gzip="record")
        temp = wrs.read_records(limit=1)

        if not temp or (temp[0].type != WarcRecord.WARCINFO):
            raise ValueError("WARC info not found")

        return temp[0]
Example #20
0
    def write_record(self, headers, mime, data):
        """Writes a WARC record.

        Arguments:
        headers -- Array of WARC headers.
        mime -- MIME type of the data.
        data -- the data block.

        """
        record = WarcRecord(headers=headers, content=(mime, data))
        logger.debug("Getting WARC: %s" % str(self.warcs.keys()))
        name = self.pool.get()
        logger.debug("Writing to: %s" % name)
        fh = self.warcs[name]
        record.write_to(fh, gzip=self.gzip)
        fh.flush()
        if not self.warc_reached_max_size(name):
            logger.debug("%s undersized; adding back to the pool." % name)
            self.pool.put(name)
Example #21
0
    def write_record(self, headers, mime, data):
        """Writes a WARC record.

        Arguments:
        headers -- Array of WARC headers.
        mime -- MIME type of the data.
        data -- the data block.

        """
        record = WarcRecord(headers=headers, content=(mime, data))
        logger.debug("Getting WARC: %s" % str(self.warcs.keys()))
        name = self.pool.get()
        logger.debug("Writing to: %s" % name)
        fh = self.warcs[name]
        record.write_to(fh, gzip=self.gzip)
        fh.flush()
        if not self.warc_reached_max_size(name):
            logger.debug("%s undersized; adding back to the pool." % name)
            self.pool.put(name)
Example #22
0
 def readRecord(filename, offset):
     """
     :type filename: str
     :type offset: int
     :rtype : WarcRecord
     """
     w = WarcRecord.open_archive(filename, offset=offset)
     g = w.read_records(limit=1)
     r = g.next()[1]
     w.close()
     return r
Example #23
0
 def loadWarcFileRecords(name):
     """ Generator function for records from the file 'name' """
     f = WarcRecord.open_archive(name, gzip="auto")
     for (offset, r, err) in f.read_records(limit=None):
         if err:
             print "warc errors at %s:%d" % (name, offset or 0)
             for e in err:
                 print '\t', e
         if r:
             yield (r, offset)
     f.close()
Example #24
0
def main(argv):
    (options, input_files) = parser.parse_args(args=argv[1:])

    out = sys.stdout
    if len(input_files) < 1:
        fh = WarcRecord.open_archive(file_handle=sys.stdin, gzip=None)

        for record in fh:
            process(record, out, options)
    else:
        for name in input_files:
            fh = WarcRecord.open_archive(name, gzip="auto")
            for record in fh:
                process(record, out, options)

            fh.close()



    return 0
Example #25
0
 def loadWarcFileRecords(name):
     """ Generator function for records from the file 'name' """
     f = WarcRecord.open_archive(name, gzip="auto")
     for (offset, r, err) in f.read_records(limit=None):
         if err:
             print "warc errors at %s:%d" % (name, offset or 0)
             for e in err:
                 print '\t', e
         if r:
             yield (r, offset)
     f.close()
Example #26
0
 def readRecord(filename, offset):
     """
     :type filename: str
     :type offset: int
     :rtype : WarcRecord
     """
     w = WarcRecord.open_archive(filename, offset=offset)
     g = w.read_records(limit=1)
     r = g.next()[1]
     w.close()
     return r
Example #27
0
    def find_record(self, url):
        self._warc_file_read.seek(0)
        wrs = WarcRecord.open_archive(file_handle=self._warc_file_read, \
                gzip="record")

        for (offset, record, errors) in wrs.read_records(limit=None):
            if record and (record.type == WarcRecord.RESPONSE) \
                    and (record.content[0] == ResponseMessage.CONTENT_TYPE) \
                    and (record.url == url):
                return record

        return None
Example #28
0
    def find_record(self, url):
        self._warc_file_read.seek(0)
        wrs = WarcRecord.open_archive(file_handle=self._warc_file_read, \
                gzip="record")

        for (offset, record, errors) in wrs.read_records(limit=None):
            if record and (record.type == WarcRecord.RESPONSE) \
                    and (record.content[0] == ResponseMessage.CONTENT_TYPE) \
                    and (record.url == url):
                return record

        return None
Example #29
0
    def _init_file(self):
        warcinfo_headers = [
            (WarcRecord.TYPE, WarcRecord.WARCINFO),
            (WarcRecord.ID, WarcRecord.random_warc_uuid()),
            (WarcRecord.DATE, warc.warc_datetime_str(datetime.utcnow())),
            (WarcRecord.FILENAME, os.path.basename(self._file_name)),
            (Warc.MAIN_URL, self._main_url),
        ]

        warcinfo_fields = "\r\n".join([
            "software: bardo",
            "format: WARC File Format 1.0",
            "conformsTo: " + CONFORMS_TO,
            "robots: unknown",
        ])

        warcinfo_content = ("application/warc-fields", warcinfo_fields)

        warcinfo_record = WarcRecord(headers=warcinfo_headers, \
                content=warcinfo_content)

        self.write_record(warcinfo_record)
Example #30
0
  def warc_record_for_uri(self, uri):
    found = False
    for (path, uris) in self.indices.iteritems():
      if uri in uris:
        warc = WarcRecord.open_archive(path, gzip="auto")
        warc.seek(uris[uri]["offset"])

        for record in warc.read_records(limit=1, offsets=uris[uri]["offset"]):
          found = True
          yield record

        warc.close()

    if not found:
      yield None
Example #31
0
def main(argv):
    (options, input_files) = parser.parse_args(args=argv[1:])

    try: # python3
        out = sys.stdout.buffer
    except AttributeError: # python2
        out = sys.stdout

    if len(input_files) < 1:
        fh = WarcRecord.open_archive(file_handle=sys.stdin, gzip=None)

        for record in fh:
            process(record, out, options)
    else:
        for name in expand_files(input_files):
            fh = WarcRecord.open_archive(name, gzip="auto")
            for record in fh:
                process(record, out, options)

            fh.close()



    return 0
Example #32
0
def main(argv):
    (options, input_files) = parser.parse_args(args=argv[1:])

    out = sys.stdout
    if len(input_files) < 1:
        dump_archive(WarcRecord.open_archive(file_handle=sys.stdin, gzip=None), name="-",offsets=False)
        
    else:
        for name in input_files:
            fh = ArchiveRecord.open_archive(name, gzip="auto")
            dump_archive(fh,name)

            fh.close()


    return 0
Example #33
0
def doc_from_warc(infn, gzip='auto'):
    """Generator to process a WARC at a given infn."""
    # These are objects of type RecordStream (or a subclass), unlike with
    # the IA library
    inwf = WarcRecord.open_archive(infn, mode='rb', gzip=gzip)
    sys.stderr.write("Processing "+str(infn)+"\n")
    for record in inwf:
#                print "\nStarting record: "+str(record.url)
        try:
            if record.get_header('WARC-Segment-Number'):
                raise Exception("Segmented response/resource record "
                                "for "+record.url+". Not processing.")
            # We can process resource records (and conversion records,
            # which we assume are all of resource type (contain a document
            # rather than an HTTP transaction with nested document). This
            # may be unsafe, but conversion records are almost unknown in
            # the wild. The only ones we'll be handling here are those
            # output from WarcTika, which are in that format.
            # TODO: generalise this.
            # We also handle HTTP response records.
            if (record.type == WarcRecord.RESPONSE and
                  record.url.startswith('http')):
                httpcode, mimetype, charset, body = parse_http_response_charset(record)

            elif (record.type == WarcRecord.RESOURCE
                  or record.type == WarcRecord.CONVERSION):
                mimetype, body = record.content
                httpcode = 200 # "Success" for stored content
                charset = None # Not recorded
                
            # If 'metadata', 'request', 'revisit', 'continuation',
            # or something exotic, we can't do anything interesting
            elif (record.type == WarcRecord.METADATA
                  or record.type == WarcRecord.WARCINFO
                  or record.type == WarcRecord.REQUEST):
                continue
            else:
                sys.stderr.write("Can't handle"+str(record.type)+", "+str(record.url))
            yield (record.url, mimetype, body, httpcode, charset)
        except Exception:
            # General catch to avoid multiprocessing taking down the whole job
            # for one bogus record
            sys.stderr.write("\n\n***** Uncaught exception reading "+record.url
                             +" from file "+infn+":\n")
            traceback.print_exc()
            sys.stderr.write("Continuing.\n\n\n")
    inwf.close()
Example #34
0
  def run(self):
    path = self.path
    idx_file = "%s.idx" % path

    records = None

    if os.path.exists(idx_file) and os.path.getmtime(idx_file) >= os.path.getmtime(path):
      print "Loading " + path + " from cache"
      self.status = "loading-cache"
      with open(idx_file, "rb") as f:
        def update_progress():
          self.bytes_read = f.tell()
        f_pr = IOWithProgress(f, update_progress)
        data = cPickle.load(f_pr)
      self.bytes_read = self.bytes_total

      if "version" in data and data["version"] == 1:
        records = data["records"]
    
    if not records:
      self.status = "indexing"
      self.bytes_total = os.path.getsize(self.path)

      print "Loading " + path
      records = OrderedDict()
      warc = WarcRecord.open_archive(path, gzip="auto")
      for (offset, record, errors) in warc.read_records(limit=None):
        if self.cancel:
          raise Exception("Loading " + path + " canceled")

        if record and re.sub(r"[^a-z;=/]+", "", record.type) == WarcRecord.RESPONSE and re.sub(r"[^a-z;=/]+", "", record.content[0]) == ResponseMessage.CONTENT_TYPE:
          http_response = parse_http_response(record)
          records[canonicalize_url(record.url)] = { "offset":offset, "code":http_response[0], "type":http_response[1] }

        self.bytes_read = offset

      warc.close()

      with open(idx_file, "wb") as f:
        cPickle.dump({ "version": 1, "records": records }, f)

    if self.cancel:
      raise Exception("Loading " + path + " canceled")

    print "Indexed "+path+". Found "+str(len(records))+" URLs"
    self.status = "indexed"
    self.records = records
Example #35
0
def main(argv):
    (options, input_files) = parser.parse_args(args=argv[1:])

    out = sys.stdout
    if len(input_files) < 1:
        dump_archive(WarcRecord.open_archive(file_handle=sys.stdin, gzip=None),
                     name="-",
                     offsets=False)

    else:
        for name in input_files:
            fh = ArchiveRecord.open_archive(name, gzip="auto")
            dump_archive(fh, name)

            fh.close()

    return 0
def warcinfo_record(warc_filename):
    """Return warcinfo WarcRecord.
    Required to write in the beginning of a WARC file.
    """
    warc_date = warc_datetime_str(datetime.utcnow())
    metadata = "\r\n".join((
        "format: WARC File Format 1.0",
        "conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf"
    ))
    return WarcRecord(headers=[
        (WarcRecord.TYPE, WarcRecord.WARCINFO),
        (WarcRecord.CONTENT_TYPE, b'application/warc-fields'),
        (WarcRecord.ID, warc_uuid(metadata + warc_date)),
        (WarcRecord.DATE, warc_date), (WarcRecord.FILENAME, warc_filename)
    ],
                      content=(b'application/warc-fields', metadata + "\r\n"),
                      version=b"WARC/1.0")
  def process_file(self, filename):
    f = WarcRecord.open_archive(filename, gzip="auto")

    for (offset, record, errors) in f.read_records(limit=None):
      if record:
        if record.type=="response":
          self._process_response(record)
        elif record.type=="request":
          self._process_request(record)
        elif record.type=="resource":
          self._process_resource(record)
      elif errors:
        raise WarcException, "Cannot decode WARC: %s" % errors

    self.current_request = None

    f.close()
Example #38
0
def main(argv):
    (options, args) = parser.parse_args(args=argv[1:])

    out = sys.stdout
    if len(args) < 1:
        # dump the first record on stdin
        with closing(WarcRecord.open_archive(file_handle=sys.stdin, gzip=None)) as fh:
            dump_record(fh)
        
    else:
        filename = args[0]
        zipfilename = args[1]

        with ZipFile(zipfilename, "w") as outzip:
            with closing(ArchiveRecord.open_archive(filename=filename, gzip="auto")) as fh:
                dump_record(fh, outzip)


    return 0
Example #39
0
def main(argv):
    (options, args) = parser.parse_args(args=argv[1:])

    out = sys.stdout
    if len(args) < 1:
        # dump the first record on stdin
        with closing(WarcRecord.open_archive(file_handle=sys.stdin,
                                             gzip=None)) as fh:
            dump_record(fh)

    else:
        filename = args[0]
        zipfilename = args[1]

        with ZipFile(zipfilename, "w") as outzip:
            with closing(
                    ArchiveRecord.open_archive(filename=filename,
                                               gzip="auto")) as fh:
                dump_record(fh, outzip)

    return 0
 def build_from_warcs(self, warcs):
     for warc in warcs:
         fh = WarcRecord.open_archive(warc, gzip="auto")
         try:
             for (offset, record, errors) in fh.read_records(limit=None):
                 if record:
                     if record.type == WarcRecord.METADATA:
                         for line in StringIO(record.content[1]):
                             if line.startswith("outlink: "):
                                 outlink = line.strip().split()[1]
                                 self.inverted_index[outlink] = record.url
                     if record.type == WarcRecord.RESPONSE:
                         f = FileHTTPResponse(record.content_file)
                         f.begin()
                         if f.status == 200 and record.url.startswith("http"):
                             self.crawled_uris.append((record.url, f.getheader("content-type"), record.date, record.content_length))
                 elif errors:
                     pass
                 else:
                     pass
         finally:
             fh.close()
Example #41
0
def read_record(path, num_pages=10):
    warcr = WarcRecord.open_archive(path, gzip='auto')
    i = 0
    documents = []
    urls = []
    for record in warcr:
        if i >= num_pages:
            break
        if record.type == b'response' and record.content[
                0] == b'application/http; msgtype=response':
            url = ""
            for (h, v) in record.headers:
                if h == b'WARC-Target-URI':
                    url = str(v, errors="ignore")
            # domain = re.sub(r'^(www\.)?','',urlparse(url.decode("ISO-8859-1"))[1].lower())
            # urls.append(url.decode("ISO-8859-1").lower())
            urls.append(url)
            # documents.append(extract_text(record.content[1].decode("ISO-8859-1")))
            documents.append(
                extract_text(str(record.content[1], errors="ignore")))
            i += 1
    return documents, urls
Example #42
0
    def _init_file(self):
        warcinfo_headers = [
            (WarcRecord.TYPE, WarcRecord.WARCINFO),
            (WarcRecord.ID, WarcRecord.random_warc_uuid()),
            (WarcRecord.DATE, warc.warc_datetime_str(datetime.utcnow())),
            (WarcRecord.FILENAME, os.path.basename(self._file_name)),
            (Warc.MAIN_URL, self._main_url),
        ]

        warcinfo_fields = "\r\n".join([
            "software: bardo",
            "format: WARC File Format 1.0",
            "conformsTo: " + CONFORMS_TO,
            "robots: unknown",
        ])

        warcinfo_content = ("application/warc-fields", warcinfo_fields)

        warcinfo_record = WarcRecord(headers=warcinfo_headers, \
                content=warcinfo_content)

        self.write_record(warcinfo_record)
Example #43
0
def main(argv):
    (options, args) = parser.parse_args(args=argv[1:])

    out = sys.stdout
    if len(args) < 1:
        # dump the first record on stdin
        with closing(WarcRecord.open_archive(file_handle=sys.stdin, gzip=None)) as fh:
            dump_record(fh)
        
    else:
        # dump a record from the filename, with optional offset
        filename = args[0]
        if len(args) > 1:
            offset = int(args[1])
        else:
            offset = 0

        with closing(ArchiveRecord.open_archive(filename=filename, gzip="auto")) as fh:
            fh.seek(offset)
            dump_record(fh)


    return 0
Example #44
0
def tweet_warc_record(tweet_json):
    """Parse Tweet JSON and return WarcRecord.
    """
    try:
        tweet = json.loads(tweet_json)
        # skip deleted tweet
        if 'user' not in tweet:
            return
        url = "https://twitter.com/%s/status/%s" % (
            tweet['user']['screen_name'], tweet['id'])
    except Exception as ex:
        logging.error('error in tweet_warc_record', exc_info=1)
        return None

    warc_date = warc_datetime_str(
        datetime.utcfromtimestamp(float(tweet['timestamp_ms']) / 1000.0))
    return WarcRecord(headers=[(WarcRecord.TYPE, WarcRecord.RESOURCE),
                               (WarcRecord.CONTENT_TYPE, b'application/json'),
                               (WarcRecord.ID, warc_uuid(url + warc_date)),
                               (WarcRecord.URL, url),
                               (WarcRecord.DATE, warc_date)],
                      content=(b'application/json', tweet_json + "\r\n"),
                      version=b"WARC/1.0")
Example #45
0
def main(argv):
    (options, args) = parser.parse_args(args=argv[1:])

    out = sys.stdout
    if len(args) < 1:
        # dump the first record on stdin
        with closing(WarcRecord.open_archive(file_handle=sys.stdin,
                                             gzip=None)) as fh:
            dump_record(fh)

    else:
        # dump a record from the filename, with optional offset
        filename = args[0]
        if len(args) > 1:
            offset = int(args[1])
        else:
            offset = 0

        with closing(ArchiveRecord.open_archive(filename=filename,
                                                gzip="auto")) as fh:
            fh.seek(offset)
            dump_record(fh)

    return 0
r.seed(1818118181) # Arbitrary

content = []
rejects = defaultdict(int)

#Load all the objects into memory first
try:
    with open(picklefn, "rb") as fh:
        print "Unpickling selected sample."
        content = pickle.load(fh)
except IOError:
    print "Pickled file does not appear to exist. Loading content."
    for fn in os.listdir(dirname):
        if not fn.endswith('.warc.gz'):
            continue
        wf = WarcRecord.open_archive(dirname+'/'+fn, mode='rb')
        try:
            print fn
            for record in wf:
                if not record.type in [WarcRecord.RESPONSE,
                                       WarcRecord.RESOURCE,
                                       WarcRecord.CONVERSION]:
                    continue
                if (record.type == WarcRecord.RESPONSE
                        and record.url.startswith('http')):
                    ccode, cmime, cbody = parse_http_response(record)
                    if ccode not in successcodes:
                        continue
                else:
                    ccode = None
                    cmime = record.content[0]
Example #47
0
r.seed(1818118181)  # Arbitrary

content = []
rejects = defaultdict(int)

#Load all the objects into memory first
try:
    with open(picklefn, "rb") as fh:
        print "Unpickling selected sample."
        content = pickle.load(fh)
except IOError:
    print "Pickled file does not appear to exist. Loading content."
    for fn in os.listdir(dirname):
        if not fn.endswith('.warc.gz'):
            continue
        wf = WarcRecord.open_archive(dirname + '/' + fn, mode='rb')
        try:
            print fn
            for record in wf:
                if not record.type in [
                        WarcRecord.RESPONSE, WarcRecord.RESOURCE,
                        WarcRecord.CONVERSION
                ]:
                    continue
                if (record.type == WarcRecord.RESPONSE
                        and record.url.startswith('http')):
                    ccode, cmime, cbody = parse_http_response(record)
                    if ccode not in successcodes:
                        continue
                else:
                    ccode = None
Example #48
0
    webbase_header = "==P=>>>>=i===<<<<=T===>=A===<=!Junghoo!==>"
    content = ""
    headers = [("WARC-Filename", filename), ("WARC-Type", "response")]
    finished_headers = False
    first_line = fh.readline()
    assert first_line.startswith(webbase_header)
    for line in fh:
        if line.startswith(webbase_header):
            yield headers, ("text/html", content)
            content = ""
        else:
            if finished_headers:
                content += line
            elif "" == line.strip():
                finished_headers = True
            else:
                add_header(headers, line)


i = 0
warc_out = open("out.warc.gz", "w")
for headers, content in get_wb_record("2pages"):
    print i
    i += 1
    # print headers
    # print content
    record = WarcRecord(headers=headers, content=content)
    record.write_to(warc_out, gzip=True)
    record.dump()
    print "_" * 80
Example #49
0
    def _reply_finished(self):
        self._network_reply.readyRead.disconnect(self._reply_ready_read)
        self._network_reply.finished.disconnect(self._reply_finished)
        self._network_reply.error.disconnect(self._reply_error)

        status_code = self._network_reply.attribute(QNetworkRequest \
                .HttpStatusCodeAttribute)

        if not status_code.isValid():
            self._temp_data.close()
            self._temp_data = None
            self._network_reply = None

            QTimer.singleShot(0, lambda: self.finished.emit())

            return

        headers = dict()

        for header in self._network_reply.rawHeaderList():
            temp = str(self._network_reply.rawHeader(header))
            headers[str(header)] = re.sub("\s", " ", temp)

        elements = []

        for name, value in headers.iteritems():
            elements.append(name + ": " + value)

        elements.append("")

        url = qstring_to_str(self._network_reply.url().toString())

        status_msg = self._network_reply.attribute(QNetworkRequest \
                .HttpReasonPhraseAttribute)

        assert (status_msg.isValid())

        self._temp_data.seek(0)

        # XXX: we can't get HTTP version from Qt webkit, assumes 1.1
        h_status = "HTTP/1.1 " + str(status_code.toString()) + " " \
                + str(status_msg.toString())

        content_data = h_status + "\r\n" \
                + "\r\n".join(elements) + "\r\n" \
                + self._temp_data.read()

        content_type = ResponseMessage.CONTENT_TYPE

        content = (content_type, content_data)

        wr = warc.make_response(WarcRecord.random_warc_uuid(),
                                warc.warc_datetime_str(datetime.utcnow()), url,
                                content, None)

        self._temp_data.close()
        self._temp_data = None

        self.manager().current_warc.write_record(wr)

        self._init_from_warc_record(wr)

        self._network_reply = None
Example #50
0
def main(argv):
    (options, input_files) = parser.parse_args(args=argv[1:])

    out = sys.stdout
    if options.output:
        out = open(options.output, 'wb')
        if options.output.endswith('.gz'):
            options.gzip = True
    if len(input_files) < 1:
        parser.error("no imput warc file(s)")
        
    for name in input_files:
        fh = ArcRecord.open_archive(name, gzip="auto")

        filedesc = None

        warcinfo_id = None
        for record in fh:
            version = "WARC/1.0"

            warc_id = make_warc_uuid(record.url+record.date)
            headers = [
                (WarcRecord.ID, warc_id),
            ]
            if record.date:
                date = datetime.datetime.strptime(record.date,'%Y%m%d%H%M%S')
                headers.append((WarcRecord.DATE, warc_datetime_str(date)))


            if record.type == 'filedesc':
                warcinfo_id = warc_id

                warcinfo_headers = list(headers)
                warcinfo_headers.append((WarcRecord.FILENAME, record.url[11:]))
                warcinfo_headers.append((WarcRecord.TYPE, WarcRecord.WARCINFO))

                warcinfo_content = ('application/warc-fields', 'software: hanzo.arc2warc\r\n')

                warcrecord = WarcRecord(headers=warcinfo_headers, content=warcinfo_content, version=version)
                warcrecord.write_to(out, gzip=options.gzip)

                warc_id = make_warc_uuid(record.url+record.date+"-meta")
                warcmeta_headers = [
                    (WarcRecord.TYPE, WarcRecord.METADATA),
                    (WarcRecord.CONCURRENT_TO, warcinfo_id),
                    (WarcRecord.ID, warc_id),
                    (WarcRecord.URL, record.url),
                    (WarcRecord.DATE, warcrecord.date),
                    (WarcRecord.WARCINFO_ID, warcinfo_id),
                ]
                warcmeta_content =('application/arc', record.raw())

                warcrecord = WarcRecord(headers=warcmeta_headers, content=warcmeta_content, version=version)
                warcrecord.write_to(out, gzip=options.gzip)
            else:
                content_type, content = record.content
                if record.url.startswith('http'):
                    # don't promote content-types for http urls,
                    # they contain headers in the body.
                    content_type="application/http;msgtype=response"

                headers.extend([
                    (WarcRecord.TYPE, WarcRecord.RESPONSE ),
                    (WarcRecord.URL,record.url),
                    (WarcRecord.WARCINFO_ID, warcinfo_id),
                ])
            
                warcrecord = WarcRecord(headers=headers, content=(content_type, content), version=version)

                warcrecord.write_to(out, gzip=options.gzip)


        fh.close()



    return 0
Example #51
0
uuidsexcluded = set()

exclist = parse_exc_args(args.pattern)

# In theory this could be agnostic as to whether the stream is compressed or
# not. In practice, the gzip guessing code reads the stream for marker bytes
# and then attempts to rewind, which fails for stdin unless an elaborate
# stream wrapping class is set up.
gzi = 'auto'
if args.gzipped_input:
    gzi = 'record'
elif args.plain_input:
    gzi = False

if args.in_filename is None:
    inwf = WarcRecord.open_archive(file_handle=sys.stdin,
                                   mode='rb', gzip=gzi)
else:
    inwf = WarcRecord.open_archive(filename=args.in_filename,
                                   mode='rb', gzip=gzi)

#####
#MAIN
#####

outf = sys.stdout
if args.out_filename is not None:
    outf = open(args.out_filename, 'wb')

for record in inwf:
    # How many matches constitutes failure?
    write = len(exclist)
Example #52
0
    def _reply_finished(self):
        self._network_reply.readyRead.disconnect(self._reply_ready_read)
        self._network_reply.finished.disconnect(self._reply_finished)
        self._network_reply.error.disconnect(self._reply_error)

        status_code = self._network_reply.attribute(QNetworkRequest \
                .HttpStatusCodeAttribute)

        if not status_code.isValid():
            self._temp_data.close()
            self._temp_data = None
            self._network_reply = None

            QTimer.singleShot(0, lambda: self.finished.emit())

            return

        headers = dict()

        for header in self._network_reply.rawHeaderList():
            temp = str(self._network_reply.rawHeader(header))
            headers[str(header)] = re.sub("\s", " ", temp)

        elements = []

        for name, value in headers.iteritems():
            elements.append(name + ": " + value)

        elements.append("")

        url = qstring_to_str(self._network_reply.url().toString())

        status_msg = self._network_reply.attribute(QNetworkRequest \
                .HttpReasonPhraseAttribute)

        assert(status_msg.isValid())

        self._temp_data.seek(0)

        # XXX: we can't get HTTP version from Qt webkit, assumes 1.1
        h_status = "HTTP/1.1 " + str(status_code.toString()) + " " \
                + str(status_msg.toString())

        content_data = h_status + "\r\n" \
                + "\r\n".join(elements) + "\r\n" \
                + self._temp_data.read()

        content_type = ResponseMessage.CONTENT_TYPE

        content = (content_type, content_data)

        wr = warc.make_response(WarcRecord.random_warc_uuid(),
                warc.warc_datetime_str(datetime.utcnow()), url, content, None)

        self._temp_data.close()
        self._temp_data = None

        self.manager().current_warc.write_record(wr)

        self._init_from_warc_record(wr)

        self._network_reply = None
	def __init__( self, warc ):
		self.warc = warc
		logger.debug( "Mounting %s" % self.warc )
		self.fh = WarcRecord.open_archive( warc, gzip="auto", mode="rb" )
		self.tree = Tree()
		self._get_records()
Example #54
0
def main(argv):
    (options, input_files) = parser.parse_args(args=argv[1:])

    out = sys.stdout
    if options.output:
        out = open(options.output, 'ab')
        if options.output.endswith('.gz'):
            options.gzip = True
    if len(input_files) < 1:
        parser.error("no imput warc file(s)")

    for name in input_files:
        fh = ArcRecord.open_archive(name, gzip="auto")

        filedesc = None

        warcinfo_id = None
        for record in fh:
            version = "WARC/1.0"

            warc_id = make_warc_uuid(record.url + record.date)
            headers = [
                (WarcRecord.ID, warc_id),
            ]
            if record.date:
                date = datetime.datetime.strptime(record.date, '%Y%m%d%H%M%S')
                headers.append((WarcRecord.DATE, warc_datetime_str(date)))

            if record.type == 'filedesc':
                warcinfo_id = warc_id

                warcinfo_headers = list(headers)
                warcinfo_headers.append((WarcRecord.FILENAME, record.url[11:]))
                warcinfo_headers.append((WarcRecord.TYPE, WarcRecord.WARCINFO))

                warcinfo_content = ('application/warc-fields',
                                    'software: hanzo.arc2warc\r\n')

                warcrecord = WarcRecord(headers=warcinfo_headers,
                                        content=warcinfo_content,
                                        version=version)
                warcrecord.write_to(out, gzip=options.gzip)

                warc_id = make_warc_uuid(record.url + record.date + "-meta")
                warcmeta_headers = [
                    (WarcRecord.TYPE, WarcRecord.METADATA),
                    (WarcRecord.CONCURRENT_TO, warcinfo_id),
                    (WarcRecord.ID, warc_id),
                    (WarcRecord.URL, record.url),
                    (WarcRecord.DATE, warcrecord.date),
                    (WarcRecord.WARCINFO_ID, warcinfo_id),
                ]
                warcmeta_content = ('application/arc', record.raw())

                warcrecord = WarcRecord(headers=warcmeta_headers,
                                        content=warcmeta_content,
                                        version=version)
                warcrecord.write_to(out, gzip=options.gzip)
            else:
                content_type, content = record.content
                if record.url.startswith('http'):
                    # don't promote content-types for http urls,
                    # they contain headers in the body.
                    content_type = "application/http;msgtype=response"

                headers.extend([
                    (WarcRecord.TYPE, WarcRecord.RESPONSE),
                    (WarcRecord.URL, record.url),
                    (WarcRecord.WARCINFO_ID, warcinfo_id),
                ])

                warcrecord = WarcRecord(headers=headers,
                                        content=(content_type, content),
                                        version=version)

                warcrecord.write_to(out, gzip=options.gzip)

        fh.close()

    return 0