Esempio n. 1
0
    def ShrinkRay():

        # TODO:
        # figure out length of video and develop number of frames to
        # drop out of every FPS interval.

        print(
            "********************* \n\n Shrinking Video. (This will take a while) \n\n*********************"
        )

        os.environ["FFREPORT"] = "file=ffmpeg-shrinking.log"

        # shrink; using the webm format at this resolution cuts the file size by
        # *about* an order of magnitude, while still maintaining more-or-less
        # perfectly crisp detail and motion. I'm thinking we don't need to drop
        # frames, and that cutting the resolution down to this ~240P-level
        # resolution is good enough.

        # We really need to check for resolution and select an output resolution
        # appropriately; this one-liner only works for 16:9 inputs

        ffmpegshrinkargs = shlex.split(
            "ffmpeg -i samplethis.flv -c:v libvpx -b:v 500K -c:a libvorbis -s 432x243 shrunken-to-webm.webm"
        )
        call(ffmpegshrinkargs)

        # The final size of snapshots and shrunken video is anywhere from a fifth to
        # a seventh of the original file size.

        os.environ["FFREPORT"] = ""

        # add ffmpeg log record
        ffmpegshrinkheader = warc.WARCHeader({
            "WARC-Type":
            "resource",
            "WARC-Warcinfo-ID":
            warcinfo_record_ID,
            "Content-Type":
            "text/plain",
            "WARC-Concurrent-To":
            metadata_record_ID
        })
        ffmpegshrinkpayload = StringIO(
            open("ffmpeg-shrinking.log").read()).getvalue()
        ffmpegshrinkrecord = warc.WARCRecord(headers=ffmpegshrinkheader,
                                             payload=ffmpegshrinkpayload)
        new_warc_file.write_record(ffmpegshrinkrecord)

        # add actual shrunken webm record
        shrinkrecord = warc.WARCRecord(
            headers=warc.WARCHeader({
                "WARC-Type": "conversion",
                "Content-Type": "video/webm",
                "WARC-Refers-To": truncated_record_ID
            }),
            payload=StringIO(open("shrunken-to-webm.webm").read()).getvalue())
        new_warc_file.write_record(shrinkrecord)

        # remove log file
        call(shlex.split("rm snapshots.tar.gz ffmpeg-shrinking.log"))
Esempio n. 2
0
    def create(self, filename, fileobj=None, operator=None):
        """
        :rtype: warc.WARCFile
        """
        assert useragent.POLICY is not None

        if fileobj is None:
            fileobj = io.BytesIO()

        self.fileobj = fileobj
        self.warc = warc.WARCFile(fileobj=fileobj)

        header = warc.WARCHeader({
            "WARC-Type": "warcinfo",
            "WARC-Filename": filename,
        }, defaults=True)
        body = [
            b"software: owlbot/"+bytes(version.STR, "ascii"),
            b"format: WARC File Format 1.0",
            # policy from .OWLBOT_POLICY or os.environ["OWLBOT_POLICY"]
            b"robots: " + bytes(useragent.POLICY, "ascii"),
        ]
        if operator is not None:
            body.append(b"operator: " + operator.encode("utf-8"))

        self.warc.write_record(
            warc.WARCRecord(header, payload=b"\r\n".join(body))
        )
Esempio n. 3
0
    def write_response(self, response):
        '''Writes a `response` object from Scrapy as a Warc record. '''
        # Avoid duplicated entries
        response_url = w3lib.url.safe_download_url(response.url)
        if response_url in self.db:
            log.msg('Ignored already stored response: %s' % response_url,
                    level=log.DEBUG)
            return
        self.db[response_url] = '1'

        # Create the payload string
        payload = StringIO.StringIO()
        status_reason = httplib.responses.get(response.status, '-')
        payload.write('HTTP/1.1 %d %s\r\n' % (response.status, status_reason))
        for h_name in response.headers:
            payload.write('%s: %s\n' % (h_name, response.headers[h_name]))

        payload.write('\r\n')
        payload.write(response.body)

        headers = {
            'WARC-Type': 'response',
            'WARC-Date': WarcWriter.now_iso_format(),
            'Content-Length': str(payload.tell()),
            'Content-Type': str(response.headers.get('Content-Type', '')),

            # Optional headers
            'WARC-Target-URI': response_url
        }
        record = warc.WARCRecord(payload=payload.getvalue(), headers=headers)

        self._write_record(record)
Esempio n. 4
0
def make_req_dummy(req, record, http_ver="1.1"):
    o = urlparse(req.url)
    path = o.path
    if not path:
        path = "/"
    temp = [
        bytes("{} {} HTTP/{}".format(req.method, path, http_ver), "ascii")
    ]
    for key in req.headers:
        temp.append(bytes("{}: {}".format(key, req.headers[key]), "utf-8"))
    temp.append(b"")
    if req.body:
        temp.append(req.body)
    dummy = b"\r\n".join(temp)
    header = warc.WARCHeader({
        "WARC-Type": "request",
        "WARC-Target-URI": req.url,
        # ISO 28500 Section 5.4 WARC-Date
        # > Multiple records written as part of a single capture event (see section 5.7)
        # > shall use the same WARC-Date, even though the times of their writing
        # > will not be exactly synchronized.
        "WARC-Date": record.header["WARC-Date"],
        "WARC-Concurrent-To": record.header["WARC-Record-ID"],
    }, defaults=True)
    return warc.WARCRecord(header, payload=dummy)
Esempio n. 5
0
def create_warc_from_corpus(documents, filename=None):
    """ Used mainly in tests to generate small .warc files """

    if filename is None:
        fd, filename = tempfile.mkstemp(suffix=".warc")
        os.close(fd)

    f = warc.open(filename, "w")

    for doc in documents:

        headers = "Connection: close\r\nContent-Type: text/html"
        if "headers" in doc:
            headers = "\r\n".join(
                ["%s: %s" % (k, v) for k, v in doc["headers"].iteritems()])

        payload = "HTTP/1.1 200 OK\r\n" + headers + "\r\n\r\n" + doc["content"]

        record = warc.WARCRecord(payload=payload,
                                 headers={
                                     "Content-Type":
                                     "application/http; msgtype=response",
                                     "WARC-Type": "response",
                                     "WARC-Target-URI": doc["url"]
                                 })
        f.write_record(record)

    f.close()

    return filename
Esempio n. 6
0
def make_resp_dummy(resp, date, http_ver="1.1"):
    body = resp.raw.data
    temp = [
        bytes("HTTP/{} {} {}".format(http_ver, resp.status_code, RESPONSES[resp.status_code]), "ascii"),
    ]
    applied_keys = []
    for key in resp.headers:
        if key.lower() in ["transfer-encoding"]:
            continue
        elif key.lower() == "content-length" and resp.headers["content-length"] != str(len(body)):
            # recalculate decoded size below
            continue
        temp.append(bytes("{}: {}".format(key, resp.headers[key]), "utf-8"))
        applied_keys.append(key.lower())
    if "content-length" not in applied_keys:
        temp.append(bytes("content-length: {}".format(len(body)), "ascii"))
    temp.append(b"")
    temp.append(body)
    dummy = b"\r\n".join(temp)
    header = warc.WARCHeader({
        "WARC-Type": "response",
        "WARC-Target-URI": resp.url,
        "WARC-Date": date.strftime("%Y-%m-%dT%H:%M:%SZ"),
    }, defaults=True)
    return warc.WARCRecord(header, payload=dummy)
Esempio n. 7
0
    def deduplicate_record(self, record):
        record_check = self.check_record(record)

        if record_check:
            record.header['Content-Length'] = '0'
            record.header['WARC-Refers-To'] = \
                record_check['WARC-Record-ID']
            record.header['WARC-Refers-To-Date'] = \
                record_check['WARC-Date']
            record.header['WARC-Refers-To-Target-URI'] = \
                record_check['WARC-Target-URI']
            record.header['WARC-Type'] = 'revisit'
            record.header['WARC-Truncated'] = 'length'
            record.header['WARC-Profile'] = \
                'http://netpreserve.org/warc/1.0/revisit/identical-payload-digest'
            record.header['WARC-Payload-Digest'] = \
                record.header['WARC-Block-Digest']

            del record.header['WARC-Block-Digest']

            self.output_log.append({
                'WARC-Record-ID': record.header['WARC-Record-ID'],
                'WARC-Target-URI': record.header['WARC-Target-URI'],
                'WARC-Date': record.header['WARC-Date'],
                'Content-Length': record_check['Content-Length'],
                'Duplicate-Of': {
                    'WARC-Record-ID': record_check['WARC-Record-ID'],
                    'WARC-Target-URI': record_check['WARC-Target-URI'],
                    'WARC-Date': record_check['WARC-Date'],
                    'Content-Length': record_check['Content-Length']
                }
            })

            return warc.WARCRecord(
                header=record.header,
                payload='',
                defaults=False
            )
        else:
            return warc.WARCRecord(
                header=record.header,
                payload=record.payload.read(),
                defaults=False
            )
Esempio n. 8
0
def createWarcInfoReacord(filename):
    H = warc.WARCHeader({"WARC-Type": "warcinfo", \
                        "WARC-Filename" : filename}, \
         defaults=True)
    Content = "software: WARCMerge/1.0" + "\r\n" \
         + "format: WARC File Format 1.0" + "\r\n" \
      + "description: "+" Merging WARC files into a single one " + "\r\n" + \
      "robots: ignore" + "\r\n"
    R = warc.WARCRecord(H, Content)
    return R
Esempio n. 9
0
    def deduplicate(self):
        info_record = self.input_file.read_record()
        info_record.header['WARC-Filename'] = self.output_filename

        warc_info_id = info_record.header['WARC-Warcinfo-ID']

        self.output_file.write_record(warc.WARCRecord(
            payload=info_record.payload.read(),
            header=info_record.header,
            defaults=False
        ))

        while self.input_file_size > self.input_file.tell():
            for record in self.input_file:
                if record.type == 'resource':
                    record = self.deduplicate_record(record)
                else:
                    record = warc.WARCRecord(
                        header=record.header,
                        payload=record.payload.read(),
                        defaults=False)
                self.output_file.write_record(record)

        self.output_file.write_record(self.record_log(warc_info_id))

        self.input_file.close()
        self.output_file.close()

        with codecs.open(self.output_log_filename, 'w') as output_log_file:
            json.dump(self.output_log, output_log_file, ensure_ascii=False,
                indent=4)

        if self.double_check(self.input_filename):
            os.remove(self.input_filename)
        else:
            os.remove(self.output_filename)
            os.remove(self.output_log_filename)

        self.dump_records()
Esempio n. 10
0
    def write(self, data, headers={}):
        warcfilename = self.get_next_warcfile()
        path = self.get_path(warcfilename, create_dirs=True)
        w = warc.WARCWriter(open(path, 'a'))

        headers = dict(headers)
        subject_uri = headers.pop('subject_uri', 'xxx')
        mimetype = headers.pop('mimetype', 'application/octet-stream')

        warc_record = warc.WARCRecord('resource', subject_uri, mimetype, headers, data)
        offset = w.write(warc_record)
        w.close()
        filename = '%s:%d:%d' % (warcfilename, offset, len(data))
        return filename
Esempio n. 11
0
def new_warc(kind):
    """return a new WARCRecord

    @arg kind: what flavor of WARC to create; see `warc.WarcHeader.CONTENT_TYPES` for flavors
    """

    # ripped from WARCHeader.init_defaults()
    headers = {
        'WARC-Type': kind,
        'WARC-Record-ID': "<urn:uuid:%s>" % uuid.uuid1(),
        'Content-Type': warc.WARCHeader.CONTENT_TYPES[kind],
        'WARC-Date': time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())
    }

    return warc.WARCRecord(header=warc.WARCHeader(headers, defaults=False),
                           defaults=False)
Esempio n. 12
0
 def process_document(self, doc):
   if doc.status == 200:
     self.concurrency_lock.acquire()
     try:
       #print base64.b64encode(doc.text)+"\t"+doc.url+"\t"+str(time.time())
       warc_record = warc.WARCRecord(payload=doc.text,headers={"WARC-Target-URI":doc.url})
       f = warc.WARCFile(fileobj=sys.stdout.buffer)
       f.write_record(warc_record)
       self.crawlsize+=sys.getsizeof(doc.text)/1000000.0
       if self.sizelimit != None and self.crawlsize > self.sizelimit:
         self.interrupt=True
         self.save_status()
       if self.timelimit != None and time.time()-self.crawlstarts > self.timelimit:
         self.interrupt=True
         self.save_status()
     finally:
       self.concurrency_lock.release()
   else:
     pass
Esempio n. 13
0
    def record_log(self, warc_info_id):
        log_payload = json.dumps(self.output_log, ensure_ascii=False)

        log_header = {
            'Content-Length': str(len(log_payload)),
            'WARC-Target-URI': 'urn:X-archive-team-ftp-gov-deduplicate:log',
            'WARC-Date': time.strftime('%Y-%m-%dT%H:%M:%SZ'),
            'WARC-Block-Digest': "sha1:{}" \
                 .format(base64.b32encode(hashlib.sha1(log_payload).digest()).decode()),
            'WARC-Record-ID': '<{}>'.format(uuid.uuid4().urn),
            'WARC-Warcinfo-ID': warc_info_id,
            'Content-Type': 'application/json',
            'WARC-Type': 'resource'
        }

        return warc.WARCRecord(
            header=warc.WARCHeader(log_header, defaults=False),
            payload=log_payload,
            defaults=False
        )
Esempio n. 14
0
    def process_record(self, record):
        if record['WARC-Type'] != 'response':
            return

        # The HTTP response is defined by a specification: first part is headers
        # (metadata) and then following two CRLFs (newlines) has the response
        payload = record.payload.read()

        http_headers, body = payload.split('\r\n\r\n', 1)
        if 'Content-Type: text/html' in http_headers and body.strip():
            if ENDPOINT_RE.search(http_headers) or INDIEWEB_RE.search(body):
                warcstr = StringIO()
                warcfile = warc.WARCFile(fileobj=warcstr, mode='w')
                warcfile.write_record(
                    warc.WARCRecord(payload=payload, header=record.header))
                warcbuf = base64.b64encode(warcstr.getvalue())
                warcfile.close()

                domain = urlparse.urlparse(
                    record['WARC-Target-URI']).netloc.lower()
                # domain = headers['Host']
                yield domain, warcbuf
Esempio n. 15
0
    def resolve_dns(self, hostname, date):
        ttl = self.robot.ctx.check_ttl(hostname)
        cache = self.robot.ctx.resolve_dns(hostname)

        if ttl:
            header = warc.WARCHeader({
                "WARC-Type": "response",
                "WARC-Target-URI": "dns:{}".format(hostname),
                "WARC-Date": date.strftime("%Y-%m-%dT%H:%M:%SZ"),
                "Content-Type": "text/dns",
            }, defaults=True)
            body = (
                # RFC 2540 section 2.2 Text Format
                [cache.created_at.strftime("%Y%m%d%H%M%S")]
                 + [x.to_text() for x in cache.answers]
            )
            record = warc.WARCRecord(header,
                                    payload=bytes("\r\n".join(body), "ascii"))
            self.warc.write_record(record)

        temp = []
        for anser in cache.answers:
            temp += [x for x in anser.items if x.rdtype == dns.rdatatype.A]
        return str(secrets.choice(temp))
Esempio n. 16
0
    def SnapShot():

        # TODO:
        # figure out length of video and develop native-resolution frame
        # sampling rate based off of this length.

        print(
            "********************* \n\n Getting snapshots. \n\n*********************"
        )

        os.environ["FFREPORT"] = "file=ffmpeg-snapshots.log"

        # snapshot
        # This is the "proper" way to handle complex command lines with lots of args
        # https://stackoverflow.com/questions/8581140/python-subprocess-call-with-arguments-having-multiple-quotations
        ffmpegsnapshotargs = shlex.split(
            "ffmpeg -i samplethis.flv -vf fps=fps=1/15 -f image2 -q:v 1 images%05d.jpg"
        )
        call(ffmpegsnapshotargs)

        print(
            "********************* \n\n Compressing snapshots. \n\n*********************"
        )

        imagelist = glob.glob("*.jpg")
        imageliststring = ' '.join(imagelist)
        tarcommand = "tar -czvf snapshots.tar.gz " + imageliststring

        # compress all the snapshots
        tarargs = shlex.split(tarcommand)
        call(tarargs)

        # delete jpgs
        rmcommand = "rm " + imageliststring
        rmargs = shlex.split(rmcommand)
        call(rmargs)

        os.environ["FFREPORT"] = ""

        # Add ffmpeg log record
        ffmpegsampleheader = warc.WARCHeader({
            "WARC-Type":
            "resource",
            "WARC-Warcinfo-ID":
            warcinfo_record_ID,
            "Content-Type":
            "text/plain",
            "WARC-Concurrent-To":
            metadata_record_ID
        })
        ffmpegsamplepayload = StringIO(
            open("ffmpeg-snapshots.log").read()).getvalue()
        ffmpegsamplerecord = warc.WARCRecord(headers=ffmpegsampleheader,
                                             payload=ffmpegsamplepayload)
        new_warc_file.write_record(ffmpegsamplerecord)

        # Add the actual snapshot record
        snapshotrecord = warc.WARCRecord(
            headers=warc.WARCHeader({
                "WARC-Type": "conversion",
                "Content-Type": "application/x-gtar",
                "WARC-Refers-To": truncated_record_ID
            }),
            payload=StringIO(open("snapshots.tar.gz").read()).getvalue())
        new_warc_file.write_record(snapshotrecord)

        # remove snapshots and log
        call(shlex.split("rm snapshots.tar.gz ffmpeg-snapshots.log"))
Esempio n. 17
0
    def process(self, item):
        hashes = {}
        input_filename = "%(item_dir)s/%(warc_file_base)s.warc.gz" % item
        output_filename = "%(item_dir)s/%(warc_file_base)s-deduplicated.warc.gz" % item

        warc_input = warc.WARCFile(input_filename)
        warc_input_size = os.path.getsize(input_filename)
        warc_output = warc.WARCFile(output_filename, 'w')

        info_record = warc_input.read_record()
        info_record.header[
            'WARC-Filename'] = "%(warc_file_base)s-deduplicated.warc.gz" % item
        del info_record.header['WARC-Block-Digest']
        warc_output.write_record(
            warc.WARCRecord(payload=info_record.payload.read(),
                            header=info_record.header))

        while warc_input_size > warc_input.tell():
            for record in warc_input:
                if record.type == 'response':
                    hash_ = record.header.get('WARC-Payload-Digest').split(
                        ':', 1)[1]
                    if hash_ in hashes:
                        headers = []
                        payload_ = record.payload.read()
                        for line in payload_.splitlines():
                            if line in ['\r\n', '\n', '']:
                                break
                            headers.append(line.strip())
                        payload = '\r\n'.join(headers) + '\r\n' * 2
                        if not ('Content-Length: 0' in payload or \
                              'content-length: 0' in payload):
                            record.header['Content-Length'] = str(len(payload))
                            record.header['WARC-Refers-To'] = hashes[hash_][0]
                            record.header['WARC-Refers-To-Date'] = hashes[
                                hash_][1]
                            record.header['WARC-Refers-To-Target-URI'] = \
                                hashes[hash_][2]
                            record.header['WARC-Type'] = 'revisit'
                            record.header['WARC-Truncated'] = 'length'
                            record.header['WARC-Profile'] = 'http://netpreserve' \
                                '.org/warc/1.0/revisit/identical-payload-digest'
                            del record.header['WARC-Block-Digest']
                            record = warc.WARCRecord(header=record.header,
                                                     payload=payload,
                                                     defaults=False)
                        else:
                            record = warc.WARCRecord(header=record.header,
                                                     payload=payload_,
                                                     defaults=False)
                    else:
                        hashes[hash_] = (record.header.get('WARC-Record-ID'),
                                         record.header.get('WARC-Date'),
                                         record.header.get('WARC-Target-URI'))
                        record = warc.WARCRecord(header=record.header,
                                                 payload=record.payload.read(),
                                                 defaults=False)
                else:
                    record = warc.WARCRecord(header=record.header,
                                             payload=record.payload.read(),
                                             defaults=False)
                warc_output.write_record(record)
Esempio n. 18
0
import warc
import uuid
import sys
import os
import gzip
os.chdir('/home/eckel/')
''' Load and preprocess data '''

print 'preprocessing'
filenameIn = sys.argv[1]
max_range = int(sys.argv[2])
for i in range(0, max_range):
    print filenameIn + str(i)
    fw = warc.open('dataset_id/' + filenameIn + str(i) + '.warc.gz', 'wb')
    with gzip.open('dataset/' + filenameIn + '.com' + str(i) + '.warc.gz',
                   mode='rb') as gzf:
        for record in warc.WARCFile(fileobj=gzf):
            record['WARC-Record-ID'] = str(uuid.uuid4())
            fw.write_record(
                warc.WARCRecord(payload=record.payload.read(),
                                headers=record.header))
    fw.close()
Esempio n. 19
0
import warc
import uuid
import os
os.chdir('/home/eckel/')

f = warc.open("samples/overstock_sample.warc.gz", "rb")
fw = warc.open("overstock_test.warc.gz", "wb")
count = 0
for record in f:
    if record[
            'WARC-Record-ID'] == '2dd726fe-5f11-43c3-a02c-47860e668cac' or record[
                'WARC-Record-ID'] == '4b3e1e5f-9ac3-4619-b784-a093a1d1ac0d':
        payload = record.payload.read()
        record_header = record.header
        fw.write_record(warc.WARCRecord(payload=payload,
                                        headers=record.header))
        fw.write_record(warc.WARCRecord(payload=payload,
                                        headers=record.header))
        fw.write_record(warc.WARCRecord(payload=payload,
                                        headers=record.header))
        fw.write_record(warc.WARCRecord(payload=payload,
                                        headers=record.header))
    #elif count < 2:
    #	payload = record.payload.read()
    #       record_header = record.header
    #	fw.write_record(warc.WARCRecord(payload=payload,headers=record.header))
    #	count += 1
f.close()
fw.close()
Esempio n. 20
0
    for query_object in bucket:
        candidates = lsh.query(query_object[0], distance_func='cosine')
        dedup.add(query_object[1])
        for c in candidates:
            candidate_key = c[0][
                1]  # warc id is appended as extra data in lsh.index()
            if candidate_key == query_object[1]:
                continue
            candidate_distance = c[1]
            if float(candidate_distance) >= threshold:
                dedup.add(candidate_key)
            elif candidate_key in dedup:
                dedup.remove(candidate_key)

file = warc.open(filenameIn + '_dedup.warc.gz', 'wb')
numSingle = len(dedup)
for i in range(0, max_files):
    with gzip.open(datasetPath + filenameIn + str(i) + '.warc.gz',
                   mode='rb') as gzf:
        for record in warc.WARCFile(fileobj=gzf):
            record_id = record['WARC-Record-ID']
            if record_id in dedup:
                payload = record.payload.read()
            file.write_record(
                warc.WARCRecord(payload=payload, headers=record.header))

print 'Total pages: ' + str(doc_count)
print 'Pages after deduplication: ' + str(numSingle)

file.close()
Esempio n. 21
0
    ]

    f = warc.open("test.warc.gz", "w")

    for u in urls:
        fp = urllib.request.urlopen(u)

        mybytes = fp.read()
        mystr = mybytes.decode("utf8")

        fp.close()

        header = h = warc.WARCHeader({"WARC-Type": "response"}, defaults=True)
        header['WARC-Target-URI:'] = u

        record = warc.WARCRecord(header, mybytes)
        f.write_record(record)

    f.close()

    for u in urls:
        f = warc.open("test_trozos.warc.gz", "a")
        fp = urllib.request.urlopen(u)

        mybytes = fp.read()
        mystr = mybytes.decode("utf8")

        fp.close()

        header = h = warc.WARCHeader({"WARC-Type": "response"}, defaults=True)
        header['WARC-Target-URI:'] = u
Esempio n. 22
0
oparser = argparse.ArgumentParser(
    description=
    "Script that takes a list of file paths from HTTrack crawled folder")
options = oparser.parse_args()

reader = sys.stdin

for line in reader:
    filepath = line.strip()
    content = None
    url = None
    with open(filepath, 'rb') as content_file:
        content = content_file.read()
    for line in content.split(b"\n"):
        if re.search(rb'<!-- Mirrored from ', line):
            url = re.sub(rb'.*<!-- Mirrored from ', b'',
                         re.sub(rb' by HTTrack Website Copier.*', b'', line))
            break
    if url == None:
        warc_record = warc.WARCRecord(payload=content,
                                      headers={"WARC-Target-URI": "unknown"})
    else:
        warc_record = warc.WARCRecord(
            payload=content, headers={"WARC-Target-URI": url.decode("utf8")})

    warc_record = warc.WARCRecord(
        payload=content, headers={"WARC-Target-URI": url.decode("utf8")})
    f = warc.WARCFile(fileobj=sys.stdout.buffer)
    f.write_record(warc_record)
Esempio n. 23
0
                flag = 0

            f = warc.WARCFile(warcFile, "rb")
            try:
                for record in f:
                    if flag == 0:
                        R = createWarcInfoReacord(newFile)
                        filePtr.write_record(R)
                        flag = 1
                    if ("warcinfo" in record['WARC-Type']):
                        New_Payload = record.payload.read().strip(
                        ) + "\r\n" + "WARC-appended-by-WARCMerge: " + datetime.datetime.utcnow(
                        ).strftime('%Y-%m-%dT%H:%M:%SZ') + "\r\n"
                        record['Content-Length'] = str(len(New_Payload))
                        R = warc.WARCRecord(record.header,
                                            New_Payload,
                                            defaults=False)
                    else:
                        R = warc.WARCRecord(payload=record.payload.read(),
                                            headers=record.header,
                                            defaults=False)
                    filePtr.write_record(R)
                if quietMode == False:
                    print '[Yes]' + warcFile
            except Exception as e:
                #print("Exceptionq: %s"%(str(e)))
                if quietMode == False:
                    print '[No]' + warcFile
                pass
        filePtr.close()
        outputFileSize = os.path.getsize(newFileFullPath) / forConvertToMB
Esempio n. 24
0
    def process(self, item):

        # assert that this item is flagged for sampling. If not,
        # return immediately. We don't want to butcher uploads that
        # have been determined to be worth saving in their original
        # state.
        #
        # Presumably, the tracker is tagging these items as something
        # appropriate. Alternately, one could create a "Phase 3" grab
        # and know for a fact that we are only receiving videos that
        # should be sampled. In which case, one may skip the item_type
        # check and proceed directly to sampling.

        item_name = item['item_name']
        item_type, item_value = item_name.split(':', 1)

        item['item_type'] = item_type
        item['item_value'] = item_value

        assert item_type in ('video-bulk', 'url-bulk')

        # Item type is not marked as "video-bulk" from tracker.
        # Carry on. Nothing to do here.
        if item_type != 'video-bulk' or 'url-bulk':
            return

        # ok. This is an item that needs to be sampled.

        # remember where we started from so we can get back there and
        # not mess up the expectations for the rest of stages in the
        # pipeline
        original_path = os.getcwd()

        # get to item_dir ; begin work
        os.chdir(item['item_dir'])

        # we will need some data from the warcfile
        warcinfo_record_ID = ""
        metadata_record_ID = ""
        truncated_record_ID = ""

        # set up old and new warc files for reading and writing, respectively.
        # If a file ends in *.gz for writing, the warc library handles gz
        # compression transparently.
        old_warc_file = warc.open("%(warc_file_base)s.warc.gz" % item)
        new_warc_file = warc.open(
            "%(warc_file_base)s-POSTPROCESSED.warc.gz" % item, "w")

        # ------------------------ Start of main for loop -------------------#

        # and here... we... go
        for record in old_warc_file:

            # Firstly, we detect whether the record we're iterating over holds
            # data we'll need later. If so, behave appropriately. After the
            # if-elif-elif dance, we proceed to copy each record into a new
            # record in the %(warc_file_base)s-POSTPROCESSED.warc.gz file,
            # modifying as necesary (truncated long records, etc)

            # ------------------------ Check for data -------------------------#

            # Grab the lengthy payload (the flv file); if the content-length is
            # longer than ~5MiB, and the record is of the "response" type, then
            # this record *probably* has the flv file.
            if ((long(record['Content-Length']) >= 5000000)
                    and record['WARC-Type'] == "response"):

                # need the record id of the original flv record. Will refernece
                # it in truncated record.
                truncated_record_id = record['warc-record-id']

                # add "WARC-Truncated" to this record, indicating that it has
                # been truncated due to length.
                record['warc-truncated'] = "length"

                # extract the payload
                tempfile = open("intermediate.int", 'wb')
                for line in record.payload:
                    tempfile.write(line)
                tempfile.close()

                # put the payload back; iterating through record.payload
                # invokes a generator on the payload that seems to
                # "eat up" the payload in the original file. I say so because
                # attempting to, say, write the payload out twice (to TWO files)
                # will fail, as will any attempt to read out the payload again
                # without first "putting it back." (I'd love an explanation for
                # just what's going on here; but for now, this hack works)
                # (for the record with the long content-length, we end up reading
                # the payload twice; once here, to get it to a separate file, and
                # once again, in COPY PAYLOAD, to write out a truncated version to
                # the new warc file)
                stream = StringIO(open("intermediate.dat", 'rb').read())
                stream.seek(0, os.SEEK_END)
                streamlength = stream.tell()
                stream.seek(0)
                record.payload = warc.utils.FilePart(fileobj=stream,
                                                     length=streamlength)

                # can't close the stream yet for some reason. This might
                # introduce leaks of some sort, so keep an eye on it.
                # The relevant error: "IO Operation on a closed file."
                # I suspect this operation occurs somewhere in the warc library,
                # and i'm hoping that the stream object just falls out of scope
                # at some point other than when the entire pipeline shuts down.
                # stream.close()

            # Adjust the warcinfo record to note that we also utilized ffmpeg
            elif (record['WARC-Type'] == "warcinfo"):

                # grab the record-id for later use in resource records
                warcinfo_record_ID = record['warc-record-id']

                # gotta add another "software" key to the content-block of the
                # warcinfo record that indicates the use of ffmpeg.
                warcinfo_stream = StringIO()
                for line in record.payload:
                    warcinfo_stream.write(line)

                # trailing \r\n\r\n is already present in the payload; just seek back
                # two bytes (yes, the second \r\n will get clobbered; potential unicode
                # byte-length issues here) and then tack on the additional lines you
                # need to like so:
                warcinfo_stream.seek(-2, os.SEEK_END)
                warcinfo_stream.write("software: ffmpeg/2.3.1\r\n\r\n")
                warcinfo_stream.seek(0, os.SEEK_END)
                warcinfo_stream_len = warcinfo_stream.tell()
                warcinfo_stream.seek(0)
                record.payload = warc.utils.FilePart(
                    fileobj=warcinfo_stream, length=warcinfo_stream_len)

            # Get the metadata record's warc-record-id for later resource
            # records.
            elif (record['WARC-Type'] == "metadata"):

                metadata_record_ID = record['warc-record-id']

            # End of conditionals. Proceed to write the new record to the
            # post-processed warcfile.

            # ------------------------ Copy Record -------------------------#

            # COPY HEADER

            # Should we add defaults=False ? It seems that some additional headers
            # are added in WARCHeader as well as WARCRecord. However, they don't
            # seem harmful: digests and timestamps.
            new_header = warc.WARCHeader(record.header)

            # COPY PAYLOAD

            # if the current record gets truncated, then set the content-length
            # to the new, truncated length as per spec.
            truncated_flag = None

            # SHORT record payloads
            if long(record['content-length']) < 500000:

                #print "Copying payload..."
                new_payload = StringIO()
                for line in record.payload:
                    new_payload.write(line)
                #if we don't seek back to 0, new_payload.read() is empty
                new_payload.seek(0)
                #print "Done copying payload."

            # LONG record payloads (the one that probably has video data)
            else:

                #print "Found long content-length. Truncating..."
                new_payload = StringIO()
                decrement = 25
                #Grab some lines
                #print "Gonna grab some lines. Decrement: ", decrement
                for line in record.payload:
                    #print "Grabbing a line."
                    new_payload.write(line)
                    decrement -= 1
                    #print "Decrement: ", decrement
                    if decrement == 0:
                        break
                # be kind: rewind
                new_payload.seek(0)
                truncated_flag = True

                #print "Done truncating."

            # CREATE RECORD FROM HEADER AND PAYLOAD

            new_rec = warc.WARCRecord(payload=new_payload.read(),
                                      headers=new_header,
                                      defaults=False)

            # if this record happened to be one that got truncated, then we
            # need to adjust its content-length header.
            if truncated_flag:

                #print "Adjusting content-length header"

                # From page 9 of the ISO WARC Standard:
                #
                # "The WARC-Truncated field may be used on any WARC record. The WARC
                # field Content-Length shall still report the actual truncated size of
                # the record block."

                # Get the length of the truncated content-block and set
                # Content-Length header appropriately
                new_payload.seek(0)
                new_payload.seek(0, os.SEEK_END)
                thelength = new_payload.tell()
                new_rec['content-length'] = str(thelength)
                new_payload.seek(0)

            # WRITE THE NEW RECORD OUT TO THE NEW WARCFILE

            # (the warc library handles the gz-compression and putting each record
            # in a separate gz "member" transparently; no need to much with the gzip
            # library ourselves)

            #print "Copying record to new .warc.gz"
            new_warc_file.write_record(new_rec)
            #print "Done copying record to new .warc.gz"
            #print "\n\n"

        #------------------------ END OF MAIN FOR LOOP ------------------------#

        # at this point, we have a new warcfile with copied and truncated
        # records; now, we need to sample the content and add these "conversion"
        # records to the warc file.

        # Should probably delete old warc at this point, since new warcfile has all
        # of the old records, and we've already got another copy of the main
        # payload. If we proceed to write out the full newfile with the shrunken
        # payload before deleting the old warc, we'll basically be using nearly
        # 3x the interim diskspace rather than 2x. (Don't get me wrong, I'd love
        # to have more of a generator-like setup that negates the need to use
        # twice the disk space, but it's beyond the scope of my abilities at the
        # moment and I don't think I'd be able to get up to speed before the
        # deadline for this project drops (August 27 2014) Update: LOL Twitch is
        # already deleting things on August 26; oh well, I suppose this code
        # could come in handy if the IA suddenly needs to compress lots of
        # material)

        # Now, we need to convert the flv, and add conversion records

        # Our "payload.flv" is not quite an flv yet; the payload still includes the
        # HTTP Response headers. We need to grep for "CRLFCRLF" and then chop off
        # anything prior to it, including it, leaving nothing but the flv file for
        # ffmpeg to work with.
        thefile = open("intermediate.int").read()  # NOT A FILE; just a "str"
        theflv = thefile.split('\r\n\r\n')[1]
        writetheflv = open("samplethis.flv", "w")
        writetheflv.write(theflv)
        writetheflv.close()

        # Get Snapshots
        SnapShot()

        # Get shrinked video
        ShrinkRay()

        # Clean up
        print(
            "********************* \n\n Removing temporary files; cleaning up \n\n*********************"
        )
        # remove original file intermediates: "intermediate.int" and "samplethis.flv"
        rmargs = shlex.split("rm intermediate.int samplethis.flv")
        call(rmargs)

        # And we're done!
        new_warc_file.close()
        os.chdir(original_path)
Esempio n. 25
0
user = subprocess.check_output("echo $USER", shell=True).decode('utf-8')

body = "robots: classic\r\nhostname: " + str(
    hostname) + "software: page_downloader.py\r\nisPartOf: Cs_media\r\n"
body += "operator: " + str(
    user
) + "description: Downloading pages\r\npublisher: KNOT (https://knot.fit.vutbr.cz/)\r\n"
body += "format: WARC File Format 1.0\r\nconformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf\r\n"

warc_header = warc.WARCHeader(
    {
        "WARC-Type": "warcinfo",
        "WARC-Filename": settings['output'][0]
    },
    defaults=True)
warc_record = warc.WARCRecord(warc_header, body.encode())
warc_record.write_to(out)

for page in generator:
    warc_header = warc.WARCHeader(
        {
            "WARC-Type": "response",
            "WARC-Target-URI": page['url']
        },
        defaults=True)
    response = page['response']
    if not (response.endswith('\r\n\r\n')):
        response += '\r\n\r\n'
    warc_record = warc.WARCRecord(warc_header,
                                  (response + page['content']).encode(
                                      'utf-8', 'replace'))
Esempio n. 26
0
import gzip

domain = str(sys.argv[1])
max = int(sys.argv[2])
per_warc = 100 / max
filename_sample = '../' + domain + '_sample.warc.gz'

for i in range(0, max):
    count = 0
    #	filename = '../dataset/'+domain+'.com'+str(i)+'.warc.gz'
    filename = '../samples/' + domain + '_sample.warc.gz'
    print 'Load' + filename
    try:
        with gzip.open(filename, 'rb') as gfz:
            ''' Load file '''
            contents = [(warc.WARCRecord(payload=record.payload.read(),
                                         headers=record.header))
                        for record in warc.WARCFile(fileobj=gfz)]
            l = len(contents)
    except:
        continue
    ''' select records randomly '''
    print 'select'
    f_sample = warc.open(filename_sample, 'a')
    while count < per_warc:
        rand = random.randint(0, l - 1)
        sys.stdout.write("\rRecord count %i" % count)
        sys.stdout.flush()
        r = contents[rand]
        #pre = preprocessing.HTMLPreprocessing(r.payload)
        payload = r.payload
        r['Content-Length'] = str(len(payload))