Beispiel #1
0
    def ShrinkRay():

        # TODO:
        # figure out length of video and develop number of frames to
        # drop out of every FPS interval.

        print(
            "********************* \n\n Shrinking Video. (This will take a while) \n\n*********************"
        )

        os.environ["FFREPORT"] = "file=ffmpeg-shrinking.log"

        # shrink; using the webm format at this resolution cuts the file size by
        # *about* an order of magnitude, while still maintaining more-or-less
        # perfectly crisp detail and motion. I'm thinking we don't need to drop
        # frames, and that cutting the resolution down to this ~240P-level
        # resolution is good enough.

        # We really need to check for resolution and select an output resolution
        # appropriately; this one-liner only works for 16:9 inputs

        ffmpegshrinkargs = shlex.split(
            "ffmpeg -i samplethis.flv -c:v libvpx -b:v 500K -c:a libvorbis -s 432x243 shrunken-to-webm.webm"
        )
        call(ffmpegshrinkargs)

        # The final size of snapshots and shrunken video is anywhere from a fifth to
        # a seventh of the original file size.

        os.environ["FFREPORT"] = ""

        # add ffmpeg log record
        ffmpegshrinkheader = warc.WARCHeader({
            "WARC-Type":
            "resource",
            "WARC-Warcinfo-ID":
            warcinfo_record_ID,
            "Content-Type":
            "text/plain",
            "WARC-Concurrent-To":
            metadata_record_ID
        })
        ffmpegshrinkpayload = StringIO(
            open("ffmpeg-shrinking.log").read()).getvalue()
        ffmpegshrinkrecord = warc.WARCRecord(headers=ffmpegshrinkheader,
                                             payload=ffmpegshrinkpayload)
        new_warc_file.write_record(ffmpegshrinkrecord)

        # add actual shrunken webm record
        shrinkrecord = warc.WARCRecord(
            headers=warc.WARCHeader({
                "WARC-Type": "conversion",
                "Content-Type": "video/webm",
                "WARC-Refers-To": truncated_record_ID
            }),
            payload=StringIO(open("shrunken-to-webm.webm").read()).getvalue())
        new_warc_file.write_record(shrinkrecord)

        # remove log file
        call(shlex.split("rm snapshots.tar.gz ffmpeg-shrinking.log"))
Beispiel #2
0
def make_resp_dummy(resp, date, http_ver="1.1"):
    body = resp.raw.data
    temp = [
        bytes("HTTP/{} {} {}".format(http_ver, resp.status_code, RESPONSES[resp.status_code]), "ascii"),
    ]
    applied_keys = []
    for key in resp.headers:
        if key.lower() in ["transfer-encoding"]:
            continue
        elif key.lower() == "content-length" and resp.headers["content-length"] != str(len(body)):
            # recalculate decoded size below
            continue
        temp.append(bytes("{}: {}".format(key, resp.headers[key]), "utf-8"))
        applied_keys.append(key.lower())
    if "content-length" not in applied_keys:
        temp.append(bytes("content-length: {}".format(len(body)), "ascii"))
    temp.append(b"")
    temp.append(body)
    dummy = b"\r\n".join(temp)
    header = warc.WARCHeader({
        "WARC-Type": "response",
        "WARC-Target-URI": resp.url,
        "WARC-Date": date.strftime("%Y-%m-%dT%H:%M:%SZ"),
    }, defaults=True)
    return warc.WARCRecord(header, payload=dummy)
Beispiel #3
0
def make_req_dummy(req, record, http_ver="1.1"):
    o = urlparse(req.url)
    path = o.path
    if not path:
        path = "/"
    temp = [
        bytes("{} {} HTTP/{}".format(req.method, path, http_ver), "ascii")
    ]
    for key in req.headers:
        temp.append(bytes("{}: {}".format(key, req.headers[key]), "utf-8"))
    temp.append(b"")
    if req.body:
        temp.append(req.body)
    dummy = b"\r\n".join(temp)
    header = warc.WARCHeader({
        "WARC-Type": "request",
        "WARC-Target-URI": req.url,
        # ISO 28500 Section 5.4 WARC-Date
        # > Multiple records written as part of a single capture event (see section 5.7)
        # > shall use the same WARC-Date, even though the times of their writing
        # > will not be exactly synchronized.
        "WARC-Date": record.header["WARC-Date"],
        "WARC-Concurrent-To": record.header["WARC-Record-ID"],
    }, defaults=True)
    return warc.WARCRecord(header, payload=dummy)
Beispiel #4
0
    def create(self, filename, fileobj=None, operator=None):
        """
        :rtype: warc.WARCFile
        """
        assert useragent.POLICY is not None

        if fileobj is None:
            fileobj = io.BytesIO()

        self.fileobj = fileobj
        self.warc = warc.WARCFile(fileobj=fileobj)

        header = warc.WARCHeader({
            "WARC-Type": "warcinfo",
            "WARC-Filename": filename,
        }, defaults=True)
        body = [
            b"software: owlbot/"+bytes(version.STR, "ascii"),
            b"format: WARC File Format 1.0",
            # policy from .OWLBOT_POLICY or os.environ["OWLBOT_POLICY"]
            b"robots: " + bytes(useragent.POLICY, "ascii"),
        ]
        if operator is not None:
            body.append(b"operator: " + operator.encode("utf-8"))

        self.warc.write_record(
            warc.WARCRecord(header, payload=b"\r\n".join(body))
        )
Beispiel #5
0
def createWarcInfoReacord(filename):
    H = warc.WARCHeader({"WARC-Type": "warcinfo", \
                        "WARC-Filename" : filename}, \
         defaults=True)
    Content = "software: WARCMerge/1.0" + "\r\n" \
         + "format: WARC File Format 1.0" + "\r\n" \
      + "description: "+" Merging WARC files into a single one " + "\r\n" + \
      "robots: ignore" + "\r\n"
    R = warc.WARCRecord(H, Content)
    return R
Beispiel #6
0
def update_warc_metadata_from_item(record, item):
    """update a WARC metadata record from a scrapy Item"""

    # make empty header object to use for fields
    # XXX WARCHeader messes up capitalization here
    fields = warc.WARCHeader({}, defaults=False)
    fields['x-crawl-depth'] = item['depth']
    fields['hopsFromSeed'] = item['hops_from_seed']
    fields['x-source-anchor'] = item['source_anchor']
    fields['x-source-url'] = item['source_url']

    buf = BytesIO()
    fields.write_to(buf, version_line=False, extra_crlf=False)
    record.update_payload(buf.getvalue())
Beispiel #7
0
def update_warc_info_from_spider(record, spider):
    """update a WARC warcinfo record from a scrapy Spider"""

    # make empty header object to use for fields
    # XXX WARCHeader messes up capitalization here
    fields = warc.WARCHeader({}, defaults=False)
    fields['software'] = 'osp_scraper'
    fields['hostname'] = socket.getfqdn()
    fields['x-spider-name'] = spider.name
    fields['x-spider-run-id'] = spider.run_id
    fields['x-spider-revision'] = git_revision
    fields['x-spider-parameters'] = json.dumps(spider.get_parameters())

    buf = BytesIO()
    fields.write_to(buf, version_line=False, extra_crlf=False)
    record.update_payload(buf.getvalue())
Beispiel #8
0
def new_warc(kind):
    """return a new WARCRecord

    @arg kind: what flavor of WARC to create; see `warc.WarcHeader.CONTENT_TYPES` for flavors
    """

    # ripped from WARCHeader.init_defaults()
    headers = {
        'WARC-Type': kind,
        'WARC-Record-ID': "<urn:uuid:%s>" % uuid.uuid1(),
        'Content-Type': warc.WARCHeader.CONTENT_TYPES[kind],
        'WARC-Date': time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())
    }

    return warc.WARCRecord(header=warc.WARCHeader(headers, defaults=False),
                           defaults=False)
Beispiel #9
0
    def record_log(self, warc_info_id):
        log_payload = json.dumps(self.output_log, ensure_ascii=False)

        log_header = {
            'Content-Length': str(len(log_payload)),
            'WARC-Target-URI': 'urn:X-archive-team-ftp-gov-deduplicate:log',
            'WARC-Date': time.strftime('%Y-%m-%dT%H:%M:%SZ'),
            'WARC-Block-Digest': "sha1:{}" \
                 .format(base64.b32encode(hashlib.sha1(log_payload).digest()).decode()),
            'WARC-Record-ID': '<{}>'.format(uuid.uuid4().urn),
            'WARC-Warcinfo-ID': warc_info_id,
            'Content-Type': 'application/json',
            'WARC-Type': 'resource'
        }

        return warc.WARCRecord(
            header=warc.WARCHeader(log_header, defaults=False),
            payload=log_payload,
            defaults=False
        )
Beispiel #10
0
    def resolve_dns(self, hostname, date):
        ttl = self.robot.ctx.check_ttl(hostname)
        cache = self.robot.ctx.resolve_dns(hostname)

        if ttl:
            header = warc.WARCHeader({
                "WARC-Type": "response",
                "WARC-Target-URI": "dns:{}".format(hostname),
                "WARC-Date": date.strftime("%Y-%m-%dT%H:%M:%SZ"),
                "Content-Type": "text/dns",
            }, defaults=True)
            body = (
                # RFC 2540 section 2.2 Text Format
                [cache.created_at.strftime("%Y%m%d%H%M%S")]
                 + [x.to_text() for x in cache.answers]
            )
            record = warc.WARCRecord(header,
                                    payload=bytes("\r\n".join(body), "ascii"))
            self.warc.write_record(record)

        temp = []
        for anser in cache.answers:
            temp += [x for x in anser.items if x.rdtype == dns.rdatatype.A]
        return str(secrets.choice(temp))
Beispiel #11
0
    urls = [
        'https://elpais.com/', 'https://elpais.com/tag/gente/a',
        'https://politica.elpais.com/', 'https://elpais.com/internacional/'
    ]

    f = warc.open("test.warc.gz", "w")

    for u in urls:
        fp = urllib.request.urlopen(u)

        mybytes = fp.read()
        mystr = mybytes.decode("utf8")

        fp.close()

        header = h = warc.WARCHeader({"WARC-Type": "response"}, defaults=True)
        header['WARC-Target-URI:'] = u

        record = warc.WARCRecord(header, mybytes)
        f.write_record(record)

    f.close()

    for u in urls:
        f = warc.open("test_trozos.warc.gz", "a")
        fp = urllib.request.urlopen(u)

        mybytes = fp.read()
        mystr = mybytes.decode("utf8")

        fp.close()
Beispiel #12
0
    def SnapShot():

        # TODO:
        # figure out length of video and develop native-resolution frame
        # sampling rate based off of this length.

        print(
            "********************* \n\n Getting snapshots. \n\n*********************"
        )

        os.environ["FFREPORT"] = "file=ffmpeg-snapshots.log"

        # snapshot
        # This is the "proper" way to handle complex command lines with lots of args
        # https://stackoverflow.com/questions/8581140/python-subprocess-call-with-arguments-having-multiple-quotations
        ffmpegsnapshotargs = shlex.split(
            "ffmpeg -i samplethis.flv -vf fps=fps=1/15 -f image2 -q:v 1 images%05d.jpg"
        )
        call(ffmpegsnapshotargs)

        print(
            "********************* \n\n Compressing snapshots. \n\n*********************"
        )

        imagelist = glob.glob("*.jpg")
        imageliststring = ' '.join(imagelist)
        tarcommand = "tar -czvf snapshots.tar.gz " + imageliststring

        # compress all the snapshots
        tarargs = shlex.split(tarcommand)
        call(tarargs)

        # delete jpgs
        rmcommand = "rm " + imageliststring
        rmargs = shlex.split(rmcommand)
        call(rmargs)

        os.environ["FFREPORT"] = ""

        # Add ffmpeg log record
        ffmpegsampleheader = warc.WARCHeader({
            "WARC-Type":
            "resource",
            "WARC-Warcinfo-ID":
            warcinfo_record_ID,
            "Content-Type":
            "text/plain",
            "WARC-Concurrent-To":
            metadata_record_ID
        })
        ffmpegsamplepayload = StringIO(
            open("ffmpeg-snapshots.log").read()).getvalue()
        ffmpegsamplerecord = warc.WARCRecord(headers=ffmpegsampleheader,
                                             payload=ffmpegsamplepayload)
        new_warc_file.write_record(ffmpegsamplerecord)

        # Add the actual snapshot record
        snapshotrecord = warc.WARCRecord(
            headers=warc.WARCHeader({
                "WARC-Type": "conversion",
                "Content-Type": "application/x-gtar",
                "WARC-Refers-To": truncated_record_ID
            }),
            payload=StringIO(open("snapshots.tar.gz").read()).getvalue())
        new_warc_file.write_record(snapshotrecord)

        # remove snapshots and log
        call(shlex.split("rm snapshots.tar.gz ffmpeg-snapshots.log"))
Beispiel #13
0
    def process(self, item):

        # assert that this item is flagged for sampling. If not,
        # return immediately. We don't want to butcher uploads that
        # have been determined to be worth saving in their original
        # state.
        #
        # Presumably, the tracker is tagging these items as something
        # appropriate. Alternately, one could create a "Phase 3" grab
        # and know for a fact that we are only receiving videos that
        # should be sampled. In which case, one may skip the item_type
        # check and proceed directly to sampling.

        item_name = item['item_name']
        item_type, item_value = item_name.split(':', 1)

        item['item_type'] = item_type
        item['item_value'] = item_value

        assert item_type in ('video-bulk', 'url-bulk')

        # Item type is not marked as "video-bulk" from tracker.
        # Carry on. Nothing to do here.
        if item_type != 'video-bulk' or 'url-bulk':
            return

        # ok. This is an item that needs to be sampled.

        # remember where we started from so we can get back there and
        # not mess up the expectations for the rest of stages in the
        # pipeline
        original_path = os.getcwd()

        # get to item_dir ; begin work
        os.chdir(item['item_dir'])

        # we will need some data from the warcfile
        warcinfo_record_ID = ""
        metadata_record_ID = ""
        truncated_record_ID = ""

        # set up old and new warc files for reading and writing, respectively.
        # If a file ends in *.gz for writing, the warc library handles gz
        # compression transparently.
        old_warc_file = warc.open("%(warc_file_base)s.warc.gz" % item)
        new_warc_file = warc.open(
            "%(warc_file_base)s-POSTPROCESSED.warc.gz" % item, "w")

        # ------------------------ Start of main for loop -------------------#

        # and here... we... go
        for record in old_warc_file:

            # Firstly, we detect whether the record we're iterating over holds
            # data we'll need later. If so, behave appropriately. After the
            # if-elif-elif dance, we proceed to copy each record into a new
            # record in the %(warc_file_base)s-POSTPROCESSED.warc.gz file,
            # modifying as necesary (truncated long records, etc)

            # ------------------------ Check for data -------------------------#

            # Grab the lengthy payload (the flv file); if the content-length is
            # longer than ~5MiB, and the record is of the "response" type, then
            # this record *probably* has the flv file.
            if ((long(record['Content-Length']) >= 5000000)
                    and record['WARC-Type'] == "response"):

                # need the record id of the original flv record. Will refernece
                # it in truncated record.
                truncated_record_id = record['warc-record-id']

                # add "WARC-Truncated" to this record, indicating that it has
                # been truncated due to length.
                record['warc-truncated'] = "length"

                # extract the payload
                tempfile = open("intermediate.int", 'wb')
                for line in record.payload:
                    tempfile.write(line)
                tempfile.close()

                # put the payload back; iterating through record.payload
                # invokes a generator on the payload that seems to
                # "eat up" the payload in the original file. I say so because
                # attempting to, say, write the payload out twice (to TWO files)
                # will fail, as will any attempt to read out the payload again
                # without first "putting it back." (I'd love an explanation for
                # just what's going on here; but for now, this hack works)
                # (for the record with the long content-length, we end up reading
                # the payload twice; once here, to get it to a separate file, and
                # once again, in COPY PAYLOAD, to write out a truncated version to
                # the new warc file)
                stream = StringIO(open("intermediate.dat", 'rb').read())
                stream.seek(0, os.SEEK_END)
                streamlength = stream.tell()
                stream.seek(0)
                record.payload = warc.utils.FilePart(fileobj=stream,
                                                     length=streamlength)

                # can't close the stream yet for some reason. This might
                # introduce leaks of some sort, so keep an eye on it.
                # The relevant error: "IO Operation on a closed file."
                # I suspect this operation occurs somewhere in the warc library,
                # and i'm hoping that the stream object just falls out of scope
                # at some point other than when the entire pipeline shuts down.
                # stream.close()

            # Adjust the warcinfo record to note that we also utilized ffmpeg
            elif (record['WARC-Type'] == "warcinfo"):

                # grab the record-id for later use in resource records
                warcinfo_record_ID = record['warc-record-id']

                # gotta add another "software" key to the content-block of the
                # warcinfo record that indicates the use of ffmpeg.
                warcinfo_stream = StringIO()
                for line in record.payload:
                    warcinfo_stream.write(line)

                # trailing \r\n\r\n is already present in the payload; just seek back
                # two bytes (yes, the second \r\n will get clobbered; potential unicode
                # byte-length issues here) and then tack on the additional lines you
                # need to like so:
                warcinfo_stream.seek(-2, os.SEEK_END)
                warcinfo_stream.write("software: ffmpeg/2.3.1\r\n\r\n")
                warcinfo_stream.seek(0, os.SEEK_END)
                warcinfo_stream_len = warcinfo_stream.tell()
                warcinfo_stream.seek(0)
                record.payload = warc.utils.FilePart(
                    fileobj=warcinfo_stream, length=warcinfo_stream_len)

            # Get the metadata record's warc-record-id for later resource
            # records.
            elif (record['WARC-Type'] == "metadata"):

                metadata_record_ID = record['warc-record-id']

            # End of conditionals. Proceed to write the new record to the
            # post-processed warcfile.

            # ------------------------ Copy Record -------------------------#

            # COPY HEADER

            # Should we add defaults=False ? It seems that some additional headers
            # are added in WARCHeader as well as WARCRecord. However, they don't
            # seem harmful: digests and timestamps.
            new_header = warc.WARCHeader(record.header)

            # COPY PAYLOAD

            # if the current record gets truncated, then set the content-length
            # to the new, truncated length as per spec.
            truncated_flag = None

            # SHORT record payloads
            if long(record['content-length']) < 500000:

                #print "Copying payload..."
                new_payload = StringIO()
                for line in record.payload:
                    new_payload.write(line)
                #if we don't seek back to 0, new_payload.read() is empty
                new_payload.seek(0)
                #print "Done copying payload."

            # LONG record payloads (the one that probably has video data)
            else:

                #print "Found long content-length. Truncating..."
                new_payload = StringIO()
                decrement = 25
                #Grab some lines
                #print "Gonna grab some lines. Decrement: ", decrement
                for line in record.payload:
                    #print "Grabbing a line."
                    new_payload.write(line)
                    decrement -= 1
                    #print "Decrement: ", decrement
                    if decrement == 0:
                        break
                # be kind: rewind
                new_payload.seek(0)
                truncated_flag = True

                #print "Done truncating."

            # CREATE RECORD FROM HEADER AND PAYLOAD

            new_rec = warc.WARCRecord(payload=new_payload.read(),
                                      headers=new_header,
                                      defaults=False)

            # if this record happened to be one that got truncated, then we
            # need to adjust its content-length header.
            if truncated_flag:

                #print "Adjusting content-length header"

                # From page 9 of the ISO WARC Standard:
                #
                # "The WARC-Truncated field may be used on any WARC record. The WARC
                # field Content-Length shall still report the actual truncated size of
                # the record block."

                # Get the length of the truncated content-block and set
                # Content-Length header appropriately
                new_payload.seek(0)
                new_payload.seek(0, os.SEEK_END)
                thelength = new_payload.tell()
                new_rec['content-length'] = str(thelength)
                new_payload.seek(0)

            # WRITE THE NEW RECORD OUT TO THE NEW WARCFILE

            # (the warc library handles the gz-compression and putting each record
            # in a separate gz "member" transparently; no need to much with the gzip
            # library ourselves)

            #print "Copying record to new .warc.gz"
            new_warc_file.write_record(new_rec)
            #print "Done copying record to new .warc.gz"
            #print "\n\n"

        #------------------------ END OF MAIN FOR LOOP ------------------------#

        # at this point, we have a new warcfile with copied and truncated
        # records; now, we need to sample the content and add these "conversion"
        # records to the warc file.

        # Should probably delete old warc at this point, since new warcfile has all
        # of the old records, and we've already got another copy of the main
        # payload. If we proceed to write out the full newfile with the shrunken
        # payload before deleting the old warc, we'll basically be using nearly
        # 3x the interim diskspace rather than 2x. (Don't get me wrong, I'd love
        # to have more of a generator-like setup that negates the need to use
        # twice the disk space, but it's beyond the scope of my abilities at the
        # moment and I don't think I'd be able to get up to speed before the
        # deadline for this project drops (August 27 2014) Update: LOL Twitch is
        # already deleting things on August 26; oh well, I suppose this code
        # could come in handy if the IA suddenly needs to compress lots of
        # material)

        # Now, we need to convert the flv, and add conversion records

        # Our "payload.flv" is not quite an flv yet; the payload still includes the
        # HTTP Response headers. We need to grep for "CRLFCRLF" and then chop off
        # anything prior to it, including it, leaving nothing but the flv file for
        # ffmpeg to work with.
        thefile = open("intermediate.int").read()  # NOT A FILE; just a "str"
        theflv = thefile.split('\r\n\r\n')[1]
        writetheflv = open("samplethis.flv", "w")
        writetheflv.write(theflv)
        writetheflv.close()

        # Get Snapshots
        SnapShot()

        # Get shrinked video
        ShrinkRay()

        # Clean up
        print(
            "********************* \n\n Removing temporary files; cleaning up \n\n*********************"
        )
        # remove original file intermediates: "intermediate.int" and "samplethis.flv"
        rmargs = shlex.split("rm intermediate.int samplethis.flv")
        call(rmargs)

        # And we're done!
        new_warc_file.close()
        os.chdir(original_path)
                           wait=wait,
                           iterable=download_links,
                           threads=threads)
hostname = subprocess.check_output("hostname -f", shell=True).decode('utf-8')
user = subprocess.check_output("echo $USER", shell=True).decode('utf-8')

body = "robots: classic\r\nhostname: " + str(
    hostname) + "software: page_downloader.py\r\nisPartOf: Cs_media\r\n"
body += "operator: " + str(
    user
) + "description: Downloading pages\r\npublisher: KNOT (https://knot.fit.vutbr.cz/)\r\n"
body += "format: WARC File Format 1.0\r\nconformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf\r\n"

warc_header = warc.WARCHeader(
    {
        "WARC-Type": "warcinfo",
        "WARC-Filename": settings['output'][0]
    },
    defaults=True)
warc_record = warc.WARCRecord(warc_header, body.encode())
warc_record.write_to(out)

for page in generator:
    warc_header = warc.WARCHeader(
        {
            "WARC-Type": "response",
            "WARC-Target-URI": page['url']
        },
        defaults=True)
    response = page['response']
    if not (response.endswith('\r\n\r\n')):
        response += '\r\n\r\n'