def ShrinkRay(): # TODO: # figure out length of video and develop number of frames to # drop out of every FPS interval. print( "********************* \n\n Shrinking Video. (This will take a while) \n\n*********************" ) os.environ["FFREPORT"] = "file=ffmpeg-shrinking.log" # shrink; using the webm format at this resolution cuts the file size by # *about* an order of magnitude, while still maintaining more-or-less # perfectly crisp detail and motion. I'm thinking we don't need to drop # frames, and that cutting the resolution down to this ~240P-level # resolution is good enough. # We really need to check for resolution and select an output resolution # appropriately; this one-liner only works for 16:9 inputs ffmpegshrinkargs = shlex.split( "ffmpeg -i samplethis.flv -c:v libvpx -b:v 500K -c:a libvorbis -s 432x243 shrunken-to-webm.webm" ) call(ffmpegshrinkargs) # The final size of snapshots and shrunken video is anywhere from a fifth to # a seventh of the original file size. os.environ["FFREPORT"] = "" # add ffmpeg log record ffmpegshrinkheader = warc.WARCHeader({ "WARC-Type": "resource", "WARC-Warcinfo-ID": warcinfo_record_ID, "Content-Type": "text/plain", "WARC-Concurrent-To": metadata_record_ID }) ffmpegshrinkpayload = StringIO( open("ffmpeg-shrinking.log").read()).getvalue() ffmpegshrinkrecord = warc.WARCRecord(headers=ffmpegshrinkheader, payload=ffmpegshrinkpayload) new_warc_file.write_record(ffmpegshrinkrecord) # add actual shrunken webm record shrinkrecord = warc.WARCRecord( headers=warc.WARCHeader({ "WARC-Type": "conversion", "Content-Type": "video/webm", "WARC-Refers-To": truncated_record_ID }), payload=StringIO(open("shrunken-to-webm.webm").read()).getvalue()) new_warc_file.write_record(shrinkrecord) # remove log file call(shlex.split("rm snapshots.tar.gz ffmpeg-shrinking.log"))
def create(self, filename, fileobj=None, operator=None): """ :rtype: warc.WARCFile """ assert useragent.POLICY is not None if fileobj is None: fileobj = io.BytesIO() self.fileobj = fileobj self.warc = warc.WARCFile(fileobj=fileobj) header = warc.WARCHeader({ "WARC-Type": "warcinfo", "WARC-Filename": filename, }, defaults=True) body = [ b"software: owlbot/"+bytes(version.STR, "ascii"), b"format: WARC File Format 1.0", # policy from .OWLBOT_POLICY or os.environ["OWLBOT_POLICY"] b"robots: " + bytes(useragent.POLICY, "ascii"), ] if operator is not None: body.append(b"operator: " + operator.encode("utf-8")) self.warc.write_record( warc.WARCRecord(header, payload=b"\r\n".join(body)) )
def write_response(self, response): '''Writes a `response` object from Scrapy as a Warc record. ''' # Avoid duplicated entries response_url = w3lib.url.safe_download_url(response.url) if response_url in self.db: log.msg('Ignored already stored response: %s' % response_url, level=log.DEBUG) return self.db[response_url] = '1' # Create the payload string payload = StringIO.StringIO() status_reason = httplib.responses.get(response.status, '-') payload.write('HTTP/1.1 %d %s\r\n' % (response.status, status_reason)) for h_name in response.headers: payload.write('%s: %s\n' % (h_name, response.headers[h_name])) payload.write('\r\n') payload.write(response.body) headers = { 'WARC-Type': 'response', 'WARC-Date': WarcWriter.now_iso_format(), 'Content-Length': str(payload.tell()), 'Content-Type': str(response.headers.get('Content-Type', '')), # Optional headers 'WARC-Target-URI': response_url } record = warc.WARCRecord(payload=payload.getvalue(), headers=headers) self._write_record(record)
def make_req_dummy(req, record, http_ver="1.1"): o = urlparse(req.url) path = o.path if not path: path = "/" temp = [ bytes("{} {} HTTP/{}".format(req.method, path, http_ver), "ascii") ] for key in req.headers: temp.append(bytes("{}: {}".format(key, req.headers[key]), "utf-8")) temp.append(b"") if req.body: temp.append(req.body) dummy = b"\r\n".join(temp) header = warc.WARCHeader({ "WARC-Type": "request", "WARC-Target-URI": req.url, # ISO 28500 Section 5.4 WARC-Date # > Multiple records written as part of a single capture event (see section 5.7) # > shall use the same WARC-Date, even though the times of their writing # > will not be exactly synchronized. "WARC-Date": record.header["WARC-Date"], "WARC-Concurrent-To": record.header["WARC-Record-ID"], }, defaults=True) return warc.WARCRecord(header, payload=dummy)
def create_warc_from_corpus(documents, filename=None): """ Used mainly in tests to generate small .warc files """ if filename is None: fd, filename = tempfile.mkstemp(suffix=".warc") os.close(fd) f = warc.open(filename, "w") for doc in documents: headers = "Connection: close\r\nContent-Type: text/html" if "headers" in doc: headers = "\r\n".join( ["%s: %s" % (k, v) for k, v in doc["headers"].iteritems()]) payload = "HTTP/1.1 200 OK\r\n" + headers + "\r\n\r\n" + doc["content"] record = warc.WARCRecord(payload=payload, headers={ "Content-Type": "application/http; msgtype=response", "WARC-Type": "response", "WARC-Target-URI": doc["url"] }) f.write_record(record) f.close() return filename
def make_resp_dummy(resp, date, http_ver="1.1"): body = resp.raw.data temp = [ bytes("HTTP/{} {} {}".format(http_ver, resp.status_code, RESPONSES[resp.status_code]), "ascii"), ] applied_keys = [] for key in resp.headers: if key.lower() in ["transfer-encoding"]: continue elif key.lower() == "content-length" and resp.headers["content-length"] != str(len(body)): # recalculate decoded size below continue temp.append(bytes("{}: {}".format(key, resp.headers[key]), "utf-8")) applied_keys.append(key.lower()) if "content-length" not in applied_keys: temp.append(bytes("content-length: {}".format(len(body)), "ascii")) temp.append(b"") temp.append(body) dummy = b"\r\n".join(temp) header = warc.WARCHeader({ "WARC-Type": "response", "WARC-Target-URI": resp.url, "WARC-Date": date.strftime("%Y-%m-%dT%H:%M:%SZ"), }, defaults=True) return warc.WARCRecord(header, payload=dummy)
def deduplicate_record(self, record): record_check = self.check_record(record) if record_check: record.header['Content-Length'] = '0' record.header['WARC-Refers-To'] = \ record_check['WARC-Record-ID'] record.header['WARC-Refers-To-Date'] = \ record_check['WARC-Date'] record.header['WARC-Refers-To-Target-URI'] = \ record_check['WARC-Target-URI'] record.header['WARC-Type'] = 'revisit' record.header['WARC-Truncated'] = 'length' record.header['WARC-Profile'] = \ 'http://netpreserve.org/warc/1.0/revisit/identical-payload-digest' record.header['WARC-Payload-Digest'] = \ record.header['WARC-Block-Digest'] del record.header['WARC-Block-Digest'] self.output_log.append({ 'WARC-Record-ID': record.header['WARC-Record-ID'], 'WARC-Target-URI': record.header['WARC-Target-URI'], 'WARC-Date': record.header['WARC-Date'], 'Content-Length': record_check['Content-Length'], 'Duplicate-Of': { 'WARC-Record-ID': record_check['WARC-Record-ID'], 'WARC-Target-URI': record_check['WARC-Target-URI'], 'WARC-Date': record_check['WARC-Date'], 'Content-Length': record_check['Content-Length'] } }) return warc.WARCRecord( header=record.header, payload='', defaults=False ) else: return warc.WARCRecord( header=record.header, payload=record.payload.read(), defaults=False )
def createWarcInfoReacord(filename): H = warc.WARCHeader({"WARC-Type": "warcinfo", \ "WARC-Filename" : filename}, \ defaults=True) Content = "software: WARCMerge/1.0" + "\r\n" \ + "format: WARC File Format 1.0" + "\r\n" \ + "description: "+" Merging WARC files into a single one " + "\r\n" + \ "robots: ignore" + "\r\n" R = warc.WARCRecord(H, Content) return R
def deduplicate(self): info_record = self.input_file.read_record() info_record.header['WARC-Filename'] = self.output_filename warc_info_id = info_record.header['WARC-Warcinfo-ID'] self.output_file.write_record(warc.WARCRecord( payload=info_record.payload.read(), header=info_record.header, defaults=False )) while self.input_file_size > self.input_file.tell(): for record in self.input_file: if record.type == 'resource': record = self.deduplicate_record(record) else: record = warc.WARCRecord( header=record.header, payload=record.payload.read(), defaults=False) self.output_file.write_record(record) self.output_file.write_record(self.record_log(warc_info_id)) self.input_file.close() self.output_file.close() with codecs.open(self.output_log_filename, 'w') as output_log_file: json.dump(self.output_log, output_log_file, ensure_ascii=False, indent=4) if self.double_check(self.input_filename): os.remove(self.input_filename) else: os.remove(self.output_filename) os.remove(self.output_log_filename) self.dump_records()
def write(self, data, headers={}): warcfilename = self.get_next_warcfile() path = self.get_path(warcfilename, create_dirs=True) w = warc.WARCWriter(open(path, 'a')) headers = dict(headers) subject_uri = headers.pop('subject_uri', 'xxx') mimetype = headers.pop('mimetype', 'application/octet-stream') warc_record = warc.WARCRecord('resource', subject_uri, mimetype, headers, data) offset = w.write(warc_record) w.close() filename = '%s:%d:%d' % (warcfilename, offset, len(data)) return filename
def new_warc(kind): """return a new WARCRecord @arg kind: what flavor of WARC to create; see `warc.WarcHeader.CONTENT_TYPES` for flavors """ # ripped from WARCHeader.init_defaults() headers = { 'WARC-Type': kind, 'WARC-Record-ID': "<urn:uuid:%s>" % uuid.uuid1(), 'Content-Type': warc.WARCHeader.CONTENT_TYPES[kind], 'WARC-Date': time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime()) } return warc.WARCRecord(header=warc.WARCHeader(headers, defaults=False), defaults=False)
def process_document(self, doc): if doc.status == 200: self.concurrency_lock.acquire() try: #print base64.b64encode(doc.text)+"\t"+doc.url+"\t"+str(time.time()) warc_record = warc.WARCRecord(payload=doc.text,headers={"WARC-Target-URI":doc.url}) f = warc.WARCFile(fileobj=sys.stdout.buffer) f.write_record(warc_record) self.crawlsize+=sys.getsizeof(doc.text)/1000000.0 if self.sizelimit != None and self.crawlsize > self.sizelimit: self.interrupt=True self.save_status() if self.timelimit != None and time.time()-self.crawlstarts > self.timelimit: self.interrupt=True self.save_status() finally: self.concurrency_lock.release() else: pass
def record_log(self, warc_info_id): log_payload = json.dumps(self.output_log, ensure_ascii=False) log_header = { 'Content-Length': str(len(log_payload)), 'WARC-Target-URI': 'urn:X-archive-team-ftp-gov-deduplicate:log', 'WARC-Date': time.strftime('%Y-%m-%dT%H:%M:%SZ'), 'WARC-Block-Digest': "sha1:{}" \ .format(base64.b32encode(hashlib.sha1(log_payload).digest()).decode()), 'WARC-Record-ID': '<{}>'.format(uuid.uuid4().urn), 'WARC-Warcinfo-ID': warc_info_id, 'Content-Type': 'application/json', 'WARC-Type': 'resource' } return warc.WARCRecord( header=warc.WARCHeader(log_header, defaults=False), payload=log_payload, defaults=False )
def process_record(self, record): if record['WARC-Type'] != 'response': return # The HTTP response is defined by a specification: first part is headers # (metadata) and then following two CRLFs (newlines) has the response payload = record.payload.read() http_headers, body = payload.split('\r\n\r\n', 1) if 'Content-Type: text/html' in http_headers and body.strip(): if ENDPOINT_RE.search(http_headers) or INDIEWEB_RE.search(body): warcstr = StringIO() warcfile = warc.WARCFile(fileobj=warcstr, mode='w') warcfile.write_record( warc.WARCRecord(payload=payload, header=record.header)) warcbuf = base64.b64encode(warcstr.getvalue()) warcfile.close() domain = urlparse.urlparse( record['WARC-Target-URI']).netloc.lower() # domain = headers['Host'] yield domain, warcbuf
def resolve_dns(self, hostname, date): ttl = self.robot.ctx.check_ttl(hostname) cache = self.robot.ctx.resolve_dns(hostname) if ttl: header = warc.WARCHeader({ "WARC-Type": "response", "WARC-Target-URI": "dns:{}".format(hostname), "WARC-Date": date.strftime("%Y-%m-%dT%H:%M:%SZ"), "Content-Type": "text/dns", }, defaults=True) body = ( # RFC 2540 section 2.2 Text Format [cache.created_at.strftime("%Y%m%d%H%M%S")] + [x.to_text() for x in cache.answers] ) record = warc.WARCRecord(header, payload=bytes("\r\n".join(body), "ascii")) self.warc.write_record(record) temp = [] for anser in cache.answers: temp += [x for x in anser.items if x.rdtype == dns.rdatatype.A] return str(secrets.choice(temp))
def SnapShot(): # TODO: # figure out length of video and develop native-resolution frame # sampling rate based off of this length. print( "********************* \n\n Getting snapshots. \n\n*********************" ) os.environ["FFREPORT"] = "file=ffmpeg-snapshots.log" # snapshot # This is the "proper" way to handle complex command lines with lots of args # https://stackoverflow.com/questions/8581140/python-subprocess-call-with-arguments-having-multiple-quotations ffmpegsnapshotargs = shlex.split( "ffmpeg -i samplethis.flv -vf fps=fps=1/15 -f image2 -q:v 1 images%05d.jpg" ) call(ffmpegsnapshotargs) print( "********************* \n\n Compressing snapshots. \n\n*********************" ) imagelist = glob.glob("*.jpg") imageliststring = ' '.join(imagelist) tarcommand = "tar -czvf snapshots.tar.gz " + imageliststring # compress all the snapshots tarargs = shlex.split(tarcommand) call(tarargs) # delete jpgs rmcommand = "rm " + imageliststring rmargs = shlex.split(rmcommand) call(rmargs) os.environ["FFREPORT"] = "" # Add ffmpeg log record ffmpegsampleheader = warc.WARCHeader({ "WARC-Type": "resource", "WARC-Warcinfo-ID": warcinfo_record_ID, "Content-Type": "text/plain", "WARC-Concurrent-To": metadata_record_ID }) ffmpegsamplepayload = StringIO( open("ffmpeg-snapshots.log").read()).getvalue() ffmpegsamplerecord = warc.WARCRecord(headers=ffmpegsampleheader, payload=ffmpegsamplepayload) new_warc_file.write_record(ffmpegsamplerecord) # Add the actual snapshot record snapshotrecord = warc.WARCRecord( headers=warc.WARCHeader({ "WARC-Type": "conversion", "Content-Type": "application/x-gtar", "WARC-Refers-To": truncated_record_ID }), payload=StringIO(open("snapshots.tar.gz").read()).getvalue()) new_warc_file.write_record(snapshotrecord) # remove snapshots and log call(shlex.split("rm snapshots.tar.gz ffmpeg-snapshots.log"))
def process(self, item): hashes = {} input_filename = "%(item_dir)s/%(warc_file_base)s.warc.gz" % item output_filename = "%(item_dir)s/%(warc_file_base)s-deduplicated.warc.gz" % item warc_input = warc.WARCFile(input_filename) warc_input_size = os.path.getsize(input_filename) warc_output = warc.WARCFile(output_filename, 'w') info_record = warc_input.read_record() info_record.header[ 'WARC-Filename'] = "%(warc_file_base)s-deduplicated.warc.gz" % item del info_record.header['WARC-Block-Digest'] warc_output.write_record( warc.WARCRecord(payload=info_record.payload.read(), header=info_record.header)) while warc_input_size > warc_input.tell(): for record in warc_input: if record.type == 'response': hash_ = record.header.get('WARC-Payload-Digest').split( ':', 1)[1] if hash_ in hashes: headers = [] payload_ = record.payload.read() for line in payload_.splitlines(): if line in ['\r\n', '\n', '']: break headers.append(line.strip()) payload = '\r\n'.join(headers) + '\r\n' * 2 if not ('Content-Length: 0' in payload or \ 'content-length: 0' in payload): record.header['Content-Length'] = str(len(payload)) record.header['WARC-Refers-To'] = hashes[hash_][0] record.header['WARC-Refers-To-Date'] = hashes[ hash_][1] record.header['WARC-Refers-To-Target-URI'] = \ hashes[hash_][2] record.header['WARC-Type'] = 'revisit' record.header['WARC-Truncated'] = 'length' record.header['WARC-Profile'] = 'http://netpreserve' \ '.org/warc/1.0/revisit/identical-payload-digest' del record.header['WARC-Block-Digest'] record = warc.WARCRecord(header=record.header, payload=payload, defaults=False) else: record = warc.WARCRecord(header=record.header, payload=payload_, defaults=False) else: hashes[hash_] = (record.header.get('WARC-Record-ID'), record.header.get('WARC-Date'), record.header.get('WARC-Target-URI')) record = warc.WARCRecord(header=record.header, payload=record.payload.read(), defaults=False) else: record = warc.WARCRecord(header=record.header, payload=record.payload.read(), defaults=False) warc_output.write_record(record)
import warc import uuid import sys import os import gzip os.chdir('/home/eckel/') ''' Load and preprocess data ''' print 'preprocessing' filenameIn = sys.argv[1] max_range = int(sys.argv[2]) for i in range(0, max_range): print filenameIn + str(i) fw = warc.open('dataset_id/' + filenameIn + str(i) + '.warc.gz', 'wb') with gzip.open('dataset/' + filenameIn + '.com' + str(i) + '.warc.gz', mode='rb') as gzf: for record in warc.WARCFile(fileobj=gzf): record['WARC-Record-ID'] = str(uuid.uuid4()) fw.write_record( warc.WARCRecord(payload=record.payload.read(), headers=record.header)) fw.close()
import warc import uuid import os os.chdir('/home/eckel/') f = warc.open("samples/overstock_sample.warc.gz", "rb") fw = warc.open("overstock_test.warc.gz", "wb") count = 0 for record in f: if record[ 'WARC-Record-ID'] == '2dd726fe-5f11-43c3-a02c-47860e668cac' or record[ 'WARC-Record-ID'] == '4b3e1e5f-9ac3-4619-b784-a093a1d1ac0d': payload = record.payload.read() record_header = record.header fw.write_record(warc.WARCRecord(payload=payload, headers=record.header)) fw.write_record(warc.WARCRecord(payload=payload, headers=record.header)) fw.write_record(warc.WARCRecord(payload=payload, headers=record.header)) fw.write_record(warc.WARCRecord(payload=payload, headers=record.header)) #elif count < 2: # payload = record.payload.read() # record_header = record.header # fw.write_record(warc.WARCRecord(payload=payload,headers=record.header)) # count += 1 f.close() fw.close()
for query_object in bucket: candidates = lsh.query(query_object[0], distance_func='cosine') dedup.add(query_object[1]) for c in candidates: candidate_key = c[0][ 1] # warc id is appended as extra data in lsh.index() if candidate_key == query_object[1]: continue candidate_distance = c[1] if float(candidate_distance) >= threshold: dedup.add(candidate_key) elif candidate_key in dedup: dedup.remove(candidate_key) file = warc.open(filenameIn + '_dedup.warc.gz', 'wb') numSingle = len(dedup) for i in range(0, max_files): with gzip.open(datasetPath + filenameIn + str(i) + '.warc.gz', mode='rb') as gzf: for record in warc.WARCFile(fileobj=gzf): record_id = record['WARC-Record-ID'] if record_id in dedup: payload = record.payload.read() file.write_record( warc.WARCRecord(payload=payload, headers=record.header)) print 'Total pages: ' + str(doc_count) print 'Pages after deduplication: ' + str(numSingle) file.close()
] f = warc.open("test.warc.gz", "w") for u in urls: fp = urllib.request.urlopen(u) mybytes = fp.read() mystr = mybytes.decode("utf8") fp.close() header = h = warc.WARCHeader({"WARC-Type": "response"}, defaults=True) header['WARC-Target-URI:'] = u record = warc.WARCRecord(header, mybytes) f.write_record(record) f.close() for u in urls: f = warc.open("test_trozos.warc.gz", "a") fp = urllib.request.urlopen(u) mybytes = fp.read() mystr = mybytes.decode("utf8") fp.close() header = h = warc.WARCHeader({"WARC-Type": "response"}, defaults=True) header['WARC-Target-URI:'] = u
oparser = argparse.ArgumentParser( description= "Script that takes a list of file paths from HTTrack crawled folder") options = oparser.parse_args() reader = sys.stdin for line in reader: filepath = line.strip() content = None url = None with open(filepath, 'rb') as content_file: content = content_file.read() for line in content.split(b"\n"): if re.search(rb'<!-- Mirrored from ', line): url = re.sub(rb'.*<!-- Mirrored from ', b'', re.sub(rb' by HTTrack Website Copier.*', b'', line)) break if url == None: warc_record = warc.WARCRecord(payload=content, headers={"WARC-Target-URI": "unknown"}) else: warc_record = warc.WARCRecord( payload=content, headers={"WARC-Target-URI": url.decode("utf8")}) warc_record = warc.WARCRecord( payload=content, headers={"WARC-Target-URI": url.decode("utf8")}) f = warc.WARCFile(fileobj=sys.stdout.buffer) f.write_record(warc_record)
flag = 0 f = warc.WARCFile(warcFile, "rb") try: for record in f: if flag == 0: R = createWarcInfoReacord(newFile) filePtr.write_record(R) flag = 1 if ("warcinfo" in record['WARC-Type']): New_Payload = record.payload.read().strip( ) + "\r\n" + "WARC-appended-by-WARCMerge: " + datetime.datetime.utcnow( ).strftime('%Y-%m-%dT%H:%M:%SZ') + "\r\n" record['Content-Length'] = str(len(New_Payload)) R = warc.WARCRecord(record.header, New_Payload, defaults=False) else: R = warc.WARCRecord(payload=record.payload.read(), headers=record.header, defaults=False) filePtr.write_record(R) if quietMode == False: print '[Yes]' + warcFile except Exception as e: #print("Exceptionq: %s"%(str(e))) if quietMode == False: print '[No]' + warcFile pass filePtr.close() outputFileSize = os.path.getsize(newFileFullPath) / forConvertToMB
def process(self, item): # assert that this item is flagged for sampling. If not, # return immediately. We don't want to butcher uploads that # have been determined to be worth saving in their original # state. # # Presumably, the tracker is tagging these items as something # appropriate. Alternately, one could create a "Phase 3" grab # and know for a fact that we are only receiving videos that # should be sampled. In which case, one may skip the item_type # check and proceed directly to sampling. item_name = item['item_name'] item_type, item_value = item_name.split(':', 1) item['item_type'] = item_type item['item_value'] = item_value assert item_type in ('video-bulk', 'url-bulk') # Item type is not marked as "video-bulk" from tracker. # Carry on. Nothing to do here. if item_type != 'video-bulk' or 'url-bulk': return # ok. This is an item that needs to be sampled. # remember where we started from so we can get back there and # not mess up the expectations for the rest of stages in the # pipeline original_path = os.getcwd() # get to item_dir ; begin work os.chdir(item['item_dir']) # we will need some data from the warcfile warcinfo_record_ID = "" metadata_record_ID = "" truncated_record_ID = "" # set up old and new warc files for reading and writing, respectively. # If a file ends in *.gz for writing, the warc library handles gz # compression transparently. old_warc_file = warc.open("%(warc_file_base)s.warc.gz" % item) new_warc_file = warc.open( "%(warc_file_base)s-POSTPROCESSED.warc.gz" % item, "w") # ------------------------ Start of main for loop -------------------# # and here... we... go for record in old_warc_file: # Firstly, we detect whether the record we're iterating over holds # data we'll need later. If so, behave appropriately. After the # if-elif-elif dance, we proceed to copy each record into a new # record in the %(warc_file_base)s-POSTPROCESSED.warc.gz file, # modifying as necesary (truncated long records, etc) # ------------------------ Check for data -------------------------# # Grab the lengthy payload (the flv file); if the content-length is # longer than ~5MiB, and the record is of the "response" type, then # this record *probably* has the flv file. if ((long(record['Content-Length']) >= 5000000) and record['WARC-Type'] == "response"): # need the record id of the original flv record. Will refernece # it in truncated record. truncated_record_id = record['warc-record-id'] # add "WARC-Truncated" to this record, indicating that it has # been truncated due to length. record['warc-truncated'] = "length" # extract the payload tempfile = open("intermediate.int", 'wb') for line in record.payload: tempfile.write(line) tempfile.close() # put the payload back; iterating through record.payload # invokes a generator on the payload that seems to # "eat up" the payload in the original file. I say so because # attempting to, say, write the payload out twice (to TWO files) # will fail, as will any attempt to read out the payload again # without first "putting it back." (I'd love an explanation for # just what's going on here; but for now, this hack works) # (for the record with the long content-length, we end up reading # the payload twice; once here, to get it to a separate file, and # once again, in COPY PAYLOAD, to write out a truncated version to # the new warc file) stream = StringIO(open("intermediate.dat", 'rb').read()) stream.seek(0, os.SEEK_END) streamlength = stream.tell() stream.seek(0) record.payload = warc.utils.FilePart(fileobj=stream, length=streamlength) # can't close the stream yet for some reason. This might # introduce leaks of some sort, so keep an eye on it. # The relevant error: "IO Operation on a closed file." # I suspect this operation occurs somewhere in the warc library, # and i'm hoping that the stream object just falls out of scope # at some point other than when the entire pipeline shuts down. # stream.close() # Adjust the warcinfo record to note that we also utilized ffmpeg elif (record['WARC-Type'] == "warcinfo"): # grab the record-id for later use in resource records warcinfo_record_ID = record['warc-record-id'] # gotta add another "software" key to the content-block of the # warcinfo record that indicates the use of ffmpeg. warcinfo_stream = StringIO() for line in record.payload: warcinfo_stream.write(line) # trailing \r\n\r\n is already present in the payload; just seek back # two bytes (yes, the second \r\n will get clobbered; potential unicode # byte-length issues here) and then tack on the additional lines you # need to like so: warcinfo_stream.seek(-2, os.SEEK_END) warcinfo_stream.write("software: ffmpeg/2.3.1\r\n\r\n") warcinfo_stream.seek(0, os.SEEK_END) warcinfo_stream_len = warcinfo_stream.tell() warcinfo_stream.seek(0) record.payload = warc.utils.FilePart( fileobj=warcinfo_stream, length=warcinfo_stream_len) # Get the metadata record's warc-record-id for later resource # records. elif (record['WARC-Type'] == "metadata"): metadata_record_ID = record['warc-record-id'] # End of conditionals. Proceed to write the new record to the # post-processed warcfile. # ------------------------ Copy Record -------------------------# # COPY HEADER # Should we add defaults=False ? It seems that some additional headers # are added in WARCHeader as well as WARCRecord. However, they don't # seem harmful: digests and timestamps. new_header = warc.WARCHeader(record.header) # COPY PAYLOAD # if the current record gets truncated, then set the content-length # to the new, truncated length as per spec. truncated_flag = None # SHORT record payloads if long(record['content-length']) < 500000: #print "Copying payload..." new_payload = StringIO() for line in record.payload: new_payload.write(line) #if we don't seek back to 0, new_payload.read() is empty new_payload.seek(0) #print "Done copying payload." # LONG record payloads (the one that probably has video data) else: #print "Found long content-length. Truncating..." new_payload = StringIO() decrement = 25 #Grab some lines #print "Gonna grab some lines. Decrement: ", decrement for line in record.payload: #print "Grabbing a line." new_payload.write(line) decrement -= 1 #print "Decrement: ", decrement if decrement == 0: break # be kind: rewind new_payload.seek(0) truncated_flag = True #print "Done truncating." # CREATE RECORD FROM HEADER AND PAYLOAD new_rec = warc.WARCRecord(payload=new_payload.read(), headers=new_header, defaults=False) # if this record happened to be one that got truncated, then we # need to adjust its content-length header. if truncated_flag: #print "Adjusting content-length header" # From page 9 of the ISO WARC Standard: # # "The WARC-Truncated field may be used on any WARC record. The WARC # field Content-Length shall still report the actual truncated size of # the record block." # Get the length of the truncated content-block and set # Content-Length header appropriately new_payload.seek(0) new_payload.seek(0, os.SEEK_END) thelength = new_payload.tell() new_rec['content-length'] = str(thelength) new_payload.seek(0) # WRITE THE NEW RECORD OUT TO THE NEW WARCFILE # (the warc library handles the gz-compression and putting each record # in a separate gz "member" transparently; no need to much with the gzip # library ourselves) #print "Copying record to new .warc.gz" new_warc_file.write_record(new_rec) #print "Done copying record to new .warc.gz" #print "\n\n" #------------------------ END OF MAIN FOR LOOP ------------------------# # at this point, we have a new warcfile with copied and truncated # records; now, we need to sample the content and add these "conversion" # records to the warc file. # Should probably delete old warc at this point, since new warcfile has all # of the old records, and we've already got another copy of the main # payload. If we proceed to write out the full newfile with the shrunken # payload before deleting the old warc, we'll basically be using nearly # 3x the interim diskspace rather than 2x. (Don't get me wrong, I'd love # to have more of a generator-like setup that negates the need to use # twice the disk space, but it's beyond the scope of my abilities at the # moment and I don't think I'd be able to get up to speed before the # deadline for this project drops (August 27 2014) Update: LOL Twitch is # already deleting things on August 26; oh well, I suppose this code # could come in handy if the IA suddenly needs to compress lots of # material) # Now, we need to convert the flv, and add conversion records # Our "payload.flv" is not quite an flv yet; the payload still includes the # HTTP Response headers. We need to grep for "CRLFCRLF" and then chop off # anything prior to it, including it, leaving nothing but the flv file for # ffmpeg to work with. thefile = open("intermediate.int").read() # NOT A FILE; just a "str" theflv = thefile.split('\r\n\r\n')[1] writetheflv = open("samplethis.flv", "w") writetheflv.write(theflv) writetheflv.close() # Get Snapshots SnapShot() # Get shrinked video ShrinkRay() # Clean up print( "********************* \n\n Removing temporary files; cleaning up \n\n*********************" ) # remove original file intermediates: "intermediate.int" and "samplethis.flv" rmargs = shlex.split("rm intermediate.int samplethis.flv") call(rmargs) # And we're done! new_warc_file.close() os.chdir(original_path)
user = subprocess.check_output("echo $USER", shell=True).decode('utf-8') body = "robots: classic\r\nhostname: " + str( hostname) + "software: page_downloader.py\r\nisPartOf: Cs_media\r\n" body += "operator: " + str( user ) + "description: Downloading pages\r\npublisher: KNOT (https://knot.fit.vutbr.cz/)\r\n" body += "format: WARC File Format 1.0\r\nconformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf\r\n" warc_header = warc.WARCHeader( { "WARC-Type": "warcinfo", "WARC-Filename": settings['output'][0] }, defaults=True) warc_record = warc.WARCRecord(warc_header, body.encode()) warc_record.write_to(out) for page in generator: warc_header = warc.WARCHeader( { "WARC-Type": "response", "WARC-Target-URI": page['url'] }, defaults=True) response = page['response'] if not (response.endswith('\r\n\r\n')): response += '\r\n\r\n' warc_record = warc.WARCRecord(warc_header, (response + page['content']).encode( 'utf-8', 'replace'))
import gzip domain = str(sys.argv[1]) max = int(sys.argv[2]) per_warc = 100 / max filename_sample = '../' + domain + '_sample.warc.gz' for i in range(0, max): count = 0 # filename = '../dataset/'+domain+'.com'+str(i)+'.warc.gz' filename = '../samples/' + domain + '_sample.warc.gz' print 'Load' + filename try: with gzip.open(filename, 'rb') as gfz: ''' Load file ''' contents = [(warc.WARCRecord(payload=record.payload.read(), headers=record.header)) for record in warc.WARCFile(fileobj=gfz)] l = len(contents) except: continue ''' select records randomly ''' print 'select' f_sample = warc.open(filename_sample, 'a') while count < per_warc: rand = random.randint(0, l - 1) sys.stdout.write("\rRecord count %i" % count) sys.stdout.flush() r = contents[rand] #pre = preprocessing.HTMLPreprocessing(r.payload) payload = r.payload r['Content-Length'] = str(len(payload))