def mapper(self, _, line): f = None #if self.options.runner in ['inline']: # print self.options.runner + "lol" # print 'Loading local file {}'.format(line) # f = warc.WARCFile(fileobj=gzip.open(line)) #else: conn = boto.connect_s3(anon=True) pds = conn.get_bucket('aws-publicdatasets') k = Key(pds, line) f = warc.WARCFile(fileobj=GzipStreamFile(k)) for i, record in enumerate(f): if record['Content-Type'] == 'application/http; msgtype=response': payload = record.payload.read() headers, body = payload.split('\r\n\r\n', 1) data = [] #data = data + Detector().check_headers(headers) data = data + Detector().check_script(body) data = data + Detector().check_html(body) data = { "tech": data, "url": record.url, "date": record.date, "domain": urlparse(record.url).netloc } yield data, 1
def mapper(self, _, line): f = None """ if self.options.runner in ['inline']: print self.options.runner + "lol" print 'Loading local file {}'.format(line) f = warc.WARCFile(fileobj=gzip.open(line)) else: """ conn = boto.connect_s3(anon=True) pds = conn.get_bucket('aws-publicdatasets') k = Key(pds, line) f = warc.WARCFile(fileobj=GzipStreamFile(k)) for i, record in enumerate(f): if record['Content-Type'] == 'application/http; msgtype=response': payload = record.payload.read() headers, body = payload.split('\r\n\r\n', 1) email = "email" p = re.compile(EMAIL_REGEX) emails = [i for i in re.findall(p, body) if len(i) < 50] emails = set(emails) domain = urlparse(record.url).netloc for email in emails: yield { "url": record.url, "date": record.date, "email": email, "domain": domain }, 1
def open_dump(self): """ Returns a file-like object for the dump """ if config["TESTDATA"] == "1": return open(self.dump_testdata, "rb") else: hdr = { 'User-Agent': 'Mozilla/5.0 (compatible; commonBot; +https://about.commonsearch.org)' } req = urllib2.Request(self.dump_url, headers=hdr) f = urllib2.urlopen(req) if self.dump_compression == "zip": file_name = self.dump_compression_params[0] # TODO: is there a more efficient way of doing this? the file object passed to ZipFile # need to support .seek() zfile = zipfile.ZipFile(StringIO.StringIO(f.read())) return StringIO.StringIO(zfile.read(file_name)) elif self.dump_compression == "gz": f.__dict__["closed"] = False # Hack for GzipStreamFile return GzipStreamFile(f) else: return f
def mapWat(self, _, line): ''' Takes partial WARC paths and produces (hostname, {links}) pairs ''' if self.options.localsource: # Stream data from local file # this lets us use pre-downloaded *.gz files for testing rather than # hammering the amazon servers. fpath = os.path.abspath( os.path.join(self.options.localsource, line)) print('Loading local file: ' + fpath) rawstream = open(fpath, 'rb') else: # Stream data from common crawl servers conn = boto.connect_s3(anon=True, host='s3.amazonaws.com') pds = conn.get_bucket('commoncrawl') rawstream = boto.s3.key.Key(pds, line) # iterate through records in warc.wat.gz file warcstream = warc.WARCFile(fileobj=GzipStreamFile(rawstream)) for i, record in enumerate(warcstream): if record['Content-Type'] == 'application/json': payload = record.payload.read() jsonPayload = json.loads(payload) hostlinks = self.watHostLinks(jsonPayload) if hostlinks: yield hostlinks if self.options.localsource and i % 10000 == 0: print('Record %5dk' % (i / 1000)) self.increment_counter('commoncrawl', 'processed_records', 1) rawstream.close()
def open_warc_file(filename, from_commoncrawl=True): """ Opens a WARC file from local-data or S3 for Common Crawl files """ local_data_file = os.path.join(config["PATH_BACK"], 'local-data/%s' % filename) if not from_commoncrawl: filereader = open(filename, "rb") elif os.path.isfile(local_data_file): filereader = open(local_data_file, "rb") else: conn = boto.s3.connect_to_region( "us-east-1", anon=True, calling_format=boto.s3.connection.OrdinaryCallingFormat(), is_secure=False) pds = conn.get_bucket('aws-publicdatasets') filereader = Key(pds) filereader.key = filename if filename.endswith(".warc"): return warc.WARCFile(fileobj=filereader) else: # TODO: investigate how we could use cloudflare's zlib return warc.WARCFile(fileobj=GzipStreamFile(filereader))
def get_records(id_, iterator): conn = boto.connect_s3(anon=True, host='s3.amazonaws.com') bucket = conn.get_bucket('commoncrawl') for uri in iterator: key_ = Key(bucket, uri) _file = warc.WARCFile(fileobj=GzipStreamFile(key_)) for record in _file: if record['Content-Type'] == 'application/json': record = json.loads(record.payload.read()) try: def cc_filter(x): return "creativecommons.org" in x['url'] cc_links = filter( cc_filter, list(record['Envelope']['Payload-Metadata'] ['HTTP-Response-Metadata']['HTML-Metadata'] ['Links'])) if len(cc_links) > 0: yield record except KeyError: pass
def map_warc_files(self, _, line): """Mapper function to process each WARC file. Args: line: Each line is a path to a WARC gz file to be processed. Returns: Generator of (key, value) tuples. """ f = None # If we are on EC2 or running on a Hadoop cluster, pull files via S3 if self.options.runner in ['emr', 'hadoop']: # Connect to Amazon S3. s3 = boto3.resource('s3') obj = s3.Object('commoncrawl', line) # Hack to get the raw stream out of obj: # http://stackoverflow.com/questions/7624900/how-can-i-use-boto-to-stream-a-file-out-of-amazon-s3-to-rackspace-cloudfiles f = warc.WARCFile(fileobj=GzipStreamFile(obj.get()['Body']._raw_stream)) # If we are local, use files on the local file system else: line = Path.join(Path.abspath(Path.dirname(__file__)), line) print 'Loading local file {}'.format(line) f = warc.WARCFile(fileobj=gzip.open(line)) # For each WARC record: for i, record in enumerate(f): for key, value in self.process_warc_record(record): yield key, value self.increment_counter('commoncrawl', 'num-files', 1)
def _warc_reader_from_file(self, filereader, filepath): """ Creates a WARC record iterator from a file reader """ if filepath.endswith(".warc"): return warc.WARCFile(fileobj=filereader) else: # TODO: investigate how we could use cloudflare's zlib return warc.WARCFile(fileobj=GzipStreamFile(filereader))
def mapper(self, _, line): """ The map will download the file from commoncrawl, parse the file into multiple records, and process each record """ self.start_time = time.time() # Connect to Amazon S3 using anonymous credentials boto_config = botocore.client.Config( signature_version=botocore.UNSIGNED, read_timeout=180, retries={'max_attempts': 20}) s3client = boto3.client('s3', config=boto_config) # Check bucket existence try: s3client.head_bucket(Bucket='commoncrawl') except botocore.exceptions.ClientError as exception: LOG.error('Failed to access bucket "commoncrawl": %s', exception) return # Check if the input exists try: s3client.head_object(Bucket='commoncrawl', Key=line) except botocore.client.ClientError as exception: LOG.error('Input not found: %s', line) return # Download input sys.stderr.write("Downloading s3://commoncrawl/{}\n".format(line)) sys.stderr.write( time.strftime( "Download [START]. Distance from initial time: %Hh:%Mm:%Ss\n", time.gmtime(time.time() - self.start_time))) try: temp = TemporaryFile(mode='w+b', dir=self.options.s3_local_temp_dir) s3client.download_fileobj('commoncrawl', line, temp) except botocore.client.ClientError as exception: LOG.error('Failed to download %s: %s', line, exception) return sys.stderr.write( time.strftime( "Download [FINISHED]. Distance from initial time: %Hh:%Mm:%Ss\n", time.gmtime(time.time() - self.start_time))) temp.seek(0) ccfile = warc.WARCFile(fileobj=(GzipStreamFile(temp))) sys.stderr.write('Attempting MapReduce Job......\n') sys.stderr.write( time.strftime( "Processing [START]. Distance from initial time: %Hh:%Mm:%Ss\n", time.gmtime(time.time() - self.start_time))) for _i, record in enumerate(ccfile): for key, value in self.process_record(record): yield key, value self.increment_counter('commoncrawl', 'processed_records', 1) sys.stderr.write( time.strftime( "Processing [FINISHED]. Distance from initial time: %Hh:%Mm:%Ss\n", time.gmtime(time.time() - self.start_time)))
def mapper(self, _, line): """ The Map of MapReduce If you're using Hadoop or EMR, it pulls the Common Crawl files from S3, otherwise it pulls from the local filesystem. Dispatches each file to `process_record`. """ # If we're on EC2 or running on a Hadoop cluster, pull files via S3 if self.options.runner in ['emr', 'hadoop']: # Connect to Amazon S3 using anonymous credentials boto_config = botocore.client.Config( signature_version=botocore.UNSIGNED, read_timeout=180, retries={'max_attempts': 20}) if self.options.bucket != 'commoncrawl': # use defaults if data is read from a custom bucket boto_config = botocore.client.Config() s3client = boto3.client('s3', config=boto_config) # Verify bucket try: s3client.head_bucket(Bucket=self.options.bucket) except botocore.exceptions.ClientError as exception: LOG.error('Failed to access bucket "%s": %s', self.options.bucket, exception) return # Check whether WARC/WAT/WET input exists try: s3client.head_object(Bucket=self.options.bucket, Key=line) except botocore.client.ClientError as exception: LOG.error('Input not found: %s', line) return # Start a connection to one of the WARC/WAT/WET files LOG.info('Loading s3://%s/%s', self.options.bucket, line) try: temp = TemporaryFile(mode='w+b', dir=self.options.s3_local_temp_dir) s3client.download_fileobj(self.options.bucket, line, temp) except botocore.client.ClientError as exception: LOG.error('Failed to download %s: %s', line, exception) return temp.seek(0) ccfile = warc.WARCFile(fileobj=(GzipStreamFile(temp))) # If we're local, use files on the local file system else: line = Path.join(Path.abspath(Path.dirname(__file__)), line) LOG.info('Loading local file %s', line) ccfile = warc.WARCFile(fileobj=gzip.open(line)) for _i, record in enumerate(ccfile): for key, value in self.process_record(record): yield key, value self.increment_counter('commoncrawl', 'processed_records', 1)
def process_paths(self, id_, paths): ''' connect to s3 and get the data ''' conn = boto.connect_s3(anon=True, host='s3.amazonaws.com') bucket = conn.get_bucket('commoncrawl') for uri in paths: key_ = Key(bucket, uri) archive_iterator = warc.WARCFile(fileobj=GzipStreamFile(key_)) for record in archive_iterator: for res in self.process_record(record): yield res
def parse_archive(self, line): # Connect to Amazon S3 using anonymous credentials conn = boto.connect_s3(anon=True) pds = conn.get_bucket('aws-publicdatasets') # Start a connection to one of the WARC files k = Key(pds, line) f = warc.WARCFile(fileobj=GzipStreamFile(k)) for record in f: if record['Content-Type'] != 'application/http; msgtype=response': continue self.doc_q.put(record.payload.read()) self.count += 1
def main(): # Parse command line arguments parser = argparse.ArgumentParser(description=__name__) parser.add_argument('--source-url', '-u', help='Remote URL to read input WARC file from.') parser.add_argument('--source-file', '-f', help='Local path to read input WARC file from.') args = parser.parse_args() # Validate argumentsif(args.command in ['setup-pool'] and args.pool_directory == None): if not (args.source_file or args.source_url): parser.error( "--source-file or --source-url argument must be provided.") if args.source_file is not None: source_string = args.source_file cf = open(args.source_file) elif args.source_url is not None: source_string = args.source_url # Open a connection pool http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED', ca_certs=certifi.where()) # Open a streaming connection to the specified URL cf = http.request('GET', args.source_url, preload_content=False) # Wrap the filestream in a streamable unzipper f = warc.WARCFile(fileobj=GzipStreamFile(cf)) warc_records = 0 warc_responses = 0 readable_pages = 0 report_interval = 100 start_time = arrow.utcnow() for record in f: if record['WARC-Type'] == 'response': warc_responses = warc_responses + 1 end_time = arrow.utcnow() elapsed_time = end_time - start_time print("{} response records in file {} ()".format(warc_responses, source_string, elapsed_time))
def mapper(self, _, line): f = None ## If we're on EC2 or running on a Hadoop cluster, pull files via S3 if self.options.runner in ['emr', 'hadoop']: # Connect to Amazon S3 using anonymous credentials conn = boto.connect_s3(anon=True) pds = conn.get_bucket('aws-publicdatasets') # Start a connection to one of the WARC files k = Key(pds, line) f = warc.WARCFile(fileobj=GzipStreamFile(k)) ## If we're local, use files on the local file system else: print 'Loading local file {}'.format(line) f = warc.WARCFile(fileobj=gzip.open(line)) ### for i, record in enumerate(f): for key, value in self.process_record(record): yield key, value self.increment_counter('commoncrawl', 'processed_records', 1)
def get_partial_warc_file(url): """ we use the incredible gzipstreamfile module because of limitations with WARC Python module. Seriously, if the module won't have existed, this task would have been impossible. Thanks a lot to the creator https://github.com/commoncrawl/gzipstream return: warc.WARCFile instance """ conn = boto.connect_s3(anon=True) pds = conn.get_bucket('aws-publicdatasets') # Start a connection to one of the WARC files k = Key(pds) k.key = url wf = warc.WARCFile(fileobj=GzipStreamFile(k)) for num, record in enumerate(wf): try: print 'On Record {0}'.format(num) payload = record.payload.read() if payload[0] == "{": r = json.loads(payload) description = '' title = r['Envelope']['Payload-Metadata'][ 'HTTP-Response-Metadata']['HTML-Metadata']['Head'][ 'Title'].encode('utf-8') for x in r['Envelope']['Payload-Metadata'][ 'HTTP-Response-Metadata']['HTML-Metadata']['Head'][ 'Metas']: if x['name'] == 'description': description = x['content'] uri = r['Envelope']['WARC-Header-Metadata']['WARC-Target-URI'] with open('output.csv', 'ab') as f: writer = csv.writer(f) writer.writerow([title, description, uri]) except: pass
def mapper(self, _, line): ## Connect to Amazon S3 using anonymous credentials conn = boto.connect_s3(anon=True) pds = conn.get_bucket('aws-publicdatasets') ## Start a connection to one of the WARC files k = Key(pds, line) f = warc.WARCFile(fileobj=GzipStreamFile(k)) ### for i, record in enumerate(f): # WARC records have three different types: # ["application/warc-fields", "application/http; msgtype=request", "application/http; msgtype=response"] # We're only interested in the HTTP responses if record['Content-Type'] == 'application/http; msgtype=response': payload = record.payload.read() # The HTTP response is defined by a specification: first part is headers (metadata) # and then following two CRLFs (newlines) has the data for the response headers, body = payload.split('\r\n\r\n', 1) if 'Content-Type: text/html' in headers: # We avoid creating a new Counter for each page as that's actually quite slow tag_count = get_tag_count(body) for tag, count in tag_count.items(): yield tag, count self.increment_counter('commoncrawl', 'processed_pages', 1)
def mapper(self, _, line): """ Override default mapper. Not yielding anything """ # Connect to Amazon S3 using anonymous credentials boto_config = botocore.client.Config( signature_version=botocore.UNSIGNED, read_timeout=180, retries={'max_attempts': 20}) s3client = boto3.client('s3', config=boto_config) # Check if the bucket exist try: s3client.head_bucket(Bucket='commoncrawl') except botocore.exceptions.ClientError as exception: LOG.error('Failed to access bucket "commoncrawl": %s', exception) return # Check if the input exists try: s3client.head_object(Bucket='commoncrawl', Key=line) except botocore.client.ClientError as exception: LOG.error('Input not found: %s', line) return # Download input files LOG.info('Downloading s3://commoncrawl/%s', line) try: temp = TemporaryFile(mode='w+b', dir=self.options.s3_local_temp_dir) s3client.download_fileobj('commoncrawl', line, temp) except botocore.client.ClientError as exception: LOG.error('Failed to download %s: %s', line, exception) return temp.seek(0) ccfile = warc.WARCFile(fileobj=(GzipStreamFile(temp))) LOG.info('Attempting MapReduce Job......') for _i, record in enumerate(ccfile): #don't yield the result self.process_record(record) self.increment_counter('commoncrawl', 'processed_records', 1)
def mapper(self, _, line): f = None ## If we're on EC2 or running on a Hadoop cluster, pull files via S3 if line.startswith("s3://"): print('Downloading ...',file=sys.stderr) key = None # Connect to Amazon S3 using anonymous credentials conn = boto.connect_s3(anon=True) if line.startswith("s3://"): pathStart = line.index('/',5) bucketName = line[5:pathStart] keyPath = line[pathStart+1:] print("Bucket: "+bucketName,file=sys.stderr) print("Key: "+keyPath,file=sys.stderr) bucket = conn.get_bucket(bucketName) key = Key(bucket,keyPath) else: print("Bucket: aws-publicdatasets",file=sys.stderr) print("Key: "+line,file=sys.stderr) bucket = conn.get_bucket("aws-publicdatasets") key = Key(bucket,line) # Start a connection to one of the WARC files f = warc.WARCFile(fileobj=GzipStreamFile(key)) ## If we're local, use files on the local file system else: if line.startswith("file:///"): line = line[7:] print("Local: {}".format(line),file=sys.stderr) f = warc.WARCFile(fileobj=gzip.open(line)) ### for i, record in enumerate(f): for key, value in self.process_record(record): yield key, value self.increment_counter('commoncrawl', 'processed_records', 1)
def open_dump(self): """ Returns a file-like object for the dump """ if config["TESTDATA"] == "1": return open(self.dump_testdata, "rb") else: f = urllib2.urlopen(self.dump_url) if self.dump_compression == "zip": file_name = self.dump_compression_params[0] # TODO: is there a more efficient way of doing this? the file object passed to ZipFile # need to support .seek() zfile = zipfile.ZipFile(StringIO.StringIO(f.read())) return StringIO.StringIO(zfile.read(file_name)) elif self.dump_compression == "gz": f.__dict__["closed"] = False # Hack for GzipStreamFile return GzipStreamFile(f) else: return f
conn = boto.connect_s3(anon=True, debug=2) bucket = conn.get_bucket('commoncrawl') list1 = bucket.list(prefix="crawl-data/CC-MAIN") #list1=bucket.get_all_keys(maxkeys=0) lookup = raw_input("Enter Lookup") for key in list1: #print key #print dir(key) #print key.name if "wet" in key.name and lookup in key.name: print key if key.name in dicta: continue try: for l in GzipStreamFile(key): #print l result = prog.findall(l) for r in result: #print l #print r #raw_input() domain = r[0].split("@")[1] table.insert( dict(domain=unidecode(domain), email=unidecode(r[0]), text=unidecode(l))) except: time.sleep(60) conn = boto.connect_s3(anon=True, debug=2) bucket = conn.get_bucket('commoncrawl')
def main(): # Parse command line arguments parser = argparse.ArgumentParser(description=__name__) parser.add_argument('--source-url', '-u', help='Remote URL to read input WARC file from.') parser.add_argument('--source-file', '-f', help='Local path to read input WARC file from.') parser.add_argument('--output-dir', '-o', help='Directory to write processed web pages to.') parser.add_argument( '--max-pages', '-m', type=int, help='Maximum number of web pages to process from WARC file.') args = parser.parse_args() # Validate argumentsif(args.command in ['setup-pool'] and args.pool_directory == None): if not (args.source_file or args.source_url): parser.error( "--source-file or --source-url argument must be provided.") # Make sure output directories exists original_pages_dir = os.path.join(args.output_dir, 'original') readable_pages_dir = os.path.join(args.output_dir, 'readable') if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) if not os.path.exists(original_pages_dir): os.makedirs(original_pages_dir) if not os.path.exists(readable_pages_dir): os.makedirs(readable_pages_dir) if args.source_file is not None: cf = open(args.source_file) elif args.source_url is not None: # Open a connection pool http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED', ca_certs=certifi.where()) # Open a streaming connection to the specified URL cf = http.request('GET', args.source_url, preload_content=False) # Wrap the filestream in a streamable unzipper f = warc.WARCFile(fileobj=GzipStreamFile(cf)) warc_records = 0 warc_responses = 0 readable_pages = 0 report_interval = 100 start_time = arrow.utcnow() for record in f: if record['WARC-Type'] == 'response': if (args.max_pages and warc_responses >= args.max_pages): print("Reached maximum WARC responses ({})".format( args.max_pages)) break warc_responses = warc_responses + 1 try: id = record.header["WARC-Record-ID"][10:-1] fp = record.payload # Open file using WARC Record ID as filename original_page_path = os.path.join(original_pages_dir, "{}.txt".format(id)) readable_page_path = os.path.join(readable_pages_dir, "{}.txt".format(id)) with open(original_page_path, 'w') as fout: while True: # Discard Header rows line = fp.readline() # Header rows are separated from page contents by a blank line if line == "\r\n": break # Write page contents to file fout.write(fp.read()) # Process page with readability script subprocess.check_call([ 'node', 'page_to_readable_page.js', original_page_path, readable_page_path ]) readable_pages = readable_pages + 1 # TODO: Persist file to blob storage and remove readable file except: pass # Clean up files created during processing try: os.remove(original_page_path) except: pass if warc_responses % report_interval == 0: print("Processed {} WARC pages ({} readable pages)".format( warc_responses, readable_pages)) end_time = arrow.utcnow() elapsed_time = end_time - start_time print("Processed {} WARC pages ({} readable pages) in {}".format( warc_responses, readable_pages, elapsed_time))
filepath = 'crawl-data/CC-MAIN-2016-50/segments/1480698540409.8/warc/CC-MAIN-20161202170900-00000-ip-10-31-129-80.ec2.internal.warc.gz' # establish anonymous connection to commoncrawl warc file bucket conn = boto.s3.connect_to_region( "us-east-1", anon=True, calling_format=boto.s3.connection.OrdinaryCallingFormat(), is_secure=False) bucket = conn.get_bucket('commoncrawl') #filereader = Key(bucket) filereader = boto.s3.key.Key(bucket) filereader.key = filepath # Updating code to python3 standards, GzipStreamFile is really just GzipFile warc_file = warc.WARCFile(fileobj=GzipStreamFile(filereader)) for record in warc_file: print(record.payload.read()) #warc_file = warc.WARCFile(fileobj=GzipFile(filereader)) #warc_file = warc.WARCFile(fileobj=filereader) #warc_stream = open_warc_stream(partition["path"]) #for record in warc_stream: # print(record.payload.read()) #payload = record.payload.read() #parser = HttpParser() #parser.execute(payload, len(payload)() '''
try: s3client.head_object(Bucket='commoncrawl', Key=line) except botocore.client.ClientError as exception: LOG.error('Input not found: %s', line) # Start a connection to one of the WARC/WAT/WET files LOG.info('Loading s3://commoncrawl/%s', line) try: temp = TemporaryFile( mode='w+b', dir= 'C:\Users\Aditya\Documents\demonstrational\Radii Corporation\common-crawl-extractor' ) s3client.download_fileobj('commoncrawl', line, temp) except botocore.client.ClientError as exception: LOG.error('Failed to download %s: %s', line, exception) temp.seek(0) # The warc library accepts file like objects, so let's use GzipStreamFile ccfile = warc.WARCFile(fileobj=(GzipStreamFile(temp))) for num, record in enumerate(ccfile): if record['WARC-Type'] == 'response': # Imagine we're interested in the URL, the length of content, and any Content-Type strings in there print(record['WARC-Target-URI'], record['Content-Length']) print('\n'.join(x for x in record.payload.read().replace( '\r', '').split('\n\n')[0].split('\n') if 'content-type:' in x.lower())) print('=-=-' * 10) if num > 100: break
def getHeaders(id_, iterator): conn = S3Connection(host="s3.amazonaws.com") bucket = conn.get_bucket("commoncrawl") for uri in iterator: key_ = Key(bucket, uri) file_ = warc.WARCFile(fileobj=GzipStreamFile(key_)) for line in file_: try: data = json.loads(line.payload.read()) #------------------------ BUILD DICTIONARY ------------------------------+ # Purpose: FOR EVERY RESPONSE RECORD IN THE CURRENT WAT FILE, # CODE BLOCK CREATES A DICTIONARY OBJECT retArray CONTAINING # TWO ELEMENTS: # -MD5 HASH OUTPUT OF HOSTNAME # -AN INTEGER, WHICH WHEN DISPLAYED IN BINARY HAS ONE BIT # REPRESENTATIVE OF THE PRESENCE OF EACH PARTICULAR HEADER # IF ANY EXCEPTIONS ARE THROWN, DISREGARD AND CONTINUE. # # Parameters: # -HTTP RESPONSE SECURITY HEADERS FROM CURRENT WAT RECORD # -FLAG BIT VARIABLES REPRESENTATIVE OF EACH HEADER # # Result: DICTIONARY OBJECT REPRESENTING ONE WAT RECORD #------------------------------------------------------------------------+ retArray = [None, 0b000000000000000000000] if (data["Envelope"]["WARC-Header-Metadata"]["WARC-Type"] == "response"): retArray[0] = hashlib.md5( urlparse(data["Envelope"]["WARC-Header-Metadata"].get( "WARC-Target-URI", "")).hostname).digest() if (data["Envelope"]["Payload-Metadata"] ["HTTP-Response-Metadata"]["Headers"].get( "X-XSS-Protection", "") != ""): retArray[1] = retArray[1] | X_XSS_Protection_FLAG if (data["Envelope"]["Payload-Metadata"] ["HTTP-Response-Metadata"]["Headers"].get( "Content-Security-Policy", "") != ""): retArray[ 1] = retArray[1] | Content_Security_Policy_FLAG if (data["Envelope"]["Payload-Metadata"] ["HTTP-Response-Metadata"]["Headers"].get( "X-Content-Security-Policy", "") != ""): retArray[ 1] = retArray[1] | X_Content_Security_Policy_FLAG if (data["Envelope"]["Payload-Metadata"] ["HTTP-Response-Metadata"]["Headers"].get( "X-Frame-Options", "") != ""): retArray[1] = retArray[1] | X_Frame_Options_FLAG if (data["Envelope"]["Payload-Metadata"] ["HTTP-Response-Metadata"]["Headers"].get( "Strict-Transport-Security", "") != ""): retArray[ 1] = retArray[1] | Strict_Transport_Security_FLAG if (data["Envelope"]["Payload-Metadata"] ["HTTP-Response-Metadata"]["Headers"].get( "X-Content-Type-Options", "") != ""): retArray[1] = retArray[1] | X_Content_Type_Options_FLAG if (data["Envelope"]["Payload-Metadata"] ["HTTP-Response-Metadata"]["Headers"].get( "X-Download-Options", "") != ""): retArray[1] = retArray[1] | X_Download_Options_FLAG if (data["Envelope"]["Payload-Metadata"] ["HTTP-Response-Metadata"]["Headers"].get( "X-Permitted-Cross-Domain-Policies", "") != ""): retArray[1] = retArray[ 1] | X_Permitted_Cross_Domain_Policies_FLAG if (data["Envelope"]["Payload-Metadata"] ["HTTP-Response-Metadata"]["Headers"].get( "Expect-CT", "") != ""): retArray[1] = retArray[1] | Expect_CT_FLAG if (data["Envelope"]["Payload-Metadata"] ["HTTP-Response-Metadata"]["Headers"].get( "Feature-Policy", "") != ""): retArray[1] = retArray[1] | Feature_Policy_FLAG if (data["Envelope"]["Payload-Metadata"] ["HTTP-Response-Metadata"]["Headers"].get( "Referrer-Policy", "") != ""): retArray[1] = retArray[1] | Referrer_Policy_FLAG if (data["Envelope"]["Payload-Metadata"] ["HTTP-Response-Metadata"]["Headers"].get( "X-Public-Key-Pins", "") != ""): retArray[1] = retArray[1] | X_Public_Key_Pins_FLAG if (data["Envelope"]["Payload-Metadata"] ["HTTP-Response-Metadata"]["Headers"].get( "X-Public-Key-Pins-Report-Only", "") != ""): retArray[1] = retArray[ 1] | X_Public_Key_Pins_Report_Only_FLAG if (data["Envelope"]["Payload-Metadata"] ["HTTP-Response-Metadata"]["Headers"].get( "Public-Key-Pins", "") != ""): retArray[1] = retArray[1] | Public_Key_Pins_FLAG if (data["Envelope"]["Payload-Metadata"] ["HTTP-Response-Metadata"]["Headers"].get( "Public-Key-Pins-Report-Only", "") != ""): retArray[ 1] = retArray[1] | Public_Key_Pins_Report_Only_FLAG if (data["Envelope"]["Payload-Metadata"] ["HTTP-Response-Metadata"]["Headers"].get( "Access-Control-Allow-Origin", "") != ""): retArray[ 1] = retArray[1] | Access_Control_Allow_Origin_FLAG if (data["Envelope"]["Payload-Metadata"] ["HTTP-Response-Metadata"]["Headers"].get( "Access-Control-Allow-Credentials", "") != ""): retArray[1] = retArray[ 1] | Access_Control_Allow_Credentials_FLAG if (data["Envelope"]["Payload-Metadata"] ["HTTP-Response-Metadata"]["Headers"].get( "Access-Control-Allow-Methods", "") != ""): retArray[1] = retArray[ 1] | Access_Control_Allow_Methods_FLAG if (data["Envelope"]["Payload-Metadata"] ["HTTP-Response-Metadata"]["Headers"].get( "Access-Control-Allow-Headers", "") != ""): retArray[1] = retArray[ 1] | Access_Control_Allow_Headers_FLAG if (data["Envelope"]["Payload-Metadata"] ["HTTP-Response-Metadata"]["Headers"].get( "Access-Control-Expose-Headers", "") != ""): retArray[1] = retArray[ 1] | Access_Control_Expose_Headers_FLAG if (data["Envelope"]["Payload-Metadata"] ["HTTP-Response-Metadata"]["Headers"].get( "Access-Control-Max-Age", "") != ""): retArray[1] = retArray[1] | Access_Control_Max_Age_FLAG yield retArray except ValueError: continue except KeyError: continue except UnboundLocalError: continue
import boto from boto.s3.key import Key from gzipstream import GzipStreamFile import warc if __name__ == '__main__': # Let's use a random gzipped web archive (WARC) file from the 2014-15 Common Crawl dataset ## Connect to Amazon S3 using anonymous credentials conn = boto.connect_s3(anon=True) pds = conn.get_bucket('aws-publicdatasets') ## Start a connection to one of the WARC files k = Key(pds) k.key = 'common-crawl/crawl-data/CC-MAIN-2014-15/segments/1397609521512.15/warc/CC-MAIN-20140416005201-00000-ip-10-147-4-33.ec2.internal.warc.gz' # The warc library accepts file like objects, so let's use GzipStreamFile f = warc.WARCFile(fileobj=GzipStreamFile(k)) for num, record in enumerate(f): if record['WARC-Type'] == 'response': # Imagine we're interested in the URL, the length of content, and any Content-Type strings in there print record['WARC-Target-URI'], record['Content-Length'] print '\n'.join(x for x in record.payload.read().replace( '\r', '').split('\n\n')[0].split('\n') if 'content-type:' in x.lower()) print '=-=-' * 10 if num > 100: break