def run(self): with open(self.warcfile, 'ab') as output: while True: self.lock.acquire() data = self.out_queue.get() writer = WARCWriter(output, gzip=False) headers_list = data[0] http_headers = StatusAndHeaders('{} {}'.format( data[3], data[4]), headers_list, protocol='HTTP/1.0') record = writer.create_warc_record(data[2], 'response', payload=data[1], http_headers=http_headers) h = hashlib.sha1() h.update(record.raw_stream.read(BLOCK_SIZE)) if self.dedup.lookup(h.hexdigest()): record = writer.create_warc_record( data[2], 'revisit', http_headers=http_headers) writer.write_record(record) self.out_queue.task_done() self.lock.release() else: self.dedup.save(h.hexdigest(), data[2]) record.raw_stream.seek(0) writer.write_record(record) self.out_queue.task_done() self.lock.release()
def process(self, item): os.rename('%(item_dir)s/%(warc_file_base)s.warc.gz' % item, '%(data_dir)s/%(warc_file_base)s.warc.gz' % item) os.rename('%(item_dir)s/%(warc_file_base)s_data.txt' % item, '%(data_dir)s/%(warc_file_base)s_data.txt' % item) has_metadata = False with open('%(data_dir)s/%(warc_file_base)s.warc.gz' % item, 'rb') as f: for record in ArchiveIterator(f): if record.rec_type == 'warcinfo': info_id = record.rec_headers.get_header('WARC-Record-ID') for l in record.content_stream().read().split(b'\r\n'): if l.startswith(b'wget-arguments'): wget_arguments = l.split(b':', 1)[1].strip() if record.rec_type == 'resource': has_metadata = True if not has_metadata: with open('%(data_dir)s/%(warc_file_base)s-tail.warc.gz' % item, 'wb') as f: writer = WARCWriter(f, gzip=True) record = writer.create_warc_record( 'metadata://gnu.org/software/wget/warc/MANIFEST.txt', 'resource', payload=io.BytesIO(bytes(info_id, 'utf8') + b'\n'), warc_headers_dict={ 'WARC-Warcinfo-ID': info_id, 'Content-Type': 'text/plain' }) manifest_id = record.rec_headers.get_header('WARC-Record-ID') writer.write_record(record) record = writer.create_warc_record( 'metadata://gnu.org/software/wget/warc/wget_arguments.txt', 'resource', payload=io.BytesIO(wget_arguments + b'\n'), warc_headers_dict={ 'WARC-Warcinfo-ID': info_id, 'WARC-Concurrent-To': manifest_id, 'Content-Type': 'text/plain' }) writer.write_record(record) with open('%(item_dir)s/wget.log' % item, 'rb') as f_log: record = writer.create_warc_record( 'metadata://gnu.org/software/wget/warc/wget.log', 'resource', payload=f_log, warc_headers_dict={ 'WARC-Warcinfo-ID': info_id, 'WARC-Concurrent-To': manifest_id, 'Content-Type': 'text/plain' }) writer.write_record(record) else: open('%(data_dir)s/%(warc_file_base)s-tail.warc.gz' % item, 'w').close() shutil.rmtree('%(item_dir)s' % item)
def fetch_urls_to_warc(urls, warcfile_path): """Fetch urls and write to warc file :urls: list of urls to binary files :warcfile_path: path to a WARC file. """ with open(warcfile_path, 'wb') as output: writer = WARCWriter(output, gzip=True) for url in urls: print(url) resp = requests.get(url, headers={'Accept-Encoding': 'identity'}, stream=True) headers_list = resp.raw.headers.items() http_headers = StatusAndHeaders('200 OK', headers_list, protocol='HTTP/1.0') record = writer.create_warc_record(url, 'response', payload=resp.raw, http_headers=http_headers) writer.write_record(record)
def warc_from_response(response, resolved_url): f_output = BytesIO() writer = WARCWriter(f_output, gzip=True) # Response response_header_items = list(response.headers.to_unicode_dict().items()) response_headers = StatusAndHeaders("200 OK", response_header_items, protocol="HTTP/1.0") response_record = writer.create_warc_record(resolved_url, "response", payload=BytesIO(response.body), http_headers=response_headers) writer.write_record(response_record) # Request request_header_items = list(response.request.headers.to_unicode_dict().items()) request_headers = StatusAndHeaders("200 OK", request_header_items, protocol="HTTP/1.0") request_record = writer.create_warc_record(resolved_url, "request", payload=BytesIO(response.request.body), http_headers=request_headers) request_record.rec_headers.add_header("WARC-Concurrent-To", response_record.rec_headers.get_header("WARC-Record-ID")) writer.write_record(request_record) contents = f_output.getvalue() f_output.close() return contents
def write_memento(self, murl=None): """ This is function to write memento in WARC format. Parameters: murl (str): URI-M Returns: (bool): True on Success and False on Failure """ try: if self.lookup_memento(murl): return True else: response = Utils.get_murl_info(murl, self.__thandle) mpath = self.__memento_dir if not os.path.exists(mpath): os.mkdir(mpath) mpath = os.path.join(mpath, response["handle"].lower()) if not os.path.exists(mpath): os.mkdir(mpath) mpath = os.path.join(mpath, response["domain"]) if not os.path.exists(mpath): os.mkdir(mpath) mpath = os.path.join(mpath, response["archive"]) if not os.path.exists(mpath): os.mkdir(mpath) mpath = os.path.join(mpath, response["wrep"] + response["lang"]) if not os.path.exists(mpath): os.mkdir(mpath) try: mpath = os.path.join(mpath, str(response["timestamp"]) + self.__constants.WARC_EXT) with open(mpath, "wb") as output: writer = WARCWriter(output, gzip=True) resp = requests.get(murl, headers={'Accept-Encoding': 'identity'}, stream=True, timeout=120) # get raw headers from urllib3 headers_list = resp.raw.headers.items() http_headers = StatusAndHeaders('200 OK', headers_list, protocol='HTTP/1.1') record = writer.create_warc_record(mpath, 'response', payload=resp.raw, http_headers=http_headers) writer.write_record(record) return True except requests.exceptions.TooManyRedirects as err: sys.stderr.write(murl + "Too Many redirects" + "\n") except requests.exceptions.ConnectTimeout as err: sys.stderr.write(murl + "Connection Timeout" + "\n") except Exception as e: sys.stderr.write("Memento Write Error: " + str(e) + "URL:" + murl + "\n") except Exception as e: sys.stderr.write("Memento Write Error: " + murl + " " + str(e) + "\n") return False
def run(self): with open(self.warcfile, 'ab') as output: while True: self.lock.acquire() data = self.out_queue.get() writer = WARCWriter(output, gzip=False) headers_list = data[0] http_headers = StatusAndHeaders('{} {}'.format(data[3], data[4]), headers_list, protocol='HTTP/1.0') record = writer.create_warc_record(data[2], 'response', payload=data[1], http_headers=http_headers) h = hashlib.sha1() h.update(record.raw_stream.read(BLOCK_SIZE)) if self.dedup.lookup(h.hexdigest()): record = writer.create_warc_record(data[2], 'revisit', http_headers=http_headers) writer.write_record(record) self.out_queue.task_done() self.lock.release() else: self.dedup.save(h.hexdigest(), data[2]) record.raw_stream.seek(0) writer.write_record(record) self.out_queue.task_done() self.lock.release()
def convert_to_warc(website, filename): with open(filename + '.warc.gz', 'wb') as output: writer = WARCWriter(output, gzip=True) resp = requests.get(website, headers={'Accept-Encoding': 'identity'}, stream=True) # get raw headers from urllib3 headers_list = resp.raw.headers.items() http_headers = StatusAndHeaders('200 OK', headers_list, protocol='HTTP/1.0') record = writer.create_warc_record(website, 'response', payload=resp.raw, http_headers=http_headers) writer.write_record(record)
def errataFix (args): errata = args.errata with args.input as infd, args.output as outfd: writer = WARCWriter (outfd, gzip=True) warcinfo = { 'software': getSoftwareInfo (), 'tool': 'crocoite-errata', # not the name of the cli tool 'parameters': {'errata': [errata.uuid]}, } payload = BytesIO (json.dumps (warcinfo, indent=2).encode ('utf-8')) record = writer.create_warc_record ('', 'warcinfo', payload=payload, warc_headers_dict={'Content-Type': makeContentType (jsonMime, 'utf-8')}) writer.write_record (record) for record in ArchiveIterator (infd): fixedRecord = errata.applyFix (record) writer.write_record (fixedRecord) json.dump (errata.stats, sys.stdout) sys.stdout.write ('\n') sys.stdout.flush ()
def _fetch_warc(self, action_result, url, out_path): with open(out_path, "wb") as output: writer = WARCWriter(output, gzip=True) resp = requests.get(url, headers={"Accept-Encoding": "identity"}, stream=True) # get raw headers from urllib3 headers_list = resp.raw.headers.items() http_headers = StatusAndHeaders("200 OK", headers_list, protocol="HTTP/1.0") record = writer.create_warc_record(url, "response", payload=resp.raw, http_headers=http_headers) writer.write_record(record) return out_path
class WarcDownloader: """ Download URL with HTTP GET, save to a WARC file and return the decoded text """ def __init__(self, expected_filename, _logger, warcinfo_record_data=None, program_name='WebArticleCurator', user_agent=None, overwrite_warc=True, err_threshold=10, known_bad_urls=None, max_no_of_calls_in_period=2, limit_period=1, proxy_url=None, allow_cookies=False, verify_request=True, stay_offline=False): # Store variables self._logger = _logger self._req_headers = { 'Accept-Encoding': 'identity', 'User-agent': user_agent } self._error_count = 0 self._error_threshold = err_threshold # Set the error threshold which cause aborting to prevent deinal # Setup download function if not stay_offline: self.download_url = self._download_url else: self.download_url = self._dummy_download_url if known_bad_urls is not None: # Setup the list of cached bad URLs to prevent trying to download them again with open(known_bad_urls, encoding='UTF-8') as fh: self.bad_urls = {line.strip() for line in fh} else: self.bad_urls = set() self.good_urls = set() # Setup target file handle filename = self._set_target_filename(expected_filename, overwrite_warc) self._logger.log('INFO', 'Creating archivefile:', filename) self._output_file = open(filename, 'wb') self._session = Session() # Setup session for speeding up downloads if proxy_url is not None: # Set socks proxy if provided self._session.proxies['http'] = proxy_url self._session.proxies['https'] = proxy_url self._allow_cookies = allow_cookies self._verify_request = verify_request if not self._verify_request: disable_warnings(InsecureRequestWarning) # Setup rate limiting to prevent hammering the server self._requests_get = sleep_and_retry( limits(calls=max_no_of_calls_in_period, period=limit_period)(self._http_get_w_cookie_handling)) self._writer = WARCWriter(self._output_file, gzip=True, warc_version='WARC/1.1') if warcinfo_record_data is None: # Or use the parsed else custom headers will not be copied # INFO RECORD # Some custom information about the warc writer program and its settings warcinfo_record_data = { 'software': program_name, 'arguments': ' '.join(sys.argv[1:]), 'format': 'WARC File Format 1.1', 'conformsTo': 'http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1-1_latestdraft.pdf' } info_record = self._writer.create_warcinfo_record( filename, warcinfo_record_data) self._writer.write_record(info_record) @staticmethod def _set_target_filename(filename, overwrite_warc): if not overwrite_warc: # Find out next nonexisting warc filename num = 0 while os.path.exists(filename): filename2, ext = os.path.splitext( filename) # Should be filename.warc.gz if ext == '.gz' and filename2.endswith('.warc'): filename2, ext2 = os.path.splitext( filename2) # Should be filename.warc ext = ext2 + ext # Should be .warc.gz filename = '{0}-{1:05d}{2}'.format(filename2, num, ext) num += 1 return filename def __del__(self): if hasattr( self, '_output_file' ): # If the program opened a file, then it should gracefully close it on exit! self._output_file.close() def _http_get_w_cookie_handling(self, *args, **kwargs): """ Extend requests.get with optional cookie purging """ if not self._allow_cookies: self._session.cookies.clear() return self._session.get(*args, **kwargs) def _handle_request_exception(self, url, msg): self._logger.log('WARNING', url, msg, sep='\t') self._error_count += 1 if self._error_count >= self._error_threshold: raise NameError( 'Too many error happened! Threshold exceeded! See log for details!' ) @staticmethod def _get_peer_name(resp): # Must get peer_name before the content is read # It has no official API for that: # https://github.com/kennethreitz/requests/issues/2158 # https://github.com/urllib3/urllib3/issues/1071 # So workaround to be compatible with windows: # https://stackoverflow.com/questions/22492484/how-do-i-get-the-ip-address-from-a-http-request-using-the-\ # requests-library/22513161#22513161 try: peer_name = resp.raw._connection.sock.getpeername()[ 0] # Must get peer_name before the content is read except AttributeError: # On Windows there is no getpeername() Attribute of the class... try: peer_name = resp.raw._connection.sock.socket.getpeername()[0] except AttributeError: peer_name = 'None' # Socket closed and could not derermine peername... return peer_name def _dummy_download_url(self, _): raise NotImplementedError def _download_url(self, url): if url in self.bad_urls: self._logger.log('DEBUG', 'Not downloading known bad URL:', url) return None if url in self.good_urls: # This should not happen! self._logger.log( 'ERROR', 'Not downloading URL, because it is already downloaded in this session:', url) return None scheme, netloc, path, params, query, fragment = urlparse(url) # For safety urlencode the generated URL... (The URL might by modified in this step.) path = quote(path, safe='/%') url_reparsed = urlunparse( (scheme, netloc, path, params, query, fragment)) try: # The actual request (on the reparsed URL, everything else is made on the original URL) resp = self._requests_get(url_reparsed, headers=self._req_headers, stream=True, verify=self._verify_request) # UnicodeError is originated from idna codec error, LocationParseError is originated from URLlib3 error except (UnicodeError, RequestException, LocationParseError) as err: self._handle_request_exception( url, 'RequestException happened during downloading: {0} \n\n' ' The program ignores it and jumps to the next one.'.format( err)) return None if resp.status_code != 200: # Not HTTP 200 OK self._handle_request_exception( url, 'Downloading failed with status code: {0} {1}'.format( resp.status_code, resp.reason)) return None # REQUEST (build headers for warc) reqv_headers = resp.request.headers reqv_headers['Host'] = netloc proto = 'HTTP/{0}'.format( respv_str[resp.raw.version]) # Friendly protocol name reqv_http_headers = StatusAndHeaders('GET {0} {1}'.format( urlunparse(('', '', path, params, query, fragment)), proto), reqv_headers.items(), is_http_request=True) reqv_record = self._writer.create_warc_record( url, 'request', http_headers=reqv_http_headers) # RESPONSE # resp_status need to be stripped else warcio strips the spaces and digest verification will fail! resp_status = '{0} {1}'.format(resp.status_code, resp.reason).strip() resp_headers_list = resp.raw.headers.items( ) # get raw headers from urllib3 # Must get peer_name before the content is read peer_name = self._get_peer_name(resp) try: data = resp.raw.read( ) # To be able to return decoded and also write warc except ProtocolError as err: self._handle_request_exception( url, 'RequestException happened during downloading: {0} \n\n' ' The program ignores it and jumps to the next one.'.format( err)) return None if len(data) == 0: err = 'Response data has zero length!' self._handle_request_exception( url, 'RequestException happened during downloading: {0} \n\n' ' The program ignores it and jumps to the next one.'.format( err)) return None # warcio hack as \r\n is the record separator and trailing ones will be split and digest will eventually fail! if data.endswith(b'\r\n'): # TODO: Warcio bugreport! data = data.rstrip() enc = resp.encoding # Get or detect encoding to decode the bytes of the text to str if enc is None: enc = detect(data)['encoding'] try: text = data.decode(enc) # Normal decode process except UnicodeDecodeError: self._logger.log('WARNING', 'DECODE ERROR RETRYING IN \'IGNORE\' MODE:', url, enc, sep='\t') text = data.decode(enc, 'ignore') data_stream = BytesIO( data ) # Need the original byte stream to write the payload to the warc file resp_http_headers = StatusAndHeaders(resp_status, resp_headers_list, protocol=proto) # Add extra headers like encoding because it is not stored any other way... resp_record = self._writer.create_warc_record( url, 'response', payload=data_stream, http_headers=resp_http_headers, warc_headers_dict={ 'WARC-IP-Address': peer_name, 'WARC-X-Detected-Encoding': enc }) # Everything is OK, write the two WARC records self.write_record(reqv_record, url) self.write_record(resp_record, url) return text def write_record(self, record, url): self.good_urls.add(url) self._writer.write_record(record)
def mergeWarc (files, output): # stats unique = 0 revisit = 0 uniqueLength = 0 revisitLength = 0 payloadMap = {} writer = WARCWriter (output, gzip=True) # Add an additional warcinfo record, describing the transformations. This # is not ideal, since # “A ‘warcinfo’ record describes the records that # follow it […] until next ‘warcinfo’” # -- https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/#warcinfo # A warcinfo record is expected at the beginning of every file. But it # might have written by a different software, so we don’t want to # strip/replace that information, but supplement it. warcinfo = { 'software': getSoftwareInfo (), 'tool': 'crocoite-merge', # not the name of the cli tool 'parameters': {'inputs': files}, } payload = BytesIO (json.dumps (warcinfo, indent=2).encode ('utf-8')) record = writer.create_warc_record ('', 'warcinfo', payload=payload, warc_headers_dict={'Content-Type': makeContentType (jsonMime, 'utf-8')}) writer.write_record (record) for l in files: with open (l, 'rb') as fd: for record in ArchiveIterator (fd): if record.rec_type in {'resource', 'response'}: headers = record.rec_headers rid = headers.get_header('WARC-Record-ID') csum = headers.get_header('WARC-Payload-Digest') length = int (headers.get_header ('Content-Length')) dup = payloadMap.get (csum, None) if dup is None: payloadMap[csum] = {'uri': headers.get_header('WARC-Target-URI'), 'id': rid, 'date': headers.get_header('WARC-Date')} unique += 1 uniqueLength += length else: logging.debug (f'Record {rid} is duplicate of {dup["id"]}') # Payload may be identical, but HTTP headers are # (probably) not. Include them. record = writer.create_revisit_record ( headers.get_header('WARC-Target-URI'), digest=csum, refers_to_uri=dup['uri'], refers_to_date=dup['date'], http_headers=record.http_headers) record.rec_headers.add_header ('WARC-Truncated', 'length') record.rec_headers.add_header ('WARC-Refers-To', dup['id']) revisit += 1 revisitLength += length else: unique += 1 writer.write_record (record) json.dump (dict ( unique=dict (records=unique, bytes=uniqueLength), revisit=dict (records=revisit, bytes=revisitLength), ratio=dict ( records=unique/(unique+revisit), bytes=uniqueLength/(uniqueLength+revisitLength) ), ), sys.stdout, cls=StrJsonEncoder) sys.stdout.write ('\n')
class HarParser(object): logger = logging.getLogger(__name__) def __init__(self, reader, writer, gzip=True): if isinstance(reader, str): with codecs.open(reader, encoding='utf-8') as fh: self.har = json.loads(fh.read()) elif hasattr(reader, 'read'): self.har = json.loads(reader.read()) elif isinstance(reader, dict): self.har = reader else: raise Exception('reader is in an unknown format') self.fh = None if isinstance(writer, BaseWARCWriter): self.writer = writer elif isinstance(writer, str): self.fh = open(writer, 'wb') self.writer = WARCWriter(self.fh, gzip=gzip) elif hasattr(writer, 'write'): self.writer = WARCWriter(writer, gzip=gzip) else: raise Exception('writer is in an unknown format') def parse(self, out_filename=None, rec_title=None): out_filename = out_filename or 'har.warc.gz' rec_title = rec_title or 'HAR Recording' metadata = self.create_wr_metadata(self.har['log'], rec_title) self.write_warc_info(self.har['log'], out_filename, metadata) for entry in self.har['log']['entries']: self.parse_entry(entry) if self.fh: self.fh.close() def parse_entry(self, entry): url = entry['request']['url'] response = self.parse_response(url, entry['response'], entry.get('serverIPAddress')) #TODO: support WARC/1.1 arbitrary precision dates! warc_date = entry['startedDateTime'][:19] + 'Z' response.rec_headers.replace_header('WARC-Date', warc_date) request = self.parse_request(entry['request']) self.writer.write_request_response_pair(request, response) def create_wr_metadata(self, log, rec_title): pagelist = [] for page in log['pages']: if not page['title'].startswith(('http:', 'https:')): continue pagelist.append(dict(title=page['title'], url=page['title'], timestamp=iso_date_to_timestamp(page['startedDateTime']))) metadata = {"title": rec_title, "type": "recording", } if pagelist: metadata["pages"] = pagelist return metadata def write_warc_info(self, log, filename, metadata): creator = '{0} {1}'.format(log['creator']['name'], log['creator']['version']) source = 'HAR Format {0}'.format(log['version']) software = 'har2warc ' + str(__version__) params = OrderedDict([('software', software), ('creator', creator), ('source', source), ('format', 'WARC File Format 1.0'), ('json-metadata', json.dumps(metadata))]) record = self.writer.create_warcinfo_record(filename, params) self.writer.write_record(record) def _get_http_version(self, entry): http_version = entry.get('httpVersion') if not http_version or http_version.upper() not in ('HTTP/1.1', 'HTTP/1.0'): http_version = 'HTTP/1.1' return http_version def parse_response(self, url, response, ip=None): headers = [] payload = BytesIO() content = response['content'].get('text', '') if not content and not response.get('headers'): self.logger.info('No headers or payload for: {0}'.format(url)) headers.append(('Content-Length', '0')) if response['content'].get('encoding') == 'base64': payload.write(base64.b64decode(content)) else: payload.write(content.encode('utf-8')) length = payload.tell() payload.seek(0) SKIP_HEADERS = ('content-encoding', 'transfer-encoding') http2 = False for header in response['headers']: if header['name'].lower() not in SKIP_HEADERS: headers.append((header['name'], header['value'])) #TODO: http2 detection -- write as same warc header? if (not http2 and header['name'] in (':method', ':scheme', ':path')): http2 = True status = response.get('status') or 204 reason = response.get('statusText') if not reason: reason = http_status_names.get(status, 'No Reason') status_line = str(status) + ' ' + reason proto = self._get_http_version(response) http_headers = StatusAndHeaders(status_line, headers, protocol=proto) if not content: content_length = http_headers.get_header('Content-Length', '0') if content_length != '0': self.logger.info('No Content for length {0} {1}'.format(content_length, url)) http_headers.replace_header('Content-Length', '0') else: http_headers.replace_header('Content-Length', str(length)) warc_headers_dict = {} if ip: warc_headers_dict['WARC-IP-Address'] = ip record = self.writer.create_warc_record(url, 'response', http_headers=http_headers, payload=payload, length=length, warc_headers_dict=warc_headers_dict) return record def parse_request(self, request): parts = urlsplit(request['url']) path = parts.path query = request.get('queryString') if query: path += '?' + urlencode(dict((p['name'], p['value']) for p in query)) headers = [] http2 = False for header in request['headers']: headers.append((header['name'], header['value'])) #TODO: http2 detection -- write as same warc header? if (not http2 and header['name'] in (':method', ':scheme', ':path')): http2 = True if http2: headers.append(('Host', parts.netloc)) http_version = self._get_http_version(request) status_line = request['method'] + ' ' + path + ' ' + http_version http_headers = StatusAndHeaders(status_line, headers) payload = None length = 0 if request['bodySize'] > 0: payload = BytesIO() payload.write(request['postData']['text'].encode('utf-8')) length = payload.tell() payload.seek(0) record = self.writer.create_warc_record(request['url'], 'request', http_headers=http_headers, payload=payload, length=length) return record
def facebook_user_bio(self, username): """Scrapes Facebook bio and returns info on the information contained on the about page (e.g. https://www.facebook.com/pg/SPD/about/?ref=page_internal) @param username: Facebook username @return: a dictionary of account attributes """ user_email_fb = self.message['credentials']['user_email_fb'] user_password_fb = self.message['credentials']['user_password_fb'] # ensure username is clean and can be accessed if username.startswith( "https://www.facebook.com/") or username.startswith( "http://www.facebook.com/"): username = re.sub(r'^.+facebook\.com\/', '', username) # possibly also remove trailing / username = re.sub(r'\/$', '', username) # created at field fb_general = base_fb_url + username # bio info fb_about = base_fb_url + username + "/about/?ref=page_internal" # site transparency (e.g. admins) m_fb_general = "http://m.facebook.com/" + username # request the html r = requests.get(fb_general) # ensure no 404's if not r: log.debug("Couldn't access profile site: %s", fb_general) return soup = BeautifulSoup(r.content, "html.parser") # scrape creation date created_at = soup.find('div', {"class": "_3qn7"}) created_at = created_at.select_one("span").text created_at = re.sub(r"(Seite erstellt)", "", created_at) created_at = created_at[3:] # scrape n of likes # find span with like number spans = soup.find('span', {"class": "_52id _50f5 _50f7"}) # isolate likes via regex likes = re.search(r'^[\d]+.[^\s]+', spans.text).group() bio_dict = { "username": fb_general, "n_likes": likes, "created_at": created_at } # request about html r_about = requests.get(fb_about) # ensure no 404's if not r_about: log.debug("Couldn't access username/about site: %s", fb_about) return about_soup = BeautifulSoup(r_about.content, "html.parser") mission_text = about_soup.find_all('div', {'class': "_4bl9"}) for divs in mission_text: describing_div = divs.find('div', {'class': '_50f4'}) content_div = divs.find('div', {'class': '_3-8w'}) if describing_div and content_div: bio_dict[describing_div.text] = content_div.text # photos # Retrieves profile and cover photo of public facebook page # bio going to the 'about' page, parsing html and getting # the links to photos from script tag, these can then be passed # harvest_media # this is not affected by the harvest_media options but will always happen all_scripts = about_soup.find_all('script') for js in all_scripts: for content in js.contents: if 'cover_photo' in content: # isolate relevant links links = re.findall(r'https\:\\/\\/scontent[^"]*', content) # remove escaped front slashes for val, link in enumerate(links): links[val] = re.sub(r'\\', "", link) self._harvest_media_url(links[val]) if m_fb_general: user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.50 Safari/537.36' site_transparency_class_selector = "._a58._a5o._9_7._2rgt._1j-g._2rgt._86-3._2rgt._1j-g._2rgt" site_transparency_detail_id = "u_0_d" chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('headless') chrome_options.add_argument('start-maximised') chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--window-size=1200x800') chrome_options.add_argument('--disable-dev-shm-usage') chrome_options.add_argument(f"user-agent={user_agent}") # this will connect to the selenium container starting scraping driver = webdriver.Remote("host.docker.internal:4444/wd/hub", {'browserName': 'chrome'}) driver.get("http://m.facebook.com") driver.maximize_window() # accept cookies cookies = driver.find_element_by_id('accept-cookie-banner-label') # more or less random wait to replicate user behavior, ensure politeness time.sleep(random.uniform(3, 9)) cookies.click() # Search & Enter the Email or Phone field & Enter Password username_fb = driver.find_element_by_id("m_login_email") password_fb = driver.find_element_by_id("m_login_password") submit = driver.find_element_by_css_selector("._56b_") # send keys and make sure not prepolutaed # 2fa has to be deactivated username_fb.clear() password_fb.clear() username_fb.send_keys(user_email_fb) password_fb.send_keys(user_password_fb) time.sleep(random.uniform(3, 9)) # Step 4) Click Login submit.click() time.sleep(random.uniform(3, 9)) # navigate to site driver.get(m_fb_general) time.sleep(random.uniform(3, 9)) driver.execute_script("window.scrollTo(0, 800)") # site info only loads on scroll # use class name and div content (todo) time.sleep(random.uniform(20, 25)) element = WebDriverWait(driver, 20).until( ec.presence_of_element_located( (By.CSS_SELECTOR, site_transparency_class_selector))) site_transparency = driver.find_elements_by_css_selector( site_transparency_class_selector) #site transparency should always be below about site_transparency[1].click() time.sleep(random.uniform(20, 15)) # simply get the whole text of the transparency box of site # the exact info can be extracted ex-post element = WebDriverWait(driver, 20).until( ec.presence_of_element_located( (By.ID, site_transparency_detail_id))) time.sleep(random.uniform(3, 9)) site_transparency_text = driver.find_element_by_id( site_transparency_detail_id).text time.sleep(random.uniform(3, 9)) driver.close() log.info("Finished scraping transparency box") bio_dict['transparency_text'] = site_transparency_text # ensure that only warc will be written if sites were found # else nothing will happen if r_about or r: # filename will later be converted to path # replicating pattern from https://github.com/internetarchive/warcprox/blob/f19ead00587633fe7e6ba6e3292456669755daaf/warcprox/writer.py#L69 # create random token for filename random_token = ''.join( random.sample('abcdefghijklmnopqrstuvwxyz0123456789', 8)) serial_no = '00000' file_name = safe_string( self.message["id"]) + "-" + warcprox.timestamp17( ) + "-" + serial_no + "-" + random_token with open(os.path.join(self.warc_temp_dir, file_name + ".warc.gz"), "wb") as result_warc_file: log.info("Writing json-timeline result to path %s", self.warc_temp_dir) writer = WARCWriter(result_warc_file, gzip=True) def json_date_converter(o): """ Converts datetime.datetime items in facebook_scraper result to formate suitable for json.dumps""" if isinstance(o, datetime.datetime): return o.__str__() json_payload = json.dumps(bio_dict, default=json_date_converter, ensure_ascii=False).encode("utf-8") record = writer.create_warc_record( "https://m.facebook.com/" + username, 'metadata', payload=BytesIO(json_payload), warc_content_type="application/json") writer.write_record(record) log.info("Writing scraped results to %s", self.warc_temp_dir)
def generate_current(urim): tic_all = timeit.default_timer() time_json = { 'date': strftime("%Y%m%d%H%M%S", gmtime()), 'time_in_seconds_to_download_memento': 0, 'time_in_seconds_to_generate_fixity': 0 } urimid_, mdatetime, urir = convert_to_original_link(urim) manif = { "@context": "http://manifest.ws-dl.cs.odu.edu/terms.json", "uri-r": urir, "uri-m": urim, "memento-datetime": datetime.datetime.strptime( mdatetime, '%Y%m%d%H%M%S').strftime('%a, %d %b %Y %H:%M:%S GMT') } urimh = hashlib.md5(urim.encode()).hexdigest() downloadtime = strftime("%Y%m%d%H%M%S", gmtime()) manif["created"] = datetime.datetime.strptime( downloadtime, '%Y%m%d%H%M%S').strftime('%a, %d %b %Y %H:%M:%S GMT') outMainDir = '/data/Fixity/verification/' + urimh + '/' + downloadtime warc_file = outMainDir + '/raw.warc' tic0 = timeit.default_timer() if not os.path.exists(outMainDir): os.makedirs(outMainDir) with open(warc_file, 'wb') as poutput: writer = WARCWriter(poutput, gzip=False) headers = { 'User-Agent': 'Web Science and Digital Libraries Group (@WebSciDL); Project/archives_fixity; Contact/Mohamed Aturban ([email protected])', 'Accept-Encoding': None } try: resp = requests.get(urimid_, headers=headers, timeout=180, allow_redirects=True, stream=True) except: pass cont = resp.content headers_list = resp.headers.items() http_headers = StatusAndHeaders(str(resp.status_code), headers_list, protocol='HTTP/1.0') record = writer.create_warc_record(urimid_, 'response', payload=BytesIO(cont), http_headers=http_headers) try: writer.write_record(record) except Exception as e: print(str(e)) toc0 = timeit.default_timer() if os.path.exists(warc_file): with open(warc_file, 'rb') as stream: counter_raw = 0 for record in ArchiveIterator(stream): if record.rec_type == 'response': uri = record.rec_headers.get_header('WARC-Target-URI') if uri == urimid_: status_code = record.http_headers.statusline.split()[0] entity = record.content_stream().read() #.strip() hdrs, hdrs_values, hdrs_keys = extrcated_headers_from_warc_record( record, status_code) hdrs[ "Preference-Applied"] = "original-links, original-content" md5h = hashlib.md5(entity + hdrs_values.encode()).hexdigest() sha256h = hashlib.sha256( entity + hdrs_values.encode()).hexdigest() hash_v = "md5:{} sha256:{}".format(md5h, sha256h) hash_constructor = "(curl -s '$uri-m' && echo -n '" + hdrs_keys + "') | tee >(sha256sum) >(md5sum) >/dev/null | cut -d ' ' -f 1 | paste -d':' <(echo -e 'md5\nsha256') - | paste -d' ' - -" manif["http-headers"] = hdrs manif["hash"] = hash_v manif["hash-constructor"] = hash_constructor manif[ "@id"] = "http://manifest.ws-dl.cs.odu.edu/manifest/" + downloadtime + '/ /' + urim manif_file = json.dumps(manif, indent=4) self_hash = hashlib.sha256(manif_file.encode()).hexdigest() manif["@id"] = manif["@id"].replace("/ /", "/" + self_hash + "/") with open(outMainDir + '/' + self_hash + '.json', 'w') as outfile: json.dump(manif, outfile, indent=4) toc_all = timeit.default_timer() time_json['time_in_seconds_to_download_memento'] = toc0 - tic0 time_json['time_in_seconds_to_generate_fixity'] = ( toc_all - tic_all) - time_json['time_in_seconds_to_download_memento'] with open(outMainDir + '/' + self_hash + '.json.time', 'w') as outfile: json.dump(time_json, outfile, indent=4) return outMainDir + '/' + self_hash + '.json'
def synthesize_warc(urim, session, output_directory): import otmt import glob from warcio.warcwriter import WARCWriter from warcio.statusandheaders import StatusAndHeaders from hashlib import md5 from datetime import datetime import traceback m = md5() m.update(urim.encode('utf8')) urlhash = m.hexdigest() if len( glob.glob('{}/{}*.warc.gz'.format(output_directory, urlhash)) ) > 0: module_logger.warning("Detected existing WARC for URI-M, skipping {}".format(urim)) return resp = session.get(urim, stream=True) resp.raise_for_status() headers_list = resp.raw.headers.items() # we use response.url instead of urim to (hopefully) avoid raw redirects raw_urim = otmt.generate_raw_urim(resp.url) raw_response = session.get(raw_urim, stream=True) warc_target_uri = None # we have to implement this construct in case the archive combines original with other relations for link in resp.links: if 'original' in link: warc_target_uri = resp.links[link]['url'] if warc_target_uri is None: module_logger.warning("could not find this memento's original resource, skipping {}".format(urim)) return try: mdt = resp.headers['Memento-Datetime'] except KeyError: module_logger.warning("could not find this memento's memento-datetime, skipping {}".format(urim)) return http_headers = StatusAndHeaders('200 OK', headers_list, protocol='HTTP/1.0') module_logger.debug("mdt formatted by strptime and converted by strftime: {}".format( datetime.strptime( mdt, "%a, %d %b %Y %H:%M:%S GMT" ).strftime('%Y-%m-%dT%H:%M:%SZ') )) warc_headers_dict = {} warc_headers_dict['WARC-Date'] = datetime.strptime( mdt, "%a, %d %b %Y %H:%M:%S GMT" ).strftime('%Y-%m-%dT%H:%M:%SZ') with open("{}/{}-{}.warc.gz".format(output_directory, urlhash, datetime.now().strftime('%Y%m%d%H%M%S')), 'wb') as output: writer = WARCWriter(output, gzip=True) record = writer.create_warc_record( warc_target_uri, 'response', payload=raw_response.raw, http_headers=http_headers, warc_headers_dict=warc_headers_dict ) writer.write_record(record)
url = None date = None with open(filepath, 'rb') as content_file: content = content_file.read() for line in content.split(b"\n"): if re.search(rb'<!-- Mirrored from .* by HTTrack Website Copier.*\[.*\],', line): url = re.sub(rb'.*<!-- Mirrored from ', b'', re.sub(rb' by HTTrack Website Copier.*', b'', line)) date = re.sub(rb'.+by HTTrack Website.+\[.+\][^,]*, ', b'', re.sub(rb' -->.*', b'', line)) break if date is None: dvalue = datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ') else: try: dvalue = parse(date.decode("utf8")).strftime('%Y-%m-%dT%H:%M:%SZ') except ValueError: dvalue = datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ') if url is None: urlStr = "unknown" else: try: urlStr = url.decode("utf8") # sys.stderr.write("HH1 " + urlStr + "\n") except: urlStr = "unknown-encoding" # sys.stderr.write("HH2 " + urlStr + "\n") with open(filepath, 'rb') as content_file: record = writer.create_warc_record(urlStr, 'response', payload=content_file) writer.write_record(record)
class CCWARCWriter: def __init__(self, prefix, max_size, subprefix=None, gzip=True, get_serial=None): self.writer = None self.prefix = prefix self.subprefix = subprefix self.max_size = max_size self.gzip = gzip self.hostname = socket.gethostname() if get_serial is not None: self.external_get_serial = get_serial else: self.external_get_serial = None self.serial = 0 def __del__(self): if self.writer is not None: self.f.close() def create_default_info(self, version, ip, description=None, creator=None, operator=None): ''' creator: # person, organization, service operator: # person, if creator is an organization isPartOf: # name of the crawl ''' info = OrderedDict() info['software'] = 'cocrawler/' + version info['hostname'] = self.hostname info['ip'] = ip if description: info['description'] = description if creator: info['creator'] = creator if operator: info['operator'] = operator info[ 'isPartOf'] = self.prefix # intentionally does not include subprefix info['format'] = 'WARC file version 1.0' self.info = info return info def open(self): filename = self.prefix if self.subprefix: filename += '-' + self.subprefix serial = self.get_serial(filename) filename += '-' + serial + '-' + self.hostname + '.warc' if self.gzip: filename += '.gz' self.filename = filename self.f = open(filename, 'wb') self.writer = WARCWriter(self.f, gzip=self.gzip) record = self.writer.create_warcinfo_record(self.filename, self.info) self.writer.write_record(record) def get_serial(self, filename): if self.external_get_serial is not None: return self.external_get_serial(filename) self.serial += 1 return '{:06}'.format(self.serial - 1) def maybe_close(self): ''' TODO: always close/reopen if subprefix is not None; minimizes open filehandles ''' fsize = os.fstat(self.f.fileno()).st_size if fsize > self.max_size: self.f.close() self.writer = None def write_dns(self, dns, expires, url): # write it out even if empty # TODO: we filter the addresses early, should we warc the unfiltered dns repsonse? # the response object doesn't contain the query type 'A' or 'AAAA' # but it has family=2 AF_INET (ipv4) and flags=4 AI_NUMERICHOST -- that's 'A' kind = 'A' # fixme IPV6 ttl = int(expires - time.time()) host = url.hostname if self.writer is None: self.open() payload = timestamp_now() + '\r\n' for r in dns: try: payload += host + '.\t' + str( ttl) + '\tIN\t' + kind + '\t' + r['host'] + '\r\n' except Exception as e: LOGGER.info('problem converting dns reply for warcing', host, r, e) pass payload = payload.encode('utf-8') record = self.writer.create_warc_record('dns:' + host, 'resource', payload=BytesIO(payload), warc_content_type='text/dns', length=len(payload)) self.writer.write_record(record) LOGGER.debug('wrote warc dns response record%s for host %s', p(self.prefix), host) stats.stats_sum('warc dns' + p(self.prefix), 1) def write_request_response_pair(self, url, req_headers, resp_headers, is_truncated, payload, digest=None): if self.writer is None: self.open() # XXX WARC-Identified-Payload-Type set from Apache Tika? (done by Common Crawl) (how expensive?) req_http_headers = StatusAndHeaders( 'GET / HTTP/1.1', headers_to_str_headers(req_headers)) request = self.writer.create_warc_record('http://example.com/', 'request', http_headers=req_http_headers) resp_http_headers = StatusAndHeaders( '200 OK', headers_to_str_headers(resp_headers), protocol='HTTP/1.1') warc_headers_dict = {} if digest is not None: warc_headers_dict['WARC-Payload-Digest'] = digest if is_truncated: if is_truncated in valid_truncations: warc_headers_dict['WARC-Truncated'] = is_truncated else: LOGGER.error('Invalid is_truncation of ' + is_truncated) warc_headers_dict['WARC-Truncated'] = 'unspecified' response = self.writer.create_warc_record( url, 'response', payload=BytesIO(payload), length=len(payload), warc_headers_dict=warc_headers_dict, http_headers=resp_http_headers) self.writer.write_request_response_pair(request, response) self.maybe_close() LOGGER.debug('wrote warc request-response pair%s for url %s', p(self.prefix), url) stats.stats_sum('warc r/r' + p(self.prefix), 1)
]: payload[input.get('name')] = input.get('value') payload["ctl00$ctl00$bodyContent$mainContent$ddlYears"] = str(year) return payload def post_url(url, headers): return url + '?' + functools.reduce( lambda acc, k: acc + '&' + k + '=' + headers[k], headers.keys()) if __name__ == "__main__": config.initalize_project_root() today = time.strftime("%Y%m%d%H%M%S", time.gmtime()) #config.initalize_record() r = requests.get(config.AGENDA_URL) session_headers = parse_session(r.text, 2018) fileName = 'rec-' + today + '-psuedos-MacBook-Pro.local.warc.gz' with open(fileName, 'wb') as output: writer = WARCWriter(output, gzip=True) response = requests.post(config.AGENDA_URL, data=session_headers) headers_list = response.raw.headers.items() http_headers = StatusAndHeaders('200 OK', headers_list, protocol='HTTP/1.0') record = writer.create_warc_record(config.AGENDA_URL, 'response', payload=response.raw, http_headers=http_headers) writer.write_record(record)
with open('example.warc.wet.gz', 'wb') as output: writer = WARCWriter(output, gzip=True) with ZIMFile("data/wikipedia_en_simple_all_nopic_2020-12.zim") as reader: for uid in range(0, reader.article_count): if uid % 10_000 == 0: print("{} out of {}".format(uid, reader.article_count)) article = reader.get_article_by_id(uid) try: if article.mimetype != "text/html": continue except RuntimeError: continue if article.is_redirect: continue url = 'https://simple.wikipedia.org/wiki/{}'.format(quote(article.url)) html = bytes(article.content).decode('utf8') text = html2text(html) payload = BytesIO(text.encode('utf8')) record = writer.create_warc_record( url, 'conversion', payload=payload, ) writer.write_record(record)
class WarcDownloader: """ Download URL with HTTP GET, save to a WARC file and return the decoded text """ def __init__(self, filename, logger_, program_name='corpusbuilder 1.0', user_agent=None, overwrite_warc=True, err_threshold=10, warcinfo_record_data=None, known_bad_urls=None, max_no_of_calls_in_period=2, limit_period=1, proxy_url=None, allow_cookies=False): if known_bad_urls is not None: # Setup the list of cached bad URLs to prevent trying to download them again with open(known_bad_urls, encoding='UTF-8') as fh: self.bad_urls = {line.strip() for line in fh} else: self.bad_urls = set() if not overwrite_warc: # Find out next nonexisting warc filename num = 0 while os.path.exists(filename): filename2, ext = os.path.splitext(filename) # Should be filename.warc.gz if ext == '.gz' and filename2.endswith('.warc'): filename2, ext2 = os.path.splitext(filename2) # Should be filename.warc ext = ext2 + ext # Should be .warc.gz filename = '{0}-{1:05d}{2}'.format(filename2, num, ext) num += 1 logger_.log('INFO', 'Creating archivefile: {0}'.format(filename)) self._output_file = open(filename, 'wb') self._logger_ = logger_ self._req_headers = {'Accept-Encoding': 'identity', 'User-agent': user_agent} self._session = Session() # Setup session for speeding up downloads if proxy_url is not None: # Set socks proxy if provided self._session.proxies['http'] = proxy_url self._session.proxies['https'] = proxy_url self._allow_cookies = allow_cookies # Setup rate limiting to prevent hammering the server self._requests_get = sleep_and_retry(limits(calls=max_no_of_calls_in_period, period=limit_period)(self._http_get_w_cookie_handling)) self._error_count = 0 self._error_threshold = err_threshold # Set the error threshold which cause aborting to prevent deinal self._writer = WARCWriter(self._output_file, gzip=True) if warcinfo_record_data is None: # INFO RECORD # Some custom information about the warc writer program and its settings info_headers = {'software': program_name, 'arguments': ' '.join(sys.argv[1:]), 'format': 'WARC File Format 1.0', 'conformsTo': 'http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf'} info_record = self._writer.create_warcinfo_record(filename, info_headers) else: # Must recreate custom headers else they will not be copied custom_headers = ''.join('{0}: {1}\r\n'.format(k, v) for k, v in warcinfo_record_data[1].items()).\ encode('UTF-8') info_record = self._writer.create_warc_record('', 'warcinfo', warc_headers=warcinfo_record_data[0], payload=BytesIO(custom_headers), length=len(custom_headers)) self._writer.write_record(info_record) def __del__(self): if hasattr(self, '_output_file'): # If the program opened a file, then it should gracefully close it on exit! self._output_file.close() def _http_get_w_cookie_handling(self, *args, **kwargs): """ Extend requests.get with optional cookie purging """ if not self._allow_cookies: self._session.cookies.clear() return self._session.get(*args, **kwargs) def _handle_request_exception(self, url, msg): self._logger_.log('WARNING', '\t'.join((url, msg))) self._error_count += 1 if self._error_count >= self._error_threshold: raise NameError('Too many error happened! Threshold exceeded! See log for details!') def download_url(self, url): scheme, netloc, path, params, query, fragment = urlparse(url) path = quote(path) # For safety urlencode the generated URL... url = urlunparse((scheme, netloc, path, params, query, fragment)) if url in self.bad_urls: self._logger_.log('INFO', 'Not downloading known bad URL: {0}'.format(url)) return None try: # The actual request resp = self._requests_get(url, headers=self._req_headers, stream=True) except RequestException as err: self._handle_request_exception(url, 'RequestException happened during downloading: {0} \n\n' ' The program ignores it and jumps to the next one.'.format(err)) return None if resp.status_code != 200: # Not HTTP 200 OK self._handle_request_exception(url, 'Downloading failed with status code: {0} {1}'.format(resp.status_code, resp.reason)) return None # REQUEST reqv_headers = resp.request.headers reqv_headers['Host'] = netloc proto = 'HTTP/{0}'.format(respv_str[resp.raw.version]) # Friendly protocol name reqv_http_headers = StatusAndHeaders('GET {0} {1}'.format(urlunparse(('', '', path, params, query, fragment)), proto), reqv_headers.items(), is_http_request=True) reqv_record = self._writer.create_warc_record(url, 'request', http_headers=reqv_http_headers) # RESPONSE resp_status = '{0} {1}'.format(resp.status_code, resp.reason) resp_headers_list = resp.raw.headers.items() # get raw headers from urllib3 # Must get peer_name before the content is read # It has no official API for that: # https://github.com/kennethreitz/requests/issues/2158 # https://github.com/urllib3/urllib3/issues/1071 # So workaround to be compatible with windows: # https://stackoverflow.com/questions/22492484/how-do-i-get-the-ip-address-from-a-http-request-using-the-\ # requests-library/22513161#22513161 try: peer_name = resp.raw._connection.sock.getpeername()[0] # Must get peer_name before the content is read except AttributeError: # On Windows there is no getpeername() Attribute of the class... try: peer_name = resp.raw._connection.sock.socket.getpeername()[0] except AttributeError: peer_name = 'None' # Socket closed and could not derermine peername... try: data = resp.raw.read() # To be able to return decoded and also write warc except ProtocolError as err: self._handle_request_exception(url, 'RequestException happened during downloading: {0} \n\n' ' The program ignores it and jumps to the next one.'.format(err)) return None if len(data) == 0: err = 'Response data has zero length!' self._handle_request_exception(url, 'RequestException happened during downloading: {0} \n\n' ' The program ignores it and jumps to the next one.'.format(err)) return None enc = resp.encoding # Get or detect encoding to decode the bytes of the text to str if enc is None: enc = detect(data)['encoding'] try: text = data.decode(enc) # Normal decode process except UnicodeDecodeError: self._logger_.log('WARNING', '\t'.join(('DECODE ERROR RETRYING IN \'IGNORE\' MODE:', url, enc))) text = data.decode(enc, 'ignore') data_stream = BytesIO(data) # Need the original byte stream to write the payload to the warc file resp_http_headers = StatusAndHeaders(resp_status, resp_headers_list, protocol=proto) # Add extra headers like encoding because it is not stored any other way... resp_record = self._writer.create_warc_record(url, 'response', payload=data_stream, http_headers=resp_http_headers, warc_headers_dict={'WARC-IP-Address': peer_name, 'WARC-X-Detected-Encoding': enc}) # Everything is OK, write the two WARC records self._writer.write_record(reqv_record) self._writer.write_record(resp_record) return text def write_record(self, record): self._writer.write_record(record)
def facebook_user_timeline(self, seed_id, username, nsid, sleep_between_harvests=True): """This function will scrape the user timeline""" log.debug("Harvesting user %s with seed_id %s.", username, seed_id) # make sure either username or nsid is present to start scraping assert username or nsid # possibly get fbid from state.json if not nsid: nsid = self.state_store.get_state( __name__, u"timeline.{}.fbid".format(username)) log.info("Trying to retrieve FB-ID from state store") log.info("Found FB-ID from state store is %s", nsid) # Possibly look up fbid if not supplied and not already in state.json if username and not nsid: log.debug("No FB userid, retrieving it") nsid = self.get_fbid(username) # write id to state.json if not already there key = "timeline.{}.fbid".format(username) self.state_store.set_state(__name__, key, nsid) log.info("Writing fbid to state store") if nsid: # report back whether user id was found log.info("FB userid %s", nsid) # todo - need to add timeout and what to do if blocked incremental = self.message.get("options", {}).get("incremental", False) harvest_media = self.message.get("options", {}).get("harvest_media", False) if harvest_media: # disable verbose logging of the state store to avoid overlong log messages # (the state store holds the list of all harvested media URLs) self.state_store.verbose = False if incremental: # search for since_id of post since_id = self.state_store.get_state( __name__, u"timeline.{}.since_id".format(nsid)) scrape_result = [] # check if blocked, usually lasts 24 hours if "Temporarily Blocked" in requests.get( "https://m.facebook.com/" + nsid).text: # sleep 24 hours log.debug("Temporarily blocked - waiting 24 hours") time.sleep(86429) # check for cookies. otherwise don't use if os.path.isfile("/tmp/cookies.json"): fb_cookies = "/tmp/cookies.json" else: fb_cookies = None counter = 0 for post in facebook_scraper.get_posts(nsid, pages=self.pages, options={ "allow_extra_requests": False, "posts_per_page": 200 }, timeout=30, cookies=fb_cookies): scrape_result.append(post) self.result.harvest_counter["posts"] += 1 self.result.increment_stats("posts") counter += 1 # in case self.result.harvest_count also contains old harvest counts # for very long harvests, try to avoid blocking by sleeping after a # certain amount of posts if self.result.harvest_counter["posts"] in [ 2000, 4000, 6000, 8000 ]: log.info( "Waiting a few minutes to avoid block bc of too many requests" ) time.sleep(random.uniform(100, 650)) if incremental and post["post_id"] == since_id and post[ "post_id"]: log.info( "Stopping, found last post that was previously harvested with id: %s", post["post_id"]) break # harvesting media (images_lowquality links!) # doing this after post scrape to avoid potential blocks # last condition avoids parsing empty lists (i.e. no media) if harvest_media: img_counter = 1 for post in scrape_result: if post['images_lowquality'] and ( img_counter <= 1000 or (1000 / img_counter) >= random.random()): log.info("Harvesting media from post") self.result.harvest_counter["images"] += 1 self.result.increment_stats("images") img_counter += 1 # in case harvest_counter contains old counts # get media content from links - should automatically be caught within warc stream # all photos on fb are jpgs, so the list comprehension checks whether this is the case # for the stream, if not (e.g. video) it will not harvest [ self._harvest_media_url(media_url) for media_url in post['images_lowquality'] if 'jpg' in media_url ] # filename will later be converted to path # replicating pattern from https://github.com/internetarchive/warcprox/blob/f19ead00587633fe7e6ba6e3292456669755daaf/warcprox/writer.py#L69 # create random token for filename random_token = ''.join( random.sample('abcdefghijklmnopqrstuvwxyz0123456789', 8)) serial_no = '00000' file_name = safe_string( self.message["id"]) + "-" + warcprox.timestamp17( ) + "-" + serial_no + "-" + random_token with open(os.path.join(self.warc_temp_dir, file_name + ".warc.gz"), "wb") as result_warc_file: log.info("Writing json-timeline result to path %s", self.warc_temp_dir) writer = WARCWriter(result_warc_file, gzip=True) def json_date_converter(o): """ Converts datetime.datetime items in facebook_scraper result to formate suitable for json.dumps""" if isinstance(o, datetime.datetime): return o.__str__() json_payload = json.dumps(scrape_result, default=json_date_converter, ensure_ascii=False).encode("utf-8") record = writer.create_warc_record( username, 'metadata', payload=BytesIO(json_payload), warc_content_type="application/json") writer.write_record(record) log.info("Writing scraped results to %s", self.warc_temp_dir) if incremental: # some posts will have post["post_id"] None # I take the latest post without a None id # if no post with a post_id is found nothing will # be written to the state store latest_post = next( (latest_post for latest_post in scrape_result if latest_post["post_id"] is not None), None) if latest_post: max_post_time = latest_post.get("time") max_post_id = latest_post.get("post_id") assert max_post_time and max_post_id # write most recent post ID to state store key = "timeline.{}.since_id".format(nsid) self.state_store.set_state(__name__, key, max_post_id) log.info( "Wrote first scraped post to state_store: %s (state: %s)", max_post_id, key) else: msg = "NSID not found for user {}".format(username) log.exception(msg) self.result.warnings.append( Msg(CODE_UID_NOT_FOUND, msg, seed_id=seed_id)) # sleep approx. 15 min before starting next harvest to ensure politness if sleep_between_harvests: log.info("Waiting approx. 15 before next harvest") time.sleep(random.randint(850, 2500))
def facebook_user_bio(self, username): """Scrapes Facebook bio and returns info on the information contained on the about page (e.g. https://www.facebook.com/pg/SPD/about/?ref=page_internal) @param username: Facebook username @return: a dictionary of account attributes """ # ensure username is clean and can be accessed if username.startswith( "https://www.facebook.com/") or username.startswith( "http://www.facebook.com/") or username.startswith( "www.facebook.com/"): username = re.sub(r'^.+facebook\.com\/', '', username) # possibly also remove trailing / username = re.sub(r'\/$', '', username) # created at field fb_general = base_fb_url + username # bio info fb_about = base_fb_url + username + "/about/?ref=page_internal" # site transparency (e.g. admins) m_fb_general = "https://m.facebook.com/" + username site_transparency_detail_id_selector = "//*[contains(text(), 'Page history')]/ancestor::div/ancestor::div" site_transparency_class_selector = "._a58._a5o._9_7._2rgt._1j-g._2rgt._86-3._2rgt._1j-g._2rgt" driver = self.initiate_selenium_webdriver() # check whether cookies are present, otherwise try to # log in if os.path.isfile("/tmp/cookies.json"): # first navigate to fb, otherwise # selenium does not accept the cookies # navigate to page driver.get("https://m.facebook.com/") # else load cookies with open("/tmp/cookies.json") as f: cookies = json.load(f) # add to driver for cookie in cookies: driver.add_cookie(cookie) # if no cookies, try to login else: self.fb_login(driver=driver) time.sleep(random.uniform(3, 9)) # then write cookies cookies = driver.get_cookies() with open("/tmp/cookies.json", "w") as f: json.dump(cookies, f) time.sleep(random.uniform(3, 9)) # navigate to site driver.get(m_fb_general) time.sleep(random.uniform(3, 9)) # site info only loads on scroll driver.execute_script("window.scrollTo(0, 800)") # extract likes site_likes_xpath = "//div[@class=\"_59k _2rgt _1j-f _2rgt\"]" likes = driver.find_elements_by_xpath(site_likes_xpath) likes = [ single_div.text for single_div in likes if "like this" in single_div.text ] # new fb page layout has followers instead of likes if len(likes) == 0: followers_xpath = "//*[@id=\"profile_intro_card\"]/div[1]/div/div[1]/div[2]/div/div/div/span" likes = driver.find_elements_by_xpath(followers_xpath) likes = [likes[0].text] # old page layout if "like this" in likes[0]: time.sleep(random.uniform(20, 25)) element = WebDriverWait(driver, 20).until( ec.presence_of_element_located( (By.CSS_SELECTOR, site_transparency_class_selector))) site_transparency = driver.find_elements_by_css_selector( site_transparency_class_selector) # site transparency should always be below about time.sleep(random.uniform(5, 9)) site_transparency[1].click() time.sleep(random.uniform(15, 20)) # simply get the whole text of the transparency box of site # the exact info can be extracted ex-post element = WebDriverWait(driver, 20).until( ec.presence_of_element_located( (By.XPATH, site_transparency_detail_id_selector))) time.sleep(random.uniform(3, 9)) site_transparency_text = driver.find_elements_by_xpath( site_transparency_detail_id_selector)[0].text time.sleep(random.uniform(3, 9)) driver.quit() # new page layout elif "Followers" in likes[0]: # click on about time.sleep(random.uniform(20, 25)) more_about_xpath = "//*[@id=\"profile_intro_card\"]/div[2]/div/div[3]/div/a" more_about = driver.find_element_by_xpath(more_about_xpath) more_about.click() time.sleep(random.uniform(2, 7)) see_more_transparency_xpath = "//*[@id=\"transparency\"]/header/div/div[2]/div/div/a" see_more_transparency = driver.find_element_by_xpath( see_more_transparency_xpath) see_more_transparency.click() time.sleep(random.uniform(2, 7)) all_divs_xpath = "//*[contains(text(), 'Page history')]/ancestor::div/ancestor::div" all_divs = driver.find_elements_by_xpath(all_divs_xpath) # account for different way of writing if len(all_divs) == 0: all_divs_xpath = "//*[contains(text(), 'Page History')]/ancestor::div/ancestor::div" all_divs = driver.find_elements_by_xpath(all_divs_xpath) site_transparency_text = all_divs[0].text driver.quit() log.info("Finished scraping transparency box") bio_dict = { "username": fb_general, "n_likes": likes[0], "transparency_text": site_transparency_text } # request about html # tries to add cookies, otherwise it won't work anymore if os.path.isfile("/tmp/cookies.json"): with open("/tmp/cookies.json") as f: cookies = json.load(f) else: cookies = None r_about = requests.get(fb_about) # ensure no 404's if not r_about: log.debug("Couldn't access username/about site: %s", fb_about) return about_soup = BeautifulSoup(r_about.content, "html.parser") mission_text = about_soup.find_all('div', {'class': "_4bl9"}) for divs in mission_text: describing_div = divs.find('div', {'class': '_50f4'}) content_div = divs.find('div', {'class': '_3-8w'}) if describing_div and content_div: bio_dict[describing_div.text] = content_div.text # photos # Retrieves profile and cover photo of public facebook page # bio going to the 'about' page, parsing html and getting # the links to photos from script tag, these can then be passed # harvest_media # this is not affected by the harvest_media options but will always happen all_scripts = about_soup.find_all('script') for js in all_scripts: for content in js.contents: if 'cover_photo' in content: # isolate relevant links links = re.findall(r'https\:\\/\\/scontent[^"]*', content) # remove escaped front slashes for val, link in enumerate(links): links[val] = re.sub(r'\\', "", link) self._harvest_media_url(links[val]) # ensure that only warc will be written if sites were found # else nothing will happen if r_about: # filename will later be converted to path # replicating pattern from https://github.com/internetarchive/warcprox/blob/f19ead00587633fe7e6ba6e3292456669755daaf/warcprox/writer.py#L69 # create random token for filename random_token = ''.join( random.sample('abcdefghijklmnopqrstuvwxyz0123456789', 8)) serial_no = '00000' file_name = safe_string( self.message["id"]) + "-" + warcprox.timestamp17( ) + "-" + serial_no + "-" + random_token with open(os.path.join(self.warc_temp_dir, file_name + ".warc.gz"), "wb") as result_warc_file: log.info("Writing json-timeline result to path %s", self.warc_temp_dir) writer = WARCWriter(result_warc_file, gzip=True) def json_date_converter(o): """ Converts datetime.datetime items in facebook_scraper result to formate suitable for json.dumps""" if isinstance(o, datetime.datetime): return o.__str__() json_payload = json.dumps(bio_dict, default=json_date_converter, ensure_ascii=False).encode("utf-8") record = writer.create_warc_record( "https://m.facebook.com/" + username, 'metadata', payload=BytesIO(json_payload), warc_content_type="application/json") writer.write_record(record) log.info("Writing scraped results to %s", self.warc_temp_dir)
def facebook_user_ads(self, username, nsid, iso2c, access_token): assert username or nsid limit_per_page = 500 if username and not nsid: log.debug("No FB userid, retrieving it") nsid = self.get_fbid(username) if nsid and access_token and iso2c: # start scraping request_url = "https://graph.facebook.com/v5.0/ads_archive" request_params = { "access_token": access_token, "limit": limit_per_page, "search_page_ids": str(nsid), "ad_active_status": "ALL", "ad_reached_countries": iso2c, # todo "fields": "page_name, page_id, funding_entity, ad_creation_time, ad_delivery_start_time, ad_delivery_stop_time, ad_creative_body, ad_creative_link_caption, ad_creative_link_description, ad_creative_link_title, ad_snapshot_url, demographic_distribution, region_distribution, impressions, spend, currency" } api_result = requests.get(request_url, params=request_params) print(api_result.text) random_token = ''.join( random.sample('abcdefghijklmnopqrstuvwxyz0123456789', 8)) serial_no = '00000' file_name = safe_string( self.message["id"]) + "-" + warcprox.timestamp17( ) + "-" + serial_no + "-" + random_token # write to warc with open(os.path.join(self.warc_temp_dir, file_name + ".warc.gz"), "wb") as result_warc_file: log.info("Writing json-timeline result to path %s", self.warc_temp_dir) writer = WARCWriter(result_warc_file, gzip=True) def json_date_converter(o): """ Converts datetime.datetime items in facebook_scraper result to formate suitable for json.dumps""" if isinstance(o, datetime.datetime): return o.__str__() json_payload = json.dumps(api_result.json(), default=json_date_converter, ensure_ascii=False).encode("utf-8") record = writer.create_warc_record( "https://m.facebook.com/" + username, 'metadata', payload=BytesIO(json_payload), warc_content_type="application/json") writer.write_record(record) log.info("Writing scraped results to %s", self.warc_temp_dir) time.sleep(1.2) # sleep to avoid getting blocked by api else: log.debug( "Something went wrong. Is some information missing? Access token is: %s, iso2c is: %s", str(access_token), str(iso2c))
def facebook_user_timeline(self, seed_id, username, nsid): """This function will scrape the user timeline""" log.debug("Harvesting user %s with seed_id %s.", username, seed_id) # make sure either username or nsid is present to start scraping assert username or nsid # Possibly look up username if username and not nsid: log.debug("No FB userid, retrieving it") nsid = self.get_fbid(username) if nsid: # report back whether user id was found log.info("FB userid %s", nsid) # todo - need to add timeout and what to do if blocked # todo - post ids will sometimes be empty, account for that for incremental incremental = self.message.get("options", {}).get("incremental", False) harvest_media = self.message.get("options", {}).get("harvest_media", False) if incremental: # search for since_id of post since_id = self.state_store.get_state( __name__, u"timeline.{}.since_id".format(nsid)) scrape_result = [] for post in facebook_scraper.get_posts(nsid, pages=self.pages, extra_info=True, timeout=20): scrape_result.append(post) self.result.harvest_counter["posts"] += 1 self.result.increment_stats("posts") if harvest_media and post[ 'images']: #last condition avoids parsing empty lists (i.e. no media) log.info("Harvesting media from post") # get media content from links - should automatically be caught within warc stream # all photos on fb are jpgs, so the list comprehension checks whether this is the case # for the stream, if not (e.g. video) it will not harvest [ self._harvest_media_url(media_url) for media_url in post['images'] if 'jpg' in media_url ] if incremental and post["post_id"] == since_id: log.info( "Stopping, found last post that was previously harvested with id: %s", post["post_id"]) break # filename will later be converted to path # replicating pattern from https://github.com/internetarchive/warcprox/blob/f19ead00587633fe7e6ba6e3292456669755daaf/warcprox/writer.py#L69 # create random token for filename random_token = ''.join( random.sample('abcdefghijklmnopqrstuvwxyz0123456789', 8)) serial_no = '00000' file_name = safe_string( self.message["id"]) + "-" + warcprox.timestamp17( ) + "-" + serial_no + "-" + random_token with open(os.path.join(self.warc_temp_dir, file_name + ".warc.gz"), "wb") as result_warc_file: log.info("Writing json-timeline result to path %s", self.warc_temp_dir) writer = WARCWriter(result_warc_file, gzip=True) def json_date_converter(o): """ Converts datetime.datetime items in facebook_scraper result to formate suitable for json.dumps""" if isinstance(o, datetime.datetime): return o.__str__() json_payload = json.dumps(scrape_result, default=json_date_converter, ensure_ascii=False).encode("utf-8") record = writer.create_warc_record( username, 'metadata', payload=BytesIO(json_payload), warc_content_type="application/json") writer.write_record(record) log.info("Writing scraped results to %s", self.warc_temp_dir) # write to state store incremental = self.message.get("options", {}).get("incremental", False) key = "timeline.{}.since_id".format(nsid) max_post_time = scrape_result[0].get("time") max_post_id = scrape_result[0].get("post_id") assert max_post_time and max_post_id if incremental: self.state_store.set_state( __name__, key, max_post_id) if incremental else None log.info("Wrote first scraped post to state_store") else: msg = "NSID not found for user {}".format(username) log.exception(msg) self.result.warnings.append( Msg(CODE_UID_NOT_FOUND, msg, seed_id=seed_id))
class SiteCrawler(object): def __init__(self, priority, multi_site_crawler, seed_urls, domain, config, scout=None): # Multi-site crawler object that manages current crawler self.multi_site_crawler = multi_site_crawler # Concurrency lock to ensure that only one process accesses URL lists (pending, visited and attempts) self.url_list_concurrency_lock = Lock() # Concurrency lock to ensure that only one process accesses to write the status and the output WARC file self.file_write_concurrency_lock = Lock() # If verbose is True, debuging level is set to INFO; otherwise it is ERROR logging.basicConfig( level=logging.INFO if config["verbose"] else logging.ERROR) # Domain corresponding to the seed URLs to be crawled self.domain = domain # Accepted TLDs in the crawl self.tlds = config["accepted_tlds"] # Set of URLs that have been already crawled self.visited = set() # Map that counts the number of times a URL is visited and could not be accessed self.attempts = {} # Links that must not be re-crawled until some time has passed self.asleep_links = {} # Maximum number of attempts to visit a website and receiving an error until it is discarded self.max_attempts = config["max_attempts"] # Maximum self.max_folder_tree_depth = config["max_folder_tree_depth"] # Accepted content time (for example: (text/html) ) self.accepted_content_type = config["accepted_content"] # List of regular expressions to discard URLs self.url_blacklist_re = config["url_blacklist"] # If interrupt is set to False, crawling stops self.interrupt = False self.sleep_thread = None # Variable that keeps the current size of the crawling self.crawl_size = 0.0 # Priority of the process when added to the queue that manages all the crawlers in MultiSiteCrawler self.priority = priority # Path to the file that stores crawling state dump self.dumpfile = config["output_dir"] + "/" + self.domain + ".state" # If a path is provided, the previous crawling status is restored to resume crawling if config["resume_crawling"]: self.load_status(pickle.load(open(self.dumpfile, 'rb'))) # Path to the file where WARC is writen output_file_name = config["output_dir"] + "/" + self.domain + ".warc.gz" metadata_output_file_name = config[ "output_dir"] + "/" + self.domain + ".metadata.gz" name_counter = 1 while os.path.isfile(output_file_name): output_file_name = config[ "output_dir"] + "/" + self.domain + "." + str( name_counter) + ".warc.gz" metadata_output_file_name = config[ "output_dir"] + "/" + self.domain + "." + str( name_counter) + ".metadata.gz" name_counter += 1 f_out = open(output_file_name, 'wb') self.writer = WARCWriter(f_out, gzip=True) self.metadata_writer = gzip.open(metadata_output_file_name, "wb") # Scout object that will determine if the website is promising and if crawling should be interrupted self.scout = scout # The user will only keep documents in these languages self.langs_of_interest = config["langs_of_interest"] # User agent of the crawl self.user_agent = config["user_agent"] # Connection timeout self.conn_timeout = config["connection_timeout"] # Setting default crawling delay self.default_delay = config["crawl_delay"] # Init list of pending URLs from seed URLs; every URL is checked to confirm that it can be visited self.pending_urls = [] # Robots parser: it is initialised from the first valid seed URL found self.robots = SiteRobots(self.user_agent, self.default_delay, self.conn_timeout) self.url_list_concurrency_lock.acquire() for url in seed_urls: if url.is_valid(): self.add_url_to_list(url) self.url_list_concurrency_lock.release() # Maximum crawling size for this site if "max_size_per_site" not in config: self.max_size = None else: self.max_size = config["max_size_per_site"] # Maximum crawling time for this site if "max_time_per_site" not in config: self.max_time = None else: self.max_time = config["max_time_per_site"] # Starting time of the crawling; it is used to decide when max_time is reached self.starts = int(time.time()) # Time of the last connection; it is used to make sure that delay is fulfilled self.last_connection = self.starts - self.default_delay def extend_url_list(self, url_list): self.url_list_concurrency_lock.acquire() for u in url_list: self.add_url_to_list(u) self.url_list_concurrency_lock.release() # Adding URL to the list of URLs to be visited during crawling; before doing so, checks if it was already visited or # if it infringes TLD restrictions def add_url_to_list(self, url): if not url.is_valid(): logging.info('"%s" is not a valid URL', url.get_norm_url()) if url.get_norm_url() in self.visited or url in self.pending_urls: logging.info( '"%s" already used before (it may be pending of crawling)', url.get_norm_url()) else: logging.info('"%s" added to pending URLs', url.get_norm_url()) self.pending_urls.append(url) def get_pending_url(self): url = None try: self.url_list_concurrency_lock.acquire() sleeping_urls = [] while len(self.pending_urls) > 0 and url is None: # Next URL is picked from the list of pending URLs and is added to the list of visited URLs tmp_url = self.pending_urls.pop() if tmp_url.wait_until is not None and tmp_url.wait_until > time.time( ): sleeping_urls.append(url) else: self.visited.add(tmp_url.get_norm_url()) url = tmp_url self.pending_urls.extend(sleeping_urls) finally: self.url_list_concurrency_lock.release() #threading.current_thread().name = "crawling: "+url.get_norm_url() return url def _process_link(self, link, url): logging.debug("\t\t" + threading.current_thread().name + "--- going to process " + link.get_norm_url()) # Longer than limit set by the standard RFC7230 are discarded if not link.is_valid(): return None # Filter url using URL blacklist_re for f in self.url_blacklist_re: if re.search(f, link.get_norm_url()): return None if self.domain == link.get_domain(): logging.debug("\t\t" + threading.current_thread().name + "--- adding URL to list " + link.get_norm_url()) self.url_list_concurrency_lock.acquire() self.add_url_to_list(link) self.url_list_concurrency_lock.release() return link elif link.get_tld() in self.tlds: self.url_list_concurrency_lock.acquire() if link.get_norm_url() in self.visited: logging.info('"%s" already used to extend list of seed URLs', link.get_norm_url()) self.url_list_concurrency_lock.release() else: logging.info('"%s" used to extend list of seed URLs', link.get_norm_url()) self.visited.add(link.get_norm_url()) self.url_list_concurrency_lock.release() self.multi_site_crawler.extend_seed_urls(link) return link else: logging.info('"%s" discarded: not in the same TLD', link.get_norm_url()) return None def _calc_depth(self, url): # calculate url depth return len( url.replace('https', 'http').replace( self.root_url, '').rstrip('/').split('/')) - 1 def connect_to_server(self, url): res = None try: logging.info('Connecting to: %s', url.get_norm_url()) self.last_connection = time.time() # Connections are done with a delay to avoid blocking the server if url.get_url_parts().scheme == 'http': try: conn = http.client.HTTPConnection( url.get_url_parts().netloc, timeout=self.conn_timeout) except: conn = http.client.HTTPSConnection( url.get_url_parts().netloc, timeout=self.conn_timeout) else: conn = http.client.HTTPSConnection(url.get_url_parts().netloc, timeout=self.conn_timeout) logging.info('Connection obtained: %s', url.get_norm_url()) conn.request('GET', quote(url.get_url_parts().path, '?=&%/'), headers={'User-Agent': self.user_agent}) logging.info('Get request set %s', url.get_norm_url()) res = conn.getresponse() logging.info('Response obtained from: %s', url.get_norm_url()) except (http.client.HTTPException, EnvironmentError) as e: logging.info("HTTPException!") conn = None self.process_failed_url(url) except socket.timeout: logging.info("Socket timeout!") if conn is not None: conn.close() self.process_failed_url(url) except ssl.CertificateError: logging.info("CertificateError!") if conn is not None: conn.close() self.process_failed_url(url) except ConnectionResetError: logging.info("ConnectionResetError!") if conn is not None: conn.close() self.process_failed_url(url) except Exception as ex: logging.info(str(ex)) if conn is not None: conn.close() if conn is None: logging.info('Connection is closed') else: logging.info('Connection is correct') return conn, res # The method returns True if the response status is 2XX and the document should be processed; otherwhise it takes # the corresponding action (manage redirects or errors) def deal_with_response_status(self, url, response): if 200 <= response.status <= 226: return True elif 301 <= response.status <= 308: rlink = self._process_link(Link(response.getheader('location')), url) if rlink is not None: logging.info('%s Redirect: %s -> %s', threading.current_thread().name, url.get_norm_url(), rlink.get_norm_url()) elif 400 <= response.status <= 407 or 409 <= response.status <= 412 or 414 <= response.status <= 427 or 431 <= response.status: self.process_failed_url(url, retry=False) elif response.status == 408: self.process_failed_url(url, retry=True) elif response.status == 413 or response.status == 428: waiting_time = response.getheader('Retry-After') if waiting_time is None: url.wait_until = time.time() + 500 else: url.wait_until = time.time() + int(waiting_time) self.process_failed_url(url, retry=True) else: self.process_failed_url(url, retry=False) return False def crawl_one_page(self): self.multi_site_crawler.new_running_crawler() url = self.get_pending_url() if not self.interrupt and url is not None: if not self.robots.fetch(url, self.max_attempts, self.domain): logging.info("robots.txt forbids crawling URL: %s", url.get_norm_url()) return logging.debug("\t" + threading.current_thread().name + " >>>> Connecting " + url.get_norm_url() + "...") connection, server_response = self.connect_to_server(url) logging.debug("\t" + threading.current_thread().name + "<<<< Connected " + url.get_norm_url()) # If response is 2XX, the web page is processed if server_response is not None and self.deal_with_response_status( url, server_response): # Check content type content_type = server_response.getheader('Content-Type') logging.debug("\t" + threading.current_thread().name + "<<<< Content type: " + str(content_type)) doc = None if content_type is not None and not re.search( self.accepted_content_type, content_type): logging.info("%s discarded: wrong file type", url.get_norm_url()) else: logging.debug("\t" + threading.current_thread().name + ">>>> Extracting doc from " + url.get_norm_url()) doc = WebDocument(server_response, url, self.max_attempts) logging.debug("\t" + threading.current_thread().name + "<<<< Document extracted " + url.get_norm_url()) connection.close() logging.debug("\t" + threading.current_thread().name + "<<<< Connection closed: " + url.get_norm_url()) if doc is not None: if doc.utf_text: links_set = doc.get_link_set() # We can shuffle links to avoid to get biased by the structure of the site # random.shuffle(linksset) listoflinks = [] for li in links_set: listoflinks.append(li.get_norm_url()) logging.debug("\t" + threading.current_thread().name + "<<<< Processing " + str(len(links_set)) + " links... " + url.get_norm_url() + "... " + " ".join(listoflinks)) for link in links_set: self._process_link(link, doc.url) logging.debug("\t" + threading.current_thread().name + "<<<< Links processed " + url.get_norm_url()) if doc.get_lang() is None or not doc.get_lang( ).is_reliable: logging.info( "%s discarded: language detection is not reliable", url.get_norm_url()) elif doc.get_lang( ).language not in self.langs_of_interest: logging.info( "%s discarded: language not among languages of interest (detected=%s)", url.get_norm_url(), doc.get_lang().language) else: logging.debug("\t" + threading.current_thread().name + ">>>> Running scout " + url.get_norm_url()) self.run_scout(doc) logging.debug("\t" + threading.current_thread().name + "<<<< Scout run " + url.get_norm_url()) # The document is writen to the warc logging.debug("\t" + threading.current_thread().name + ">>>> Write document " + url.get_norm_url()) self.write_document(doc) logging.debug("\t" + threading.current_thread().name + "<<<< Document saved " + url.get_norm_url()) else: logging.debug("\t" + threading.current_thread().name + "<<<< Document was none: " + url.get_norm_url()) else: logging.debug("\t" + threading.current_thread().name + "<<<< Connection was none") if connection is not None: connection.close() if self.max_size is not None and self.crawl_size > self.max_size: self.interrupt_crawl() elif self.max_time is not None and time.time( ) - self.crawlstarts > self.max_time: self.interrupt_crawl() elif len(self.pending_urls) == 0: self.interrupt = True # If the crawler is allowed to continue crawling, wait until delay has passed and continue if not self.interrupt: self.sleep_thread = Thread(target=self._wait_and_queue) self.sleep_thread.daemon = False self.sleep_thread.name = self.sleep_thread.name + "_sleep" self.sleep_thread.start() else: self.multi_site_crawler.new_done_crawler() def _wait_and_queue(self): sleeptime = self.robots.get_delay() - (time.time() - self.last_connection) if sleeptime > 0: time.sleep(sleeptime) self.multi_site_crawler.crawler_ready(self) self.multi_site_crawler.new_done_crawler() # Scout is run until the recommendation_ready is ready; once it is, the object scout is deleted def run_scout(self, doc): if self.scout is not None: self.scout.step(doc) if self.scout.recommendation_ready(): if not self.scout.recommendation_keep_crawling(): logging.info( "Website discarded after crawling %s due to infringement of scout rule", doc.url.get_norm_url()) self.interrupt = True else: logging.info( "Scout recommends keep crawling website after downloading %s; langs of interest found: %s", doc.url.get_norm_url(), str(self.scout.lang_evidence)) self.scout = None def process_failed_url(self, url, retry=True): if not retry: self.url_list_concurrency_lock.acquire() self.visited.add(url.get_norm_url()) self.url_list_concurrency_lock.release() logging.info('%s: the URL does not exist', url.get_norm_url()) else: if url.get_norm_url() not in self.attempts: self.url_list_concurrency_lock.acquire() self.add_url_to_list(url) self.attempts[url.get_norm_url()] = 1 self.visited.remove(url.get_norm_url()) self.url_list_concurrency_lock.release() logging.info('%s: retrying (attempt 1)', url.get_norm_url()) else: if self.attempts[url.get_norm_url()] <= self.max_attempts: logging.info('%s: retrying (attempt %s)', url, str(self.attempts[url.get_norm_url()])) self.url_list_concurrency_lock.acquire() self.add_url_to_list(url) self.attempts[url.get_norm_url()] += 1 self.visited.remove(url.get_norm_url()) self.url_list_concurrency_lock.release() else: self.url_list_concurrency_lock.acquire() del self.attempts[url.get_norm_url()] self.visited.add(url.get_norm_url()) self.url_list_concurrency_lock.release() logging.info('%s: given up after %s attempts', url.get_norm_url(), str(self.max_attempts)) def write_document(self, doc): self.file_write_concurrency_lock.acquire() try: headers_list = doc.response.getheaders() http_headers = StatusAndHeaders('200 OK', headers_list, protocol='HTTP/1.0') norm_url = doc.url.get_norm_url() record = self.writer.create_warc_record(norm_url, 'response', payload=io.BytesIO( doc.text), http_headers=http_headers) self.writer.write_record(record) self.crawl_size += sys.getsizeof(doc.text) / 1000000.0 if self.metadata_writer is not None: self.metadata_writer.write( ("%s\t%s\t%s\n" % (doc.url.get_norm_url(), str( doc.encoding), str(doc.get_lang()))).encode()) self.metadata_writer.flush() finally: self.file_write_concurrency_lock.release() def get_status_object(self): targets = [] for u in self.pending_urls: targets.append(u.get_norm_url()) return { 'visited': self.visited, 'pendingurls': targets, 'attempts': self.attempts } def load_status(self, status_obj): try: self.file_write_concurrency_lock.acquire() self.visited = status_obj['visited'] self.pending_urls = [] for u in status_obj['pendingurls']: self.pending_urls.append(Link(u)) self.attempts = status_obj['attempts'] finally: self.file_write_concurrency_lock.release() def save_status(self): try: self.file_write_concurrency_lock.acquire() if self.dumpfile is not None: pickle.dump(self.get_status_object(), open(self.dumpfile, 'wb')) finally: self.file_write_concurrency_lock.release() def interrupt_crawl(self): try: self.url_list_concurrency_lock.acquire() self.interrupt = True self.save_status() self.metadata_writer.close() finally: self.url_list_concurrency_lock.release() def __hash__(self): return hash(self.domain) def one_thread_less(self): self.threads += 1
class MHTML2WARC: logger = logging.getLogger(__name__) def __init__(self, writer, gzip=True): self.fh = None self.writer = None self.filename = 'unknown' self.is_first = True if isinstance(writer, BaseWARCWriter): self.writer = writer elif isinstance(writer, str): self.fh = open(writer, 'wb') self.filename = writer self.writer = WARCWriter(self.fh, gzip=gzip) elif hasattr(writer, 'write'): self.writer = WARCWriter(writer, gzip=gzip) else: raise Exception('writer is in an unknown format') def parse(self, input_): if isinstance(input_, str): with open(input_, 'rb') as rfh: message = email.message_from_binary_file(rfh, policy=email.policy.strict) elif hasattr(input_, 'read'): message = email.message_from_binary_file(input_, policy=email.policy.strict) else: raise Exception('input is in an unknown format') if not message.is_multipart(): raise Exception('Invalid MHTML -- not multipart') main_url = message.get('Snapshot-Content-Location', '') warc_date = self.write_warc_info(message) for part in message.walk(): if part.get_content_type() == 'multipart/related': continue self.write_resource(part, main_url, warc_date) def write_resource(self, part, main_url, warc_date): content_type = part.get_content_type() main_type = part.get_content_maintype() content = part.get_payload(decode=True) url = part.get('Content-Location') warc_headers = {'WARC-Date': warc_date, 'WARC-Creation-Date': self.writer._make_warc_date(), } content_id = part.get('Content-ID') write_redir = False if content_id: warc_headers['Content-ID'] = content_id cid_url = 'cid:' + content_id[1:-1] # only write main page url once under url # there may be additional frames for same url # only write them under cid if url == main_url: if self.is_first: self.is_first = False else: url = None if not url: # if cid urls not allowed, skip this resource if not allow_cid_urls: return url = cid_url else: write_redir = True record = self.writer.create_warc_record(url, 'resource', payload=BytesIO(content), length=len(content), warc_content_type=content_type, warc_headers_dict=warc_headers) self.writer.write_record(record) if write_redir and allow_cid_urls: self.add_cid_redirect(cid_url, url) def add_cid_redirect(self, cid_url, url): msg = b'redirect' headers_list = [('Content-Type', 'text/plain'), ('Content-Length', str(len(msg))), ('Location', url)] http_headers = StatusAndHeaders('302 Redirect', headers_list, protocol='HTTP/1.0') record = self.writer.create_warc_record(cid_url, 'response', length=len(msg), payload=BytesIO(msg), http_headers=http_headers) self.writer.write_record(record) def write_warc_info(self, message): creator = message.get('From', '') url = message.get('Snapshot-Content-Location', '') title = message.get('Subject', url) try: actual_date = http_date_to_datetime(message['Date']) timestamp = datetime_to_timestamp(actual_date) except Exception: actual_date = '' timestamp = '' source = 'MHTML Snapshot for: ' + url software = 'mhtml2warc ' + str(__version__) metadata = {'title': source, 'type': 'recording', 'pages': [{'title': title, 'url': url, 'timestamp': timestamp}] } params = OrderedDict([('software', software), ('creator', creator), ('source', source), ('format', 'WARC File Format 1.0'), ('subject', title), ('json-metadata', json.dumps(metadata))]) record = self.writer.create_warcinfo_record(self.filename, params) if actual_date: actual_date = datetime_to_iso_date(actual_date) creation_date = record.rec_headers.get('WARC-Date') record.rec_headers.replace_header('WARC-Date', actual_date) record.rec_headers.replace_header('WARC-Creation-Date', creation_date) self.writer.write_record(record) return actual_date
class CCWARCWriter: def __init__(self, prefix, max_size, subprefix=None, gzip=True, get_serial=None): self.writer = None self.prefix = prefix self.subprefix = subprefix self.max_size = max_size self.gzip = gzip self.hostname = socket.gethostname() if get_serial is not None: self.external_get_serial = get_serial else: self.external_get_serial = None self.serial = 0 def __del__(self): if self.writer is not None: self.f.close() def create_default_info(self, version, warcheader_version, ip, description=None, creator=None, operator=None): ''' creator: # person, organization, service operator: # person, if creator is an organization isPartOf: # name of the crawl ''' info = OrderedDict() info[ 'software'] = 'cocrawler/' + version + ' cocrawler_warcheader_version/' + warcheader_version info['hostname'] = self.hostname info['ip'] = ip if description: info['description'] = description if creator: info['creator'] = creator if operator: info['operator'] = operator info[ 'isPartOf'] = self.prefix # intentionally does not include subprefix info['format'] = 'WARC file version 1.0' self.info = info return info def open(self): filename = self.prefix if self.subprefix: filename += '-' + str( self.subprefix) # don't let yaml leave this as an int serial = self.get_serial(filename) filename += '-' + serial + '-' + self.hostname + '.warc' if self.gzip: filename += '.gz' self.filename = filename self.f = open(filename, 'wb') self.writer = WARCWriter(self.f, gzip=self.gzip) record = self.writer.create_warcinfo_record(self.filename, self.info) self.writer.write_record(record) def get_serial(self, filename): if self.external_get_serial is not None: return self.external_get_serial(filename) self.serial += 1 return '{:06}'.format(self.serial - 1) def maybe_close(self): ''' TODO: always close/reopen if subprefix is not None; to minimize open filehandles? ''' fsize = os.fstat(self.f.fileno()).st_size if fsize > self.max_size: self.f.close() self.writer = None def write_dns(self, dns, ttl, url): # write it out even if empty # TODO: we filter the addresses early, should we warc the unfiltered dns repsonse? # the response object doesn't contain the query type 'A' or 'AAAA' # but it has family=2 AF_INET (ipv4) and flags=4 AI_NUMERICHOST -- that's 'A' kind = 'A' # fixme IPV6 ttl = int(ttl) host = url.hostname if self.writer is None: self.open() payload = timestamp_now() + '\r\n' for r in dns: try: payload += '\t'.join( (host + '.', str(ttl), 'IN', kind, r['host'])) + '\r\n' except Exception as e: LOGGER.info('problem converting dns reply for warcing', host, r, e) pass payload = payload.encode('utf-8') record = self.writer.create_warc_record('dns:' + host, 'resource', payload=BytesIO(payload), warc_content_type='text/dns', length=len(payload)) self.writer.write_record(record) LOGGER.debug('wrote warc dns response record%s for host %s', p(self.prefix), host) stats.stats_sum('warc dns' + p(self.prefix), 1) def _fake_resp_headers(self, resp_headers, body_len, decompressed=False): prefix = b'X-Crawler-' ret = [] for h, v in resp_headers: hl = h.lower() if hl == b'content-length': if not (v.isdigit() and int(v) == body_len): ret.append((prefix + h, v)) ret.append((b'Content-Length', str(body_len))) elif hl == b'content-encoding': if decompressed: ret.append((prefix + h, v)) else: ret.append((h, v)) elif hl == b'transfer-encoding': if v.lower() == b'chunked': # aiohttp always undoes chunking ret.append((prefix + h, v)) else: ret.append((h, v)) else: ret.append((h, v)) return ret def write_request_response_pair(self, url, ip, req_headers, resp_headers, is_truncated, payload, digest=None, decompressed=False): if self.writer is None: self.open() req_http_headers = StatusAndHeaders('GET / HTTP/1.1', req_headers) request = self.writer.create_warc_record('http://example.com/', 'request', http_headers=req_http_headers) fake_resp_headers = self._fake_resp_headers(resp_headers, len(payload), decompressed=decompressed) resp_http_headers = StatusAndHeaders('200 OK', fake_resp_headers, protocol='HTTP/1.1') warc_headers_dict = OrderedDict() if ip is not None: # ip should be here unless we crawl through a proxy warc_headers_dict['WARC-IP-Address'] = ip if digest is not None: warc_headers_dict['WARC-Payload-Digest'] = digest if is_truncated: if is_truncated in valid_truncations: warc_headers_dict['WARC-Truncated'] = is_truncated else: LOGGER.error('Invalid is_truncation of ' + is_truncated) warc_headers_dict['WARC-Truncated'] = 'unspecified' response = self.writer.create_warc_record( url, 'response', payload=BytesIO(payload), length=len(payload), warc_headers_dict=warc_headers_dict, http_headers=resp_http_headers) self.writer.write_request_response_pair(request, response) self.maybe_close() LOGGER.debug('wrote warc request-response pair%s for url %s', p(self.prefix), url) stats.stats_sum('warc r/r' + p(self.prefix), 1)
else: record_type = 'response' http_headers = record.http_headers # Transfer-Encoding: chunked header causes error with giawarc http_headers.remove_header("Transfer-Encoding") try: http_headers.to_ascii_bytes() except UnicodeEncodeError: # if header is non ascii, create a new header, with status code only # content length and content type will be filled before writing http_headers = StatusAndHeaders(record.http_headers.get_statuscode(), []) # Extract payloads (XML) from non-HTML document formats if url[-4:] == ".pdf" or ((record.http_headers is not None and record.http_headers.get_header('Content-Type') is not None) and "application/pdf" in record.http_headers.get_header('Content-Type')): if options.pdfpass: new_record = po.create_warc_record(uri=url, record_type=record_type, warc_content_type=record.content_type, payload=BytesIO(payload), http_headers=http_headers) po.write_record(new_record) continue ### do not process further! if options.pdfextract: payloads = pdfextract(payload, extractor) else: payloads = pdf2html(payload) elif url[-4:] == ".odt" or url[-4:] == ".ods" or url[-4:] == ".odp": payloads = openoffice2html(payload) elif url[-5:] == ".docx" or url[-5:] == ".pptx" or url[-5:] == ".xlsx": payloads = office2html(payload) elif url[-5:] == ".epub": payloads = epub2html(payload) else: payloads = [payload]
with open('example.warc.gz', 'wb') as output: writer = WARCWriter(output, gzip=True) resp = requests.get('http://example.com/', headers={'Accept-Encoding': 'identity'}, stream=True) # get raw headers from urllib3 headers_list = resp.raw.headers.items() http_headers = StatusAndHeaders('200 OK', headers_list, protocol='HTTP/1.0') print(resp.raw) record = writer.create_warc_record('http://example.com/', 'response', payload=resp.raw, http_headers=http_headers) writer.write_record(record) #quit() all_posts = [] for post in facebook_scraper.get_posts(442978589179108, extra_info=True, pages=1, timeout=20): print(post['text'][:40]) all_posts.append(post)
class WarcHandler(EventHandler): __slots__ = ('logger', 'writer', 'documentRecords', 'log', 'maxLogSize', 'logEncoding', 'warcinfoRecordId') def __init__(self, fd, logger): self.logger = logger self.writer = WARCWriter(fd, gzip=True) self.logEncoding = 'utf-8' self.log = BytesIO() # max log buffer size (bytes) self.maxLogSize = 500 * 1024 # maps document urls to WARC record ids, required for DomSnapshotEvent # and ScreenshotEvent self.documentRecords = {} # record id of warcinfo record self.warcinfoRecordId = None def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): self._flushLogEntries() def writeRecord(self, url, kind, payload, warc_headers_dict=None, http_headers=None): """ Thin wrapper around writer.create_warc_record and writer.write_record. Adds default WARC headers. """ d = {} if self.warcinfoRecordId: d['WARC-Warcinfo-ID'] = self.warcinfoRecordId d.update(warc_headers_dict) warc_headers_dict = d record = self.writer.create_warc_record( str(url), kind, payload=payload, warc_headers_dict=warc_headers_dict, http_headers=http_headers) self.writer.write_record(record) return record def _writeRequest(self, item): logger = self.logger.bind(reqId=item.id) req = item.request url = item.url path = url.relative().with_fragment(None) httpHeaders = StatusAndHeaders(f'{req.method} {path} HTTP/1.1', req.headers, protocol='HTTP/1.1', is_http_request=True) warcHeaders = { 'X-Chrome-Initiator': json.dumps(req.initiator), 'X-Chrome-Request-ID': item.id, 'WARC-Date': datetime_to_iso_date(req.timestamp), } body = item.request.body if item.request.hasPostData and body is None: # oops, don’t know what went wrong here logger.error('requestBody missing', uuid='ee9adc58-e723-4595-9feb-312a67ead6a0') warcHeaders['WARC-Truncated'] = 'unspecified' else: warcHeaders['X-Chrome-Base64Body'] = str(type(body) is Base64Body) body = BytesIO(body) record = self.writeRecord(url, 'request', payload=body, http_headers=httpHeaders, warc_headers_dict=warcHeaders) return record.rec_headers['WARC-Record-ID'] def _writeResponse(self, item, concurrentTo): # fetch the body reqId = item.id # now the response resp = item.response warcHeaders = { 'WARC-Concurrent-To': concurrentTo, 'X-Chrome-Request-ID': item.id, 'WARC-Date': datetime_to_iso_date(resp.timestamp), } # conditional WARC headers if item.remoteIpAddress: warcHeaders['WARC-IP-Address'] = item.remoteIpAddress if item.protocol: warcHeaders['X-Chrome-Protocol'] = item.protocol # HTTP headers statusText = resp.statusText or \ BaseHTTPRequestHandler.responses.get ( resp.status, ('No status text available', ))[0] httpHeaders = StatusAndHeaders(f'{resp.status} {statusText}', resp.headers, protocol='HTTP/1.1') # Content is saved decompressed and decoded, remove these headers blacklistedHeaders = {'transfer-encoding', 'content-encoding'} for h in blacklistedHeaders: httpHeaders.remove_header(h) # chrome sends nothing but utf8 encoded text. Fortunately HTTP # headers take precedence over the document’s <meta>, thus we can # easily override those. contentType = resp.mimeType if contentType: if isinstance(resp.body, UnicodeBody): contentType += '; charset=utf-8' httpHeaders.replace_header('Content-Type', contentType) # response body body = resp.body if body is None: warcHeaders['WARC-Truncated'] = 'unspecified' else: httpHeaders.replace_header('Content-Length', str(len(body))) warcHeaders['X-Chrome-Base64Body'] = str(type(body) is Base64Body) body = BytesIO(body) record = self.writeRecord(item.url, 'response', warc_headers_dict=warcHeaders, payload=body, http_headers=httpHeaders) if item.resourceType == 'Document': self.documentRecords[item.url] = record.rec_headers.get_header( 'WARC-Record-ID') def _writeScript(self, item): writer = self.writer encoding = 'utf-8' path = item.path or '-' self.writeRecord(packageUrl(f'script/{path}'), 'metadata', payload=BytesIO(str(item).encode(encoding)), warc_headers_dict={ 'Content-Type': f'application/javascript; charset={encoding}' }) def _writeItem(self, item): assert item.request concurrentTo = self._writeRequest(item) # items that failed loading don’t have a response if item.response: self._writeResponse(item, concurrentTo) def _addRefersTo(self, headers, url): refersTo = self.documentRecords.get(url) if refersTo: headers['WARC-Refers-To'] = refersTo else: self.logger.error(f'No document record found for {url}') return headers def _writeDomSnapshot(self, item): writer = self.writer warcHeaders = { 'X-DOM-Snapshot': str(True), 'X-Chrome-Viewport': item.viewport, 'Content-Type': 'text/html; charset=utf-8', } self._addRefersTo(warcHeaders, item.url) self.writeRecord(item.url, 'conversion', payload=BytesIO(item.document), warc_headers_dict=warcHeaders) def _writeScreenshot(self, item): writer = self.writer warcHeaders = { 'Content-Type': 'image/png', 'X-Crocoite-Screenshot-Y-Offset': str(item.yoff) } self._addRefersTo(warcHeaders, item.url) self.writeRecord(item.url, 'conversion', payload=BytesIO(item.data), warc_headers_dict=warcHeaders) def _writeControllerStart(self, item): payload = BytesIO( json.dumps(item.payload, indent=2, cls=StrJsonEncoder).encode('utf-8')) writer = self.writer warcinfo = self.writeRecord( packageUrl('warcinfo'), 'warcinfo', warc_headers_dict={'Content-Type': 'text/plain; encoding=utf-8'}, payload=payload) self.warcinfoRecordId = warcinfo.rec_headers['WARC-Record-ID'] def _flushLogEntries(self): if self.log.tell() > 0: writer = self.writer self.log.seek(0) # XXX: we should use the type continuation here self.writeRecord(packageUrl('log'), 'resource', payload=self.log, warc_headers_dict={ 'Content-Type': f'text/plain; encoding={self.logEncoding}' }) self.log = BytesIO() def _writeLog(self, item): """ Handle log entries, called by .logger.WarcHandlerConsumer only """ self.log.write(item.encode(self.logEncoding)) self.log.write(b'\n') if self.log.tell() > self.maxLogSize: self._flushLogEntries() route = { Script: _writeScript, RequestResponsePair: _writeItem, DomSnapshotEvent: _writeDomSnapshot, ScreenshotEvent: _writeScreenshot, ControllerStart: _writeControllerStart, } async def push(self, item): for k, v in self.route.items(): if isinstance(item, k): v(self, item) break
rb'.*<!-- Mirrored from ', b'', re.sub(rb' by HTTrack Website Copier.*', b'', line)) date = re.sub(rb'.+by HTTrack Website.+\[.+\][^,]*, ', b'', re.sub(rb' -->.*', b'', line)) break if date is None: dvalue = datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ') else: try: dvalue = parse( date.decode("utf8")).strftime('%Y-%m-%dT%H:%M:%SZ') except ValueError: dvalue = datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ') if url is None: urlStr = "unknown" else: try: urlStr = url.decode("utf8") # sys.stderr.write("HH1 " + urlStr + "\n") except: urlStr = "unknown-encoding" # sys.stderr.write("HH2 " + urlStr + "\n") with open(filepath, 'rb') as content_file: record = writer.create_warc_record( urlStr, 'resource', warc_content_type="application/http; msgtype=response", payload=content_file) writer.write_record(record)