def process(self, item): digests = {} input_filename = '%(item_dir)s/%(warc_file_base)s.warc.gz' % item output_filename = '%(item_dir)s/%(warc_file_base)s-deduplicated.warc.gz' % item with open(input_filename, 'rb') as f_in, \ open(output_filename, 'wb') as f_out: writer = WARCWriter(filebuf=f_out, gzip=True) for record in ArchiveIterator(f_in): url = record.rec_headers.get_header('WARC-Target-URI') if url is not None and url.startswith('<'): url = re.search('^<(.+)>$', url).group(1) record.rec_headers.replace_header('WARC-Target-URI', url) if record.rec_headers.get_header('WARC-Type') == 'response': digest = record.rec_headers.get_header('WARC-Payload-Digest') if digest in digests: writer.write_record( self._record_response_to_revisit(writer, record, digests[digest]) ) else: digests[digest] = ( record.rec_headers.get_header('WARC-Record-ID'), record.rec_headers.get_header('WARC-Date'), record.rec_headers.get_header('WARC-Target-URI') ) writer.write_record(record) elif record.rec_headers.get_header('WARC-Type') == 'warcinfo': record.rec_headers.replace_header('WARC-Filename', output_filename) writer.write_record(record) else: writer.write_record(record)
def build_writer(self): """ Initialize a new WARC file and write the "warcinfo" header. """ directory = self.settings.get('WARC_FILE_DIRECTORY', '.') filename = self.build_filename() if self.debug: fp = sys.stdout.buffer else: fp = open(os.path.join(directory, filename), 'wb') logger.debug(f"Generating WARC file {filename}") writer = WARCWriter( fp, gzip=self.settings.getbool('WARC_GZIP', True), warc_version=self.settings['WARC_VERSION'], ) headers = { 'hostname': self.hostname, 'ip': self.ip_address, 'http-header-user-agent': self.settings["USER_AGENT"], 'robots': 'classic' if self.settings["ROBOTSTXT_OBEY"] else 'none', 'operator': self.settings.get("WARC_OPERATOR"), 'software': self.settings.get("WARC_SOFTWARE"), 'isPartOf': self.settings.get("WARC_IS_PART_OF"), 'description': self.settings.get("WARC_DESCRIPTION"), 'format': self.settings.get("WARC_FORMAT"), 'conformsTo': self.settings.get("WARC_CONFORMS_TO"), } warcinfo_record = writer.create_warcinfo_record(filename, headers) writer.write_record(warcinfo_record) return writer
def fetch_urls_to_warc(urls, warcfile_path): """Fetch urls and write to warc file :urls: list of urls to binary files :warcfile_path: path to a WARC file. """ with open(warcfile_path, 'wb') as output: writer = WARCWriter(output, gzip=True) for url in urls: print(url) resp = requests.get(url, headers={'Accept-Encoding': 'identity'}, stream=True) headers_list = resp.raw.headers.items() http_headers = StatusAndHeaders('200 OK', headers_list, protocol='HTTP/1.0') record = writer.create_warc_record(url, 'response', payload=resp.raw, http_headers=http_headers) writer.write_record(record)
def run(url, out_path, time_limit, agent, filetypes, warcfilename, wait): cmd = "" if time_limit: cmd += "timeout {} ".format(time_limit) waitoption = "" if wait is not None: waitoption = "--wait " + wait agentoption = "" if agent is not None: agentoption = "--user-agent \"" + agent + "\"" filetypesoption = "" if filetypes is not None: filetypesoption = "-A \"" + filetypes + "\"" warcoption = "" warcfilebasename = warcfilename[0:warcfilename.find(".warc.gz")] if warcfilename is not None: warcoption = "--warc-file \"" + warcfilebasename + "\"" if check_wget_compression("wget --help | grep 'no-warc-compression'"): warcoption += " --no-warc-compression" cmd += "wget --mirror {WAIT} {FILETYPES} -q -o /dev/null {URL} -P {DOWNLOAD_PATH} {AGENT} {WARC}".format( WAIT=waitoption, FILETYPES=filetypesoption, URL=url, DOWNLOAD_PATH=out_path, AGENT=agentoption, WARC=warcoption) # print("cmd", cmd) try: system_check(cmd) except subprocess.CalledProcessError as grepexc: sys.stderr.write( "Warning: Some files could not be downloaded with wget\n") with open(warcfilebasename + ".warc", 'rb') as f_in: with open(warcfilebasename + ".warc.gz", 'wb') as f_out: writer = WARCWriter(f_out, gzip=True) try: for record in ArchiveIterator(f_in): if record.http_headers: if record.http_headers.get_header( 'Transfer-Encoding') == "chunked": continue try: record.http_headers.to_ascii_bytes() except UnicodeEncodeError: # if header is non ascii, create a new header, with status code only # content length and content type will be filled before writing record.http_headers = StatusAndHeaders( record.http_headers.get_statuscode(), []) record.length = None writer.write_record(record) except: pass system_check("rm {WARC}".format(WARC=warcfilebasename + ".warc"))
def download_one(self, path: Path, source: str) -> None: logging.info(f"Fetching {source} for {self.name}") source_rows = self.fetch_source_rows(source) with AtomicFileWriter(path) as output: writer = WARCWriter(output, gzip=True) logging.info(f"Downloading {source}") for warc in fetch_all_cc(source_rows, self.nthread, self.disable_progress): writer.write_record(warc)
def load_and_write(self, stream, output): with open(output, 'wb') as out: writer = WARCWriter(filebuf=out, gzip=True) for record in ArchiveIterator(stream, no_record_parse=True, arc2warc=True, verify_http=False): writer.write_record(record)
def write_memento(self, murl=None): """ This is function to write memento in WARC format. Parameters: murl (str): URI-M Returns: (bool): True on Success and False on Failure """ try: if self.lookup_memento(murl): return True else: response = Utils.get_murl_info(murl, self.__thandle) mpath = self.__memento_dir if not os.path.exists(mpath): os.mkdir(mpath) mpath = os.path.join(mpath, response["handle"].lower()) if not os.path.exists(mpath): os.mkdir(mpath) mpath = os.path.join(mpath, response["domain"]) if not os.path.exists(mpath): os.mkdir(mpath) mpath = os.path.join(mpath, response["archive"]) if not os.path.exists(mpath): os.mkdir(mpath) mpath = os.path.join(mpath, response["wrep"] + response["lang"]) if not os.path.exists(mpath): os.mkdir(mpath) try: mpath = os.path.join(mpath, str(response["timestamp"]) + self.__constants.WARC_EXT) with open(mpath, "wb") as output: writer = WARCWriter(output, gzip=True) resp = requests.get(murl, headers={'Accept-Encoding': 'identity'}, stream=True, timeout=120) # get raw headers from urllib3 headers_list = resp.raw.headers.items() http_headers = StatusAndHeaders('200 OK', headers_list, protocol='HTTP/1.1') record = writer.create_warc_record(mpath, 'response', payload=resp.raw, http_headers=http_headers) writer.write_record(record) return True except requests.exceptions.TooManyRedirects as err: sys.stderr.write(murl + "Too Many redirects" + "\n") except requests.exceptions.ConnectTimeout as err: sys.stderr.write(murl + "Connection Timeout" + "\n") except Exception as e: sys.stderr.write("Memento Write Error: " + str(e) + "URL:" + murl + "\n") except Exception as e: sys.stderr.write("Memento Write Error: " + murl + " " + str(e) + "\n") return False
def __init__(self, filename, logger_, program_name='corpusbuilder 1.0', user_agent=None, overwrite_warc=True, err_threshold=10, warcinfo_record_data=None, known_bad_urls=None, max_no_of_calls_in_period=2, limit_period=1, proxy_url=None, allow_cookies=False): if known_bad_urls is not None: # Setup the list of cached bad URLs to prevent trying to download them again with open(known_bad_urls, encoding='UTF-8') as fh: self.bad_urls = {line.strip() for line in fh} else: self.bad_urls = set() if not overwrite_warc: # Find out next nonexisting warc filename num = 0 while os.path.exists(filename): filename2, ext = os.path.splitext(filename) # Should be filename.warc.gz if ext == '.gz' and filename2.endswith('.warc'): filename2, ext2 = os.path.splitext(filename2) # Should be filename.warc ext = ext2 + ext # Should be .warc.gz filename = '{0}-{1:05d}{2}'.format(filename2, num, ext) num += 1 logger_.log('INFO', 'Creating archivefile: {0}'.format(filename)) self._output_file = open(filename, 'wb') self._logger_ = logger_ self._req_headers = {'Accept-Encoding': 'identity', 'User-agent': user_agent} self._session = Session() # Setup session for speeding up downloads if proxy_url is not None: # Set socks proxy if provided self._session.proxies['http'] = proxy_url self._session.proxies['https'] = proxy_url self._allow_cookies = allow_cookies # Setup rate limiting to prevent hammering the server self._requests_get = sleep_and_retry(limits(calls=max_no_of_calls_in_period, period=limit_period)(self._http_get_w_cookie_handling)) self._error_count = 0 self._error_threshold = err_threshold # Set the error threshold which cause aborting to prevent deinal self._writer = WARCWriter(self._output_file, gzip=True) if warcinfo_record_data is None: # INFO RECORD # Some custom information about the warc writer program and its settings info_headers = {'software': program_name, 'arguments': ' '.join(sys.argv[1:]), 'format': 'WARC File Format 1.0', 'conformsTo': 'http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf'} info_record = self._writer.create_warcinfo_record(filename, info_headers) else: # Must recreate custom headers else they will not be copied custom_headers = ''.join('{0}: {1}\r\n'.format(k, v) for k, v in warcinfo_record_data[1].items()).\ encode('UTF-8') info_record = self._writer.create_warc_record('', 'warcinfo', warc_headers=warcinfo_record_data[0], payload=BytesIO(custom_headers), length=len(custom_headers)) self._writer.write_record(info_record)
def process(self, item): digests = {} input_filename = "%(item_dir)s/%(warc_file_base)s.warc" % item output_filename = "%(item_dir)s/%(warc_file_base)s-deduplicated.warc.gz" % item with open(input_filename, 'rb') as f_in, \ open(output_filename, 'wb') as f_out: writer = WARCWriter(filebuf=f_out, gzip=True) for record in ArchiveIterator(f_in): url = record.rec_headers.get_header('WARC-Target-URI') if url is not None and url.startswith('<'): url = re.search('^<(.+)>$', url).group(1) record.rec_headers.replace_header('WARC-Target-URI', url) if record.rec_headers.get_header('WARC-Type') == 'response': digest = record.rec_headers.get_header('WARC-Payload-Digest') if digest in digests: writer.write_record( self._record_response_to_revisit(writer, record, digests[digest]) ) else: digests[digest] = ( record.rec_headers.get_header('WARC-Record-ID'), record.rec_headers.get_header('WARC-Date'), record.rec_headers.get_header('WARC-Target-URI') ) writer.write_record(record) elif record.rec_headers.get_header('WARC-Type') == 'warcinfo': record.rec_headers.replace_header('WARC-Filename', output_filename) writer.write_record(record) else: writer.write_record(record)
def main(args): output = open(args.output, 'wb') writer = WARCWriter(output, gzip=True) with open(args.input, 'rb') as stream: for record in ArchiveIterator(stream): if 'WARC-Target-URI' in record.rec_headers: record.rec_headers['WARC-Target-URI'] = record.rec_headers[ 'WARC-Target-URI'].lstrip('<').rstrip('>') writer.write_record(record) output.close()
def mergeWarc(): """ Merge multiple WARC files into a single file, writing revisit records for items which occur multiple times """ parser = argparse.ArgumentParser( description='Merge WARCs, reads filenames from stdin.') parser.add_argument('--verbose', '-v', action='store_true') parser.add_argument('output', type=argparse.FileType('wb'), help='Output WARC') args = parser.parse_args() loglevel = logging.DEBUG if args.verbose else logging.INFO logging.basicConfig(level=loglevel) unique = 0 revisit = 0 payloadMap = {} writer = WARCWriter(args.output, gzip=True) for l in sys.stdin: l = l.strip() with open(l, 'rb') as fd: for record in ArchiveIterator(fd): if record.rec_type in {'resource', 'response'}: headers = record.rec_headers rid = headers.get_header('WARC-Record-ID') csum = headers.get_header('WARC-Payload-Digest') dup = payloadMap.get(csum, None) if dup is None: payloadMap[csum] = { 'uri': headers.get_header('WARC-Target-URI'), 'id': rid, 'date': headers.get_header('WARC-Date') } unique += 1 else: logging.debug('Record {} is duplicate of {}'.format( rid, dup['id'])) record = writer.create_revisit_record( dup['uri'], csum, dup['uri'], dup['date']) record.rec_headers.add_header('WARC-Truncated', 'length') record.rec_headers.add_header('WARC-Refers-To', dup['id']) revisit += 1 else: unique += 1 writer.write_record(record) logging.info('Wrote {} unique records, {} revisits'.format( unique, revisit))
def open(self): filename = self.prefix if self.subprefix: filename += '-' + self.subprefix serial = self.get_serial(filename) filename += '-' + serial + '-' + self.hostname + '.warc' if self.gzip: filename += '.gz' self.filename = filename self.f = open(filename, 'wb') self.writer = WARCWriter(self.f, gzip=self.gzip) record = self.writer.create_warcinfo_record(self.filename, self.info) self.writer.write_record(record)
def process(filename_in, filename_out): starttime = datetime.now() dedupemiss = 0 dedupehit = 0 with open(filename_in, 'rb') as file_in: with open(filename_out, 'wb') as file_out: writer = WARCWriter(filebuf=file_out, gzip=True) for record in ArchiveIterator(file_in): if record.rec_headers.get_header('WARC-Type') == 'response': record_url = record.rec_headers.get_header( 'WARC-Target-URI') record_digest = record.rec_headers.get_header( 'WARC-Payload-Digest') ia_record = ia_available(record_url, record_digest) if not ia_record: writer.write_record(record) else: print('Found duplicate, writing revisit record.') writer.write_record( revisit_record(writer, record, ia_record)) dedupehit = dedupehit + 1 else: writer.write_record(record) dedupemiss = dedupemiss + 1 print(str(dedupehit) + " Hits") print(str(dedupemiss) + " Misses") print("took " + str(datetime.now() - starttime) + " to execute")
def run(self): with open(self.warcfile, 'ab') as output: while True: self.lock.acquire() data = self.out_queue.get() writer = WARCWriter(output, gzip=False) headers_list = data[0] http_headers = StatusAndHeaders('{} {}'.format( data[3], data[4]), headers_list, protocol='HTTP/1.0') record = writer.create_warc_record(data[2], 'response', payload=data[1], http_headers=http_headers) h = hashlib.sha1() h.update(record.raw_stream.read(BLOCK_SIZE)) if self.dedup.lookup(h.hexdigest()): record = writer.create_warc_record( data[2], 'revisit', http_headers=http_headers) writer.write_record(record) self.out_queue.task_done() self.lock.release() else: self.dedup.save(h.hexdigest(), data[2]) record.raw_stream.seek(0) writer.write_record(record) self.out_queue.task_done() self.lock.release()
def __init__(self, fd, logger): self.logger = logger self.writer = WARCWriter(fd, gzip=True) self.logEncoding = 'utf-8' self.log = BytesIO() # max log buffer size (bytes) self.maxLogSize = 500 * 1024 # maps document urls to WARC record ids, required for DomSnapshotEvent # and ScreenshotEvent self.documentRecords = {} # record id of warcinfo record self.warcinfoRecordId = None
def process(self, item): filename_in = '%(item_dir)s/%(warc_file_base)s.warc.gz' % item filename_out = '%(item_dir)s/%(warc_file_base)s-deduplicated.warc.gz' % item with open(filename_in, 'rb') as file_in: with open(filename_out, 'wb') as file_out: writer = WARCWriter(filebuf=file_out, gzip=True) for record in ArchiveIterator(file_in): if record.rec_headers.get_header( 'WARC-Type') == 'response': record_url = record.rec_headers.get_header( 'WARC-Target-URI') record_digest = record.rec_headers.get_header( 'WARC-Payload-Digest') ia_record = self.ia_available(record_url, record_digest) #print(ia_record) if not ia_record: writer.write_record(record) else: print('Found duplicate, writing revisit record.') writer.write_record( self.revisit_record(writer, record, ia_record)) else: writer.write_record(record)
def open(self): filename = self.prefix if self.subprefix: filename += '-' + str( self.subprefix) # don't let yaml leave this as an int serial = self.get_serial(filename) filename += '-' + serial + '-' + self.hostname + '.warc' if self.gzip: filename += '.gz' self.filename = filename self.f = open(filename, 'wb') self.writer = WARCWriter(self.f, gzip=self.gzip) record = self.writer.create_warcinfo_record(self.filename, self.info) self.warcinfo_id = record.rec_headers.get_header('WARC-Record-ID') self.writer.write_record(record)
def test_init_2(self): filename = os.path.join(self.root_dir, 'redir2.warc.gz') with open(filename, 'wb') as fh: self.writer = WARCWriter(fh, gzip=True) redirect = self.create_redirect_record( 'http://www.example.com/path', 'https://www.example.com/path/', '20191003115920') redirect = self.create_redirect_record( 'https://www.example.com/path/', 'https://www2.example.com/path', '20191003115927', status='302') response = self.create_response_record( 'https://www2.example.com/path', '20191024125646', 'Some Text') revisit = self.create_revisit_record( 'https://www2.example.com/path', '20191024125648', 'https://www2.example.com/path', response.rec_headers['WARC-Date']) wb_manager(['init', 'redir2']) wb_manager(['add', 'redir2', filename]) assert os.path.isfile( os.path.join(self.root_dir, self.COLLS_DIR, 'redir2', 'indexes', 'index.cdxj'))
def run(self): if self.use_magic: if not self.load_magic(): return 1 try: output = open(self.name, self.mode) except FileExistsError as e: self.logger.error(e) self.logger.error( '* Use -a/--append to append to an existing WARC file') self.logger.error( '* Use -o/--overwrite to overwrite existing WARC file') return 1 with closing(output): writer = WARCWriter(output, gzip=self.gzip) self.make_warcinfo(writer) for file_info in self.iter_inputs(): self.make_record(writer, file_info) self.logger.info('Wrote {0} resources to {1}'.format( self.count, self.name)) return 0
def __init__(self, writer, gzip=True): self.fh = None self.writer = None self.filename = 'unknown' self.is_first = True if isinstance(writer, BaseWARCWriter): self.writer = writer elif isinstance(writer, str): self.fh = open(writer, 'wb') self.filename = writer self.writer = WARCWriter(self.fh, gzip=gzip) elif hasattr(writer, 'write'): self.writer = WARCWriter(writer, gzip=gzip) else: raise Exception('writer is in an unknown format')
def test_init_1(self): filename = os.path.join(self.root_dir, 'redir.warc.gz') with open(filename, 'wb') as fh: self.writer = WARCWriter(fh, gzip=True) redirect = self.create_redirect_record('http://example.com/', 'https://example.com/', '20180626101112') redirect = self.create_redirect_record('https://example.com/', 'https://www.example.com/', '20180626101112') response = self.create_response_record('https://www.example.com/', '20180626101112', 'Some Text') revisit = self.create_revisit_record( 'https://example.com/path', '20190626101112', 'https://example.com/abc', response.rec_headers['WARC-Date']) revisit = self.create_revisit_record( 'https://www.example.com/', '20190626101112', 'https://www.example.com/', response.rec_headers['WARC-Date']) wb_manager(['init', 'redir']) wb_manager(['add', 'redir', filename]) assert os.path.isfile( os.path.join(self.root_dir, self.COLLS_DIR, 'redir', 'indexes', 'index.cdxj'))
def run(self): if self.use_magic == 'magic': if not self.load_magic(): return 1 if self.use_tika: if not self.load_tika(): return 1 if self.use_mapfile: if not self.load_mapfile(): return 1 if self.use_logfile: if not self.init_logfile(): return 1 try: output = warcio.utils.open(self.name, self.mode) except OSError as e: # ensure only file exists handling if e.errno != errno.EEXIST: raise self.logger.error(e) self.logger.error( '* Use -a/--append to append to an existing WARC file') self.logger.error( '* Use -o/--overwrite to overwrite existing WARC file') return 1 with closing(output): writer = WARCWriter(output, gzip=self.gzip) self.make_warcinfo(writer) for file_info in self.iter_inputs(): result = self.make_record(writer, file_info) if not result: self.logger.debug('Skipping {0}'.format(file_info.url)) continue url, record = result # Current file serves as a directory index if url.lower().endswith(self.index_files): self.make_index_revisit(writer, url, record) if self.conversion_serializer: self.make_conversions(writer, url, record) if self.transclusion_serializer: self.make_transclusion_metadata(writer, url, record) self.logger.info('Wrote {0} resources to {1}'.format( self.count, self.name)) self.close_logfile() return 0
def process(self, item): records = {} num_records = 0 filename_in = '%(item_dir)s/%(warc_file_base)s.warc.gz' % item filename_out = '%(data_dir)s/%(warc_file_base)s.warc.gz' % item with open(filename_in, 'rb') as file_in: with open(filename_out, 'wb') as file_out: writer = WARCWriter(filebuf=file_out, gzip=True) for record in ArchiveIterator(file_in): num_records += 1 writer.write_record(self.process_record(writer, record, filename_out, records)) print('Processed {} payloads, found {} unique payloads.' .format(num_records, len(records)))
def extract_record(cmd): records = cmd.records.split(',') writer = WARCWriter(filebuf=sys.stdout.buffer, gzip=False) for filename in cmd.inputs: with open(filename, 'rb') as fh: for record in ArchiveIterator(fh, no_record_parse=True, arc2warc=True): if record.format == 'arc': rec_uri = record.rec_headers.get_header('uri') elif record.format in ('warc', 'arc2warc'): rec_uri = record.rec_headers.get_header('WARC-Target-URI') if record.rec_type in records and ( cmd.uri is None or rec_uri is None or (cmd.uri is not None and cmd.uri == rec_uri)): writer.write_record(record) if (not cmd.print_all): break
def test_redir_init_slash(self): filename = os.path.join(self.root_dir, 'redir-slash.warc.gz') with open(filename, 'wb') as fh: self.writer = WARCWriter(fh, gzip=True) response = self.create_response_record('https://www.example.com/sub/path/', '201806026101112', 'Sub Path Data') response = self.create_response_record('https://www.example.com/sub/path/?foo=bar', '201806026101112', 'Sub Path Data Q') wb_manager(['add', 'redir', filename])
def convert_to_warc(website, filename): with open(filename + '.warc.gz', 'wb') as output: writer = WARCWriter(output, gzip=True) resp = requests.get(website, headers={'Accept-Encoding': 'identity'}, stream=True) # get raw headers from urllib3 headers_list = resp.raw.headers.items() http_headers = StatusAndHeaders('200 OK', headers_list, protocol='HTTP/1.0') record = writer.create_warc_record(website, 'response', payload=resp.raw, http_headers=http_headers) writer.write_record(record)
def test_input_manifest(self): wet1 = BytesIO() writer1 = WARCWriter(wet1, gzip=False) write_conversion_record(writer1, 'https://nophonenumbershere.info', b'THIS-IS-NOT-A-NUMBER') write_conversion_record( writer1, 'https://big.directory/', b'The Time: (612) 777-9311\nJenny: (201) 867-5309\n') wet2_gz_path = join(self.tmp_dir, 'wet2.warc.wet.gz') with open(wet2_gz_path, 'wb') as wet2: writer2 = WARCWriter(wet2, gzip=True) write_conversion_record(writer2, 'https://jseventplanning.biz/', b'contact us at +1 201 867 5309') self.assertEqual( run_job(MRPhoneToURL(['-r', self.RUNNER, wet2_gz_path, '-']), raw_input=wet1.getvalue()), self.EXPECTED_OUTPUT)
def create_temp_warc(cls): with NamedTemporaryFile(delete=False, suffix='.warc.gz') as fh: writer = WARCWriter(fh, gzip=True) cls.create_record(writer, 'http://example.com/', 'Example Domain', '20140101000000') cls.create_record(writer, 'http://example.com/', 'Example Domain', '20170101000000') filename = fh.name return filename
def __init__(self, reader, writer, gzip=True): if isinstance(reader, str): with codecs.open(reader, encoding='utf-8') as fh: self.har = json.loads(fh.read()) elif hasattr(reader, 'read'): self.har = json.loads(reader.read()) elif isinstance(reader, dict): self.har = reader else: raise Exception('reader is in an unknown format') self.fh = None if isinstance(writer, BaseWARCWriter): self.writer = writer elif isinstance(writer, str): self.fh = open(writer, 'wb') self.writer = WARCWriter(self.fh, gzip=gzip) elif hasattr(writer, 'write'): self.writer = WARCWriter(writer, gzip=gzip) else: raise Exception('writer is in an unknown format')
def run(url, outPath, timeLimit, agent, filetypes, warcfilename, wait): cmd = "" if timeLimit: cmd += "timeout {} ".format(timeLimit) waitoption = "" if wait is not None: waitoption = "--wait " + wait agentoption = "" if agent is not None: agentoption = "--user-agent \"" + agent + "\"" filetypesoption = "" if filetypes is not None: filetypesoption = "-A \"" + filetypes + "\"" warcoption = "" warcfilebasename = warcfilename[0:warcfilename.find(".warc.gz")] if warcfilename is not None: warcoption = "--warc-file \"" + warcfilebasename + "\"" if check_wget_compression("wget --help | grep 'no-warc-compression'"): warcoption += " --no-warc-compression" cmd += "wget --mirror {WAIT} {FILETYPES} -q {URL} -P {DOWNLOAD_PATH} {AGENT} {WARC}".format( WAIT=waitoption, FILETYPES=filetypesoption, URL=url, DOWNLOAD_PATH=outPath, AGENT=agentoption, WARC=warcoption) # print("cmd", cmd) try: system_check(cmd) with open(warcfilebasename + ".warc", 'rb') as f_in: with open(warcfilebasename + ".warc.gz", 'wb') as f_out: writer = WARCWriter(f_out, gzip=True) for record in ArchiveIterator(f_in): writer.write_record(record) except subprocess.CalledProcessError as grepexc: with open(warcfilebasename + ".warc", 'rb') as f_in: with open(warcfilebasename + ".warc.gz", 'wb') as f_out: writer = WARCWriter(f_out, gzip=True) for record in ArchiveIterator(f_in): writer.write_record(record) # try except here sys.stderr.write( "Warning: Some files could not be downloaded with wget\n") system_check("rm {WARC}".format(WARC=warcfilebasename + ".warc"))
def _fetch_warc(self, action_result, url, out_path): with open(out_path, "wb") as output: writer = WARCWriter(output, gzip=True) resp = requests.get(url, headers={"Accept-Encoding": "identity"}, stream=True) # get raw headers from urllib3 headers_list = resp.raw.headers.items() http_headers = StatusAndHeaders("200 OK", headers_list, protocol="HTTP/1.0") record = writer.create_warc_record(url, "response", payload=resp.raw, http_headers=http_headers) writer.write_record(record) return out_path
def run(self): with open(self.warcfile, 'ab') as output: while True: self.lock.acquire() data = self.out_queue.get() writer = WARCWriter(output, gzip=False) headers_list = data[0] http_headers = StatusAndHeaders('{} {}'.format(data[3], data[4]), headers_list, protocol='HTTP/1.0') record = writer.create_warc_record(data[2], 'response', payload=data[1], http_headers=http_headers) h = hashlib.sha1() h.update(record.raw_stream.read(BLOCK_SIZE)) if self.dedup.lookup(h.hexdigest()): record = writer.create_warc_record(data[2], 'revisit', http_headers=http_headers) writer.write_record(record) self.out_queue.task_done() self.lock.release() else: self.dedup.save(h.hexdigest(), data[2]) record.raw_stream.seek(0) writer.write_record(record) self.out_queue.task_done() self.lock.release()