def test_customize_index_list(): tests = [ # gets the whole list because 201704 is before the first 2017 index [{ 'to': '201804' }, list(reversed(my_cc_endpoints))], [{ 'from_ts': '201801', 'to': '201804' }, my_cc_endpoints[4:0:-1]], [{ 'from_ts': '20180214', 'to': '201804' }, my_cc_endpoints[4:1:-1]], [{ 'from_ts': '20180429', 'to': '20180430' }, my_cc_endpoints[4:5]], # perhaps this next one should raise... [{ 'from_ts': '20180430', 'to': '20180429' }, my_cc_endpoints[4:5]], ] with mock.patch('cdx_toolkit.get_cc_endpoints', return_value=my_cc_endpoints): cdx = cdx_toolkit.CDXFetcher(source='cc') cdxa = cdx_toolkit.CDXFetcher(source='cc', cc_sort='ascending') for params, custom_list in tests: cdx_toolkit.apply_cc_defaults(params) assert cdx.customize_index_list(params) == custom_list assert cdxa.customize_index_list(params) == list( reversed(custom_list))
def test_args(): with pytest.raises(ValueError): cdx = cdx_toolkit.CDXFetcher(wb='foo', warc_download_prefix='foo') with pytest.raises(ValueError): cdx = cdx_toolkit.CDXFetcher(source='asdf') with pytest.raises(ValueError): cdx = cdx_toolkit.CDXFetcher(source='cc', wb='foo')
def test_capture_object(): cdx_cc = cdx_toolkit.CDXFetcher(source='cc') cdx_ia = cdx_toolkit.CDXFetcher(source='ia') cdx_only = cdx_toolkit.CDXFetcher( source='https://web.archive.org/cdx/search/cdx', loglevel='DEBUG') url = 'example.com' kwargs = {'limit': 1} got_one = False for obj in cdx_only.iter(url, **kwargs): got_one = True with pytest.raises(ValueError): _ = obj.content assert got_one, 'found a capture cdx_only' for cdx in (cdx_cc, cdx_ia): got_one = False for obj in cdx.iter(url, **kwargs): got_one = True content = obj.content assert isinstance(content, six.binary_type) if len(content) == 0: # if the first capture happens to be a revisit, the content length will be zero pass else: assert len(content) > 100, str(obj) content2 = obj.content assert content == content2 r = obj.fetch_warc_record() r2 = obj.fetch_warc_record() assert r == r2 stream = obj.content_stream # we read the stream above, so it's at eof more_content = stream.read() assert len(more_content) == 0 text = obj.text assert isinstance(text, six.string_types) text2 = obj.text assert text == text2 # some duck-type dict texts on obj obj['foo'] = 'asdf' assert obj['foo'] == 'asdf' assert 'foo' in obj del obj['foo'] assert got_one
def setup(cmd): kwargs = {} kwargs['source'] = cmd.cc or cmd.ia or cmd.source or None if kwargs['source'] is None: raise ValueError('must specify --cc, --ia, or a --source') if cmd.wb: kwargs['wb'] = cmd.wb if cmd.cc_mirror: kwargs['cc_mirror'] = cmd.cc_mirror cdx = cdx_toolkit.CDXFetcher(**kwargs) kwargs = {} if cmd.limit: kwargs['limit'] = cmd.limit if 'from' in vars(cmd) and vars( cmd)['from']: # python, uh, from is a reserved word kwargs['from_ts'] = vars(cmd)['from'] if cmd.to: kwargs['to'] = cmd.to if cmd.closest: if not cmd.get: # pragma: no cover LOGGER.info('note: --closest works best with --get') kwargs['closest'] = cmd.closest if cmd.filter: kwargs['filter'] = cmd.filter if cmd.cmd == 'warc' and cmd.size: kwargs['size'] = cmd.size if cmd.cmd == 'size' and cmd.details: kwargs['details'] = cmd.details return cdx, kwargs
def test_customize_index_list_closest(): # when I implement the funky sort order, this will become different my_cc_endpoints_rev = list(reversed(my_cc_endpoints)) tests = [ [{ 'closest': '201801', 'from_ts': '20171230', 'to': None }, my_cc_endpoints_rev[0:4]], [{ 'closest': '201803', 'from_ts': '20180214', 'to': None }, my_cc_endpoints_rev[0:3]], [{ 'closest': '201801', 'from_ts': '20171230', 'to': '201802' }, my_cc_endpoints_rev[2:4]], ] with mock.patch('cdx_toolkit.get_cc_endpoints', return_value=my_cc_endpoints): cdx = cdx_toolkit.CDXFetcher(source='cc') for params, custom_list in tests: cdx_toolkit.apply_cc_defaults(params) print(params) assert cdx.customize_index_list(params) == custom_list
def crawl(self, domain, limit): """ :param domain: :param limit: """ self.header = self.get_header() self.proxy = self.get_proxy(self.source) cdx = cdx_toolkit.CDXFetcher() url = f'*.{domain}/*' size = cdx.get_size_estimate(url) print(url, 'CommonCrawl size estimate', size) for resp in tqdm(cdx.iter(url, limit=limit), total=limit): if resp.data.get('status') not in ['301', '302']: subdomains = self.match_subdomains(domain, resp.text) self.subdomains.update(subdomains)
def crawl(self, domain, limit): """ :param domain: :param limit: """ self.header = self.get_header() self.proxy = self.get_proxy(self.source) cdx = cdx_toolkit.CDXFetcher(source='ia') url = f'*.{domain}/*' size = cdx.get_size_estimate(url) logger.log('DEBUG', f'{url} ArchiveCrawl size estimate {size}') for resp in cdx.iter(url, limit=limit): if resp.data.get('status') not in ['301', '302']: url = resp.data.get('url') subdomains = self.match_subdomains(domain, url + resp.text) self.subdomains.update(subdomains)
import extruct import requests import pprint from w3lib.html import get_base_url from rdflib.plugin import register, Serializer, Parser register('json-ld', Serializer, 'rdflib_jsonld.serializer', 'JsonLDSerializer') register('json-ld', Parser, 'rdflib_jsonld.parser', 'JsonLDParser') import cdx_toolkit cdx = cdx_toolkit.CDXFetcher(source='cc') objs = list( cdx.iter('http://www.randstadusa.com/jobs/*', from_ts='202006', to='202103', filter=['status:200'])) with open('test.html', 'wb') as f: f.write(objs[0].content) pp = pprint.PrettyPrinter(indent=2) r = requests.get( 'http://www.randstadusa.com/jobs/search/4/824761/java-app-developer-312021em_malvern/' ) base_url = get_base_url(r.text, r.url) data = extruct.extract(r.text, base_url, syntaxes=['microdata', 'opengraph', 'rdfa']) [ data for data in extruct.extract(objs[0].content)['json-ld'] if data['@type'] == 'JobPosting' ]
def process_cdx_url(connection, url, batch_size=100, source='cc', **kwargs): ''' NOTE: ideally, this function would be wrapped in a transaction; but this causes deadlocks when it is run concurrently with other instances of itself ''' cdx = cdx_toolkit.CDXFetcher(source) # create a new entry in the source table for this bulk insertion name = 'process_cdx_url(url="' + str(url) + '", source="' + str( source) + '", **kwargs=' + str(kwargs) + ')' log.info("name=" + str(name.replace('"', r'\"'))) try: sql = sqlalchemy.sql.text(''' INSERT INTO source (name) VALUES (:name) RETURNING id; ''') res = connection.execute(sql, {'name': name}) id_source = res.first()['id'] log.info('id_source=' + str(id_source)) # if an entry already exists in source, # then this bulk insertion has already happened (although may not be complete), # and so we skip this insertion except sqlalchemy.exc.IntegrityError: logging.warning('skipping name=' + name) return # ensure that we search all records, and not just records from the last year if 'from_ts' not in kwargs: kwargs['from_ts'] = '19000101000000' # the cc archive supports filtering by status code, but the ia archive does not; # since we only care about status=200, add this filter if possible if 'filter' not in kwargs and source == 'cc': kwargs['filter'] = 'status:200' # estimate the total number of matching urls estimated_urls = cdx.get_size_estimate(url, kwargs) log.info("estimated_urls=" + str(estimated_urls)) # loop through each matching url # and add it to the batch batch = [] for i, result in enumerate(cdx.iter(url, **kwargs)): # process only urls with 200 status code (i.e. successful) if result['status'] == '200': log.info('fetching result; progress=' + str(i) + '/' + str(estimated_urls) + '={:10.4f}'.format(i / estimated_urls) + ' url=' + result['url']) # FIXME: extract a warc record from the result variable record = result.fetch_warc_record() # FIXME: extract the information from the warc record url = result['url'] accessed_at = datetime.strptime(result['timestamp'], '%Y%m%d%H%M%S') html = result.content log.debug("url=" + url) # FIXME: extract the metainfo using the metahtml library try: meta = metahtml.parse(html, url) try: pspacy_title = pspacy.lemmatize( meta['language']['best']['value'], meta['title']['best']['value']) pspacy_content = pspacy.lemmatize( meta['language']['best']['value'], meta['title']['best']['value']) except TypeError: pspacy_title = None pspacy_content = None # if there was an error in metahtml, log it except Exception as e: logging.warning('url=' + url + ' exception=' + str(e)) meta = { 'exception': { 'str(e)': str(e), 'type': type(e).__name__, 'location': 'metahtml', 'traceback': traceback.format_exc() } } pspacy_title = None pspacy_content = None #meta = metahtml.parse(html, url) #pspacy_title = None #pspacy_content = None # append to the batch batch.append({ 'accessed_at': accessed_at, 'id_source': id_source, 'url': url, 'jsonb': json.dumps(meta, default=str), 'pspacy_title': pspacy_title, 'pspacy_content': pspacy_content }) if len(batch) >= batch_size: bulk_insert(connection, batch) batch = [] # finished loading urls, # so insert the last batch and update the source table if len(batch) > 0: bulk_insert(connection, batch) batch = [] sql = sqlalchemy.sql.text(''' UPDATE source SET finished_at=now() where id=:id; ''') res = connection.execute(sql, {'id': id_source})
#!/usr/bin/env python import sys import cdx_toolkit source = sys.argv[1] url = sys.argv[2] cdx = cdx_toolkit.CDXFetcher(source=source) for obj in cdx.items(url, limit=1): print(obj) pass if source == 'cc': content_bytes = cdx_toolkit.fetch_warc_content(obj) else: content_bytes = cdx_toolkit.fetch_wb_content(obj) if len(content_bytes) > 100 and b'html' in content_bytes.lower(): print('OK') exit(0) else: print('FAIL') exit(1)