def test_customize_index_list():
    tests = [
        # gets the whole list because 201704 is before the first 2017 index
        [{
            'to': '201804'
        }, list(reversed(my_cc_endpoints))],
        [{
            'from_ts': '201801',
            'to': '201804'
        }, my_cc_endpoints[4:0:-1]],
        [{
            'from_ts': '20180214',
            'to': '201804'
        }, my_cc_endpoints[4:1:-1]],
        [{
            'from_ts': '20180429',
            'to': '20180430'
        }, my_cc_endpoints[4:5]],
        # perhaps this next one should raise...
        [{
            'from_ts': '20180430',
            'to': '20180429'
        }, my_cc_endpoints[4:5]],
    ]

    with mock.patch('cdx_toolkit.get_cc_endpoints',
                    return_value=my_cc_endpoints):
        cdx = cdx_toolkit.CDXFetcher(source='cc')
        cdxa = cdx_toolkit.CDXFetcher(source='cc', cc_sort='ascending')

        for params, custom_list in tests:
            cdx_toolkit.apply_cc_defaults(params)
            assert cdx.customize_index_list(params) == custom_list
            assert cdxa.customize_index_list(params) == list(
                reversed(custom_list))
Exemple #2
0
def test_args():
    with pytest.raises(ValueError):
        cdx = cdx_toolkit.CDXFetcher(wb='foo', warc_download_prefix='foo')
    with pytest.raises(ValueError):
        cdx = cdx_toolkit.CDXFetcher(source='asdf')
    with pytest.raises(ValueError):
        cdx = cdx_toolkit.CDXFetcher(source='cc', wb='foo')
Exemple #3
0
def test_capture_object():
    cdx_cc = cdx_toolkit.CDXFetcher(source='cc')
    cdx_ia = cdx_toolkit.CDXFetcher(source='ia')
    cdx_only = cdx_toolkit.CDXFetcher(
        source='https://web.archive.org/cdx/search/cdx', loglevel='DEBUG')

    url = 'example.com'
    kwargs = {'limit': 1}

    got_one = False
    for obj in cdx_only.iter(url, **kwargs):
        got_one = True
        with pytest.raises(ValueError):
            _ = obj.content
    assert got_one, 'found a capture cdx_only'

    for cdx in (cdx_cc, cdx_ia):
        got_one = False
        for obj in cdx.iter(url, **kwargs):
            got_one = True
            content = obj.content
            assert isinstance(content, six.binary_type)
            if len(content) == 0:
                # if the first capture happens to be a revisit, the content length will be zero
                pass
            else:
                assert len(content) > 100, str(obj)

            content2 = obj.content
            assert content == content2

            r = obj.fetch_warc_record()
            r2 = obj.fetch_warc_record()
            assert r == r2

            stream = obj.content_stream
            # we read the stream above, so it's at eof
            more_content = stream.read()
            assert len(more_content) == 0

            text = obj.text
            assert isinstance(text, six.string_types)
            text2 = obj.text
            assert text == text2

            # some duck-type dict texts on obj
            obj['foo'] = 'asdf'
            assert obj['foo'] == 'asdf'
            assert 'foo' in obj
            del obj['foo']

        assert got_one
Exemple #4
0
def setup(cmd):
    kwargs = {}
    kwargs['source'] = cmd.cc or cmd.ia or cmd.source or None
    if kwargs['source'] is None:
        raise ValueError('must specify --cc, --ia, or a --source')
    if cmd.wb:
        kwargs['wb'] = cmd.wb
    if cmd.cc_mirror:
        kwargs['cc_mirror'] = cmd.cc_mirror

    cdx = cdx_toolkit.CDXFetcher(**kwargs)

    kwargs = {}
    if cmd.limit:
        kwargs['limit'] = cmd.limit
    if 'from' in vars(cmd) and vars(
            cmd)['from']:  # python, uh, from is a reserved word
        kwargs['from_ts'] = vars(cmd)['from']
    if cmd.to:
        kwargs['to'] = cmd.to
    if cmd.closest:
        if not cmd.get:  # pragma: no cover
            LOGGER.info('note: --closest works best with --get')
        kwargs['closest'] = cmd.closest
    if cmd.filter:
        kwargs['filter'] = cmd.filter

    if cmd.cmd == 'warc' and cmd.size:
        kwargs['size'] = cmd.size

    if cmd.cmd == 'size' and cmd.details:
        kwargs['details'] = cmd.details

    return cdx, kwargs
def test_customize_index_list_closest():
    # when I implement the funky sort order, this will become different
    my_cc_endpoints_rev = list(reversed(my_cc_endpoints))
    tests = [
        [{
            'closest': '201801',
            'from_ts': '20171230',
            'to': None
        }, my_cc_endpoints_rev[0:4]],
        [{
            'closest': '201803',
            'from_ts': '20180214',
            'to': None
        }, my_cc_endpoints_rev[0:3]],
        [{
            'closest': '201801',
            'from_ts': '20171230',
            'to': '201802'
        }, my_cc_endpoints_rev[2:4]],
    ]

    with mock.patch('cdx_toolkit.get_cc_endpoints',
                    return_value=my_cc_endpoints):
        cdx = cdx_toolkit.CDXFetcher(source='cc')

        for params, custom_list in tests:
            cdx_toolkit.apply_cc_defaults(params)
            print(params)
            assert cdx.customize_index_list(params) == custom_list
Exemple #6
0
    def crawl(self, domain, limit):
        """

        :param domain:
        :param limit:
        """
        self.header = self.get_header()
        self.proxy = self.get_proxy(self.source)
        cdx = cdx_toolkit.CDXFetcher()
        url = f'*.{domain}/*'
        size = cdx.get_size_estimate(url)
        print(url, 'CommonCrawl size estimate', size)

        for resp in tqdm(cdx.iter(url, limit=limit), total=limit):
            if resp.data.get('status') not in ['301', '302']:
                subdomains = self.match_subdomains(domain, resp.text)
                self.subdomains.update(subdomains)
Exemple #7
0
    def crawl(self, domain, limit):
        """

        :param domain:
        :param limit:
        """
        self.header = self.get_header()
        self.proxy = self.get_proxy(self.source)
        cdx = cdx_toolkit.CDXFetcher(source='ia')
        url = f'*.{domain}/*'
        size = cdx.get_size_estimate(url)
        logger.log('DEBUG', f'{url} ArchiveCrawl size estimate {size}')

        for resp in cdx.iter(url, limit=limit):
            if resp.data.get('status') not in ['301', '302']:
                url = resp.data.get('url')
                subdomains = self.match_subdomains(domain, url + resp.text)
                self.subdomains.update(subdomains)
import extruct
import requests
import pprint
from w3lib.html import get_base_url
from rdflib.plugin import register, Serializer, Parser
register('json-ld', Serializer, 'rdflib_jsonld.serializer', 'JsonLDSerializer')
register('json-ld', Parser, 'rdflib_jsonld.parser', 'JsonLDParser')
import cdx_toolkit

cdx = cdx_toolkit.CDXFetcher(source='cc')
objs = list(
    cdx.iter('http://www.randstadusa.com/jobs/*',
             from_ts='202006',
             to='202103',
             filter=['status:200']))

with open('test.html', 'wb') as f:
    f.write(objs[0].content)

pp = pprint.PrettyPrinter(indent=2)
r = requests.get(
    'http://www.randstadusa.com/jobs/search/4/824761/java-app-developer-312021em_malvern/'
)
base_url = get_base_url(r.text, r.url)
data = extruct.extract(r.text,
                       base_url,
                       syntaxes=['microdata', 'opengraph', 'rdfa'])
[
    data for data in extruct.extract(objs[0].content)['json-ld']
    if data['@type'] == 'JobPosting'
]
def process_cdx_url(connection, url, batch_size=100, source='cc', **kwargs):
    '''
    NOTE:
    ideally, this function would be wrapped in a transaction;
    but this causes deadlocks when it is run concurrently with other instances of itself
    '''
    cdx = cdx_toolkit.CDXFetcher(source)

    # create a new entry in the source table for this bulk insertion
    name = 'process_cdx_url(url="' + str(url) + '", source="' + str(
        source) + '", **kwargs=' + str(kwargs) + ')'
    log.info("name=" + str(name.replace('"', r'\"')))
    try:
        sql = sqlalchemy.sql.text('''
        INSERT INTO source (name) VALUES (:name) RETURNING id;
        ''')
        res = connection.execute(sql, {'name': name})
        id_source = res.first()['id']
        log.info('id_source=' + str(id_source))

    # if an entry already exists in source,
    # then this bulk insertion has already happened (although may not be complete),
    # and so we skip this insertion
    except sqlalchemy.exc.IntegrityError:
        logging.warning('skipping name=' + name)
        return

    # ensure that we search all records, and not just records from the last year
    if 'from_ts' not in kwargs:
        kwargs['from_ts'] = '19000101000000'

    # the cc archive supports filtering by status code, but the ia archive does not;
    # since we only care about status=200, add this filter if possible
    if 'filter' not in kwargs and source == 'cc':
        kwargs['filter'] = 'status:200'

    # estimate the total number of matching urls
    estimated_urls = cdx.get_size_estimate(url, kwargs)
    log.info("estimated_urls=" + str(estimated_urls))

    # loop through each matching url
    # and add it to the batch
    batch = []
    for i, result in enumerate(cdx.iter(url, **kwargs)):

        # process only urls with 200 status code (i.e. successful)
        if result['status'] == '200':
            log.info('fetching result; progress=' + str(i) + '/' +
                     str(estimated_urls) +
                     '={:10.4f}'.format(i / estimated_urls) + ' url=' +
                     result['url'])

            # FIXME: extract a warc record from the result variable
            record = result.fetch_warc_record()

            # FIXME: extract the information from the warc record
            url = result['url']
            accessed_at = datetime.strptime(result['timestamp'],
                                            '%Y%m%d%H%M%S')
            html = result.content
            log.debug("url=" + url)

            # FIXME: extract the metainfo using the metahtml library
            try:
                meta = metahtml.parse(html, url)
                try:
                    pspacy_title = pspacy.lemmatize(
                        meta['language']['best']['value'],
                        meta['title']['best']['value'])
                    pspacy_content = pspacy.lemmatize(
                        meta['language']['best']['value'],
                        meta['title']['best']['value'])
                except TypeError:
                    pspacy_title = None
                    pspacy_content = None

            # if there was an error in metahtml, log it
            except Exception as e:
                logging.warning('url=' + url + ' exception=' + str(e))
                meta = {
                    'exception': {
                        'str(e)': str(e),
                        'type': type(e).__name__,
                        'location': 'metahtml',
                        'traceback': traceback.format_exc()
                    }
                }
                pspacy_title = None
                pspacy_content = None
            #meta = metahtml.parse(html, url)
            #pspacy_title = None
            #pspacy_content = None

            # append to the batch
            batch.append({
                'accessed_at': accessed_at,
                'id_source': id_source,
                'url': url,
                'jsonb': json.dumps(meta, default=str),
                'pspacy_title': pspacy_title,
                'pspacy_content': pspacy_content
            })

        if len(batch) >= batch_size:
            bulk_insert(connection, batch)
            batch = []

    # finished loading urls,
    # so insert the last batch and update the source table
    if len(batch) > 0:
        bulk_insert(connection, batch)
        batch = []
    sql = sqlalchemy.sql.text('''
    UPDATE source SET finished_at=now() where id=:id;
    ''')
    res = connection.execute(sql, {'id': id_source})
Exemple #10
0
#!/usr/bin/env python

import sys

import cdx_toolkit

source = sys.argv[1]
url = sys.argv[2]

cdx = cdx_toolkit.CDXFetcher(source=source)

for obj in cdx.items(url, limit=1):
    print(obj)
    pass

if source == 'cc':
    content_bytes = cdx_toolkit.fetch_warc_content(obj)
else:
    content_bytes = cdx_toolkit.fetch_wb_content(obj)

if len(content_bytes) > 100 and b'html' in content_bytes.lower():
    print('OK')
    exit(0)
else:
    print('FAIL')
    exit(1)