def process_cdx_url(connection, url, batch_size=100, source='cc', **kwargs):
    '''
    NOTE:
    ideally, this function would be wrapped in a transaction;
    but this causes deadlocks when it is run concurrently with other instances of itself
    '''
    cdx = cdx_toolkit.CDXFetcher(source)

    # create a new entry in the source table for this bulk insertion
    name = 'process_cdx_url(url="' + str(url) + '", source="' + str(
        source) + '", **kwargs=' + str(kwargs) + ')'
    log.info("name=" + str(name.replace('"', r'\"')))
    try:
        sql = sqlalchemy.sql.text('''
        INSERT INTO source (name) VALUES (:name) RETURNING id;
        ''')
        res = connection.execute(sql, {'name': name})
        id_source = res.first()['id']
        log.info('id_source=' + str(id_source))

    # if an entry already exists in source,
    # then this bulk insertion has already happened (although may not be complete),
    # and so we skip this insertion
    except sqlalchemy.exc.IntegrityError:
        logging.warning('skipping name=' + name)
        return

    # ensure that we search all records, and not just records from the last year
    if 'from_ts' not in kwargs:
        kwargs['from_ts'] = '19000101000000'

    # the cc archive supports filtering by status code, but the ia archive does not;
    # since we only care about status=200, add this filter if possible
    if 'filter' not in kwargs and source == 'cc':
        kwargs['filter'] = 'status:200'

    # estimate the total number of matching urls
    estimated_urls = cdx.get_size_estimate(url, kwargs)
    log.info("estimated_urls=" + str(estimated_urls))

    # loop through each matching url
    # and add it to the batch
    batch = []
    for i, result in enumerate(cdx.iter(url, **kwargs)):

        # process only urls with 200 status code (i.e. successful)
        if result['status'] == '200':
            log.info('fetching result; progress=' + str(i) + '/' +
                     str(estimated_urls) +
                     '={:10.4f}'.format(i / estimated_urls) + ' url=' +
                     result['url'])

            # FIXME: extract a warc record from the result variable
            record = result.fetch_warc_record()

            # FIXME: extract the information from the warc record
            url = result['url']
            accessed_at = datetime.strptime(result['timestamp'],
                                            '%Y%m%d%H%M%S')
            html = result.content
            log.debug("url=" + url)

            # FIXME: extract the metainfo using the metahtml library
            try:
                meta = metahtml.parse(html, url)
                try:
                    pspacy_title = pspacy.lemmatize(
                        meta['language']['best']['value'],
                        meta['title']['best']['value'])
                    pspacy_content = pspacy.lemmatize(
                        meta['language']['best']['value'],
                        meta['title']['best']['value'])
                except TypeError:
                    pspacy_title = None
                    pspacy_content = None

            # if there was an error in metahtml, log it
            except Exception as e:
                logging.warning('url=' + url + ' exception=' + str(e))
                meta = {
                    'exception': {
                        'str(e)': str(e),
                        'type': type(e).__name__,
                        'location': 'metahtml',
                        'traceback': traceback.format_exc()
                    }
                }
                pspacy_title = None
                pspacy_content = None
            #meta = metahtml.parse(html, url)
            #pspacy_title = None
            #pspacy_content = None

            # append to the batch
            batch.append({
                'accessed_at': accessed_at,
                'id_source': id_source,
                'url': url,
                'jsonb': json.dumps(meta, default=str),
                'pspacy_title': pspacy_title,
                'pspacy_content': pspacy_content
            })

        if len(batch) >= batch_size:
            bulk_insert(connection, batch)
            batch = []

    # finished loading urls,
    # so insert the last batch and update the source table
    if len(batch) > 0:
        bulk_insert(connection, batch)
        batch = []
    sql = sqlalchemy.sql.text('''
    UPDATE source SET finished_at=now() where id=:id;
    ''')
    res = connection.execute(sql, {'id': id_source})
Esempio n. 2
0
def process_warc_from_disk(connection, warc_path, id_source, batch_size=100):
    '''
    '''
    with open(warc_path, 'rb') as stream:

        # for efficiency, we will not insert items into the db one at a time;
        # instead, we add them to the batch list,
        # and then bulk insert the batch list when it reaches len(batch)==batch_size
        batch = []

        for record in ArchiveIterator(stream):

            # WARC files contain many entries;
            # we only care about HTTP200 status code responses
            if record.rec_type == 'response':

                # extract the information from the warc archive
                url = record.rec_headers.get_header('WARC-Target-URI')
                accessed_at = record.rec_headers.get_header('WARC-Date')
                html = record.content_stream().read()
                logging.debug("url=" + url)

                # extract the meta
                try:
                    meta = metahtml.parse(html, url)
                    try:
                        pspacy_title = pspacy.lemmatize(
                            meta['language']['best']['value'],
                            meta['title']['best']['value'])
                        pspacy_content = pspacy.lemmatize(
                            meta['language']['best']['value'],
                            meta['title']['best']['value'])
                    except TypeError:
                        pspacy_title = None
                        pspacy_content = None

                # if there was an error in metahtml, log it
                except Exception as e:
                    logging.warning('url=' + url + ' exception=' + str(e))
                    meta = {
                        'exception': {
                            'str(e)': str(e),
                            'type': type(e).__name__,
                            'location': 'metahtml',
                            'traceback': traceback.format_exc()
                        }
                    }
                    pspacy_title = None
                    pspacy_content = None

                # add the results to the batch
                meta_json = json.dumps(meta, default=str)
                batch.append({
                    'accessed_at': accessed_at,
                    'id_source': id_source,
                    'url': url,
                    'jsonb': meta_json,
                    'pspacy_title': pspacy_title,
                    'pspacy_content': pspacy_content
                })

            # bulk insert the batch
            if len(batch) >= batch_size:
                bulk_insert(batch)
                batch = []

        # we have finished looping over the archive;
        # we should bulk insert everything in the batch list that hasn't been inserted
        if len(batch) > 0:
            bulk_insert(batch)