def process_cdx_url(connection, url, batch_size=100, source='cc', **kwargs): ''' NOTE: ideally, this function would be wrapped in a transaction; but this causes deadlocks when it is run concurrently with other instances of itself ''' cdx = cdx_toolkit.CDXFetcher(source) # create a new entry in the source table for this bulk insertion name = 'process_cdx_url(url="' + str(url) + '", source="' + str( source) + '", **kwargs=' + str(kwargs) + ')' log.info("name=" + str(name.replace('"', r'\"'))) try: sql = sqlalchemy.sql.text(''' INSERT INTO source (name) VALUES (:name) RETURNING id; ''') res = connection.execute(sql, {'name': name}) id_source = res.first()['id'] log.info('id_source=' + str(id_source)) # if an entry already exists in source, # then this bulk insertion has already happened (although may not be complete), # and so we skip this insertion except sqlalchemy.exc.IntegrityError: logging.warning('skipping name=' + name) return # ensure that we search all records, and not just records from the last year if 'from_ts' not in kwargs: kwargs['from_ts'] = '19000101000000' # the cc archive supports filtering by status code, but the ia archive does not; # since we only care about status=200, add this filter if possible if 'filter' not in kwargs and source == 'cc': kwargs['filter'] = 'status:200' # estimate the total number of matching urls estimated_urls = cdx.get_size_estimate(url, kwargs) log.info("estimated_urls=" + str(estimated_urls)) # loop through each matching url # and add it to the batch batch = [] for i, result in enumerate(cdx.iter(url, **kwargs)): # process only urls with 200 status code (i.e. successful) if result['status'] == '200': log.info('fetching result; progress=' + str(i) + '/' + str(estimated_urls) + '={:10.4f}'.format(i / estimated_urls) + ' url=' + result['url']) # FIXME: extract a warc record from the result variable record = result.fetch_warc_record() # FIXME: extract the information from the warc record url = result['url'] accessed_at = datetime.strptime(result['timestamp'], '%Y%m%d%H%M%S') html = result.content log.debug("url=" + url) # FIXME: extract the metainfo using the metahtml library try: meta = metahtml.parse(html, url) try: pspacy_title = pspacy.lemmatize( meta['language']['best']['value'], meta['title']['best']['value']) pspacy_content = pspacy.lemmatize( meta['language']['best']['value'], meta['title']['best']['value']) except TypeError: pspacy_title = None pspacy_content = None # if there was an error in metahtml, log it except Exception as e: logging.warning('url=' + url + ' exception=' + str(e)) meta = { 'exception': { 'str(e)': str(e), 'type': type(e).__name__, 'location': 'metahtml', 'traceback': traceback.format_exc() } } pspacy_title = None pspacy_content = None #meta = metahtml.parse(html, url) #pspacy_title = None #pspacy_content = None # append to the batch batch.append({ 'accessed_at': accessed_at, 'id_source': id_source, 'url': url, 'jsonb': json.dumps(meta, default=str), 'pspacy_title': pspacy_title, 'pspacy_content': pspacy_content }) if len(batch) >= batch_size: bulk_insert(connection, batch) batch = [] # finished loading urls, # so insert the last batch and update the source table if len(batch) > 0: bulk_insert(connection, batch) batch = [] sql = sqlalchemy.sql.text(''' UPDATE source SET finished_at=now() where id=:id; ''') res = connection.execute(sql, {'id': id_source})
def process_warc_from_disk(connection, warc_path, id_source, batch_size=100): ''' ''' with open(warc_path, 'rb') as stream: # for efficiency, we will not insert items into the db one at a time; # instead, we add them to the batch list, # and then bulk insert the batch list when it reaches len(batch)==batch_size batch = [] for record in ArchiveIterator(stream): # WARC files contain many entries; # we only care about HTTP200 status code responses if record.rec_type == 'response': # extract the information from the warc archive url = record.rec_headers.get_header('WARC-Target-URI') accessed_at = record.rec_headers.get_header('WARC-Date') html = record.content_stream().read() logging.debug("url=" + url) # extract the meta try: meta = metahtml.parse(html, url) try: pspacy_title = pspacy.lemmatize( meta['language']['best']['value'], meta['title']['best']['value']) pspacy_content = pspacy.lemmatize( meta['language']['best']['value'], meta['title']['best']['value']) except TypeError: pspacy_title = None pspacy_content = None # if there was an error in metahtml, log it except Exception as e: logging.warning('url=' + url + ' exception=' + str(e)) meta = { 'exception': { 'str(e)': str(e), 'type': type(e).__name__, 'location': 'metahtml', 'traceback': traceback.format_exc() } } pspacy_title = None pspacy_content = None # add the results to the batch meta_json = json.dumps(meta, default=str) batch.append({ 'accessed_at': accessed_at, 'id_source': id_source, 'url': url, 'jsonb': meta_json, 'pspacy_title': pspacy_title, 'pspacy_content': pspacy_content }) # bulk insert the batch if len(batch) >= batch_size: bulk_insert(batch) batch = [] # we have finished looping over the archive; # we should bulk insert everything in the batch list that hasn't been inserted if len(batch) > 0: bulk_insert(batch)