def _save_record(url, record, fetch_index): # Convert string for the timestamp into a proper datetime object try: timestamp_datetime = datetime.datetime.strptime(record["timestamp"], "%Y%m%d%H%M%S") except ValueError: logger.warn("Invalid timestamp '%s' for URL %s. Skipping record", record["timestamp"], url) return # We'll create a new record for the version only if it doesn't yet exist. try: WebPageVersion.get(url=url, timestamp=timestamp_datetime) except WebPageVersion.DoesNotExist: # In a few exceptional cases, I've found that the length has # the value '-'. We store a null length when we encounter '-'. try: length = int(record["length"]) except ValueError: logger.warn("Length '%s' is not an integer for URL %s", record["length"], url) length = None WebPageVersion.create( fetch_index=fetch_index, url=url, url_key=record["urlkey"], timestamp=timestamp_datetime, original=record["original"], mime_type=record["mimetype"], status_code=record["statuscode"], digest=record["digest"], length=length, )
def main(*args, **kwargs): # Create a new fetch index. last_fetch_index = WebPageVersion.select(fn.Max(WebPageVersion.fetch_index)).scalar() or 0 fetch_index = last_fetch_index + 1 search_results = SearchResult.select(SearchResult.url).distinct() for search_result in search_results: get_history(search_result.url, fetch_index)