Ejemplo n.º 1
0
def _save_record(url, record, fetch_index):

    # Convert string for the timestamp into a proper datetime object
    try:
        timestamp_datetime = datetime.datetime.strptime(record["timestamp"], "%Y%m%d%H%M%S")
    except ValueError:
        logger.warn("Invalid timestamp '%s' for URL %s.  Skipping record", record["timestamp"], url)
        return

    # We'll create a new record for the version only if it doesn't yet exist.
    try:
        WebPageVersion.get(url=url, timestamp=timestamp_datetime)
    except WebPageVersion.DoesNotExist:

        # In a few exceptional cases, I've found that the length has
        # the value '-'.  We store a null length when we encounter '-'.
        try:
            length = int(record["length"])
        except ValueError:
            logger.warn("Length '%s' is not an integer for URL %s", record["length"], url)
            length = None

        WebPageVersion.create(
            fetch_index=fetch_index,
            url=url,
            url_key=record["urlkey"],
            timestamp=timestamp_datetime,
            original=record["original"],
            mime_type=record["mimetype"],
            status_code=record["statuscode"],
            digest=record["digest"],
            length=length,
        )
Ejemplo n.º 2
0
def main(*args, **kwargs):

    # Create a new fetch index.
    last_fetch_index = WebPageVersion.select(fn.Max(WebPageVersion.fetch_index)).scalar() or 0
    fetch_index = last_fetch_index + 1
    search_results = SearchResult.select(SearchResult.url).distinct()
    for search_result in search_results:
        get_history(search_result.url, fetch_index)