Exemple #1
0
def get_or_cache_pacer_cookies(user_pk, username, password):
    """Get PACER cookies for a user or create and cache fresh ones

    For the PACER Fetch API, we store users' PACER cookies in Redis with a
    short expiration timeout. This way, we never store their password, and
    we only store their cookies temporarily.

    This function attempts to get cookies for a user from Redis. If it finds
    them, it returns them. If not, it attempts to log the user in and then
    returns the fresh cookies (after caching them).

    :param user_pk: The PK of the user attempting to store their credentials.
    Needed to create the key in Redis.
    :param username: The PACER username of the user
    :param password: The PACER password of the user
    :return: Cookies for the PACER user
    """
    r = make_redis_interface("CACHE")
    cookies = get_pacer_cookie_from_cache(user_pk, r=r)
    if cookies:
        return cookies

    # Unable to find cookies in cache. Login and cache new values.
    cookies = log_into_pacer(username, password)
    cookie_expiration = 60 * 60
    r.set(session_key % user_pk, pickle.dumps(cookies), ex=cookie_expiration)
    return cookies
Exemple #2
0
def get_count_for_endpoint(endpoint, start, end):
    """Get the count of hits for an endpoint by name, during a date range

    :param endpoint: The endpoint to get the count for. Typically something
    like 'docket-list' or 'docket-detail'
    :param start: The beginning date (inclusive) you want the results for. A
    string to be interpreted by dateparser
    :param end: The end date (inclusive) you want the results for. A string to
    be interpreted by dateparser.
    :return int: The count for that endpoint
    """
    r = make_redis_interface("STATS")
    pipe = r.pipeline()

    dates = [
        d.date().isoformat() for d in rrule(
            DAILY,
            dtstart=parser.parse(start, fuzzy=False),
            until=parser.parse(end, fuzzy=False),
        )
    ]
    for d in dates:
        pipe.zscore("api:v3.endpoint.d:%s.counts" % d, endpoint)
    results = pipe.execute()
    return sum(r for r in results if r)
Exemple #3
0
    def _log_request(self, request):
        d = date.today().isoformat()
        user = request.user
        endpoint = resolve(request.path_info).url_name
        response_ms = self._get_response_ms()

        r = make_redis_interface("STATS")
        pipe = r.pipeline()

        # Global and daily tallies for all URLs.
        pipe.incr("api:v3.count")
        pipe.incr("api:v3.d:%s.count" % d)
        pipe.incr("api:v3.timing", response_ms)
        pipe.incr("api:v3.d:%s.timing" % d, response_ms)

        # Use a sorted set to store the user stats, with the score representing
        # the number of queries the user made total or on a given day.
        user_pk = user.pk or "AnonymousUser"
        pipe.zincrby("api:v3.user.counts", 1, user_pk)
        pipe.zincrby("api:v3.user.d:%s.counts" % d, 1, user_pk)

        # Use a sorted set to store all the endpoints with score representing
        # the number of queries the endpoint received total or on a given day.
        pipe.zincrby("api:v3.endpoint.counts", 1, endpoint)
        pipe.zincrby("api:v3.endpoint.d:%s.counts" % d, 1, endpoint)

        # We create a per-day key in redis for timings. Inside the key we have
        # members for every endpoint, with score of the total time. So to get
        # the average for an endpoint you need to get the number of requests
        # and the total time for the endpoint and divide.
        timing_key = "api:v3.endpoint.d:%s.timings" % d
        pipe.zincrby(timing_key, response_ms, endpoint)

        results = pipe.execute()
        return results
Exemple #4
0
def get_homepage_stats():
    """Get any stats that are displayed on the homepage and return them as a
    dict
    """
    r = make_redis_interface("STATS")
    ten_days_ago = make_aware(datetime.today() - timedelta(days=10), utc)
    last_ten_days = [
        "api:v3.d:%s.count" % (date.today() - timedelta(days=x)).isoformat()
        for x in range(0, 10)
    ]
    homepage_data = {
        "alerts_in_last_ten":
        Stat.objects.filter(name__contains="alerts.sent",
                            date_logged__gte=ten_days_ago).aggregate(
                                Sum("count"))["count__sum"],
        "queries_in_last_ten":
        Stat.objects.filter(name="search.results",
                            date_logged__gte=ten_days_ago).aggregate(
                                Sum("count"))["count__sum"],
        "bulk_in_last_ten":
        Stat.objects.filter(name__contains="bulk_data",
                            date_logged__gte=ten_days_ago).aggregate(
                                Sum("count"))["count__sum"],
        "opinions_in_last_ten":
        Opinion.objects.filter(date_created__gte=ten_days_ago).count(),
        "oral_arguments_in_last_ten":
        Audio.objects.filter(date_created__gte=ten_days_ago).count(),
        "api_in_last_ten":
        sum([
            int(result) for result in r.mget(*last_ten_days)
            if result is not None
        ]),
        "users_in_last_ten":
        User.objects.filter(date_joined__gte=ten_days_ago).count(),
        "days_of_oa":
        naturalduration(
            Audio.objects.aggregate(Sum("duration"))["duration__sum"],
            as_dict=True,
        )["d"],
        "viz_in_last_ten":
        SCOTUSMap.objects.filter(
            date_published__gte=ten_days_ago,
            published=True,
        ).count(),
        "visualizations":
        SCOTUSMap.objects.filter(
            published=True,
            deleted=False,
        ).annotate(Count("clusters"), ).filter(
            # Ensures that we only show good stuff on homepage
            clusters__count__gt=10, ).order_by(
                "-date_published",
                "-date_modified",
                "-date_created",
            )[:1],
        "private":
        False,  # VERY IMPORTANT!
    }
    return homepage_data
Exemple #5
0
def get_homepage_stats():
    """Get any stats that are displayed on the homepage and return them as a
    dict
    """
    r = make_redis_interface('STATS')
    ten_days_ago = make_aware(datetime.today() - timedelta(days=10), utc)
    last_ten_days = [
        'api:v3.d:%s.count' % (date.today() - timedelta(days=x)).isoformat()
        for x in range(0, 10)
    ]
    homepage_data = {
        'alerts_in_last_ten':
        Stat.objects.filter(name__contains='alerts.sent',
                            date_logged__gte=ten_days_ago).aggregate(
                                Sum('count'))['count__sum'],
        'queries_in_last_ten':
        Stat.objects.filter(name='search.results',
                            date_logged__gte=ten_days_ago).aggregate(
                                Sum('count'))['count__sum'],
        'bulk_in_last_ten':
        Stat.objects.filter(name__contains='bulk_data',
                            date_logged__gte=ten_days_ago).aggregate(
                                Sum('count'))['count__sum'],
        'opinions_in_last_ten':
        Opinion.objects.filter(date_created__gte=ten_days_ago).count(),
        'oral_arguments_in_last_ten':
        Audio.objects.filter(date_created__gte=ten_days_ago).count(),
        'api_in_last_ten':
        sum([
            int(result) for result in r.mget(*last_ten_days)
            if result is not None
        ]),
        'users_in_last_ten':
        User.objects.filter(date_joined__gte=ten_days_ago).count(),
        'days_of_oa':
        naturalduration(
            Audio.objects.aggregate(Sum('duration'))['duration__sum'],
            as_dict=True,
        )['d'],
        'viz_in_last_ten':
        SCOTUSMap.objects.filter(
            date_published__gte=ten_days_ago,
            published=True,
        ).count(),
        'visualizations':
        SCOTUSMap.objects.filter(
            published=True,
            deleted=False,
        ).annotate(Count('clusters'), ).filter(
            # Ensures that we only show good stuff on homepage
            clusters__count__gt=10, ).order_by(
                '-date_published',
                '-date_modified',
                '-date_created',
            )[:1],
        'private':
        False,  # VERY IMPORTANT!
    }
    return homepage_data
def clear_queue(queue_name: str):
    """Empty out a queue, nuking the tasks in it."""
    priority_names = [
        make_queue_name_for_pri(queue_name, pri)
        for pri in DEFAULT_PRIORITY_STEPS
    ]
    r = make_redis_interface("CELERY")
    return sum([r.delete(x) for x in priority_names])
Exemple #7
0
def make_lasc_search():
    """Create a logged-in LASCSearch object with cookies pulled from cache

    :return: LASCSearch object
    """
    r = make_redis_interface('CACHE')
    session = LASCSession()
    session.cookies = pickle.loads(r.get(LASC_SESSION_COOKIE_KEY))
    return LASCSearch(session)
def add_all_cases_to_cl(
    options: Dict[str, Union[List[str], int, str, float]], ) -> None:
    """Iterate over courts and gather iquery results from them.

    :param options: The options from the handle method
    :return None
    """
    q = options["queue"]
    r = make_redis_interface("CACHE")
    # This is a simple dictionary that's populated with the maximum
    # pacer_case_id in the CL DB as of 2021-01-18. The idea is to use this to
    # prevent the scraper from going forever. You can reset it by querying the
    # latest item in the DB by date_filed, and then using r.hmset to save it.
    max_ids = r.hgetall("iquery_max_ids")

    courts = Court.federal_courts.district_pacer_courts().exclude(
        pk__in=["uscfc", "arb", "cit"])
    if options["courts"] != ["all"]:
        courts = courts.filter(pk__in=options["courts"])
    court_ids = list(courts.values_list("pk", flat=True))

    # Create a queue that's a bit longer than the number of courts we're doing
    throttle = CeleryThrottle(queue_name=q, min_items=len(court_ids) * 2)

    iterations_completed = 0
    db_key_cycle = itertools.cycle(settings.DATABASES.keys())
    while (options["iterations"] == 0
           or iterations_completed < options["iterations"]):
        if len(court_ids) == 0:
            # No more courts. Done!
            break

        for court_id in court_ids:
            throttle.maybe_wait()
            try:
                pacer_case_id = r.hincrby("iquery_status", court_id, 1)
                if pacer_case_id > int(max_ids[court_id]):
                    # Enough scraping. Stop doing this court.
                    court_ids.remove(court_id)
                    # Adjust the throttle queue to be shorter.
                    throttle.set_min(len(court_ids * 2))
                    continue
                make_docket_by_iquery.apply_async(
                    args=(court_id, pacer_case_id, next(db_key_cycle)),
                    queue=q,
                )
            except Exception as e:
                # Cleanup
                r.hincrby("iquery_status", court_id, -1)
                raise e

        iterations_completed += 1
        remaining_iterations = options["iterations"] - iterations_completed
        if remaining_iterations > 0:
            time.sleep(options["iteration_delay"])
Exemple #9
0
def get_pacer_cookie_from_cache(user_pk, r=None):
    """Get the cookie for a user from the cache.

    :param r: A redis interface. If not provided, a fresh one is used. This is
    a performance enhancement.
    :return Either None if no cache cookies or the cookies if they're found.
    """
    if not r:
        r = make_redis_interface("CACHE")
    pickled_cookie = r.get(session_key % user_pk)
    if pickled_cookie:
        return pickle.loads(pickled_cookie)
def get_queue_length(queue_name="celery"):
    """Get the number of tasks in a celery queue.

    :param queue_name: The name of the queue you want to inspect.
    :return: the number of items in the queue.
    """
    priority_names = [
        make_queue_name_for_pri(queue_name, pri)
        for pri in DEFAULT_PRIORITY_STEPS
    ]
    r = make_redis_interface("CELERY")
    return sum([r.llen(x) for x in priority_names])
Exemple #11
0
def send_docket_alert(d_pk, since):
    """Send an alert for a given docket

    :param d_pk: The docket PK that was modified
    :param since: If we run alerts, notify users about items *since* this time.
    :return: None
    """
    email_addresses = (User.objects.filter(
        docket_alerts__docket_id=d_pk).distinct().values_list("email",
                                                              flat=True))
    if email_addresses:
        # We have an alert for this docket. Proceed.
        docket = Docket.objects.get(pk=d_pk)
        new_des = DocketEntry.objects.filter(date_created__gte=since,
                                             docket=docket)

        if new_des.count() > 0:
            # Notify every user that's subscribed to this alert.
            case_name = trunc(best_case_name(docket), 100, ellipsis="...")
            subject_template = loader.get_template("docket_alert_subject.txt")
            subject = subject_template.render({
                "docket": docket,
                "count": new_des.count(),
                "case_name": case_name,
            }).strip()  # Remove newlines that editors can insist on adding.
            email_context = {"new_des": new_des, "docket": docket}
            txt_template = loader.get_template("docket_alert_email.txt")
            html_template = loader.get_template("docket_alert_email.html")
            messages = []
            for email_address in email_addresses:
                msg = EmailMultiAlternatives(
                    subject=subject,
                    body=txt_template.render(email_context),
                    from_email=settings.DEFAULT_ALERTS_EMAIL,
                    to=[email_address],
                    headers={"X-Entity-Ref-ID": "docket.alert:%s" % d_pk},
                )
                html = html_template.render(email_context)
                msg.attach_alternative(html, "text/html")
                messages.append(msg)

            # Add a bcc to the first message in the list so that we get a copy.
            messages[0].bcc = ["*****@*****.**"]
            connection = get_connection()
            connection.send_messages(messages)
            tally_stat("alerts.docket.alerts.sent", inc=len(email_addresses))

        DocketAlert.objects.filter(docket=docket).update(date_last_hit=now())

    # Work completed, clear the semaphore
    r = make_redis_interface("ALERTS")
    r.delete(make_alert_key(d_pk))
Exemple #12
0
def add_or_update_case_db(self, case_id):
    """Add a case from the LASC MAP using an authenticated session object

    :param self: The celery object
    :param case_id: The case ID to download, for example, '19STCV25157;SS;CV'
    :return: None
    """
    establish_good_login(self)
    lasc = make_lasc_search()

    clean_data = {}
    try:
        clean_data = lasc.get_json_from_internal_case_id(case_id)
        logger.info("Successful Query")
    except RequestException as e:
        retries_remaining = self.max_retries - self.request.retries
        if retries_remaining == 0:
            logger.error("RequestException, unable to get case at %s", case_id)
            return
        logger.info(
            "Failed to get JSON for '%s', with RequestException: %s. "
            "%s retries remaining.",
            case_id,
            e,
            retries_remaining,
        )
        r = make_redis_interface("CACHE")
        r.delete(LASC_SESSION_COOKIE_KEY, LASC_SESSION_STATUS_KEY)
        self.retry()

    if not clean_data:
        logger.info("No information for case %s. Possibly sealed?", case_id)
        return

    ds = Docket.objects.filter(case_id=case_id)
    ds_count = ds.count()
    if ds_count == 0:
        logger.info("Adding lasc case with ID: %s", case_id)
        add_case(case_id, clean_data, lasc.case_data)
    elif ds_count == 1:
        if latest_sha(case_id=case_id) != sha1_of_json_data(lasc.case_data):
            logger.info("Updating lasc case with ID: %s", case_id)
            update_case(lasc, clean_data)
        else:
            logger.info("LASC case is already up to date: %s", case_id)
    else:
        logger.warn(
            "Issue adding or updating lasc case with ID '%s' - Too "
            "many cases in system with that ID (%s cases)",
            case_id,
            ds_count,
        )
def get_pacer_cookie_from_cache(user_pk: Union[str, int], r: Redis = None):
    """Get the cookie for a user from the cache.

    :param user_pk: The ID of the user, can be a string or an ID
    :param r: A redis interface. If not provided, a fresh one is used. This is
    a performance enhancement.
    :return Either None if no cache cookies or the cookies if they're found.
    """
    if not r:
        r = make_redis_interface("CACHE", decode_responses=False)
    pickled_cookie = r.get(session_key % user_pk)
    if pickled_cookie:
        return pickle.loads(pickled_cookie)
Exemple #14
0
def redis_writes(request):
    """Just return 200 OK if we can write to redis. Else return 500 Error."""
    r = make_redis_interface("STATS")

    # Increment a counter. If it's "high" reset it. No need to do fancy try/
    # except work here to log or display the error. If there's an error, it'll
    # send a log we can review.
    key = "monitoring:redis-writes"
    v = r.incr(key)
    if v > 100:
        r.set(key, 0)

    return HttpResponse("Successful Redis write.")
Exemple #15
0
def login_to_court():
    """Set the login cookies in redis for an LASC user

    Replace any existing cookies in redis.

    :return: None
    """
    r = make_redis_interface("CACHE")
    # Give yourself a few minutes to log in
    r.set(LASC_SESSION_STATUS_KEY, SESSION_IS.LOGGING_IN, ex=60 * 2)
    lasc_session = LASCSession(username=LASC_USERNAME, password=LASC_PASSWORD)
    lasc_session.login()
    cookie_str = str(pickle.dumps(lasc_session.cookies))
    # Done logging in; save the cookies.
    r.set(LASC_SESSION_COOKIE_KEY, cookie_str, ex=60 * 30)
    r.set(LASC_SESSION_STATUS_KEY, SESSION_IS.OK, ex=60 * 30)
Exemple #16
0
def skip_unreadable_post(record):
    if record.exc_info:
        exc_value = record.exc_info[1]
        if isinstance(exc_value, UnreadablePostError):
            cache_key = "settings.unreadable_post_error"
            r = make_redis_interface("CACHE")
            if r.get(cache_key) is not None:
                # We've seen this recently; let it through; hitting it a lot
                # might mean something.
                return True
            else:
                # Haven't seen this recently; cache it with a minute expiry,
                # and don't let it through.
                r.set(cache_key, "True", ex=60)
                return False
    return True
Exemple #17
0
def establish_good_login(self):
    """Make sure that we have good login credentials for LASC in redis

    Checks the Login Status for LASC.  If no status is found runs login
    function to store good keys in redis.

    :param self: A Celery task object
    :return: None
    """
    r = make_redis_interface('CACHE')
    status = r.get(LASC_SESSION_STATUS_KEY)
    if status == SESSION_IS.LOGGING_IN:
        self.retry()
    if status == SESSION_IS.OK:
        return
    login_to_court()
Exemple #18
0
def get_avg_ms_for_endpoint(endpoint, d):
    """

    :param endpoint: The endpoint to get the average timing for. Typically
    something like 'docket-list' or 'docket-detail'
    :param d: The date to get the timing for (a date object)
    :return: The average number of ms that endpoint used to serve requests on
    that day.
    """
    d_str = d.isoformat()
    r = make_redis_interface("STATS")
    pipe = r.pipeline()
    pipe.zscore("api:v3.endpoint.d:%s.timings" % d_str, endpoint)
    pipe.zscore("api:v3.endpoint.d:%s.counts" % d_str, endpoint)
    results = pipe.execute()

    return results[0] / results[1]
Exemple #19
0
def get_count_for_endpoint(endpoint: str, start: str, end: str) -> int:
    """Get the count of hits for an endpoint by name, during a date range

    :param endpoint: The endpoint to get the count for. Typically something
    like 'docket-list' or 'docket-detail'
    :param start: The beginning date (inclusive) you want the results for.
    :param end: The end date (inclusive) you want the results for.
    :return int: The count for that endpoint
    """
    r = make_redis_interface("STATS")
    pipe = r.pipeline()

    dates = make_date_str_list(start, end)
    for d in dates:
        pipe.zscore(f"api:v3.endpoint.d:{d}.counts", endpoint)
    results = pipe.execute()
    return sum(r for r in results if r)
Exemple #20
0
def enqueue_docket_alert(d_pk):
    """Enqueue a docket alert or punt it if there's already a task for it.

    :param d_pk: The ID of the docket we're going to send alerts for.
    :return: True if we enqueued the item, false if not.
    """
    # Create an expiring semaphor in redis or check if there's already one
    # there.
    r = make_redis_interface('ALERTS')
    key = make_alert_key(d_pk)
    # Set to True if not already set. Redis doesn't do bools anymore, so use 1.
    currently_enqueued = bool(r.getset(key, 1))
    if currently_enqueued:
        # We've got a task going for this alert.
        return False

    # We don't have a task for this yet. Set an expiration for the new key,
    # and make a new async task. The expiration gives us a safety so that the
    # semaphor *will* eventually go away even if our task or server crashes.
    safety_expiration_timeout = 10 * 60
    r.expire(key, safety_expiration_timeout)
    return True
Exemple #21
0
def get_user_ids_for_date_range(
    start: Union[str, datetime],
    end: Union[str, datetime],
) -> Set[int]:
    """Get a list of user IDs that used the API during a span of time

    :param start: The beginning of when you want to find users (default: all
    time). A str to be interpreted by dateparser.
    :param end: The end of when you want to find users (default today).  A
    str to be interpreted by dateparser.
    :return Set of user IDs during a time period. Will not contain anonymous
    users.
    """
    r = make_redis_interface("STATS")
    pipe = r.pipeline()

    date_strs = make_date_str_list(start, end)
    for d in date_strs:
        pipe.zrange(f"api:v3.user.d:{d}.counts", 0, -1)

    results: list = pipe.execute()
    result_set: set = set().union(*results)
    return {int(i) for i in result_set if i.isdigit()}
Exemple #22
0
def get_task_wait(
    task: Task,
    rate: str = "1/s",
    key: str = None,
) -> float:
    """Keep a global throttle for tasks

    Can be used via the `throttle_task` decorator above.

    This implements the timestamp-based algorithm detailed here:

        https://www.figma.com/blog/an-alternative-approach-to-rate-limiting/

    Basically, you keep track of the number of requests and use the key
    expiration as a reset of the counter.

    So you have a rate of 5/m, and your first task comes in. You create a key:

        celery_throttle:task_name = 1
        celery_throttle:task_name.expires = 60

    Another task comes in a few seconds later:

        celery_throttle:task_name = 2
        Do not update the ttl, it now has 58s remaining

    And so forth, until:

        celery_throttle:task_name = 6
        (10s remaining)

    We're over the threshold. Re-queue the task for later. 10s later:

        Key expires b/c no more ttl.

    Another task comes in:

        celery_throttle:task_name = 1
        celery_throttle:task_name.expires = 60

    And so forth.

    ---

    There is also a scheduler that figures out when to re-queue tasks. The idea
    of the scheduler is simple: If you know the rate the tasks can be
    processed, and if you're getting tasks faster than that rate, you can
    schedule each one to take its turn at a reasonable specified time. This is
    implemented by keeping a timestamp in redis indicating when the throttle
    will no longer be clogged up.

    Say you have a rate of 1/5s, and you get tasks as follows:

         Elapsed Time | Task Number
         -------------+------------
              1s      |     1
              2s      |     2
              3s      |     3

    Task number 1 runs immediately, but sets a throttle for five seconds until
    more work can be done. The second comes in and sees that the throttle has a
    ttl of three remaining seconds, so it waits that long. Next, task number 3
    comes in. It sees that the current window is full, and that the next one is
    too — only one task every five seconds, right? It has to wait seven
    seconds: two seconds (for the current window) *plus* 5 seconds (for the
    next one, which is occupied by task two).

    And so forth.

    :param task: The task that is being checked
    :param rate: How many times the task can be run during the time period.
    Something like, 1/s, 2/h or similar.
    :param key: If given, add this to the key placed in Redis for the item.
    Typically, this will correspond to the value of an argument passed to the
    throttled task.
    :return: If throttled returns a float of how many seconds the task should
    wait until the next open window for processing. If not throttled, returns
    zero (i.e., don't wait).
    """
    task_sub_key = f"{task.name}{':' + str(key) if key else ''}"
    throttle_key = f"celery_throttle:{task_sub_key}"

    r = make_redis_interface("CACHE")

    allowed_task_count, duration = parse_rate(rate)

    # Check the count in redis
    actual_task_count = r.get(throttle_key)
    if actual_task_count is None:
        # No key. Set the value to 1 and set the ttl of the key.
        r.set(throttle_key, 1, ex=duration)
        return 0

    # Key found. Check if we should throttle.
    if int(actual_task_count) < allowed_task_count:
        # We're OK to run the task. Increment our counter, and say things are
        # OK by returning 0.
        new_count = r.incr(throttle_key, 1)
        if new_count == 1:
            # Safety check. If the count is 1 after incrementing, that means we
            # created the key via the incr command. This can happen when it
            # expires between when we `get` its value up above and when we
            # increment it here. If that happens, it lacks a ttl! Set one.
            #
            # N.B. There's no need to worry about a race condition between our
            # incr above, and the `expire` line here b/c without a ttl on this
            # key, it can't expire between these two commands.
            r.expire(throttle_key, duration)
        return 0

    # Over the threshold. Find the next window and schedule the task.
    schedule_key = f"celery_throttle:schedule:{task_sub_key}"
    n = now()
    delay = r.get(schedule_key)
    if delay is None:
        # No schedule yet. Run the task when the current throttle expires.
        return set_for_next_window(r, throttle_key, schedule_key, n)

    # We have a delay, so use it if it's in the future
    delay = parser.parse(delay)
    if delay < n:
        # Delay is in the past. Run the task when the current throttle expires.
        return set_for_next_window(r, throttle_key, schedule_key, n)

    # Delay is in the future; use it and supplement it
    new_time = delay + timedelta(seconds=duration / allowed_task_count)
    r.set(schedule_key, str(new_time))
    return (new_time - n).total_seconds()
def upload_recap_data(options):
    """Upload RECAP data to Internet Archive."""
    q = options["queue"]
    database = options["database"]
    r = make_redis_interface("CACHE")
    redis_key = "recap-docket-last-id"
    last_pk = r.getset(redis_key, 0)
    ds = (Docket.objects.filter(
        Q(ia_upload_failure_count__lte=3)
        | Q(ia_upload_failure_count=None),
        ia_needs_upload=True,
        source__in=Docket.RECAP_SOURCES,
        pk__gt=last_pk,
    ).order_by("pk").only("pk"))

    chunk_size = 100  # Small to save memory
    i = 0
    previous_i = None
    delay_count = 0
    t1 = now()
    logger.info("Sending recap dockets to Internet Archive")
    throttle = CeleryThrottle(queue_name=q, min_items=5)
    while True:
        # Start of quarter needs to be re-analyzed every time through the loop.
        # This ensures that if the quarter changes while this runs, we get the
        # new value.
        params = {
            "pk__gt": last_pk,
            "ia_date_first_change__lt": get_start_of_quarter(),
        }
        for d in ds.filter(**params)[:chunk_size]:
            throttle.maybe_wait()
            upload_recap_json.apply_async(args=(d.pk, database), queue=q)
            i += 1
            if i % 100 == 0:
                # Print a useful log line with expected finish date.
                t2 = now()
                elapsed_minutes = float((t2 - t1).seconds) / 60
                try:
                    rate = i / float(elapsed_minutes)
                    logger.info("Uploaded %s dockets to IA so far (%.01f/m)",
                                i, rate)
                except ZeroDivisionError:
                    # First lap through can be completed in less than 1s.
                    pass
            last_pk = d.pk
            r.set(redis_key, last_pk)

        # Detect if we've hit the end of the loop and reset it if so. We do
        # this by keeping track of the last_pk that we saw the last time the
        # for loop changed. If that PK doesn't change after the for loop has
        # run again, then we know we've hit the end of the loop and we should
        # reset it.
        empty_loop = i == previous_i
        if empty_loop:
            # i is the same as the last time the
            # for loop finished. Reset things.
            if last_pk == 0:
                # We went through the for loop a second time and still didn't
                # do anything. Stall with capped back off.
                delay_count += 1
                max_delay = 60 * 30  # Thirty minutes
                delay = min(delay_count * 60, max_delay)
                time.sleep(delay)
            else:
                delay_count = 0
                last_pk = 0
                r.set(redis_key, 0)
        else:
            previous_i = i
def is_rate_okay(task: Task, rate: str = "1/s", key=None) -> bool:
    """Keep a global throttle for tasks

    Can be used via the `throttle_task` decorator above.

    This implements the timestamp-based algorithm detailed here:

        https://www.figma.com/blog/an-alternative-approach-to-rate-limiting/

    Basically, you keep track of the number of requests and use the key
    expiration as a reset of the counter.

    So you have a rate of 5/m, and your first task comes in. You create a key:

        celery_throttle:task_name = 1
        celery_throttle:task_name.expires = 60

    Another task comes in a few seconds later:

        celery_throttle:task_name = 2
        Do not update the ttl, it now has 58s remaining

    And so forth, until:

        celery_throttle:task_name = 6
        (10s remaining)

    We're over the threshold. Re-queue the task for later. 10s later:

        Key expires b/c no more ttl.

    Another task comes in:

        celery_throttle:task_name = 1
        celery_throttle:task_name.expires = 60

    And so forth.

    :param task: The task that is being checked
    :param rate: How many times the task can be run during the time period.
    Something like, 1/s, 2/h or similar.
    :param key: If given, add this to the key placed in Redis for the item.
    Typically, this will correspond to the value of an argument passed to the
    throttled task.
    :return: Whether the task should be throttled or not.
    """
    key = f"celery_throttle:{task.name}{':' + str(key) if key else ''}"

    r = make_redis_interface("CACHE")

    num_tasks, duration = parse_rate(rate)

    # Check the count in redis
    count = r.get(key)
    if count is None:
        # No key. Set the value to 1 and set the ttl of the key.
        r.set(key, 1)
        r.expire(key, duration)
        return True
    else:
        # Key found. Check it.
        if int(count) <= num_tasks:
            # We're OK, run it.
            r.incr(key, 1)
            return True
        else:
            return False
Exemple #25
0
 def flush_stats():
     # Flush existing stats (else previous tests cause issues)
     r = make_redis_interface("STATS")
     r.flushdb()
def is_rate_okay(task: Task, rate: str = "1/s", key=None) -> bool:
    """Keep a global throttle for tasks

    Can be used via the `throttle_task` decorator above.

    This implements the timestamp-based algorithm detailed here:

        https://www.figma.com/blog/an-alternative-approach-to-rate-limiting/

    Basically, you keep track of the number of requests and use the key
    expiration as a reset of the counter.

    So you have a rate of 5/m, and your first task comes in. You create a key:

        celery_throttle:task_name = 1
        celery_throttle:task_name.expires = 60

    Another task comes in a few seconds later:

        celery_throttle:task_name = 2
        Do not update the ttl, it now has 58s remaining

    And so forth, until:

        celery_throttle:task_name = 6
        (10s remaining)

    We're over the threshold. Re-queue the task for later. 10s later:

        Key expires b/c no more ttl.

    Another task comes in:

        celery_throttle:task_name = 1
        celery_throttle:task_name.expires = 60

    And so forth.

    :param task: The task that is being checked
    :param rate: How many times the task can be run during the time period.
    Something like, 1/s, 2/h or similar.
    :param key: If given, add this to the key placed in Redis for the item.
    Typically, this will correspond to the value of an argument passed to the
    throttled task.
    :return: Whether the task should be throttled or not.
    """
    key = f"celery_throttle:{task.name}{':' + str(key) if key else ''}"

    r = make_redis_interface("CACHE")

    allowed_task_count, duration = parse_rate(rate)

    # Check the count in redis
    actual_task_count = r.get(key)
    if actual_task_count is None:
        # No key. Set the value to 1 and set the ttl of the key.
        r.set(key, 1, ex=duration)
        return True
    else:
        # Key found. Check it.
        if int(actual_task_count) <= allowed_task_count:
            # We're OK to run the task. Increment our counter, and say things
            # are OK by returning True.
            new_count = r.incr(key, 1)
            if new_count == 1:
                # Safety check. If the count is 1 after incrementing, that
                # means we created the key via the incr command. This can
                # happen when it expires between when we `get` its value up
                # above and when we increment it here. If that happens, it
                # lacks a ttl! Set one.
                #
                # N.B. There's no need to worry about a race condition between
                # our incr above, and the `expire` line here b/c without a ttl
                # on this key, it can't expire between these two commands.
                r.expire(key, duration)
            return True
        else:
            # Over the threshold.
            return False
Exemple #27
0
def import_disclosure(self, data: Dict[str, Union[str, int, list]]) -> None:
    """Import disclosures into Courtlistener

    :param data: The disclosure information to process
    :return: None
    """
    # Check download_filepath to see if it has been processed before.
    if has_been_extracted(data):
        logger.info(f"Document already extracted and saved: {data['id']}.")
        return

    interface = make_redis_interface("CACHE")
    disclosure_key = make_disclosure_key(data["id"])
    newly_enqueued = enqueue_disclosure_process(interface, disclosure_key)

    if not newly_enqueued:
        logger.info(f"Process is already running {data['id']}.")
        return

    # Generate PDF content from our three paths
    year = int(data["year"])
    person_id = data["person_id"]

    logger.info(
        f"Processing row {data['id']} for person {person_id} "
        f"in year {year}"
    )

    # Check if we've already extracted
    disclosure_url = get_aws_url(data)
    was_previously_pdfed = has_been_pdfed(disclosure_url)
    pdf_response = generate_or_download_disclosure_as_pdf(
        data, was_previously_pdfed
    )
    pdf_bytes = pdf_response.content

    if pdf_response.status_code != 200:
        logger.info("PDF generation failed.")
        return

    if was_previously_pdfed:
        disclosure = get_disclosure_from_pdf_path(disclosure_url)
    else:
        logger.info("PDF generated successfully.")

        # Sha1 hash - Check for duplicates
        sha1_hash = sha1(pdf_bytes)
        in_system = check_if_in_system(sha1_hash)
        if in_system:
            logger.info("PDF already in system.")
            interface.delete(disclosure_key)
            return

        # Return page count - 0 indicates a failure of some kind.  Like PDF
        # Not actually present on aws.
        pg_count = get_page_count(pdf_bytes)
        if not pg_count:
            logger.info(f"PDF failed for disclosure {data['id']}.")
            interface.delete(disclosure_key)
            return

        # Save Financial Disclosure here to AWS and move onward
        disclosure = FinancialDisclosure(
            year=year,
            page_count=pg_count,
            person=Person.objects.get(id=person_id),
            sha1=sha1_hash,
            has_been_extracted=False,
            download_filepath=data.get("url")
            if data.get("url")
            else data.get("urls")[0],
        )
        # Save and upload PDF
        disclosure.filepath.save(
            f"{disclosure.person.slug}-disclosure.{year}.pdf",
            ContentFile(pdf_bytes),
        )
        logger.info(
            f"Uploaded to https://{settings.AWS_S3_CUSTOM_DOMAIN}/"
            f"{disclosure.filepath}"
        )
    # Extract content from PDF
    content = extract_content(
        pdf_bytes=pdf_bytes, disclosure_type=data["disclosure_type"]
    )
    if not content:
        logger.info("Failed extraction!")
        interface.delete(disclosure_key)
        return

    # Save PDF content
    save_disclosure(extracted_data=content, disclosure=disclosure)
    # Remove disclosure ID in redis for completed disclosure
    interface.delete(disclosure_key)
Exemple #28
0
def invert_user_logs(start, end, add_usernames=True):
    """Invert the user logs for a period of time

    The user logs have the date in the key and the user as part of the set:

        'api:v3.user.d:2016-10-01.counts': {
           mlissner: 22,
           joe_hazard: 33,
        }

    This inverts these entries to:

        users: {
            mlissner: {
                2016-10-01: 22,
                total: 22,
            },
            joe_hazard: {
                2016-10-01: 33,
                total: 33,
            }
        }
    :param start: The beginning date (inclusive) you want the results for. A
    :type start: datetime.datetime
    :param end: The end date (inclusive) you want the results for.
    :type end: datetime.datetime
    :param add_usernames: Stats are stored with the user ID. If this is True,
    add an alias in the returned dictionary that contains the username as well.
    :type add_usernames: bool
    :return The inverted dictionary
    :rtype: dict
    """
    r = make_redis_interface("STATS")
    pipe = r.pipeline()

    dates = [
        d.date().isoformat() for d in rrule(
            DAILY,
            dtstart=start,
            until=end,
        )
    ]
    for d in dates:
        pipe.zrange("api:v3.user.d:%s.counts" % d, 0, -1, withscores=True)
    results = pipe.execute()

    # results is a list of results for each of the zrange queries above. Zip
    # those results with the date that created it, and invert the whole thing.
    out = defaultdict(dict)
    for d, result in zip(dates, results):
        for user_id, count in result:
            if user_id == "None" or user_id == "AnonymousUser":
                user_id = "AnonymousUser"
            else:
                user_id = int(user_id)
            count = int(count)
            if out.get(user_id):
                out[user_id][d] = count
                out[user_id]["total"] += count
            else:
                out[user_id] = {d: count, "total": count}

    # Sort the values
    for k, v in out.items():
        out[k] = OrderedDict(sorted(v.items(), key=lambda t: t[0]))

    # Add usernames as alternate keys for every value possible.
    if add_usernames:
        for k, v in out.items():
            try:
                user = User.objects.get(pk=k)
            except (User.DoesNotExist, ValueError):
                pass
            else:
                out[user.username] = v

    return out
Exemple #29
0
def invert_user_logs(start, end):
    """Invert the user logs for a period of time

    The user logs have the date in the key and the user as part of the set:

        'api:v3.user.d:2016-10-01.counts': {
           mlissner: 22,
           joe_hazard: 33,
        }

    This inverts these entries to:

        users: {
            mlissner: {
                2016-10-01: 22,
                total: 22,
            },
            joe_hazard: {
                2016-10-01: 33,
                total: 33,
            }
        }
    """
    r = make_redis_interface('STATS')
    pipe = r.pipeline()

    dates = [
        d.date().isoformat() for d in rrule(
            DAILY,
            dtstart=parser.parse(start, fuzzy=False),
            until=parser.parse(end, fuzzy=False),
        )
    ]
    for d in dates:
        pipe.zrange('api:v3.user.d:%s.counts' % d, 0, -1, withscores=True)
    results = pipe.execute()

    # results is a list of results for each of the zrange queries above. Zip
    # those results with the date that created it, and invert the whole thing.
    out = defaultdict(dict)
    for d, result in zip(dates, results):
        for user_id, count in result:
            if user_id == 'None':
                continue
            user_id = int(user_id)
            count = int(count)
            if out.get(user_id):
                out[user_id][d] = count
                out[user_id]['total'] += count
            else:
                out[user_id] = {d: count, 'total': count}

    # Sort the values
    for k, v in out.items():
        out[k] = OrderedDict(sorted(v.items(), key=lambda t: t[0]))

    # Add usernames as alternate keys for every value possible.
    for k, v in out.items():
        try:
            user = User.objects.get(pk=k)
        except (User.DoesNotExist, ValueError):
            pass
        else:
            out[user.username] = v

    return out