def sweep_missing_downloads():
    """
    Get any documents that somehow are missing.
    
    This function attempts to address issue #671 by checking for any missing
    documents, downloading and parsing them. Hopefully this is a temporary 
    hack that we can soon remove when we deprecate the old RECAP server.
    
    :return: None 
    """
    two_hours_ago = now() - timedelta(hours=2)
    rds = RECAPDocument.objects.filter(
        Q(date_created__gt=two_hours_ago) | Q(date_modified__gt=two_hours_ago),
        is_available=True,
        page_count=None,
    ).order_by()
    for rd in rds:
        # Download the item to the correct location if it doesn't exist
        if not os.path.isfile(rd.filepath_local.path):
            filename = rd.filepath_local.name.rsplit('/', 1)[-1]
            chain(
                download_recap_item.si(rd.filepath_ia, filename),
                set_recap_page_count.si(rd.pk),
                extract_recap_pdf.s(check_if_needed=False).set(priority=5),
                add_or_update_recap_document.s(coalesce_docket=True),
            ).apply_async()
def get_dockets(options, items, tags, sample_size=0, doc_num_end=''):
    """Download dockets from PACER.

    :param options: Options provided by argparse
    :param items: Items from our FJC IDB database
    :param tags: A list of tag names to associate with the purchased content.
    :param sample_size: The number of items to get. If 0, get them all. Else,
    get only this many and do it randomly.
    :param doc_num_end: Only get docket numbers up to this value to constrain
    costs. If set to an empty string, no constraints are applied. Note that
    applying this value means no unnumbered entries will be retrieved by PACER.
    """

    if sample_size > 0:
        items = items.order_by('?')[:sample_size]

    q = options['queue']
    throttle = CeleryThrottle(queue_name=q)
    session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD)
    session.login()
    for i, row in enumerate(items):
        if i < options['offset']:
            continue
        if i >= options['limit'] > 0:
            break

        if i % 5000 == 0:
            # Re-authenticate just in case the auto-login mechanism isn't
            # working.
            session = PacerSession(username=PACER_USERNAME,
                                   password=PACER_PASSWORD)
            session.login()

        # All tests pass. Get the docket.
        logger.info("Doing row %s: %s", i, row)

        throttle.maybe_wait()
        params = make_fjc_idb_lookup_params(row)
        chain(
            get_pacer_case_id_and_title.s(
                pass_through=None,
                docket_number=row.docket_number,
                court_id=row.district_id,
                cookies=session.cookies,
                **params
            ).set(queue=q),
            filter_docket_by_tags.s(tags, row.district_id).set(queue=q),
            get_docket_by_pacer_case_id.s(
                court_id=row.district_id,
                cookies=session.cookies,
                tag_names=tags,
                **{
                    'show_parties_and_counsel': True,
                    'show_terminated_parties': True,
                    'show_list_of_member_cases': False,
                    'doc_num_end': doc_num_end,
                }
            ).set(queue=q),
            add_or_update_recap_docket.s().set(queue=q),
        ).apply_async()
def task(msg):
    logger.info(task.name)
    load_type = msg[mk.LOAD_TYPE]
    assessment_type = None
    if load_type == LoadType.ASSESSMENT:
        assessment_type = msg[mk.ASSESSMENT_TYPE]
    logger.info('DETERMINE END ROUTE: Determining end route by %s' % load_type)
    split_file_tuple_list = msg[mk.SPLIT_FILE_LIST]

    if split_file_tuple_list:
        common_tasks = [W_parallel_csv_load.get_load_from_csv_tasks(msg),
                        W_load_json_to_integration.task.s(),
                        W_file_content_validator.task.s(),
                        W_load_to_integration_table.task.s(),
                        W_load_from_integration_to_star.prepare_target_schema.s()]

        target_tasks = {LoadType.ASSESSMENT: [W_load_from_integration_to_star.get_explode_to_tables_tasks(msg, 'dim'),
                                              W_tasks_utils.handle_group_results.s(),
                                              W_load_from_integration_to_star.handle_record_upsert.s(),
                                              W_load_from_integration_to_star.get_explode_to_tables_tasks(msg, Constants.FACT_TABLE_PREFIX.get(assessment_type)),
                                              W_tasks_utils.handle_group_results.s(),
                                              W_load_from_integration_to_star.handle_deletions.s()],
                        LoadType.STUDENT_REGISTRATION: [W_load_sr_integration_to_target.task.s()]}

        post_etl_tasks = [W_post_etl.task.s(), W_all_done.task.s()]

        chain(common_tasks + target_tasks[load_type] + post_etl_tasks).delay()
    else:
        # because we process .err file, so, it's error
        msg[mk.PIPELINE_STATE] = 'error'
        W_all_done.task.subtask(args=[msg]).delay()
def get_pacer_dockets(options, docket_pks, tags):
    """Get the pacer dockets identified by the FJC IDB rows"""
    q = options['queue']
    throttle = CeleryThrottle(queue_name=q)
    pacer_session = None
    for i, docket_pk in enumerate(docket_pks):
        if i < options['offset']:
            continue
        if i >= options['limit'] > 0:
            break
        throttle.maybe_wait()
        if i % 1000 == 0 or pacer_session is None:
            pacer_session = PacerSession(username=PACER_USERNAME,
                                         password=PACER_PASSWORD)
            pacer_session.login()
            logger.info("Sent %s tasks to celery so far." % i)
        d = Docket.objects.get(pk=docket_pk)
        chain(
            get_docket_by_pacer_case_id.s(
                {'pacer_case_id': d.pacer_case_id,
                 'docket_pk': d.pk},
                d.court_id,
                cookies=pacer_session.cookies,
                tag_names=tags,
                **{'show_parties_and_counsel': True,
                   'show_terminated_parties': True,
                   'show_list_of_member_cases': False}
            ).set(queue=q),
            add_or_update_recap_docket.s().set(queue=q),
        ).apply_async()
Example #5
0
 def test_splices_chains(self):
     c = chain(
         self.add.s(5, 5),
         chain(self.add.s(6), self.add.s(7), self.add.s(8), app=self.app),
         app=self.app,
     )
     c.freeze()
     tasks, _ = c._frozen
     assert len(tasks) == 4
Example #6
0
def populate_report_data(start_date, end_date, domain, runner, locations=None, strict=True):
    # first populate all the warehouse tables for all facilities
    # hard coded to know this is the first date with data
    start_date = max(start_date, default_start_date())

    # For QA purposes generate reporting data for only some small part of data.
    if not ILSGatewayConfig.for_domain(domain).all_stock_data:
        if locations is None:
            locations = _get_test_locations(domain)
        facilities = filter(lambda location: location.location_type == 'FACILITY', locations)
        non_facilities_types = ['DISTRICT', 'REGION', 'MSDZONE', 'MOHSW']
        non_facilities = []
        for location_type in non_facilities_types:
            non_facilities.extend(filter(lambda location: location.location_type == location_type, locations))
    else:
        facilities = Location.filter_by_type(domain, 'FACILITY')
        non_facilities = list(Location.filter_by_type(domain, 'DISTRICT'))
        non_facilities += list(Location.filter_by_type(domain, 'REGION'))
        non_facilities += list(Location.filter_by_type(domain, 'MSDZONE'))
        non_facilities += list(Location.filter_by_type(domain, 'MOHSW'))

    if runner.location:
        if runner.location.location_type.name.upper() != 'FACILITY':
            facilities = []
            non_facilities = itertools.dropwhile(
                lambda location: location._id != runner.location.location_id,
                non_facilities
            )
        else:
            facilities = itertools.dropwhile(
                lambda location: location._id != runner.location.location_id,
                facilities
            )

    facilities_chunked_list = chunked(facilities, 5)
    for chunk in facilities_chunked_list:
        res = chain(process_facility_warehouse_data.si(fac, start_date, end_date, runner) for fac in chunk)()
        res.get()

    non_facilities_chunked_list = chunked(non_facilities, 50)

    # then populate everything above a facility off a warehouse table
    for chunk in non_facilities_chunked_list:
        res = chain(
            process_non_facility_warehouse_data.si(org, start_date, end_date, runner, strict)
            for org in chunk
        )()
        res.get()
    runner.location = None
    runner.save()
    # finally go back through the history and initialize empty data for any
    # newly created facilities
    update_historical_data(domain)
Example #7
0
def sync_chain():
    tasks = []
    if settings.GOOGLEADWORDS_SYNC_ACCOUNT:
        tasks.append(sync_accounts.si())
    if settings.GOOGLEADWORDS_SYNC_CAMPAIGN:
        tasks.append(sync_campaigns.si())
    if settings.GOOGLEADWORDS_SYNC_ADGROUP:
        tasks.append(sync_adgroups.si())
    if settings.GOOGLEADWORDS_SYNC_AD:
        tasks.append(sync_ads.si())

    chain(*tasks).apply_async()
def download_dockets(options):
    """Download dockets listed in the spreadsheet."""
    with open(options['input_file'], 'r') as f:
        dialect = csv.Sniffer().sniff(f.read(1024))
        f.seek(0)
        reader = csv.DictReader(f, dialect=dialect)
        q = options['queue']
        task = options['task']
        throttle = CeleryThrottle(queue_name=q,
                                  min_items=options['queue_length'])
        session = PacerSession(username=PACER_USERNAME,
                               password=PACER_PASSWORD)
        session.login()
        for i, row in enumerate(reader):
            if i < options['offset']:
                continue
            if i >= options['limit'] > 0:
                break
            throttle.maybe_wait()

            logger.info("Doing row %s: %s", i, row)

            if row['idb_docket_number']:
                if task == 'download_student_dockets':
                    continue
                # Zero-pad the docket number up to seven digits because Excel
                # ate the leading zeros that these would normally have.
                docket_number = row['idb_docket_number'].rjust(7, '0')
            elif row['student_docket_number']:
                # Use the values collected by student
                # researchers, then cleaned up my mlr.
                docket_number = row['student_docket_number']
            else:
                # No docket number; move on.
                continue
            court = Court.objects.get(fjc_court_id=row['AO ID'].rjust(2, '0'),
                                      jurisdiction=Court.FEDERAL_DISTRICT)
            chain(
                get_pacer_case_id_and_title.s(
                    pass_through=None,
                    docket_number=docket_number,
                    court_id=court.pk,
                    cookies=session.cookies,
                    case_name=row['Case Name'],
                ).set(queue=q),
                get_docket_by_pacer_case_id.s(
                    court_id=court.pk,
                    cookies=session.cookies,
                    tag_names=[TAG_NAME],
                ).set(queue=q),
                add_or_update_recap_docket.s().set(queue=q),
            ).apply_async()
def get_attachment_pages(options):
    """Find docket entries that look like invoices and get their attachment
    pages.
    """
    page_size = 100
    main_query = build_main_query_from_query_string(
        Q_DOCS_ONLY,
        {'rows': page_size, 'fl': ['id', 'docket_id']},
        {'group': False, 'facet': False},
    )
    si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode='r')
    results = si.query().add_extra(**main_query)

    q = options['queue']
    recap_user = User.objects.get(username='******')
    throttle = CeleryThrottle(queue_name=q,
                              min_items=options['queue_length'])
    session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD)
    session.login()
    paginator = Paginator(results, page_size)
    i = 0
    for page_number in range(1, paginator.num_pages + 1):
        paged_results = paginator.page(page_number)
        for result in paged_results.object_list:
            if i < options['offset']:
                i += 1
                continue
            if i >= options['limit'] > 0:
                break

            logger.info("Doing row %s: rd: %s, docket: %s", i, result['id'],
                        result['docket_id'])
            throttle.maybe_wait()
            chain(
                # Query the attachment page and process it
                get_attachment_page_by_rd.s(
                    result['id'], session.cookies).set(queue=q),
                # Take that in a new task and make a PQ object
                make_attachment_pq_object.s(
                    result['id'], recap_user.pk).set(queue=q),
                # And then process that using the normal machinery.
                process_recap_attachment.s(tag_names=[TAG_PHASE_1]).set(queue=q),
            ).apply_async()
            i += 1
        else:
            # Inner loop exited normally (didn't "break")
            continue
        # Inner loop broke. Break outer loop too.
        break
def get_documents(options):
    """Download documents from PACER if we don't already have them."""
    q = options['queue']
    throttle = CeleryThrottle(queue_name=q,
                              min_items=options['queue_length'])
    session = PacerSession(username=PACER_USERNAME,
                           password=PACER_PASSWORD)
    session.login()

    page_size = 10000
    main_query = build_main_query_from_query_string(
        Q_INVOICES,
        {'rows': page_size, 'fl': ['id', 'docket_id']},
        {'group': False, 'facet': False},
    )
    si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode='r')
    results = si.query().add_extra(**main_query).execute()
    logger.info("Got %s search results.", results.result.numFound)

    for i, result in enumerate(results):
        if i < options['offset']:
            i += 1
            continue
        if i >= options['limit'] > 0:
            break
        throttle.maybe_wait()

        rd = RECAPDocument.objects.get(pk=result['id'])
        logger.info("Doing item %s w/rd: %s, d: %s", i, rd.pk,
                    result['docket_id'])

        if rd.is_available:
            logger.info("Already have pk %s; just tagging it.", rd.pk)
            add_tags(rd, TAG_PHASE_2)
            i += 1
            continue

        if not rd.pacer_doc_id:
            logger.info("Unable to find pacer_doc_id for: %s", rd.pk)
            i += 1
            continue

        chain(
            get_pacer_doc_by_rd.s(rd.pk, session.cookies,
                                  tag=TAG_PHASE_2).set(queue=q),
            extract_recap_pdf.si(rd.pk).set(queue=q),
            add_items_to_solr.si([rd.pk], 'search.RECAPDocument').set(queue=q),
        ).apply_async()
        i += 1
Example #11
0
def populate_report_data(start_date, end_date, domain, runner, locations=None, strict=True):
    # first populate all the warehouse tables for all facilities
    # hard coded to know this is the first date with data
    start_date = max(start_date, default_start_date())

    # For QA purposes generate reporting data for only some small part of data.
    if not ILSGatewayConfig.for_domain(domain).all_stock_data:
        if locations is None:
            locations = _get_test_locations(domain)
        facilities = filter(lambda location: location.location_type == "FACILITY", locations)
        non_facilities_types = ["DISTRICT", "REGION", "MSDZONE", "MOHSW"]
        non_facilities = []
        for location_type in non_facilities_types:
            non_facilities.extend(filter(lambda location: location.location_type == location_type, locations))
    else:
        facilities = Location.filter_by_type(domain, "FACILITY")
        non_facilities = list(Location.filter_by_type(domain, "DISTRICT"))
        non_facilities += list(Location.filter_by_type(domain, "REGION"))
        non_facilities += list(Location.filter_by_type(domain, "MSDZONE"))
        non_facilities += list(Location.filter_by_type(domain, "MOHSW"))

    if runner.location:
        if runner.location.location_type.name.upper() != "FACILITY":
            facilities = []
            non_facilities = itertools.dropwhile(
                lambda location: location.location_id != runner.location.location_id, non_facilities
            )
        else:
            facilities = itertools.dropwhile(
                lambda location: location.location_id != runner.location.location_id, facilities
            )

    facilities_chunked_list = chunked(facilities, 5)
    for chunk in facilities_chunked_list:
        res = chain(process_facility_warehouse_data.si(fac, start_date, end_date, runner) for fac in chunk)()
        res.get()

    non_facilities_chunked_list = chunked(non_facilities, 50)

    # then populate everything above a facility off a warehouse table
    for chunk in non_facilities_chunked_list:
        res = chain(
            process_non_facility_warehouse_data.si(org, start_date, end_date, runner, strict) for org in chunk
        )()
        res.get()

    runner.location = None
    runner.save()
Example #12
0
def _pre_create_undeploy(deployment, search_params, next_task=None):
    """
    Un-deploys during pre-create phase. The versions un-deployed depends upon
    mode of deployment.
    :param deployment: Deployment parameters
    :type deployment: dict
    :return: deployment to continue deploy chain
    :rtype: dict
    """
    deploy_mode = deployment['deployment']['mode']
    if deploy_mode == DEPLOYMENT_MODE_BLUEGREEN:
        # Undeploy only current version in pre-create phase.
        version = deployment['deployment']['version']
    elif deploy_mode == DEPLOYMENT_MODE_REDGREEN:
        # Undeploy all versions in pre-create phase.
        version = None
    else:
        # Do not undeploy anything when mode is custom or A/B
        return next_task.delay() if next_task else None
    name = deployment['deployment']['name']
    undeploy_chain = [
        _fleet_stop.si(name, version=version) |
        _wait_for_stop.si(name, version=version, search_params=search_params) |
        _fleet_undeploy.si(name, version, ignore_error=False),
        _wait_for_undeploy.si(name, version, search_params=search_params)
    ]
    if next_task:
        undeploy_chain.append(next_task)
    return chain(undeploy_chain).delay()
Example #13
0
    def test_apply(self):
        x = chain(add.s(4, 4), add.s(8), add.s(10))
        res = x.apply()
        self.assertIsInstance(res, EagerResult)
        self.assertEqual(res.get(), 26)

        self.assertEqual(res.parent.get(), 16)
        self.assertEqual(res.parent.parent.get(), 8)
        self.assertIsNone(res.parent.parent.parent)
Example #14
0
    def test_append_to_empty_chain(self):
        x = chain()
        x |= self.add.s(1, 1)
        x |= self.add.s(1)
        x.freeze()
        tasks, _ = x._frozen
        assert len(tasks) == 2

        assert x.apply().get() == 3
Example #15
0
    def test_from_dict_full_subtasks(self):
        c = chain(self.add.si(1, 2), self.add.si(3, 4), self.add.si(5, 6))

        serialized = json.loads(json.dumps(c))

        deserialized = chain.from_dict(serialized)

        for task in deserialized.tasks:
            assert isinstance(task, Signature)
Example #16
0
    def test_apply(self):
        x = chain(self.add.s(4, 4), self.add.s(8), self.add.s(10))
        res = x.apply()
        assert isinstance(res, EagerResult)
        assert res.get() == 26

        assert res.parent.get() == 16
        assert res.parent.parent.get() == 8
        assert res.parent.parent.parent is None
Example #17
0
 def test_handles_dicts(self):
     c = chain(
         self.add.s(5, 5), dict(self.add.s(8)), app=self.app,
     )
     c.freeze()
     tasks, _ = c._frozen
     for task in tasks:
         assert isinstance(task, Signature)
         assert task.app is self.app
def get_dockets(options):
    """Download the dockets described in the CSV
    """
    f = options['file']
    reader = csv.DictReader(f)
    q = options['queue']
    throttle = CeleryThrottle(queue_name=q)
    pacer_session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD)
    pacer_session.login()
    for i, row in enumerate(reader):
        if i < options['offset']:
            continue
        if i >= options['limit'] > 0:
            break

        if i % 1000 == 0:
            pacer_session = PacerSession(username=PACER_USERNAME,
                                         password=PACER_PASSWORD)
            pacer_session.login()
            logger.info("Sent %s tasks to celery so far." % i)
        logger.info("Doing row %s", i)
        throttle.maybe_wait()
        chain(
            get_pacer_case_id_and_title.s(
                pass_through=None,
                docket_number=make_docket_number(row['filecy'], row['docket']),
                court_id='ilnb',
                cookies=pacer_session.cookies,
                office_number=row['office'],
                docket_number_letters='bk',
            ).set(queue=q),
            get_docket_by_pacer_case_id.s(
                court_id='ilnb',
                cookies=pacer_session.cookies,
                tag_names=[TAG],
                **{
                    'show_parties_and_counsel': True,
                    'show_terminated_parties': True,
                    'show_list_of_member_cases': True
                }
            ).set(queue=q),
            add_or_update_recap_docket.s().set(queue=q),
        ).apply_async()
def do_ocr(options):
    """Do the OCR for any items that need it, then save to the solr index."""
    q = options['queue']
    rds = RECAPDocument.objects.filter(
        ocr_status=RECAPDocument.OCR_NEEDED,
    ).values_list('pk', flat=True).order_by()
    count = rds.count()
    throttle = CeleryThrottle(queue_name=q)
    for i, pk in enumerate(rds):
        throttle.maybe_wait()
        if options['index']:
            extract_recap_pdf.si(pk, skip_ocr=False).set(queue=q).apply_async()
        else:
            chain(
                extract_recap_pdf.si(pk, skip_ocr=False).set(queue=q),
                add_or_update_recap_document.s(coalesce_docket=True).set(queue=q),
            ).apply_async()
        if i % 1000 == 0:
            logger.info("Sent %s/%s tasks to celery so far." % (i + 1, count))
Example #20
0
def populate_report_data(start_date, end_date, domain, runner, strict=True):
    facilities = SQLLocation.objects.filter(
        location_type__name='FACILITY',
        domain=domain,
        created_at__lt=end_date
    ).order_by('pk')
    non_facilities = _get_locations_by_type(domain, 'DISTRICT')
    non_facilities += _get_locations_by_type(domain, 'REGION')
    non_facilities += _get_locations_by_type(domain, 'MSDZONE')
    non_facilities += _get_locations_by_type(domain, 'MOHSW')

    if runner.location:
        if runner.location.location_type.name.upper() != 'FACILITY':
            facilities = []
            non_facilities = itertools.dropwhile(
                lambda location: location.location_id != runner.location.location_id,
                non_facilities
            )
        else:
            facilities = itertools.dropwhile(
                lambda location: location.location_id != runner.location.location_id,
                facilities
            )

    facilities_chunked_list = chunked(facilities, 5)
    for chunk in facilities_chunked_list:
        res = chain(process_facility_warehouse_data.si(fac, start_date, end_date, runner) for fac in chunk)()
        res.get()

    non_facilities_chunked_list = chunked(non_facilities, 50)

    # then populate everything above a facility off a warehouse table
    for chunk in non_facilities_chunked_list:
        res = chain(
            process_non_facility_warehouse_data.si(org, start_date, end_date, runner, strict)
            for org in chunk
        )()
        res.get()

    runner.location = None
    runner.save()
 def test_multi_tenant_move_to_target_assessment_data(self):
     ASMT_OUTCOME_FILE = self.require_file('INT_SBAC_ASMT_OUTCOME.csv')
     ASMT_FILE = self.require_file('INT_SBAC_ASMT.csv')
     self.verify_target_assessment_schema(self.guid_batch_asmt, True)
     self.load_csv_data_to_integration(ASMT_OUTCOME_FILE, ASMT_FILE, 'int_sbac_asmt_outcome', 'int_sbac_asmt')
     msg = self.create_msg(Constants.LOAD_TYPE_ASSESSMENT, self.guid_batch_asmt)
     dim_tasks = get_explode_to_tables_tasks(msg, 'dim')
     fact_tasks = get_explode_to_tables_tasks(msg, 'fact_asmt')
     tasks = chain(prepare_target_schema.s(msg), dim_tasks, handle_group_results.s(), fact_tasks, handle_group_results.s())
     results = tasks.delay()
     results.get()
     self.verify_target_assessment_schema(self.guid_batch_asmt, False)
    def update_any_missing_pacer_case_ids(options):
        """The network requests were making things far too slow and had to be
        disabled during the first pass. With this method, we update any items
        that are missing their pacer case ID value.
        """
        ds = Docket.objects.filter(
            idb_data__isnull=False,
            pacer_case_id=None,
        )
        q = options['queue']
        throttle = CeleryThrottle(queue_name=q)
        session = PacerSession(username=PACER_USERNAME,
                               password=PACER_PASSWORD)
        session.login()
        for i, d in enumerate(queryset_generator(ds)):
            if i < options['offset']:
                continue
            if i >= options['limit'] > 0:
                break

            if i % 5000 == 0:
                # Re-authenticate just in case the auto-login mechanism isn't
                # working.
                session = PacerSession(username=PACER_USERNAME,
                                       password=PACER_PASSWORD)
                session.login()

            throttle.maybe_wait()
            logger.info("Getting pacer_case_id for item %s", d)
            params = make_fjc_idb_lookup_params(d.idb_data)
            chain(
                get_pacer_case_id_and_title.s(
                    pass_through=d.pk,
                    docket_number=d.idb_data.docket_number,
                    court_id=d.idb_data.district_id,
                    cookies=session.cookies,
                    **params
                ).set(queue=q),
                update_docket_from_hidden_api.s().set(queue=q),
            ).apply_async()
Example #23
0
def _using_lock(self, search_params, name, do_task, cleanup_tasks=None,
                error_tasks=None):
    """
    Applies lock for the deployment

    :return: Lock object (dictionary)
    :rtype: dict
    """
    try:
        lock = LockService().apply_lock(name)
    except ResourceLockedException as lock_error:
        raise self.retry(exc=lock_error)

    _release_lock_s = _release_lock.si(lock)
    cleanup_tasks = cleanup_tasks or []
    if not isinstance(cleanup_tasks, list):
        cleanup_tasks = [cleanup_tasks]

    error_tasks = error_tasks or []
    if not isinstance(error_tasks, list):
        error_tasks = [error_tasks]

    error_tasks.append(_release_lock_s)
    cleanup_tasks.append(_release_lock_s)

    get_store().add_event(
        EVENT_ACQUIRED_LOCK, search_params=search_params, details={
            'name': name
        })

    return (
        do_task |
        async_wait.s(
            default_retry_delay=TASK_SETTINGS['DEPLOYMENT_WAIT_RETRY_DELAY'],
            max_retries=TASK_SETTINGS['DEPLOYMENT_WAIT_RETRIES']
        )
    ).apply_async(
        link=chain(cleanup_tasks),
        link_error=chain(error_tasks)
    )
def get_pdfs(options):
    """Get PDFs for the results of the Free Document Report queries.

    At this stage, we have rows in the PACERFreeDocumentRow table, each of 
    which represents a PDF we need to download and merge into our normal 
    tables: Docket, DocketEntry, and RECAPDocument.

    In this function, we iterate over the entire table of results, merge it 
    into our normal tables, and then download and extract the PDF.

    :return: None
    """
    q = options['queue']
    index = options['index']
    cnt = CaseNameTweaker()
    rows = PACERFreeDocumentRow.objects.filter(error_msg="").only('pk')
    count = rows.count()
    task_name = "downloading"
    if index:
        task_name += " and indexing"
    logger.info("%s %s items from PACER." % (task_name, count))
    throttle = CeleryThrottle(queue_name=q)
    completed = 0
    for row in queryset_generator(rows):
        throttle.maybe_wait()
        if completed % 30000 == 0:
            pacer_session = PacerSession(username=PACER_USERNAME,
                                         password=PACER_PASSWORD)
            pacer_session.login()
        chain(
            process_free_opinion_result.si(row.pk, cnt).set(queue=q),
            get_and_process_pdf.s(pacer_session, row.pk, index=index).set(queue=q),
            delete_pacer_row.si(row.pk).set(queue=q),
        ).apply_async()
        completed += 1
        if completed % 1000 == 0:
            logger.info("Sent %s/%s tasks to celery for %s so "
                        "far." % (completed, count, task_name))
def get_and_merge_items(items, log):
    """Get the items returned from the RECAP server and merge them into CL.

    Items is a list of dicts like so, sorted by court, case number, document
    number and attachment number:

    [{'attachment_number': '0',
      'document_number': '1',
      'case_number': '186759',
      'court_id': 'almb',
      'is_available': '0'},
      ...
    ]

    Note that all values are strings. The idea is to iterate over all of these
    dicts, grabbing the docket, and adding any items that have is_available = 1.
    """
    update_log_status(log, RECAPLog.GETTING_AND_MERGING_ITEMS)
    tasks = []
    for prev, item, nxt in previous_and_next(items):
        if prev is None or item['case_number'] != prev['case_number']:
            # New case. Get the next docket before getting any PDFs.
            url = get_docketxml_url(item['court_id'], item['case_number'])
            logger.info("New docket found at: %s" % url)
            filename = get_docket_filename(item['court_id'], item['case_number'])
            tasks.append(download_recap_item.si(url, filename, clobber=True))

        # Get the document
        filename = get_document_filename(item['court_id'], item['case_number'],
                                         item['document_number'],
                                         item['attachment_number'])
        location = os.path.join(settings.MEDIA_ROOT, 'recap', filename)
        if not os.path.isfile(location) and int(item['is_available']):
            # We don't have it yet, and it's available to get. Get it!
            url = get_pdf_url(item['court_id'], item['case_number'], filename)
            tasks.append(download_recap_item.si(url, filename))

        if nxt is None or item['case_number'] != nxt['case_number']:
            # Last item in the case. Send for processing.
            if len(tasks) > 0:
                logger.info("Sending %s tasks for processing." % len(tasks))
                filename = get_docket_filename(item['court_id'],
                                               item['case_number'])
                chord(tasks)(chain(
                    parse_recap_docket.si(filename, debug=False),
                    extract_recap_pdf.s().set(priority=5),
                    add_or_update_recap_document.s(coalesce_docket=True),
                ))
                tasks = []
    logger.info("Finished queueing new cases.")
def get_petitions(options):
    """Just get document number one for every docket that's tagged in this
    collection.
    """
    rds = RECAPDocument.objects.filter(
        tags__name=TAG,
        document_number='1',
        document_type=RECAPDocument.PACER_DOCUMENT,
    ).exclude(
        pacer_doc_id='',
    ).order_by('pk').values_list('pk', flat=True).iterator()
    q = options['queue']
    throttle = CeleryThrottle(queue_name=q)
    pacer_session = PacerSession(username=PACER_USERNAME,
                                 password=PACER_PASSWORD)
    pacer_session.login()
    for i, rd_pk in enumerate(rds):
        if i < options['offset']:
            i += 1
            continue
        if i >= options['limit'] > 0:
            break

        if i % 1000 == 0:
            pacer_session = PacerSession(username=PACER_USERNAME,
                                         password=PACER_PASSWORD)
            pacer_session.login()
            logger.info("Sent %s tasks to celery so far." % i)
        logger.info("Doing row %s", i)
        throttle.maybe_wait()

        chain(
            get_pacer_doc_by_rd.s(
                rd_pk, pacer_session.cookies, tag=TAG_PETITIONS).set(queue=q),
            extract_recap_pdf.si(rd_pk).set(queue=q),
            add_items_to_solr.si([rd_pk], 'search.RECAPDocument').set(queue=q),
        ).apply_async()
Example #27
0
    def test_group_to_chord__protocol_2(self):
        c = chain(
            group([self.add.s(i, i) for i in range(5)], app=self.app),
            self.add.s(10),
            self.add.s(20),
            self.add.s(30)
        )
        assert isinstance(c, chord)
        assert isinstance(c.body, _chain)
        assert len(c.body.tasks) == 3

        c2 = self.add.s(2, 2) | group(self.add.s(i, i) for i in range(10))
        c2._use_link = False
        tasks2, _ = c2.prepare_steps((), c2.tasks)
        assert isinstance(tasks2[0], group)
Example #28
0
def start_extract(tenant, request_id, archive_file_names, directories_to_archive, registration_ids, tasks, queue=None):
    '''
    entry point to start an extract request for one or more extract tasks
    it groups the generation of csv into a celery task group and then chains it to the next task to archive the files into one zip
    '''
    workflow = chain(generate_prepare_path_task(request_id, archive_file_names, directories_to_archive, queue_name=queue),
                     generate_extract_file_tasks(tenant, request_id, tasks, queue_name=queue),
                     extract_group_separator.subtask(immutable=True),  # @UndefinedVariable
                     generate_archive_file_tasks(request_id, archive_file_names, directories_to_archive, queue_name=queue),
                     extract_group_separator.subtask(immutable=True),  # @UndefinedVariable
                     generate_remote_copy_tasks(request_id, archive_file_names, registration_ids, queue_name=queue),
                     extract_group_separator.subtask(immutable=True),  # @UndefinedVariable
                     clean_up.subtask(args=[get_extract_request_base_path(tenant, request_id)], queue_name=queue, immutable=True)  # @UndefinedVariable
                     )
    workflow.apply_async()
Example #29
0
def _start_bulk(archive_file_path, directory_to_archive, registration_id, gen_tasks, merge_tasks, cover_tasks,
                merge_covers_tasks, pdf_base_dir):
    '''
    entry point to start a bulk PDF generation request for one or more students
    it groups the generation of individual PDFs into a celery task group and then chains it to the next task to merge
    the files into one PDF, archive the PDF into a zip, and upload the zip to HPZ
    '''
    workflow = chain(group(gen_tasks),
                     group_separator.subtask(immutable=True),
                     group(merge_tasks),
                     group_separator.subtask(immutable=True),
                     group(cover_tasks),
                     group_separator.subtask(immutable=True),
                     group(merge_covers_tasks),
                     archive.subtask(args=(archive_file_path, directory_to_archive), immutable=True),
                     hpz_upload_cleanup.subtask(args=(archive_file_path, registration_id, pdf_base_dir), immutable=True))
    workflow.apply_async()
Example #30
0
def stock_data_task(api_object):
    # checkpoint logic
    start_date = datetime.today()
    default_api = api_object.apis[0][0]

    checkpoint, _ = StockDataCheckpoint.objects.get_or_create(domain=api_object.domain, defaults={
        'api': default_api,
        'date': None,
        'limit': 1000,
        'offset': 0,
        'location': None,
        'start_date': start_date
    })

    if not checkpoint.start_date:
        checkpoint.start_date = start_date
        checkpoint.save()

    if not api_object.all_stock_data:
        facilities = api_object.test_facilities
    else:
        facilities = api_object.get_ids()

    if checkpoint.location:
        external_id = api_object.get_last_processed_location(checkpoint)
        if external_id:
            facilities = list(itertools.dropwhile(lambda x: int(x) != int(external_id), facilities))
            process_facility_task(api_object, facilities[0], start_from=checkpoint.api)
            facilities = facilities[1:]

    if not checkpoint.date:
        # use subtasks only during initial migration
        facilities_chunked_list = chunked(facilities, 5)

        for chunk in facilities_chunked_list:
            res = chain(process_facility_task.si(api_object, fac) for fac in chunk)()
            res.get()

    else:
        for facility in facilities:
            process_facility_task(api_object, facility)

    checkpoint = StockDataCheckpoint.objects.get(domain=api_object.domain)
    save_stock_data_checkpoint(checkpoint, default_api, 1000, 0, start_date, None, False)
    checkpoint.start_date = None
    checkpoint.save()
Example #31
0
 def test_kwargs_apply(self):
     x = chain(self.add.s(), self.add.s(8), self.add.s(10))
     res = x.apply(kwargs={'x': 1, 'y': 1}).get()
     assert res == 20
Example #32
0
 def test_from_dict_no_tasks(self):
     assert chain.from_dict(dict(chain(app=self.app)), app=self.app)
Example #33
0
 def test_app_falls_back_to_default(self):
     from celery._state import current_app
     assert chain().app is current_app
Example #34
0
 def test_chord_size_nested_implicit_chain_chain_single(self):
     sig = chord([chain(self.add.s())])
     assert sig.__length_hint__() == 1
Example #35
0
 def test_from_dict_full_subtasks(self):
     c = chain(self.add.si(1, 2), self.add.si(3, 4), self.add.si(5, 6))
     serialized = json.loads(json.dumps(c))
     deserialized = chain.from_dict(serialized)
     assert all(isinstance(task, Signature) for task in deserialized.tasks)
Example #36
0
 def test_call_no_tasks(self):
     x = chain()
     assert not x()
Example #37
0
 def test_chord_size_chain_many(self):
     # Chains get flattened into the encapsulating chord so even though the
     # chain would only count for 1, the tasks we pulled into the chord's
     # header and are counted as a bunch of simple signature objects
     sig = chord(chain([self.add.s()] * 42))
     assert sig.__length_hint__() == 42
Example #38
0
def get_and_save_free_document_reports(options):
    """Query the Free Doc Reports on PACER and get a list of all the free
    documents. Do not download those items, as that step is done later.
    """
    # Kill any *old* logs that report they're in progress. (They've failed.)
    twelve_hrs_ago = now() - timedelta(hours=12)
    PACERFreeDocumentLog.objects.filter(
        date_started__lt=twelve_hrs_ago,
        status=PACERFreeDocumentLog.SCRAPE_IN_PROGRESS,
    ).update(status=PACERFreeDocumentLog.SCRAPE_FAILED, )

    cl_court_ids = Court.objects.filter(
        jurisdiction__in=[Court.FEDERAL_DISTRICT, Court.FEDERAL_BANKRUPTCY],
        in_use=True,
        end_date=None,
    ).exclude(pk__in=[
        'casb', 'ganb', 'gub', 'innb', 'mieb', 'miwb', 'nmib', 'nvb', 'ohsb',
        'prb', 'tnwb', 'vib'
    ], ).values_list(
        'pk',
        flat=True,
    )
    pacer_court_ids = {
        map_cl_to_pacer_id(v): {
            'until': now(),
            'count': 1,
            'result': None
        }
        for v in cl_court_ids
    }
    pacer_session = PacerSession(username=PACER_USERNAME,
                                 password=PACER_PASSWORD)
    pacer_session.login()

    # Iterate over every court, X days at a time. As courts are completed,
    # remove them from the list of courts to process until none are left
    tomorrow = now() + timedelta(days=1)
    while len(pacer_court_ids) > 0:
        court_ids_copy = pacer_court_ids.copy()  # Make a copy of the list.
        for pacer_court_id, delay in court_ids_copy.items():
            if now() < delay['until']:
                # Do other courts until the delay is up. Do not print/log
                # anything since at the end there will only be one court left.
                continue

            next_start_date, next_end_date = get_next_date_range(
                pacer_court_id)
            if delay['result'] is not None:
                if delay['result'].ready():
                    result = delay['result'].get()
                    if result == PACERFreeDocumentLog.SCRAPE_SUCCESSFUL:
                        if next_start_date >= tomorrow.date():
                            logger.info("Finished '%s'. Marking it complete." %
                                        pacer_court_id)
                            pacer_court_ids.pop(pacer_court_id, None)
                            continue

                    elif result == PACERFreeDocumentLog.SCRAPE_FAILED:
                        logger.error("Encountered critical error on %s "
                                     "(network error?). Marking as failed and "
                                     "pressing on." % pacer_court_id)
                        pacer_court_ids.pop(pacer_court_id, None)
                        continue
                else:
                    next_delay = min(delay['count'] * 5, 30)  # backoff w/cap
                    logger.info(
                        "Court %s still in progress. Delaying at least "
                        "%ss." % (pacer_court_id, next_delay))
                    pacer_court_ids[pacer_court_id]['until'] = now(
                    ) + timedelta(seconds=next_delay)
                    pacer_court_ids[pacer_court_id]['count'] += 1
                    continue

            mark_court_in_progress(pacer_court_id, next_end_date)
            pacer_court_ids[pacer_court_id]['count'] = 1  # Reset
            delay['result'] = chain(
                get_and_save_free_document_report.si(pacer_court_id,
                                                     next_start_date,
                                                     next_end_date,
                                                     pacer_session),
                mark_court_done_on_date.s(pacer_court_id, next_end_date),
            ).apply_async()
Example #39
0
    def test_chain_tasks(self):
        sigs = [celery_canvas.Signature() for r in range(4)]

        chain = celery_canvas.chain(sigs)
        tasks = inspect.get_chain_tasks(chain)
        assert sigs == tasks
Example #40
0
def get_dockets(options, items, tags, sample_size=0, doc_num_end=""):
    """Download dockets from PACER.

    :param options: Options provided by argparse
    :param items: Items from our FJC IDB database
    :param tags: A list of tag names to associate with the purchased content.
    :param sample_size: The number of items to get. If 0, get them all. Else,
    get only this many and do it randomly.
    :param doc_num_end: Only get docket numbers up to this value to constrain
    costs. If set to an empty string, no constraints are applied. Note that
    applying this value means no unnumbered entries will be retrieved by PACER.
    """

    if sample_size > 0:
        items = items.order_by("?")[:sample_size]

    q = options["queue"]
    throttle = CeleryThrottle(queue_name=q)
    session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD)
    session.login()
    for i, row in enumerate(items):
        if i < options["offset"]:
            continue
        if i >= options["limit"] > 0:
            break

        if i % 5000 == 0:
            # Re-authenticate just in case the auto-login mechanism isn't
            # working.
            session = PacerSession(
                username=PACER_USERNAME, password=PACER_PASSWORD
            )
            session.login()

        # All tests pass. Get the docket.
        logger.info("Doing row %s: %s", i, row)

        throttle.maybe_wait()
        params = make_fjc_idb_lookup_params(row)
        chain(
            get_pacer_case_id_and_title.s(
                pass_through=None,
                docket_number=row.docket_number,
                court_id=row.district_id,
                cookies=session.cookies,
                **params
            ).set(queue=q),
            filter_docket_by_tags.s(tags, row.district_id).set(queue=q),
            get_docket_by_pacer_case_id.s(
                court_id=row.district_id,
                cookies=session.cookies,
                tag_names=tags,
                **{
                    "show_parties_and_counsel": True,
                    "show_terminated_parties": True,
                    "show_list_of_member_cases": False,
                    "doc_num_end": doc_num_end,
                }
            ).set(queue=q),
            add_or_update_recap_docket.s().set(queue=q),
        ).apply_async()
Example #41
0
def crawl_xml(merchant_name, location):
	uri = get_rss_feed_uri(merchant_name, location)
	chain(parse_rss(uri).s(), extract_data.s(), persist_data.s())
Example #42
0
def _get_analysis_update_tasks(analysis_id) -> List:
    """ Runs update tasks on nodes that have status=DIRTY """

    tasks = []

    analysis = Analysis.objects.get(pk=analysis_id)
    node_ids = analysis.analysisnode_set.all().values_list("pk", flat=True)
    edges = AnalysisEdge.objects.filter(parent__analysis=analysis).values_list("parent", "child")

    all_nodes_graph = nx.DiGraph()
    all_nodes_graph.add_nodes_from(node_ids)
    all_nodes_graph.add_edges_from(edges)

    logging.info("-" * 60)

    for connected_components in nx.weakly_connected_components(all_nodes_graph):
        sub_graph = all_nodes_graph.subgraph(connected_components)
        sub_graph_node_ids = list(sub_graph)
        # nx.topological_sort returns a flattened list, ie doesn’t break into groups which can run in parallel
        # so use other toposort library

        # We need a way to lock/claim the nodes - so someone else calling get_analysis_update_task()
        # doesn't also launch update tasks for them.
        sub_graph_nodes_qs = analysis.analysisnode_set.filter(pk__in=sub_graph_node_ids)
        analysis_update_uuid = uuid.uuid4()
        node_task_records = []
        logging.info("Dirty nodes:")
        for node_id, version in sub_graph_nodes_qs.filter(status=NodeStatus.DIRTY).values_list("pk", "version"):
            node_task = NodeTask(node_id=node_id, version=version, analysis_update_uuid=analysis_update_uuid)
            logging.info(node_task)
            node_task_records.append(node_task)

        if node_task_records:
            NodeTask.objects.bulk_create(node_task_records, ignore_conflicts=True)

        # Return the ones we got the lock for
        node_tasks = NodeTask.objects.filter(analysis_update_uuid=analysis_update_uuid)
        node_versions_to_update = dict(node_tasks.values_list("node_id", "version"))
        logging.info(f"Got lock for: {node_versions_to_update}")

        groups = []
        if node_versions_to_update:
            parent_value_data = defaultdict(set)
            for parent, child_list in nx.to_dict_of_lists(sub_graph).items():
                for child_node_id in child_list:
                    parent_value_data[child_node_id].add(parent)

            nodes_by_id = get_nodes_by_id(sub_graph_nodes_qs.select_subclasses())
            dependencies = _get_node_dependencies(nodes_by_id, parent_value_data)
            topo_sorted = get_toposorted_nodes_from_parent_value_data(nodes_by_id, parent_value_data)

            # Ensure cache loading tasks are only triggered once. Cache can come from different toposort level/groups
            # eg MergeNode asks parent Venn to cache (which is was already doing)
            all_cache_jobs = set()
            for grp in topo_sorted:
                group_cache_jobs = _add_jobs_for_group(node_versions_to_update, dependencies, grp, groups, all_cache_jobs)
                all_cache_jobs.update(group_cache_jobs)

            # Need to only set where version matches what we got lock for (as it may have updated)
            node_version_q_list = []
            for node_id, version in node_versions_to_update.items():
                node_version_q_list.append(Q(pk=node_id) & Q(version=version))
            q_node_version = reduce(operator.or_, node_version_q_list)
            analysis.analysisnode_set.filter(q_node_version).update(status=NodeStatus.QUEUED)

        if groups:
            t = chain(groups)
            tasks.append(t)

    return tasks
Example #43
0
def do_pacer_fetch(fq):
    """Process a request made by a user to get an item from PACER.

    :param fq: The PacerFetchQueue item to process
    :return: None
    """
    c = None
    if fq.request_type == REQUEST_TYPE.DOCKET:
        # Request by docket_id
        court_id = fq.court_id or getattr(fq.docket, "court_id", None)
        kwargs = {
            # Universal params
            "court_id": court_id,
            "user_pk": fq.user_id,
            "docket_pk": fq.docket_id,
            # Scraping params
            "doc_num_start": fq.de_number_start,
            "doc_num_end": fq.de_number_end,
            "date_start": fq.de_date_start,
            "date_end": fq.de_date_end,
            "show_parties_and_counsel": fq.show_parties_and_counsel,
            "show_terminated_parties": fq.show_terminated_parties,
            "show_list_of_member_cases": fq.show_list_of_member_cases,
        }
        if (fq.docket_id and not fq.docket.pacer_case_id) or fq.docket_number:
            # We lack the pacer_case_id either on the docket or from the
            # submission. Look it up.
            docket_number = fq.docket_number or getattr(
                fq.docket, "docket_number", None
            )
            c = chain(
                get_pacer_case_id_and_title.si(
                    pass_through=None,
                    docket_number=docket_number,
                    court_id=court_id,
                    user_pk=fq.user_id,
                ),
                get_docket_by_pacer_case_id.s(**kwargs),
            )
        else:
            if fq.docket_id is not None and fq.docket.pacer_case_id:
                # We have the docket and its pacer_case_id
                kwargs.update(
                    {
                        "data": {"pacer_case_id": fq.docket.pacer_case_id},
                        "court_id": fq.docket.court_id,
                    }
                )
            elif fq.pacer_case_id:
                # We lack the docket, but have a pacer_case_id
                kwargs.update(
                    {"data": {"pacer_case_id": fq.pacer_case_id},}
                )
            c = chain(get_docket_by_pacer_case_id.si(**kwargs))
        c |= add_or_update_recap_docket.s()
    elif fq.request_type == REQUEST_TYPE.PDF:
        # Request by recap_document_id
        rd_pk = fq.recap_document_id
        if fq.recap_document_id:
            c = chain(
                fetch_pacer_doc_by_rd.si(rd_pk, fq.pk, fq.user_id),
                extract_recap_pdf.si(rd_pk),
                add_items_to_solr.si([rd_pk], "search.RECAPDocument"),
            )
    if c is not None:
        c |= mark_fq_successful.si(fq.pk)
        c.apply_async()
    else:
        # Somehow failed to make a chain. Log an error.
        fq.status = PROCESSING_STATUS.INVALID_CONTENT
        fq.message = "Invalid submission, unable to make chain for processing."
        fq.save()
Example #44
0
def get_dockets(options):
    """Download the dockets described in the CSV according to the `tasks`
    option.
    """
    f = options['file']
    reader = csv.DictReader(f)
    q = options['queue']
    task = options['task']
    throttle = CeleryThrottle(queue_name=q)
    session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD)
    session.login()
    for i, row in enumerate(reader):
        if i < options['offset']:
            continue
        if i >= options['limit'] > 0:
            break
        if row['Too Old'] == 'Yes':
            continue
        if row['Appellate/District'].lower() != task:
            # Only do appellate when appellate, and district when district.
            continue

        # All tests pass. Get the docket.
        logger.info("Doing row %s: %s", i, row)
        throttle.maybe_wait()
        if task == 'appellate':
            chain(
                get_appellate_docket_by_docket_number.s(
                    docket_number=row['Cleaned case_No'],
                    court_id=row['fjc_court_id'],
                    cookies=session.cookies,
                    tag_names=[TAG],
                    **{
                        'show_docket_entries': True,
                        'show_orig_docket': True,
                        'show_prior_cases': True,
                        'show_associated_cases': True,
                        'show_panel_info': True,
                        'show_party_atty_info': True,
                        'show_caption': True,
                    }).set(queue=q),
                add_or_update_recap_docket.s().set(queue=q),
            ).apply_async()
        elif task == 'district':
            chain(
                get_pacer_case_id_and_title.s(
                    pass_through=None,
                    docket_number=row['Cleaned case_No'],
                    court_id=row['fjc_court_id'],
                    cookies=session.cookies,
                    case_name=row['Title'],
                ).set(queue=q),
                get_docket_by_pacer_case_id.s(court_id=row['fjc_court_id'],
                                              cookies=session.cookies,
                                              tag_names=[TAG],
                                              **{
                                                  'show_parties_and_counsel':
                                                  True,
                                                  'show_terminated_parties':
                                                  True,
                                                  'show_list_of_member_cases':
                                                  True
                                              }).set(queue=q),
                add_or_update_recap_docket.s().set(queue=q),
            ).apply_async()
Example #45
0
 def test_single_expresion(self):
     x = chain(self.add.s(1, 2)).apply()
     assert x.get() == 3
     assert x.parent is None
Example #46
0
    def handle(self, *args, **options):
        super(Command, self).handle(*args, **options)

        if options['sweep'] is False:
            # Only allow one script at a time per court combination.
            # Note that multiple scripts on multiple machines could still be
            # run.
            court_str = '-'.join(sorted(options['courts']))
            with open('/tmp/rss-scraper-%s.pid' % court_str, 'w') as fp:
                try:
                    fcntl.lockf(fp, fcntl.LOCK_EX | fcntl.LOCK_NB)
                except IOError:
                    print("Another instance of this program is running with "
                          "for this combination of courts. Only one instance "
                          "can crawl these courts at a time: '%s'" % court_str)
                    sys.exit(1)

        # Loop over the PACER sites that have RSS feeds and see if they're
        # ready to do.
        courts = Court.objects.filter(
            jurisdiction__in=[
                Court.FEDERAL_BANKRUPTCY,
                Court.FEDERAL_DISTRICT,
            ],
            pacer_has_rss_feed=True,
        )
        if options['courts'] != ['all']:
            courts = courts.filter(pk__in=options['courts'])

        iterations_completed = 0
        last_trim_date = None
        while options['iterations'] == 0 or \
                iterations_completed < options['iterations']:
            for court in courts:
                # Check the last time we successfully got the feed
                try:
                    feed_status = RssFeedStatus.objects.filter(
                        court=court,
                        is_sweep=options['sweep'],
                        status__in=[
                            RssFeedStatus.PROCESSING_SUCCESSFUL,
                            RssFeedStatus.UNCHANGED,
                            RssFeedStatus.PROCESSING_IN_PROGRESS,
                        ]
                    ).latest('date_created')
                except RssFeedStatus.DoesNotExist:
                    # First time running it or status items have been nuked by
                    # an admin. Make a dummy object, but no need to actually
                    # save it to the DB. Make it old.
                    lincolns_birthday = make_aware(datetime(1809, 2, 12))
                    feed_status = RssFeedStatus(
                        date_created=lincolns_birthday,
                        date_last_build=lincolns_birthday,
                        is_sweep=options['sweep'],
                    )
                if options['courts'] == ['all'] and options['sweep'] is False:
                    # If it's all courts and it's not a sweep, check if we did
                    # it recently.
                    max_visit_ago = now() - timedelta(
                        seconds=self.RSS_MAX_VISIT_FREQUENCY)
                    if feed_status.date_created > max_visit_ago:
                        # Processed too recently. Try next court.
                        continue

                # Give a court some time to complete during non-sweep crawls
                processing_cutoff = now() - timedelta(
                    seconds=self.RSS_MAX_PROCESSING_DURATION)
                if all([
                    options['sweep'] is False,
                    feed_status.status == RssFeedStatus.PROCESSING_IN_PROGRESS,
                    feed_status.date_created > processing_cutoff
                ]):
                    continue

                # The court is ripe! Crawl it if it has changed.
                # Make a new object to track the attempted crawl.
                new_status = RssFeedStatus.objects.create(
                    court_id=court.pk,
                    status=RssFeedStatus.PROCESSING_IN_PROGRESS,
                    is_sweep=options['sweep'],
                )

                # Check if the item needs crawling, and crawl it if so.
                chain(
                    check_if_feed_changed.s(court.pk, new_status.pk,
                                            feed_status.date_last_build),
                    merge_rss_feed_contents.s(court.pk, new_status.pk),
                    send_docket_alerts.s(),
                    # Update recap *documents*, not *dockets*. Updating dockets
                    # requires much more work, and we don't expect to get much
                    # docket information from the RSS feeds. RSS feeds also
                    # have information about hundreds or thousands of
                    # dockets. Updating them all would be very bad.
                    add_items_to_solr.s('search.RECAPDocument'),
                    mark_status_successful.si(new_status.pk),
                ).apply_async()

            # Trim if not too recently trimmed.
            trim_cutoff_date = now() - timedelta(
                seconds=self.DELAY_BETWEEN_CACHE_TRIMS)
            if last_trim_date is None or trim_cutoff_date > last_trim_date:
                trim_rss_cache.delay()
                last_trim_date = now()

            # Wait, then attempt the courts again if iterations not exceeded.
            iterations_completed += 1
            time.sleep(self.DELAY_BETWEEN_ITERATIONS)
Example #47
0
 def test_empty_chain_returns_none(self):
     assert chain(app=self.app)() is None
     assert chain(app=self.app).apply_async() is None
Example #48
0
 def test_empty_chain_returns_none(self):
     self.assertIsNone(chain(app=self.app)())
     self.assertIsNone(chain(app=self.app).apply_async())
Example #49
0
 def test_chord_size_chain_single(self):
     sig = chord(chain(self.add.s()))
     assert sig.__length_hint__() == 1
Example #50
0
 def test_accepts_generator_argument(self):
     x = chain(self.add.s(i) for i in range(10))
     self.assertTrue(x.tasks[0].type, self.add)
     self.assertTrue(x.type)
Example #51
0
 def test_chord_size_nested_chain_chain_many(self):
     # The outer chain will be pulled up into the chord but the lower one
     # remains and will only count as a single final element
     sig = chord(chain(chain([self.add.s()] * 42)))
     assert sig.__length_hint__() == 1
Example #52
0
 def test_app_falls_back_to_default(self):
     from celery._state import current_app
     self.assertIs(chain().app, current_app)
Example #53
0
 def test_chord_size_nested_implicit_chain_chain_many(self):
     sig = chord([chain([self.add.s()] * 42)])
     assert sig.__length_hint__() == 1
Example #54
0
def get_attachment_pages(options):
    """Find docket entries that look like invoices and get their attachment
    pages.
    """
    page_size = 100
    main_query = build_main_query_from_query_string(
        Q_DOCS_ONLY,
        {
            "rows": page_size,
            "fl": ["id", "docket_id"]
        },
        {
            "group": False,
            "facet": False,
            "highlight": False
        },
    )
    si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode="r")
    results = si.query().add_extra(**main_query)
    si.conn.http_connection.close()

    q = options["queue"]
    recap_user = User.objects.get(username="******")
    throttle = CeleryThrottle(queue_name=q, min_items=options["queue_length"])
    session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD)
    session.login()
    paginator = Paginator(results, page_size)
    i = 0
    for page_number in range(1, paginator.num_pages + 1):
        paged_results = paginator.page(page_number)
        for result in paged_results.object_list:
            if i < options["offset"]:
                i += 1
                continue
            if i >= options["limit"] > 0:
                break

            logger.info(
                "Doing row %s: rd: %s, docket: %s",
                i,
                result["id"],
                result["docket_id"],
            )
            throttle.maybe_wait()
            chain(
                # Query the attachment page and process it
                get_attachment_page_by_rd.s(result["id"],
                                            session.cookies).set(queue=q),
                # Take that in a new task and make a PQ object
                make_attachment_pq_object.s(result["id"],
                                            recap_user.pk).set(queue=q),
                # And then process that using the normal machinery.
                process_recap_attachment.s(tag_names=[TAG_PHASE_1]).set(queue=q
                                                                        ),
            ).apply_async()
            i += 1
        else:
            # Inner loop exited normally (didn't "break")
            continue
        # Inner loop broke. Break outer loop too.
        break
Example #55
0
def get_documents(options):
    """Download documents from PACER if we don't already have them."""
    q = options["queue"]

    throttle = CeleryThrottle(queue_name=q)
    session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD)
    session.login()

    page_size = 20000
    main_query = build_main_query_from_query_string(
        QUERY_STRING,
        {
            "rows": page_size,
            "fl": ["id", "docket_id"]
        },
        {
            "group": False,
            "facet": False,
            "highlight": False
        },
    )
    si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode="r")
    results = si.query().add_extra(**main_query).execute()
    logger.info("Got %s search results.", results.result.numFound)

    for i, result in enumerate(results):
        if i < options["offset"]:
            continue
        if i >= options["limit"] > 0:
            break
        throttle.maybe_wait()

        logger.info(
            "Doing item %s w/rd: %s, d: %s",
            i,
            result["id"],
            result["docket_id"],
        )

        try:
            rd = RECAPDocument.objects.get(pk=result["id"])
        except RECAPDocument.DoesNotExist:
            logger.warn("Unable to find RECAP Document with id %s",
                        result["id"])
            continue

        if rd.is_available:
            logger.info("Already have pk %s; just tagging it.", rd.pk)
            add_tags(rd, TAG)
            continue

        if not rd.pacer_doc_id:
            logger.info("Unable to find pacer_doc_id for: %s", rd.pk)
            continue

        chain(
            get_pacer_doc_by_rd.s(rd.pk, session.cookies,
                                  tag=TAG).set(queue=q),
            extract_recap_pdf.si(rd.pk).set(queue=q),
            add_items_to_solr.si([rd.pk], "search.RECAPDocument").set(queue=q),
        ).apply_async()
Example #56
0
 def test_chain_of_chain_with_a_single_task(self):
     s = self.add.s(1, 1)
     assert chain([chain(s)]).tasks == list(chain(s).tasks)
Example #57
0
 def test_call_no_tasks(self):
     x = chain()
     self.assertFalse(x())
Example #58
0
 def test_clone_preserves_state(self):
     x = chain(self.add.s(i, i) for i in range(10))
     assert x.clone().tasks == x.tasks
     assert x.clone().kwargs == x.kwargs
     assert x.clone().args == x.args
def download_documents(options):
    """We've got good values in the new columns, so just need to look those up,
    and get the documents from PACER.
    """
    f = open(options['input_file'], 'r')
    dialect = csv.Sniffer().sniff(f.read(1024))
    f.seek(0)
    reader = csv.DictReader(f, dialect=dialect)
    q = options['queue']
    throttle = CeleryThrottle(queue_name=q, min_items=options['queue_length'])
    session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD)
    session.login()
    for i, row in enumerate(reader):
        if i < options['offset']:
            continue
        if i >= options['limit'] > 0:
            break
        throttle.maybe_wait()

        logger.info("Doing row %s: %s", i, row)

        docket_number = row['cl_d_docket_number'] or \
            row['cl_d_docket_number (student)'] or \
            None

        if not docket_number:
            logger.warn("No docket number found for row: %s", i)
            continue
        court = Court.objects.get(fjc_court_id=row['AO ID'].rjust(2, '0'),
                                  jurisdiction=Court.FEDERAL_DISTRICT)

        try:
            d = Docket.objects.get(docket_number=docket_number, court=court)
        except Docket.MultipleObjectsReturned:
            logger.warn("Multiple objects returned for row: %s", i)
            continue
        except Docket.DoesNotExist:
            logger.warn("Could not find docket for row: %s", i)
            continue

        # Got the docket, now get the documents from it, tag & OCR them.
        document_date = datetime.strptime(row['Date'], '%m/%d/%Y').date()
        des = d.docket_entries.filter(date_filed=document_date)
        count = des.count()
        if count == 0:
            logger.warn("No docket entries found for row: %s", i)
            continue
        elif des.count() == 1:
            good_des = [des[0]]
        else:
            # More than one item. Apply filtering rules.
            good_des = filter_des(des)

        # We've got our des, now download them.
        for de in good_des:
            rds = de.recap_documents.filter(
                document_type=RECAPDocument.PACER_DOCUMENT)
            for rd in rds:
                if not rd.pacer_doc_id:
                    logger.warn(
                        "Unable to get pacer_doc_id for item with "
                        "rd_pk: %s. Restricted document?", rd.pk)
                    continue
                if options['task'] == 'add_extra_tags':
                    # Wherein I belatedly realize we need a tag specifically
                    # for this part of the project.
                    add_tags(rd, TAG_NAME_OPINIONS)
                else:
                    # Otherwise, do the normal download thing.
                    chain(
                        get_pacer_doc_by_rd.s(rd.pk,
                                              session.cookies,
                                              tag=TAG_NAME).set(queue=q),
                        extract_recap_pdf.si(rd.pk).set(queue=q),
                        add_or_update_recap_document.si([rd.pk]).set(queue=q),
                    ).apply_async()
    f.close()
Example #60
0
def get_gets_chain(app: Celery, task: models.Task, kwargs: dict, params: dict) -> chain:
    signatures = [
        app.signature(JobNames.get_action, kwargs=kwargs, **params)
        for _ in range(task.gets)
    ]
    return chain(*signatures)