def sweep_missing_downloads(): """ Get any documents that somehow are missing. This function attempts to address issue #671 by checking for any missing documents, downloading and parsing them. Hopefully this is a temporary hack that we can soon remove when we deprecate the old RECAP server. :return: None """ two_hours_ago = now() - timedelta(hours=2) rds = RECAPDocument.objects.filter( Q(date_created__gt=two_hours_ago) | Q(date_modified__gt=two_hours_ago), is_available=True, page_count=None, ).order_by() for rd in rds: # Download the item to the correct location if it doesn't exist if not os.path.isfile(rd.filepath_local.path): filename = rd.filepath_local.name.rsplit('/', 1)[-1] chain( download_recap_item.si(rd.filepath_ia, filename), set_recap_page_count.si(rd.pk), extract_recap_pdf.s(check_if_needed=False).set(priority=5), add_or_update_recap_document.s(coalesce_docket=True), ).apply_async()
def get_dockets(options, items, tags, sample_size=0, doc_num_end=''): """Download dockets from PACER. :param options: Options provided by argparse :param items: Items from our FJC IDB database :param tags: A list of tag names to associate with the purchased content. :param sample_size: The number of items to get. If 0, get them all. Else, get only this many and do it randomly. :param doc_num_end: Only get docket numbers up to this value to constrain costs. If set to an empty string, no constraints are applied. Note that applying this value means no unnumbered entries will be retrieved by PACER. """ if sample_size > 0: items = items.order_by('?')[:sample_size] q = options['queue'] throttle = CeleryThrottle(queue_name=q) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() for i, row in enumerate(items): if i < options['offset']: continue if i >= options['limit'] > 0: break if i % 5000 == 0: # Re-authenticate just in case the auto-login mechanism isn't # working. session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() # All tests pass. Get the docket. logger.info("Doing row %s: %s", i, row) throttle.maybe_wait() params = make_fjc_idb_lookup_params(row) chain( get_pacer_case_id_and_title.s( pass_through=None, docket_number=row.docket_number, court_id=row.district_id, cookies=session.cookies, **params ).set(queue=q), filter_docket_by_tags.s(tags, row.district_id).set(queue=q), get_docket_by_pacer_case_id.s( court_id=row.district_id, cookies=session.cookies, tag_names=tags, **{ 'show_parties_and_counsel': True, 'show_terminated_parties': True, 'show_list_of_member_cases': False, 'doc_num_end': doc_num_end, } ).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async()
def task(msg): logger.info(task.name) load_type = msg[mk.LOAD_TYPE] assessment_type = None if load_type == LoadType.ASSESSMENT: assessment_type = msg[mk.ASSESSMENT_TYPE] logger.info('DETERMINE END ROUTE: Determining end route by %s' % load_type) split_file_tuple_list = msg[mk.SPLIT_FILE_LIST] if split_file_tuple_list: common_tasks = [W_parallel_csv_load.get_load_from_csv_tasks(msg), W_load_json_to_integration.task.s(), W_file_content_validator.task.s(), W_load_to_integration_table.task.s(), W_load_from_integration_to_star.prepare_target_schema.s()] target_tasks = {LoadType.ASSESSMENT: [W_load_from_integration_to_star.get_explode_to_tables_tasks(msg, 'dim'), W_tasks_utils.handle_group_results.s(), W_load_from_integration_to_star.handle_record_upsert.s(), W_load_from_integration_to_star.get_explode_to_tables_tasks(msg, Constants.FACT_TABLE_PREFIX.get(assessment_type)), W_tasks_utils.handle_group_results.s(), W_load_from_integration_to_star.handle_deletions.s()], LoadType.STUDENT_REGISTRATION: [W_load_sr_integration_to_target.task.s()]} post_etl_tasks = [W_post_etl.task.s(), W_all_done.task.s()] chain(common_tasks + target_tasks[load_type] + post_etl_tasks).delay() else: # because we process .err file, so, it's error msg[mk.PIPELINE_STATE] = 'error' W_all_done.task.subtask(args=[msg]).delay()
def get_pacer_dockets(options, docket_pks, tags): """Get the pacer dockets identified by the FJC IDB rows""" q = options['queue'] throttle = CeleryThrottle(queue_name=q) pacer_session = None for i, docket_pk in enumerate(docket_pks): if i < options['offset']: continue if i >= options['limit'] > 0: break throttle.maybe_wait() if i % 1000 == 0 or pacer_session is None: pacer_session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) pacer_session.login() logger.info("Sent %s tasks to celery so far." % i) d = Docket.objects.get(pk=docket_pk) chain( get_docket_by_pacer_case_id.s( {'pacer_case_id': d.pacer_case_id, 'docket_pk': d.pk}, d.court_id, cookies=pacer_session.cookies, tag_names=tags, **{'show_parties_and_counsel': True, 'show_terminated_parties': True, 'show_list_of_member_cases': False} ).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async()
def test_splices_chains(self): c = chain( self.add.s(5, 5), chain(self.add.s(6), self.add.s(7), self.add.s(8), app=self.app), app=self.app, ) c.freeze() tasks, _ = c._frozen assert len(tasks) == 4
def populate_report_data(start_date, end_date, domain, runner, locations=None, strict=True): # first populate all the warehouse tables for all facilities # hard coded to know this is the first date with data start_date = max(start_date, default_start_date()) # For QA purposes generate reporting data for only some small part of data. if not ILSGatewayConfig.for_domain(domain).all_stock_data: if locations is None: locations = _get_test_locations(domain) facilities = filter(lambda location: location.location_type == 'FACILITY', locations) non_facilities_types = ['DISTRICT', 'REGION', 'MSDZONE', 'MOHSW'] non_facilities = [] for location_type in non_facilities_types: non_facilities.extend(filter(lambda location: location.location_type == location_type, locations)) else: facilities = Location.filter_by_type(domain, 'FACILITY') non_facilities = list(Location.filter_by_type(domain, 'DISTRICT')) non_facilities += list(Location.filter_by_type(domain, 'REGION')) non_facilities += list(Location.filter_by_type(domain, 'MSDZONE')) non_facilities += list(Location.filter_by_type(domain, 'MOHSW')) if runner.location: if runner.location.location_type.name.upper() != 'FACILITY': facilities = [] non_facilities = itertools.dropwhile( lambda location: location._id != runner.location.location_id, non_facilities ) else: facilities = itertools.dropwhile( lambda location: location._id != runner.location.location_id, facilities ) facilities_chunked_list = chunked(facilities, 5) for chunk in facilities_chunked_list: res = chain(process_facility_warehouse_data.si(fac, start_date, end_date, runner) for fac in chunk)() res.get() non_facilities_chunked_list = chunked(non_facilities, 50) # then populate everything above a facility off a warehouse table for chunk in non_facilities_chunked_list: res = chain( process_non_facility_warehouse_data.si(org, start_date, end_date, runner, strict) for org in chunk )() res.get() runner.location = None runner.save() # finally go back through the history and initialize empty data for any # newly created facilities update_historical_data(domain)
def sync_chain(): tasks = [] if settings.GOOGLEADWORDS_SYNC_ACCOUNT: tasks.append(sync_accounts.si()) if settings.GOOGLEADWORDS_SYNC_CAMPAIGN: tasks.append(sync_campaigns.si()) if settings.GOOGLEADWORDS_SYNC_ADGROUP: tasks.append(sync_adgroups.si()) if settings.GOOGLEADWORDS_SYNC_AD: tasks.append(sync_ads.si()) chain(*tasks).apply_async()
def download_dockets(options): """Download dockets listed in the spreadsheet.""" with open(options['input_file'], 'r') as f: dialect = csv.Sniffer().sniff(f.read(1024)) f.seek(0) reader = csv.DictReader(f, dialect=dialect) q = options['queue'] task = options['task'] throttle = CeleryThrottle(queue_name=q, min_items=options['queue_length']) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() for i, row in enumerate(reader): if i < options['offset']: continue if i >= options['limit'] > 0: break throttle.maybe_wait() logger.info("Doing row %s: %s", i, row) if row['idb_docket_number']: if task == 'download_student_dockets': continue # Zero-pad the docket number up to seven digits because Excel # ate the leading zeros that these would normally have. docket_number = row['idb_docket_number'].rjust(7, '0') elif row['student_docket_number']: # Use the values collected by student # researchers, then cleaned up my mlr. docket_number = row['student_docket_number'] else: # No docket number; move on. continue court = Court.objects.get(fjc_court_id=row['AO ID'].rjust(2, '0'), jurisdiction=Court.FEDERAL_DISTRICT) chain( get_pacer_case_id_and_title.s( pass_through=None, docket_number=docket_number, court_id=court.pk, cookies=session.cookies, case_name=row['Case Name'], ).set(queue=q), get_docket_by_pacer_case_id.s( court_id=court.pk, cookies=session.cookies, tag_names=[TAG_NAME], ).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async()
def get_attachment_pages(options): """Find docket entries that look like invoices and get their attachment pages. """ page_size = 100 main_query = build_main_query_from_query_string( Q_DOCS_ONLY, {'rows': page_size, 'fl': ['id', 'docket_id']}, {'group': False, 'facet': False}, ) si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode='r') results = si.query().add_extra(**main_query) q = options['queue'] recap_user = User.objects.get(username='******') throttle = CeleryThrottle(queue_name=q, min_items=options['queue_length']) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() paginator = Paginator(results, page_size) i = 0 for page_number in range(1, paginator.num_pages + 1): paged_results = paginator.page(page_number) for result in paged_results.object_list: if i < options['offset']: i += 1 continue if i >= options['limit'] > 0: break logger.info("Doing row %s: rd: %s, docket: %s", i, result['id'], result['docket_id']) throttle.maybe_wait() chain( # Query the attachment page and process it get_attachment_page_by_rd.s( result['id'], session.cookies).set(queue=q), # Take that in a new task and make a PQ object make_attachment_pq_object.s( result['id'], recap_user.pk).set(queue=q), # And then process that using the normal machinery. process_recap_attachment.s(tag_names=[TAG_PHASE_1]).set(queue=q), ).apply_async() i += 1 else: # Inner loop exited normally (didn't "break") continue # Inner loop broke. Break outer loop too. break
def get_documents(options): """Download documents from PACER if we don't already have them.""" q = options['queue'] throttle = CeleryThrottle(queue_name=q, min_items=options['queue_length']) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() page_size = 10000 main_query = build_main_query_from_query_string( Q_INVOICES, {'rows': page_size, 'fl': ['id', 'docket_id']}, {'group': False, 'facet': False}, ) si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode='r') results = si.query().add_extra(**main_query).execute() logger.info("Got %s search results.", results.result.numFound) for i, result in enumerate(results): if i < options['offset']: i += 1 continue if i >= options['limit'] > 0: break throttle.maybe_wait() rd = RECAPDocument.objects.get(pk=result['id']) logger.info("Doing item %s w/rd: %s, d: %s", i, rd.pk, result['docket_id']) if rd.is_available: logger.info("Already have pk %s; just tagging it.", rd.pk) add_tags(rd, TAG_PHASE_2) i += 1 continue if not rd.pacer_doc_id: logger.info("Unable to find pacer_doc_id for: %s", rd.pk) i += 1 continue chain( get_pacer_doc_by_rd.s(rd.pk, session.cookies, tag=TAG_PHASE_2).set(queue=q), extract_recap_pdf.si(rd.pk).set(queue=q), add_items_to_solr.si([rd.pk], 'search.RECAPDocument').set(queue=q), ).apply_async() i += 1
def populate_report_data(start_date, end_date, domain, runner, locations=None, strict=True): # first populate all the warehouse tables for all facilities # hard coded to know this is the first date with data start_date = max(start_date, default_start_date()) # For QA purposes generate reporting data for only some small part of data. if not ILSGatewayConfig.for_domain(domain).all_stock_data: if locations is None: locations = _get_test_locations(domain) facilities = filter(lambda location: location.location_type == "FACILITY", locations) non_facilities_types = ["DISTRICT", "REGION", "MSDZONE", "MOHSW"] non_facilities = [] for location_type in non_facilities_types: non_facilities.extend(filter(lambda location: location.location_type == location_type, locations)) else: facilities = Location.filter_by_type(domain, "FACILITY") non_facilities = list(Location.filter_by_type(domain, "DISTRICT")) non_facilities += list(Location.filter_by_type(domain, "REGION")) non_facilities += list(Location.filter_by_type(domain, "MSDZONE")) non_facilities += list(Location.filter_by_type(domain, "MOHSW")) if runner.location: if runner.location.location_type.name.upper() != "FACILITY": facilities = [] non_facilities = itertools.dropwhile( lambda location: location.location_id != runner.location.location_id, non_facilities ) else: facilities = itertools.dropwhile( lambda location: location.location_id != runner.location.location_id, facilities ) facilities_chunked_list = chunked(facilities, 5) for chunk in facilities_chunked_list: res = chain(process_facility_warehouse_data.si(fac, start_date, end_date, runner) for fac in chunk)() res.get() non_facilities_chunked_list = chunked(non_facilities, 50) # then populate everything above a facility off a warehouse table for chunk in non_facilities_chunked_list: res = chain( process_non_facility_warehouse_data.si(org, start_date, end_date, runner, strict) for org in chunk )() res.get() runner.location = None runner.save()
def _pre_create_undeploy(deployment, search_params, next_task=None): """ Un-deploys during pre-create phase. The versions un-deployed depends upon mode of deployment. :param deployment: Deployment parameters :type deployment: dict :return: deployment to continue deploy chain :rtype: dict """ deploy_mode = deployment['deployment']['mode'] if deploy_mode == DEPLOYMENT_MODE_BLUEGREEN: # Undeploy only current version in pre-create phase. version = deployment['deployment']['version'] elif deploy_mode == DEPLOYMENT_MODE_REDGREEN: # Undeploy all versions in pre-create phase. version = None else: # Do not undeploy anything when mode is custom or A/B return next_task.delay() if next_task else None name = deployment['deployment']['name'] undeploy_chain = [ _fleet_stop.si(name, version=version) | _wait_for_stop.si(name, version=version, search_params=search_params) | _fleet_undeploy.si(name, version, ignore_error=False), _wait_for_undeploy.si(name, version, search_params=search_params) ] if next_task: undeploy_chain.append(next_task) return chain(undeploy_chain).delay()
def test_apply(self): x = chain(add.s(4, 4), add.s(8), add.s(10)) res = x.apply() self.assertIsInstance(res, EagerResult) self.assertEqual(res.get(), 26) self.assertEqual(res.parent.get(), 16) self.assertEqual(res.parent.parent.get(), 8) self.assertIsNone(res.parent.parent.parent)
def test_append_to_empty_chain(self): x = chain() x |= self.add.s(1, 1) x |= self.add.s(1) x.freeze() tasks, _ = x._frozen assert len(tasks) == 2 assert x.apply().get() == 3
def test_from_dict_full_subtasks(self): c = chain(self.add.si(1, 2), self.add.si(3, 4), self.add.si(5, 6)) serialized = json.loads(json.dumps(c)) deserialized = chain.from_dict(serialized) for task in deserialized.tasks: assert isinstance(task, Signature)
def test_apply(self): x = chain(self.add.s(4, 4), self.add.s(8), self.add.s(10)) res = x.apply() assert isinstance(res, EagerResult) assert res.get() == 26 assert res.parent.get() == 16 assert res.parent.parent.get() == 8 assert res.parent.parent.parent is None
def test_handles_dicts(self): c = chain( self.add.s(5, 5), dict(self.add.s(8)), app=self.app, ) c.freeze() tasks, _ = c._frozen for task in tasks: assert isinstance(task, Signature) assert task.app is self.app
def get_dockets(options): """Download the dockets described in the CSV """ f = options['file'] reader = csv.DictReader(f) q = options['queue'] throttle = CeleryThrottle(queue_name=q) pacer_session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) pacer_session.login() for i, row in enumerate(reader): if i < options['offset']: continue if i >= options['limit'] > 0: break if i % 1000 == 0: pacer_session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) pacer_session.login() logger.info("Sent %s tasks to celery so far." % i) logger.info("Doing row %s", i) throttle.maybe_wait() chain( get_pacer_case_id_and_title.s( pass_through=None, docket_number=make_docket_number(row['filecy'], row['docket']), court_id='ilnb', cookies=pacer_session.cookies, office_number=row['office'], docket_number_letters='bk', ).set(queue=q), get_docket_by_pacer_case_id.s( court_id='ilnb', cookies=pacer_session.cookies, tag_names=[TAG], **{ 'show_parties_and_counsel': True, 'show_terminated_parties': True, 'show_list_of_member_cases': True } ).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async()
def do_ocr(options): """Do the OCR for any items that need it, then save to the solr index.""" q = options['queue'] rds = RECAPDocument.objects.filter( ocr_status=RECAPDocument.OCR_NEEDED, ).values_list('pk', flat=True).order_by() count = rds.count() throttle = CeleryThrottle(queue_name=q) for i, pk in enumerate(rds): throttle.maybe_wait() if options['index']: extract_recap_pdf.si(pk, skip_ocr=False).set(queue=q).apply_async() else: chain( extract_recap_pdf.si(pk, skip_ocr=False).set(queue=q), add_or_update_recap_document.s(coalesce_docket=True).set(queue=q), ).apply_async() if i % 1000 == 0: logger.info("Sent %s/%s tasks to celery so far." % (i + 1, count))
def populate_report_data(start_date, end_date, domain, runner, strict=True): facilities = SQLLocation.objects.filter( location_type__name='FACILITY', domain=domain, created_at__lt=end_date ).order_by('pk') non_facilities = _get_locations_by_type(domain, 'DISTRICT') non_facilities += _get_locations_by_type(domain, 'REGION') non_facilities += _get_locations_by_type(domain, 'MSDZONE') non_facilities += _get_locations_by_type(domain, 'MOHSW') if runner.location: if runner.location.location_type.name.upper() != 'FACILITY': facilities = [] non_facilities = itertools.dropwhile( lambda location: location.location_id != runner.location.location_id, non_facilities ) else: facilities = itertools.dropwhile( lambda location: location.location_id != runner.location.location_id, facilities ) facilities_chunked_list = chunked(facilities, 5) for chunk in facilities_chunked_list: res = chain(process_facility_warehouse_data.si(fac, start_date, end_date, runner) for fac in chunk)() res.get() non_facilities_chunked_list = chunked(non_facilities, 50) # then populate everything above a facility off a warehouse table for chunk in non_facilities_chunked_list: res = chain( process_non_facility_warehouse_data.si(org, start_date, end_date, runner, strict) for org in chunk )() res.get() runner.location = None runner.save()
def test_multi_tenant_move_to_target_assessment_data(self): ASMT_OUTCOME_FILE = self.require_file('INT_SBAC_ASMT_OUTCOME.csv') ASMT_FILE = self.require_file('INT_SBAC_ASMT.csv') self.verify_target_assessment_schema(self.guid_batch_asmt, True) self.load_csv_data_to_integration(ASMT_OUTCOME_FILE, ASMT_FILE, 'int_sbac_asmt_outcome', 'int_sbac_asmt') msg = self.create_msg(Constants.LOAD_TYPE_ASSESSMENT, self.guid_batch_asmt) dim_tasks = get_explode_to_tables_tasks(msg, 'dim') fact_tasks = get_explode_to_tables_tasks(msg, 'fact_asmt') tasks = chain(prepare_target_schema.s(msg), dim_tasks, handle_group_results.s(), fact_tasks, handle_group_results.s()) results = tasks.delay() results.get() self.verify_target_assessment_schema(self.guid_batch_asmt, False)
def update_any_missing_pacer_case_ids(options): """The network requests were making things far too slow and had to be disabled during the first pass. With this method, we update any items that are missing their pacer case ID value. """ ds = Docket.objects.filter( idb_data__isnull=False, pacer_case_id=None, ) q = options['queue'] throttle = CeleryThrottle(queue_name=q) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() for i, d in enumerate(queryset_generator(ds)): if i < options['offset']: continue if i >= options['limit'] > 0: break if i % 5000 == 0: # Re-authenticate just in case the auto-login mechanism isn't # working. session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() throttle.maybe_wait() logger.info("Getting pacer_case_id for item %s", d) params = make_fjc_idb_lookup_params(d.idb_data) chain( get_pacer_case_id_and_title.s( pass_through=d.pk, docket_number=d.idb_data.docket_number, court_id=d.idb_data.district_id, cookies=session.cookies, **params ).set(queue=q), update_docket_from_hidden_api.s().set(queue=q), ).apply_async()
def _using_lock(self, search_params, name, do_task, cleanup_tasks=None, error_tasks=None): """ Applies lock for the deployment :return: Lock object (dictionary) :rtype: dict """ try: lock = LockService().apply_lock(name) except ResourceLockedException as lock_error: raise self.retry(exc=lock_error) _release_lock_s = _release_lock.si(lock) cleanup_tasks = cleanup_tasks or [] if not isinstance(cleanup_tasks, list): cleanup_tasks = [cleanup_tasks] error_tasks = error_tasks or [] if not isinstance(error_tasks, list): error_tasks = [error_tasks] error_tasks.append(_release_lock_s) cleanup_tasks.append(_release_lock_s) get_store().add_event( EVENT_ACQUIRED_LOCK, search_params=search_params, details={ 'name': name }) return ( do_task | async_wait.s( default_retry_delay=TASK_SETTINGS['DEPLOYMENT_WAIT_RETRY_DELAY'], max_retries=TASK_SETTINGS['DEPLOYMENT_WAIT_RETRIES'] ) ).apply_async( link=chain(cleanup_tasks), link_error=chain(error_tasks) )
def get_pdfs(options): """Get PDFs for the results of the Free Document Report queries. At this stage, we have rows in the PACERFreeDocumentRow table, each of which represents a PDF we need to download and merge into our normal tables: Docket, DocketEntry, and RECAPDocument. In this function, we iterate over the entire table of results, merge it into our normal tables, and then download and extract the PDF. :return: None """ q = options['queue'] index = options['index'] cnt = CaseNameTweaker() rows = PACERFreeDocumentRow.objects.filter(error_msg="").only('pk') count = rows.count() task_name = "downloading" if index: task_name += " and indexing" logger.info("%s %s items from PACER." % (task_name, count)) throttle = CeleryThrottle(queue_name=q) completed = 0 for row in queryset_generator(rows): throttle.maybe_wait() if completed % 30000 == 0: pacer_session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) pacer_session.login() chain( process_free_opinion_result.si(row.pk, cnt).set(queue=q), get_and_process_pdf.s(pacer_session, row.pk, index=index).set(queue=q), delete_pacer_row.si(row.pk).set(queue=q), ).apply_async() completed += 1 if completed % 1000 == 0: logger.info("Sent %s/%s tasks to celery for %s so " "far." % (completed, count, task_name))
def get_and_merge_items(items, log): """Get the items returned from the RECAP server and merge them into CL. Items is a list of dicts like so, sorted by court, case number, document number and attachment number: [{'attachment_number': '0', 'document_number': '1', 'case_number': '186759', 'court_id': 'almb', 'is_available': '0'}, ... ] Note that all values are strings. The idea is to iterate over all of these dicts, grabbing the docket, and adding any items that have is_available = 1. """ update_log_status(log, RECAPLog.GETTING_AND_MERGING_ITEMS) tasks = [] for prev, item, nxt in previous_and_next(items): if prev is None or item['case_number'] != prev['case_number']: # New case. Get the next docket before getting any PDFs. url = get_docketxml_url(item['court_id'], item['case_number']) logger.info("New docket found at: %s" % url) filename = get_docket_filename(item['court_id'], item['case_number']) tasks.append(download_recap_item.si(url, filename, clobber=True)) # Get the document filename = get_document_filename(item['court_id'], item['case_number'], item['document_number'], item['attachment_number']) location = os.path.join(settings.MEDIA_ROOT, 'recap', filename) if not os.path.isfile(location) and int(item['is_available']): # We don't have it yet, and it's available to get. Get it! url = get_pdf_url(item['court_id'], item['case_number'], filename) tasks.append(download_recap_item.si(url, filename)) if nxt is None or item['case_number'] != nxt['case_number']: # Last item in the case. Send for processing. if len(tasks) > 0: logger.info("Sending %s tasks for processing." % len(tasks)) filename = get_docket_filename(item['court_id'], item['case_number']) chord(tasks)(chain( parse_recap_docket.si(filename, debug=False), extract_recap_pdf.s().set(priority=5), add_or_update_recap_document.s(coalesce_docket=True), )) tasks = [] logger.info("Finished queueing new cases.")
def get_petitions(options): """Just get document number one for every docket that's tagged in this collection. """ rds = RECAPDocument.objects.filter( tags__name=TAG, document_number='1', document_type=RECAPDocument.PACER_DOCUMENT, ).exclude( pacer_doc_id='', ).order_by('pk').values_list('pk', flat=True).iterator() q = options['queue'] throttle = CeleryThrottle(queue_name=q) pacer_session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) pacer_session.login() for i, rd_pk in enumerate(rds): if i < options['offset']: i += 1 continue if i >= options['limit'] > 0: break if i % 1000 == 0: pacer_session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) pacer_session.login() logger.info("Sent %s tasks to celery so far." % i) logger.info("Doing row %s", i) throttle.maybe_wait() chain( get_pacer_doc_by_rd.s( rd_pk, pacer_session.cookies, tag=TAG_PETITIONS).set(queue=q), extract_recap_pdf.si(rd_pk).set(queue=q), add_items_to_solr.si([rd_pk], 'search.RECAPDocument').set(queue=q), ).apply_async()
def test_group_to_chord__protocol_2(self): c = chain( group([self.add.s(i, i) for i in range(5)], app=self.app), self.add.s(10), self.add.s(20), self.add.s(30) ) assert isinstance(c, chord) assert isinstance(c.body, _chain) assert len(c.body.tasks) == 3 c2 = self.add.s(2, 2) | group(self.add.s(i, i) for i in range(10)) c2._use_link = False tasks2, _ = c2.prepare_steps((), c2.tasks) assert isinstance(tasks2[0], group)
def start_extract(tenant, request_id, archive_file_names, directories_to_archive, registration_ids, tasks, queue=None): ''' entry point to start an extract request for one or more extract tasks it groups the generation of csv into a celery task group and then chains it to the next task to archive the files into one zip ''' workflow = chain(generate_prepare_path_task(request_id, archive_file_names, directories_to_archive, queue_name=queue), generate_extract_file_tasks(tenant, request_id, tasks, queue_name=queue), extract_group_separator.subtask(immutable=True), # @UndefinedVariable generate_archive_file_tasks(request_id, archive_file_names, directories_to_archive, queue_name=queue), extract_group_separator.subtask(immutable=True), # @UndefinedVariable generate_remote_copy_tasks(request_id, archive_file_names, registration_ids, queue_name=queue), extract_group_separator.subtask(immutable=True), # @UndefinedVariable clean_up.subtask(args=[get_extract_request_base_path(tenant, request_id)], queue_name=queue, immutable=True) # @UndefinedVariable ) workflow.apply_async()
def _start_bulk(archive_file_path, directory_to_archive, registration_id, gen_tasks, merge_tasks, cover_tasks, merge_covers_tasks, pdf_base_dir): ''' entry point to start a bulk PDF generation request for one or more students it groups the generation of individual PDFs into a celery task group and then chains it to the next task to merge the files into one PDF, archive the PDF into a zip, and upload the zip to HPZ ''' workflow = chain(group(gen_tasks), group_separator.subtask(immutable=True), group(merge_tasks), group_separator.subtask(immutable=True), group(cover_tasks), group_separator.subtask(immutable=True), group(merge_covers_tasks), archive.subtask(args=(archive_file_path, directory_to_archive), immutable=True), hpz_upload_cleanup.subtask(args=(archive_file_path, registration_id, pdf_base_dir), immutable=True)) workflow.apply_async()
def stock_data_task(api_object): # checkpoint logic start_date = datetime.today() default_api = api_object.apis[0][0] checkpoint, _ = StockDataCheckpoint.objects.get_or_create(domain=api_object.domain, defaults={ 'api': default_api, 'date': None, 'limit': 1000, 'offset': 0, 'location': None, 'start_date': start_date }) if not checkpoint.start_date: checkpoint.start_date = start_date checkpoint.save() if not api_object.all_stock_data: facilities = api_object.test_facilities else: facilities = api_object.get_ids() if checkpoint.location: external_id = api_object.get_last_processed_location(checkpoint) if external_id: facilities = list(itertools.dropwhile(lambda x: int(x) != int(external_id), facilities)) process_facility_task(api_object, facilities[0], start_from=checkpoint.api) facilities = facilities[1:] if not checkpoint.date: # use subtasks only during initial migration facilities_chunked_list = chunked(facilities, 5) for chunk in facilities_chunked_list: res = chain(process_facility_task.si(api_object, fac) for fac in chunk)() res.get() else: for facility in facilities: process_facility_task(api_object, facility) checkpoint = StockDataCheckpoint.objects.get(domain=api_object.domain) save_stock_data_checkpoint(checkpoint, default_api, 1000, 0, start_date, None, False) checkpoint.start_date = None checkpoint.save()
def test_kwargs_apply(self): x = chain(self.add.s(), self.add.s(8), self.add.s(10)) res = x.apply(kwargs={'x': 1, 'y': 1}).get() assert res == 20
def test_from_dict_no_tasks(self): assert chain.from_dict(dict(chain(app=self.app)), app=self.app)
def test_app_falls_back_to_default(self): from celery._state import current_app assert chain().app is current_app
def test_chord_size_nested_implicit_chain_chain_single(self): sig = chord([chain(self.add.s())]) assert sig.__length_hint__() == 1
def test_from_dict_full_subtasks(self): c = chain(self.add.si(1, 2), self.add.si(3, 4), self.add.si(5, 6)) serialized = json.loads(json.dumps(c)) deserialized = chain.from_dict(serialized) assert all(isinstance(task, Signature) for task in deserialized.tasks)
def test_call_no_tasks(self): x = chain() assert not x()
def test_chord_size_chain_many(self): # Chains get flattened into the encapsulating chord so even though the # chain would only count for 1, the tasks we pulled into the chord's # header and are counted as a bunch of simple signature objects sig = chord(chain([self.add.s()] * 42)) assert sig.__length_hint__() == 42
def get_and_save_free_document_reports(options): """Query the Free Doc Reports on PACER and get a list of all the free documents. Do not download those items, as that step is done later. """ # Kill any *old* logs that report they're in progress. (They've failed.) twelve_hrs_ago = now() - timedelta(hours=12) PACERFreeDocumentLog.objects.filter( date_started__lt=twelve_hrs_ago, status=PACERFreeDocumentLog.SCRAPE_IN_PROGRESS, ).update(status=PACERFreeDocumentLog.SCRAPE_FAILED, ) cl_court_ids = Court.objects.filter( jurisdiction__in=[Court.FEDERAL_DISTRICT, Court.FEDERAL_BANKRUPTCY], in_use=True, end_date=None, ).exclude(pk__in=[ 'casb', 'ganb', 'gub', 'innb', 'mieb', 'miwb', 'nmib', 'nvb', 'ohsb', 'prb', 'tnwb', 'vib' ], ).values_list( 'pk', flat=True, ) pacer_court_ids = { map_cl_to_pacer_id(v): { 'until': now(), 'count': 1, 'result': None } for v in cl_court_ids } pacer_session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) pacer_session.login() # Iterate over every court, X days at a time. As courts are completed, # remove them from the list of courts to process until none are left tomorrow = now() + timedelta(days=1) while len(pacer_court_ids) > 0: court_ids_copy = pacer_court_ids.copy() # Make a copy of the list. for pacer_court_id, delay in court_ids_copy.items(): if now() < delay['until']: # Do other courts until the delay is up. Do not print/log # anything since at the end there will only be one court left. continue next_start_date, next_end_date = get_next_date_range( pacer_court_id) if delay['result'] is not None: if delay['result'].ready(): result = delay['result'].get() if result == PACERFreeDocumentLog.SCRAPE_SUCCESSFUL: if next_start_date >= tomorrow.date(): logger.info("Finished '%s'. Marking it complete." % pacer_court_id) pacer_court_ids.pop(pacer_court_id, None) continue elif result == PACERFreeDocumentLog.SCRAPE_FAILED: logger.error("Encountered critical error on %s " "(network error?). Marking as failed and " "pressing on." % pacer_court_id) pacer_court_ids.pop(pacer_court_id, None) continue else: next_delay = min(delay['count'] * 5, 30) # backoff w/cap logger.info( "Court %s still in progress. Delaying at least " "%ss." % (pacer_court_id, next_delay)) pacer_court_ids[pacer_court_id]['until'] = now( ) + timedelta(seconds=next_delay) pacer_court_ids[pacer_court_id]['count'] += 1 continue mark_court_in_progress(pacer_court_id, next_end_date) pacer_court_ids[pacer_court_id]['count'] = 1 # Reset delay['result'] = chain( get_and_save_free_document_report.si(pacer_court_id, next_start_date, next_end_date, pacer_session), mark_court_done_on_date.s(pacer_court_id, next_end_date), ).apply_async()
def test_chain_tasks(self): sigs = [celery_canvas.Signature() for r in range(4)] chain = celery_canvas.chain(sigs) tasks = inspect.get_chain_tasks(chain) assert sigs == tasks
def get_dockets(options, items, tags, sample_size=0, doc_num_end=""): """Download dockets from PACER. :param options: Options provided by argparse :param items: Items from our FJC IDB database :param tags: A list of tag names to associate with the purchased content. :param sample_size: The number of items to get. If 0, get them all. Else, get only this many and do it randomly. :param doc_num_end: Only get docket numbers up to this value to constrain costs. If set to an empty string, no constraints are applied. Note that applying this value means no unnumbered entries will be retrieved by PACER. """ if sample_size > 0: items = items.order_by("?")[:sample_size] q = options["queue"] throttle = CeleryThrottle(queue_name=q) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() for i, row in enumerate(items): if i < options["offset"]: continue if i >= options["limit"] > 0: break if i % 5000 == 0: # Re-authenticate just in case the auto-login mechanism isn't # working. session = PacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) session.login() # All tests pass. Get the docket. logger.info("Doing row %s: %s", i, row) throttle.maybe_wait() params = make_fjc_idb_lookup_params(row) chain( get_pacer_case_id_and_title.s( pass_through=None, docket_number=row.docket_number, court_id=row.district_id, cookies=session.cookies, **params ).set(queue=q), filter_docket_by_tags.s(tags, row.district_id).set(queue=q), get_docket_by_pacer_case_id.s( court_id=row.district_id, cookies=session.cookies, tag_names=tags, **{ "show_parties_and_counsel": True, "show_terminated_parties": True, "show_list_of_member_cases": False, "doc_num_end": doc_num_end, } ).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async()
def crawl_xml(merchant_name, location): uri = get_rss_feed_uri(merchant_name, location) chain(parse_rss(uri).s(), extract_data.s(), persist_data.s())
def _get_analysis_update_tasks(analysis_id) -> List: """ Runs update tasks on nodes that have status=DIRTY """ tasks = [] analysis = Analysis.objects.get(pk=analysis_id) node_ids = analysis.analysisnode_set.all().values_list("pk", flat=True) edges = AnalysisEdge.objects.filter(parent__analysis=analysis).values_list("parent", "child") all_nodes_graph = nx.DiGraph() all_nodes_graph.add_nodes_from(node_ids) all_nodes_graph.add_edges_from(edges) logging.info("-" * 60) for connected_components in nx.weakly_connected_components(all_nodes_graph): sub_graph = all_nodes_graph.subgraph(connected_components) sub_graph_node_ids = list(sub_graph) # nx.topological_sort returns a flattened list, ie doesn’t break into groups which can run in parallel # so use other toposort library # We need a way to lock/claim the nodes - so someone else calling get_analysis_update_task() # doesn't also launch update tasks for them. sub_graph_nodes_qs = analysis.analysisnode_set.filter(pk__in=sub_graph_node_ids) analysis_update_uuid = uuid.uuid4() node_task_records = [] logging.info("Dirty nodes:") for node_id, version in sub_graph_nodes_qs.filter(status=NodeStatus.DIRTY).values_list("pk", "version"): node_task = NodeTask(node_id=node_id, version=version, analysis_update_uuid=analysis_update_uuid) logging.info(node_task) node_task_records.append(node_task) if node_task_records: NodeTask.objects.bulk_create(node_task_records, ignore_conflicts=True) # Return the ones we got the lock for node_tasks = NodeTask.objects.filter(analysis_update_uuid=analysis_update_uuid) node_versions_to_update = dict(node_tasks.values_list("node_id", "version")) logging.info(f"Got lock for: {node_versions_to_update}") groups = [] if node_versions_to_update: parent_value_data = defaultdict(set) for parent, child_list in nx.to_dict_of_lists(sub_graph).items(): for child_node_id in child_list: parent_value_data[child_node_id].add(parent) nodes_by_id = get_nodes_by_id(sub_graph_nodes_qs.select_subclasses()) dependencies = _get_node_dependencies(nodes_by_id, parent_value_data) topo_sorted = get_toposorted_nodes_from_parent_value_data(nodes_by_id, parent_value_data) # Ensure cache loading tasks are only triggered once. Cache can come from different toposort level/groups # eg MergeNode asks parent Venn to cache (which is was already doing) all_cache_jobs = set() for grp in topo_sorted: group_cache_jobs = _add_jobs_for_group(node_versions_to_update, dependencies, grp, groups, all_cache_jobs) all_cache_jobs.update(group_cache_jobs) # Need to only set where version matches what we got lock for (as it may have updated) node_version_q_list = [] for node_id, version in node_versions_to_update.items(): node_version_q_list.append(Q(pk=node_id) & Q(version=version)) q_node_version = reduce(operator.or_, node_version_q_list) analysis.analysisnode_set.filter(q_node_version).update(status=NodeStatus.QUEUED) if groups: t = chain(groups) tasks.append(t) return tasks
def do_pacer_fetch(fq): """Process a request made by a user to get an item from PACER. :param fq: The PacerFetchQueue item to process :return: None """ c = None if fq.request_type == REQUEST_TYPE.DOCKET: # Request by docket_id court_id = fq.court_id or getattr(fq.docket, "court_id", None) kwargs = { # Universal params "court_id": court_id, "user_pk": fq.user_id, "docket_pk": fq.docket_id, # Scraping params "doc_num_start": fq.de_number_start, "doc_num_end": fq.de_number_end, "date_start": fq.de_date_start, "date_end": fq.de_date_end, "show_parties_and_counsel": fq.show_parties_and_counsel, "show_terminated_parties": fq.show_terminated_parties, "show_list_of_member_cases": fq.show_list_of_member_cases, } if (fq.docket_id and not fq.docket.pacer_case_id) or fq.docket_number: # We lack the pacer_case_id either on the docket or from the # submission. Look it up. docket_number = fq.docket_number or getattr( fq.docket, "docket_number", None ) c = chain( get_pacer_case_id_and_title.si( pass_through=None, docket_number=docket_number, court_id=court_id, user_pk=fq.user_id, ), get_docket_by_pacer_case_id.s(**kwargs), ) else: if fq.docket_id is not None and fq.docket.pacer_case_id: # We have the docket and its pacer_case_id kwargs.update( { "data": {"pacer_case_id": fq.docket.pacer_case_id}, "court_id": fq.docket.court_id, } ) elif fq.pacer_case_id: # We lack the docket, but have a pacer_case_id kwargs.update( {"data": {"pacer_case_id": fq.pacer_case_id},} ) c = chain(get_docket_by_pacer_case_id.si(**kwargs)) c |= add_or_update_recap_docket.s() elif fq.request_type == REQUEST_TYPE.PDF: # Request by recap_document_id rd_pk = fq.recap_document_id if fq.recap_document_id: c = chain( fetch_pacer_doc_by_rd.si(rd_pk, fq.pk, fq.user_id), extract_recap_pdf.si(rd_pk), add_items_to_solr.si([rd_pk], "search.RECAPDocument"), ) if c is not None: c |= mark_fq_successful.si(fq.pk) c.apply_async() else: # Somehow failed to make a chain. Log an error. fq.status = PROCESSING_STATUS.INVALID_CONTENT fq.message = "Invalid submission, unable to make chain for processing." fq.save()
def get_dockets(options): """Download the dockets described in the CSV according to the `tasks` option. """ f = options['file'] reader = csv.DictReader(f) q = options['queue'] task = options['task'] throttle = CeleryThrottle(queue_name=q) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() for i, row in enumerate(reader): if i < options['offset']: continue if i >= options['limit'] > 0: break if row['Too Old'] == 'Yes': continue if row['Appellate/District'].lower() != task: # Only do appellate when appellate, and district when district. continue # All tests pass. Get the docket. logger.info("Doing row %s: %s", i, row) throttle.maybe_wait() if task == 'appellate': chain( get_appellate_docket_by_docket_number.s( docket_number=row['Cleaned case_No'], court_id=row['fjc_court_id'], cookies=session.cookies, tag_names=[TAG], **{ 'show_docket_entries': True, 'show_orig_docket': True, 'show_prior_cases': True, 'show_associated_cases': True, 'show_panel_info': True, 'show_party_atty_info': True, 'show_caption': True, }).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async() elif task == 'district': chain( get_pacer_case_id_and_title.s( pass_through=None, docket_number=row['Cleaned case_No'], court_id=row['fjc_court_id'], cookies=session.cookies, case_name=row['Title'], ).set(queue=q), get_docket_by_pacer_case_id.s(court_id=row['fjc_court_id'], cookies=session.cookies, tag_names=[TAG], **{ 'show_parties_and_counsel': True, 'show_terminated_parties': True, 'show_list_of_member_cases': True }).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async()
def test_single_expresion(self): x = chain(self.add.s(1, 2)).apply() assert x.get() == 3 assert x.parent is None
def handle(self, *args, **options): super(Command, self).handle(*args, **options) if options['sweep'] is False: # Only allow one script at a time per court combination. # Note that multiple scripts on multiple machines could still be # run. court_str = '-'.join(sorted(options['courts'])) with open('/tmp/rss-scraper-%s.pid' % court_str, 'w') as fp: try: fcntl.lockf(fp, fcntl.LOCK_EX | fcntl.LOCK_NB) except IOError: print("Another instance of this program is running with " "for this combination of courts. Only one instance " "can crawl these courts at a time: '%s'" % court_str) sys.exit(1) # Loop over the PACER sites that have RSS feeds and see if they're # ready to do. courts = Court.objects.filter( jurisdiction__in=[ Court.FEDERAL_BANKRUPTCY, Court.FEDERAL_DISTRICT, ], pacer_has_rss_feed=True, ) if options['courts'] != ['all']: courts = courts.filter(pk__in=options['courts']) iterations_completed = 0 last_trim_date = None while options['iterations'] == 0 or \ iterations_completed < options['iterations']: for court in courts: # Check the last time we successfully got the feed try: feed_status = RssFeedStatus.objects.filter( court=court, is_sweep=options['sweep'], status__in=[ RssFeedStatus.PROCESSING_SUCCESSFUL, RssFeedStatus.UNCHANGED, RssFeedStatus.PROCESSING_IN_PROGRESS, ] ).latest('date_created') except RssFeedStatus.DoesNotExist: # First time running it or status items have been nuked by # an admin. Make a dummy object, but no need to actually # save it to the DB. Make it old. lincolns_birthday = make_aware(datetime(1809, 2, 12)) feed_status = RssFeedStatus( date_created=lincolns_birthday, date_last_build=lincolns_birthday, is_sweep=options['sweep'], ) if options['courts'] == ['all'] and options['sweep'] is False: # If it's all courts and it's not a sweep, check if we did # it recently. max_visit_ago = now() - timedelta( seconds=self.RSS_MAX_VISIT_FREQUENCY) if feed_status.date_created > max_visit_ago: # Processed too recently. Try next court. continue # Give a court some time to complete during non-sweep crawls processing_cutoff = now() - timedelta( seconds=self.RSS_MAX_PROCESSING_DURATION) if all([ options['sweep'] is False, feed_status.status == RssFeedStatus.PROCESSING_IN_PROGRESS, feed_status.date_created > processing_cutoff ]): continue # The court is ripe! Crawl it if it has changed. # Make a new object to track the attempted crawl. new_status = RssFeedStatus.objects.create( court_id=court.pk, status=RssFeedStatus.PROCESSING_IN_PROGRESS, is_sweep=options['sweep'], ) # Check if the item needs crawling, and crawl it if so. chain( check_if_feed_changed.s(court.pk, new_status.pk, feed_status.date_last_build), merge_rss_feed_contents.s(court.pk, new_status.pk), send_docket_alerts.s(), # Update recap *documents*, not *dockets*. Updating dockets # requires much more work, and we don't expect to get much # docket information from the RSS feeds. RSS feeds also # have information about hundreds or thousands of # dockets. Updating them all would be very bad. add_items_to_solr.s('search.RECAPDocument'), mark_status_successful.si(new_status.pk), ).apply_async() # Trim if not too recently trimmed. trim_cutoff_date = now() - timedelta( seconds=self.DELAY_BETWEEN_CACHE_TRIMS) if last_trim_date is None or trim_cutoff_date > last_trim_date: trim_rss_cache.delay() last_trim_date = now() # Wait, then attempt the courts again if iterations not exceeded. iterations_completed += 1 time.sleep(self.DELAY_BETWEEN_ITERATIONS)
def test_empty_chain_returns_none(self): assert chain(app=self.app)() is None assert chain(app=self.app).apply_async() is None
def test_empty_chain_returns_none(self): self.assertIsNone(chain(app=self.app)()) self.assertIsNone(chain(app=self.app).apply_async())
def test_chord_size_chain_single(self): sig = chord(chain(self.add.s())) assert sig.__length_hint__() == 1
def test_accepts_generator_argument(self): x = chain(self.add.s(i) for i in range(10)) self.assertTrue(x.tasks[0].type, self.add) self.assertTrue(x.type)
def test_chord_size_nested_chain_chain_many(self): # The outer chain will be pulled up into the chord but the lower one # remains and will only count as a single final element sig = chord(chain(chain([self.add.s()] * 42))) assert sig.__length_hint__() == 1
def test_app_falls_back_to_default(self): from celery._state import current_app self.assertIs(chain().app, current_app)
def test_chord_size_nested_implicit_chain_chain_many(self): sig = chord([chain([self.add.s()] * 42)]) assert sig.__length_hint__() == 1
def get_attachment_pages(options): """Find docket entries that look like invoices and get their attachment pages. """ page_size = 100 main_query = build_main_query_from_query_string( Q_DOCS_ONLY, { "rows": page_size, "fl": ["id", "docket_id"] }, { "group": False, "facet": False, "highlight": False }, ) si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode="r") results = si.query().add_extra(**main_query) si.conn.http_connection.close() q = options["queue"] recap_user = User.objects.get(username="******") throttle = CeleryThrottle(queue_name=q, min_items=options["queue_length"]) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() paginator = Paginator(results, page_size) i = 0 for page_number in range(1, paginator.num_pages + 1): paged_results = paginator.page(page_number) for result in paged_results.object_list: if i < options["offset"]: i += 1 continue if i >= options["limit"] > 0: break logger.info( "Doing row %s: rd: %s, docket: %s", i, result["id"], result["docket_id"], ) throttle.maybe_wait() chain( # Query the attachment page and process it get_attachment_page_by_rd.s(result["id"], session.cookies).set(queue=q), # Take that in a new task and make a PQ object make_attachment_pq_object.s(result["id"], recap_user.pk).set(queue=q), # And then process that using the normal machinery. process_recap_attachment.s(tag_names=[TAG_PHASE_1]).set(queue=q ), ).apply_async() i += 1 else: # Inner loop exited normally (didn't "break") continue # Inner loop broke. Break outer loop too. break
def get_documents(options): """Download documents from PACER if we don't already have them.""" q = options["queue"] throttle = CeleryThrottle(queue_name=q) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() page_size = 20000 main_query = build_main_query_from_query_string( QUERY_STRING, { "rows": page_size, "fl": ["id", "docket_id"] }, { "group": False, "facet": False, "highlight": False }, ) si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode="r") results = si.query().add_extra(**main_query).execute() logger.info("Got %s search results.", results.result.numFound) for i, result in enumerate(results): if i < options["offset"]: continue if i >= options["limit"] > 0: break throttle.maybe_wait() logger.info( "Doing item %s w/rd: %s, d: %s", i, result["id"], result["docket_id"], ) try: rd = RECAPDocument.objects.get(pk=result["id"]) except RECAPDocument.DoesNotExist: logger.warn("Unable to find RECAP Document with id %s", result["id"]) continue if rd.is_available: logger.info("Already have pk %s; just tagging it.", rd.pk) add_tags(rd, TAG) continue if not rd.pacer_doc_id: logger.info("Unable to find pacer_doc_id for: %s", rd.pk) continue chain( get_pacer_doc_by_rd.s(rd.pk, session.cookies, tag=TAG).set(queue=q), extract_recap_pdf.si(rd.pk).set(queue=q), add_items_to_solr.si([rd.pk], "search.RECAPDocument").set(queue=q), ).apply_async()
def test_chain_of_chain_with_a_single_task(self): s = self.add.s(1, 1) assert chain([chain(s)]).tasks == list(chain(s).tasks)
def test_call_no_tasks(self): x = chain() self.assertFalse(x())
def test_clone_preserves_state(self): x = chain(self.add.s(i, i) for i in range(10)) assert x.clone().tasks == x.tasks assert x.clone().kwargs == x.kwargs assert x.clone().args == x.args
def download_documents(options): """We've got good values in the new columns, so just need to look those up, and get the documents from PACER. """ f = open(options['input_file'], 'r') dialect = csv.Sniffer().sniff(f.read(1024)) f.seek(0) reader = csv.DictReader(f, dialect=dialect) q = options['queue'] throttle = CeleryThrottle(queue_name=q, min_items=options['queue_length']) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() for i, row in enumerate(reader): if i < options['offset']: continue if i >= options['limit'] > 0: break throttle.maybe_wait() logger.info("Doing row %s: %s", i, row) docket_number = row['cl_d_docket_number'] or \ row['cl_d_docket_number (student)'] or \ None if not docket_number: logger.warn("No docket number found for row: %s", i) continue court = Court.objects.get(fjc_court_id=row['AO ID'].rjust(2, '0'), jurisdiction=Court.FEDERAL_DISTRICT) try: d = Docket.objects.get(docket_number=docket_number, court=court) except Docket.MultipleObjectsReturned: logger.warn("Multiple objects returned for row: %s", i) continue except Docket.DoesNotExist: logger.warn("Could not find docket for row: %s", i) continue # Got the docket, now get the documents from it, tag & OCR them. document_date = datetime.strptime(row['Date'], '%m/%d/%Y').date() des = d.docket_entries.filter(date_filed=document_date) count = des.count() if count == 0: logger.warn("No docket entries found for row: %s", i) continue elif des.count() == 1: good_des = [des[0]] else: # More than one item. Apply filtering rules. good_des = filter_des(des) # We've got our des, now download them. for de in good_des: rds = de.recap_documents.filter( document_type=RECAPDocument.PACER_DOCUMENT) for rd in rds: if not rd.pacer_doc_id: logger.warn( "Unable to get pacer_doc_id for item with " "rd_pk: %s. Restricted document?", rd.pk) continue if options['task'] == 'add_extra_tags': # Wherein I belatedly realize we need a tag specifically # for this part of the project. add_tags(rd, TAG_NAME_OPINIONS) else: # Otherwise, do the normal download thing. chain( get_pacer_doc_by_rd.s(rd.pk, session.cookies, tag=TAG_NAME).set(queue=q), extract_recap_pdf.si(rd.pk).set(queue=q), add_or_update_recap_document.si([rd.pk]).set(queue=q), ).apply_async() f.close()
def get_gets_chain(app: Celery, task: models.Task, kwargs: dict, params: dict) -> chain: signatures = [ app.signature(JobNames.get_action, kwargs=kwargs, **params) for _ in range(task.gets) ] return chain(*signatures)