def get_final_docs(options): """Get any documents that contain "final" in their description.""" des = (DocketEntry.objects.filter( tags__name=TAG, description__icontains="final").order_by("pk").iterator()) q = options["queue"] throttle = CeleryThrottle(queue_name=q) pacer_session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) pacer_session.login() for i, de in enumerate(des): if i < options["offset"]: i += 1 continue if i >= options["limit"] > 0: break if i % 1000 == 0: pacer_session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) pacer_session.login() logger.info(f"Sent {i} tasks to celery so far.") logger.info("Doing row %s", i) rd_pks = (de.recap_documents.filter( document_type=RECAPDocument.PACER_DOCUMENT, ).exclude( pacer_doc_id="").order_by("pk").values_list("pk", flat=True)) for rd_pk in rd_pks: throttle.maybe_wait() chain( get_pacer_doc_by_rd.s(rd_pk, pacer_session.cookies, tag=TAG_FINALS).set(queue=q), extract_recap_pdf.si(rd_pk).set(queue=q), add_items_to_solr.si([rd_pk], "search.RECAPDocument").set(queue=q), ).apply_async()
def do_pacer_fetch(fq): """Process a request made by a user to get an item from PACER. :param fq: The PacerFetchQueue item to process :return: None """ result = None if fq.request_type == REQUEST_TYPE.DOCKET: # Request by docket_id c = chain( fetch_docket.si(fq.pk), add_or_update_recap_docket.s(), mark_fq_successful.si(fq.pk), ) result = c.apply_async() elif fq.request_type == REQUEST_TYPE.PDF: # Request by recap_document_id rd_pk = fq.recap_document_id result = chain( fetch_pacer_doc_by_rd.si(rd_pk, fq.pk), extract_recap_pdf.si(rd_pk), add_items_to_solr.si([rd_pk], "search.RECAPDocument"), mark_fq_successful.si(fq.pk), ).apply_async() elif fq.request_type == REQUEST_TYPE.ATTACHMENT_PAGE: result = fetch_attachment_page.apply_async(args=(fq.pk,)) return result
def save(self, do_extraction=True, index=True, *args, **kwargs): if self.document_type == self.ATTACHMENT: if self.attachment_number is None: raise ValidationError( 'attachment_number cannot be null for an ' 'attachment.') if self.pacer_doc_id == '': # Normally a char field would be never have a null value, opting # instead on having a blank value. However, blanks are not # considered unique while nulls are, so for this field, we reset # it to null whenever it would normally be blank. # http://stackoverflow.com/a/3124586/64911 self.pacer_doc_id = None super(RECAPDocument, self).save(*args, **kwargs) tasks = [] if do_extraction and self.needs_extraction: # Context extraction not done and is requested. from cl.scrapers.tasks import extract_recap_pdf tasks.append(extract_recap_pdf.si(self.pk)) if index: from cl.search.tasks import add_or_update_recap_document tasks.append(add_or_update_recap_document.si([self.pk], False)) if len(tasks) > 0: chain(*tasks)()
def get_documents(options): """Download documents from PACER if we don't already have them.""" q = options['queue'] throttle = CeleryThrottle(queue_name=q) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() page_size = 20000 main_query = build_main_query_from_query_string( QUERY_STRING, { 'rows': page_size, 'fl': ['id', 'docket_id'] }, { 'group': False, 'facet': False, 'highlight': False }, ) si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode='r') results = si.query().add_extra(**main_query).execute() logger.info("Got %s search results.", results.result.numFound) for i, result in enumerate(results): if i < options['offset']: continue if i >= options['limit'] > 0: break throttle.maybe_wait() logger.info("Doing item %s w/rd: %s, d: %s", i, result['id'], result['docket_id']) try: rd = RECAPDocument.objects.get(pk=result['id']) except RECAPDocument.DoesNotExist: logger.warn("Unable to find RECAP Document with id %s", result['id']) continue if rd.is_available: logger.info("Already have pk %s; just tagging it.", rd.pk) add_tags(rd, TAG) continue if not rd.pacer_doc_id: logger.info("Unable to find pacer_doc_id for: %s", rd.pk) continue chain( get_pacer_doc_by_rd.s(rd.pk, session.cookies, tag=TAG).set(queue=q), extract_recap_pdf.si(rd.pk).set(queue=q), add_items_to_solr.si([rd.pk], 'search.RECAPDocument').set(queue=q), ).apply_async()
def do_ocr(options): """Do the OCR for any items that need it, then save to the solr index.""" q = options['queue'] rds = RECAPDocument.objects.filter( ocr_status=RECAPDocument.OCR_NEEDED, ).values_list('pk', flat=True).order_by() count = rds.count() throttle = CeleryThrottle(queue_name=q) for i, pk in enumerate(rds): throttle.maybe_wait() if options['index']: extract_recap_pdf.si(pk, skip_ocr=False).set(queue=q).apply_async() else: chain( extract_recap_pdf.si(pk, skip_ocr=False).set(queue=q), add_or_update_recap_document.s(coalesce_docket=True).set(queue=q), ).apply_async() if i % 1000 == 0: logger.info("Sent %s/%s tasks to celery so far." % (i + 1, count))
def ocr_available(options: OptionsType) -> None: """Do the OCR for any items that need it, then save to the solr index.""" q = cast(str, options["queue"]) rds = (RECAPDocument.objects.filter( ocr_status=RECAPDocument.OCR_NEEDED).values_list("pk", flat=True).order_by()) count = rds.count() throttle = CeleryThrottle(queue_name=q) for i, pk in enumerate(rds): throttle.maybe_wait() if options["index"]: extract_recap_pdf.si(pk, skip_ocr=False).set(queue=q).apply_async() else: chain( extract_recap_pdf.si(pk, skip_ocr=False).set(queue=q), add_docket_to_solr_by_rds.s().set(queue=q), ).apply_async() if i % 1000 == 0: logger.info(f"Sent {i + 1}/{count} tasks to celery so far.")
def get_documents(options): """Download documents from PACER if we don't already have them.""" q = options["queue"] throttle = CeleryThrottle(queue_name=q, min_items=options["queue_length"]) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() page_size = 10000 main_query = build_main_query_from_query_string( Q_INVOICES, { "rows": page_size, "fl": ["id", "docket_id"] }, { "group": False, "facet": False, "highlight": False }, ) si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode="r") results = si.query().add_extra(**main_query).execute() si.conn.http_connection.close() logger.info("Got %s search results.", results.result.numFound) for i, result in enumerate(results): if i < options["offset"]: i += 1 continue if i >= options["limit"] > 0: break throttle.maybe_wait() rd = RECAPDocument.objects.get(pk=result["id"]) logger.info("Doing item %s w/rd: %s, d: %s", i, rd.pk, result["docket_id"]) if rd.is_available: logger.info("Already have pk %s; just tagging it.", rd.pk) add_tags(rd, TAG_PHASE_2) i += 1 continue if not rd.pacer_doc_id: logger.info("Unable to find pacer_doc_id for: %s", rd.pk) i += 1 continue chain( get_pacer_doc_by_rd.s(rd.pk, session.cookies, tag=TAG_PHASE_2).set(queue=q), extract_recap_pdf.si(rd.pk).set(queue=q), add_items_to_solr.si([rd.pk], "search.RECAPDocument").set(queue=q), ).apply_async() i += 1
def get_documents(options): """Download documents from PACER if we don't already have them.""" q = options['queue'] throttle = CeleryThrottle(queue_name=q, min_items=options['queue_length']) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() page_size = 10000 main_query = build_main_query_from_query_string( Q_INVOICES, {'rows': page_size, 'fl': ['id', 'docket_id']}, {'group': False, 'facet': False}, ) si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode='r') results = si.query().add_extra(**main_query).execute() logger.info("Got %s search results.", results.result.numFound) for i, result in enumerate(results): if i < options['offset']: i += 1 continue if i >= options['limit'] > 0: break throttle.maybe_wait() rd = RECAPDocument.objects.get(pk=result['id']) logger.info("Doing item %s w/rd: %s, d: %s", i, rd.pk, result['docket_id']) if rd.is_available: logger.info("Already have pk %s; just tagging it.", rd.pk) add_tags(rd, TAG_PHASE_2) i += 1 continue if not rd.pacer_doc_id: logger.info("Unable to find pacer_doc_id for: %s", rd.pk) i += 1 continue chain( get_pacer_doc_by_rd.s(rd.pk, session.cookies, tag=TAG_PHASE_2).set(queue=q), extract_recap_pdf.si(rd.pk).set(queue=q), add_items_to_solr.si([rd.pk], 'search.RECAPDocument').set(queue=q), ).apply_async() i += 1
def get_petitions(options): """Just get document number one for every docket that's tagged in this collection. """ rds = ( RECAPDocument.objects.filter( tags__name=TAG, document_number="1", document_type=RECAPDocument.PACER_DOCUMENT, ) .exclude(pacer_doc_id="",) .order_by("pk") .values_list("pk", flat=True) .iterator() ) q = options["queue"] throttle = CeleryThrottle(queue_name=q) pacer_session = PacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) pacer_session.login() for i, rd_pk in enumerate(rds): if i < options["offset"]: i += 1 continue if i >= options["limit"] > 0: break if i % 1000 == 0: pacer_session = PacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) pacer_session.login() logger.info("Sent %s tasks to celery so far." % i) logger.info("Doing row %s", i) throttle.maybe_wait() chain( get_pacer_doc_by_rd.s( rd_pk, pacer_session.cookies, tag=TAG_PETITIONS ).set(queue=q), extract_recap_pdf.si(rd_pk).set(queue=q), add_items_to_solr.si([rd_pk], "search.RECAPDocument").set(queue=q), ).apply_async()
def get_petitions(options): """Just get document number one for every docket that's tagged in this collection. """ rds = RECAPDocument.objects.filter( tags__name=TAG, document_number='1', document_type=RECAPDocument.PACER_DOCUMENT, ).exclude( pacer_doc_id='', ).order_by('pk').values_list('pk', flat=True).iterator() q = options['queue'] throttle = CeleryThrottle(queue_name=q) pacer_session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) pacer_session.login() for i, rd_pk in enumerate(rds): if i < options['offset']: i += 1 continue if i >= options['limit'] > 0: break if i % 1000 == 0: pacer_session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) pacer_session.login() logger.info("Sent %s tasks to celery so far." % i) logger.info("Doing row %s", i) throttle.maybe_wait() chain( get_pacer_doc_by_rd.s( rd_pk, pacer_session.cookies, tag=TAG_PETITIONS).set(queue=q), extract_recap_pdf.si(rd_pk).set(queue=q), add_items_to_solr.si([rd_pk], 'search.RECAPDocument').set(queue=q), ).apply_async()
def download_documents(options): """We've got good values in the new columns, so just need to look those up, and get the documents from PACER. """ f = open(options["input_file"], "r") dialect = csv.Sniffer().sniff(f.read(1024)) f.seek(0) reader = csv.DictReader(f, dialect=dialect) q = options["queue"] throttle = CeleryThrottle(queue_name=q, min_items=options["queue_length"]) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() for i, row in enumerate(reader): if i < options["offset"]: continue if i >= options["limit"] > 0: break throttle.maybe_wait() logger.info("Doing row %s: %s", i, row) docket_number = (row["cl_d_docket_number"] or row["cl_d_docket_number (student)"] or None) if not docket_number: logger.warn("No docket number found for row: %s", i) continue court = Court.objects.get( fjc_court_id=row["AO ID"].rjust(2, "0"), jurisdiction=Court.FEDERAL_DISTRICT, ) try: d = Docket.objects.get(docket_number=docket_number, court=court) except Docket.MultipleObjectsReturned: logger.warn("Multiple objects returned for row: %s", i) continue except Docket.DoesNotExist: logger.warn("Could not find docket for row: %s", i) continue # Got the docket, now get the documents from it, tag & OCR them. document_date = datetime.strptime(row["Date"], "%m/%d/%Y").date() des = d.docket_entries.filter(date_filed=document_date) count = des.count() if count == 0: logger.warn("No docket entries found for row: %s", i) continue elif des.count() == 1: good_des = [des[0]] else: # More than one item. Apply filtering rules. good_des = filter_des(des) # We've got our des, now download them. for de in good_des: rds = de.recap_documents.filter( document_type=RECAPDocument.PACER_DOCUMENT) for rd in rds: if not rd.pacer_doc_id: logger.warn( "Unable to get pacer_doc_id for item with " "rd_pk: %s. Restricted document?", rd.pk, ) continue if options["task"] == "add_extra_tags": # Wherein I belatedly realize we need a tag specifically # for this part of the project. add_tags(rd, TAG_NAME_OPINIONS) else: # Otherwise, do the normal download thing. chain( get_pacer_doc_by_rd.s(rd.pk, session.cookies, tag=TAG_NAME).set(queue=q), extract_recap_pdf.si(rd.pk).set(queue=q), add_items_to_solr.si( [rd.pk], "search.RECAPDocument").set(queue=q), ).apply_async() f.close()
def do_pacer_fetch(fq): """Process a request made by a user to get an item from PACER. :param fq: The PacerFetchQueue item to process :return: None """ c = None if fq.request_type == REQUEST_TYPE.DOCKET: # Request by docket_id court_id = fq.court_id or getattr(fq.docket, "court_id", None) kwargs = { # Universal params "court_id": court_id, "user_pk": fq.user_id, "docket_pk": fq.docket_id, # Scraping params "doc_num_start": fq.de_number_start, "doc_num_end": fq.de_number_end, "date_start": fq.de_date_start, "date_end": fq.de_date_end, "show_parties_and_counsel": fq.show_parties_and_counsel, "show_terminated_parties": fq.show_terminated_parties, "show_list_of_member_cases": fq.show_list_of_member_cases, } if (fq.docket_id and not fq.docket.pacer_case_id) or fq.docket_number: # We lack the pacer_case_id either on the docket or from the # submission. Look it up. docket_number = fq.docket_number or getattr( fq.docket, "docket_number", None ) c = chain( get_pacer_case_id_and_title.si( pass_through=None, docket_number=docket_number, court_id=court_id, user_pk=fq.user_id, ), get_docket_by_pacer_case_id.s(**kwargs), ) else: if fq.docket_id is not None and fq.docket.pacer_case_id: # We have the docket and its pacer_case_id kwargs.update( { "data": {"pacer_case_id": fq.docket.pacer_case_id}, "court_id": fq.docket.court_id, } ) elif fq.pacer_case_id: # We lack the docket, but have a pacer_case_id kwargs.update( {"data": {"pacer_case_id": fq.pacer_case_id},} ) c = chain(get_docket_by_pacer_case_id.si(**kwargs)) c |= add_or_update_recap_docket.s() elif fq.request_type == REQUEST_TYPE.PDF: # Request by recap_document_id rd_pk = fq.recap_document_id if fq.recap_document_id: c = chain( fetch_pacer_doc_by_rd.si(rd_pk, fq.pk, fq.user_id), extract_recap_pdf.si(rd_pk), add_items_to_solr.si([rd_pk], "search.RECAPDocument"), ) if c is not None: c |= mark_fq_successful.si(fq.pk) c.apply_async() else: # Somehow failed to make a chain. Log an error. fq.status = PROCESSING_STATUS.INVALID_CONTENT fq.message = "Invalid submission, unable to make chain for processing." fq.save()
def download_documents(options): """We've got good values in the new columns, so just need to look those up, and get the documents from PACER. """ f = open(options['input_file'], 'r') dialect = csv.Sniffer().sniff(f.read(1024)) f.seek(0) reader = csv.DictReader(f, dialect=dialect) q = options['queue'] throttle = CeleryThrottle(queue_name=q, min_items=options['queue_length']) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() for i, row in enumerate(reader): if i < options['offset']: continue if i >= options['limit'] > 0: break throttle.maybe_wait() logger.info("Doing row %s: %s", i, row) docket_number = row['cl_d_docket_number'] or \ row['cl_d_docket_number (student)'] or \ None if not docket_number: logger.warn("No docket number found for row: %s", i) continue court = Court.objects.get(fjc_court_id=row['AO ID'].rjust(2, '0'), jurisdiction=Court.FEDERAL_DISTRICT) try: d = Docket.objects.get(docket_number=docket_number, court=court) except Docket.MultipleObjectsReturned: logger.warn("Multiple objects returned for row: %s", i) continue except Docket.DoesNotExist: logger.warn("Could not find docket for row: %s", i) continue # Got the docket, now get the documents from it, tag & OCR them. document_date = datetime.strptime(row['Date'], '%m/%d/%Y').date() des = d.docket_entries.filter(date_filed=document_date) count = des.count() if count == 0: logger.warn("No docket entries found for row: %s", i) continue elif des.count() == 1: good_des = [des[0]] else: # More than one item. Apply filtering rules. good_des = filter_des(des) # We've got our des, now download them. for de in good_des: rds = de.recap_documents.filter( document_type=RECAPDocument.PACER_DOCUMENT) for rd in rds: if not rd.pacer_doc_id: logger.warn("Unable to get pacer_doc_id for item with " "rd_pk: %s. Restricted document?", rd.pk) continue if options['task'] == 'add_extra_tags': # Wherein I belatedly realize we need a tag specifically # for this part of the project. add_tags(rd, TAG_NAME_OPINIONS) else: # Otherwise, do the normal download thing. chain( get_pacer_doc_by_rd.s( rd.pk, session.cookies, tag=TAG_NAME).set(queue=q), extract_recap_pdf.si(rd.pk).set(queue=q), add_items_to_solr.si( [rd.pk], 'search.RECAPDocument').set(queue=q), ).apply_async() f.close()
def do_pacer_fetch(fq): """Process a request made by a user to get an item from PACER. :param fq: The PacerFetchQueue item to process :return: None """ result = None if fq.request_type == REQUEST_TYPE.DOCKET: # Request by docket_id court_id = fq.court_id or getattr(fq.docket, "court_id", None) kwargs = { # Universal params "court_id": court_id, "user_pk": fq.user_id, "docket_pk": fq.docket_id, # Scraping params "doc_num_start": fq.de_number_start, "doc_num_end": fq.de_number_end, "date_start": fq.de_date_start, "date_end": fq.de_date_end, "show_parties_and_counsel": fq.show_parties_and_counsel, "show_terminated_parties": fq.show_terminated_parties, "show_list_of_member_cases": fq.show_list_of_member_cases, } if (fq.docket_id and not fq.docket.pacer_case_id) or fq.docket_number: # We lack the pacer_case_id either on the docket or from the # submission. Look it up. docket_number = fq.docket_number or getattr( fq.docket, "docket_number", None) c = chain( get_pacer_case_id_and_title.si( pass_through=None, docket_number=docket_number, court_id=court_id, user_pk=fq.user_id, ), get_docket_by_pacer_case_id.s(**kwargs), ) else: if fq.docket_id is not None and fq.docket.pacer_case_id: # We have the docket and its pacer_case_id kwargs.update({ "data": { "pacer_case_id": fq.docket.pacer_case_id }, "court_id": fq.docket.court_id, }) elif fq.pacer_case_id: # We lack the docket, but have a pacer_case_id kwargs.update({ "data": { "pacer_case_id": fq.pacer_case_id }, }) c = chain(get_docket_by_pacer_case_id.si(**kwargs)) c |= add_or_update_recap_docket.s() c |= mark_fq_successful.si(fq.pk) result = c.apply_async() elif fq.request_type == REQUEST_TYPE.PDF: # Request by recap_document_id rd_pk = fq.recap_document_id result = chain( fetch_pacer_doc_by_rd.si(rd_pk, fq.pk), extract_recap_pdf.si(rd_pk), add_items_to_solr.si([rd_pk], "search.RECAPDocument"), mark_fq_successful.si(fq.pk), ).apply_async() elif fq.request_type == REQUEST_TYPE.ATTACHMENT_PAGE: result = fetch_attachment_page.apply_async(args=(fq.pk, )) return result