def get_documents(options): """Download documents from PACER if we don't already have them.""" q = options['queue'] throttle = CeleryThrottle(queue_name=q) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() page_size = 20000 main_query = build_main_query_from_query_string( QUERY_STRING, { 'rows': page_size, 'fl': ['id', 'docket_id'] }, { 'group': False, 'facet': False, 'highlight': False }, ) si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode='r') results = si.query().add_extra(**main_query).execute() logger.info("Got %s search results.", results.result.numFound) for i, result in enumerate(results): if i < options['offset']: continue if i >= options['limit'] > 0: break throttle.maybe_wait() logger.info("Doing item %s w/rd: %s, d: %s", i, result['id'], result['docket_id']) try: rd = RECAPDocument.objects.get(pk=result['id']) except RECAPDocument.DoesNotExist: logger.warn("Unable to find RECAP Document with id %s", result['id']) continue if rd.is_available: logger.info("Already have pk %s; just tagging it.", rd.pk) add_tags(rd, TAG) continue if not rd.pacer_doc_id: logger.info("Unable to find pacer_doc_id for: %s", rd.pk) continue chain( get_pacer_doc_by_rd.s(rd.pk, session.cookies, tag=TAG).set(queue=q), extract_recap_pdf.si(rd.pk).set(queue=q), add_items_to_solr.si([rd.pk], 'search.RECAPDocument').set(queue=q), ).apply_async()
def get_documents(options): """Download documents from PACER if we don't already have them.""" q = options["queue"] throttle = CeleryThrottle(queue_name=q, min_items=options["queue_length"]) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() page_size = 10000 main_query = build_main_query_from_query_string( Q_INVOICES, { "rows": page_size, "fl": ["id", "docket_id"] }, { "group": False, "facet": False, "highlight": False }, ) si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode="r") results = si.query().add_extra(**main_query).execute() si.conn.http_connection.close() logger.info("Got %s search results.", results.result.numFound) for i, result in enumerate(results): if i < options["offset"]: i += 1 continue if i >= options["limit"] > 0: break throttle.maybe_wait() rd = RECAPDocument.objects.get(pk=result["id"]) logger.info("Doing item %s w/rd: %s, d: %s", i, rd.pk, result["docket_id"]) if rd.is_available: logger.info("Already have pk %s; just tagging it.", rd.pk) add_tags(rd, TAG_PHASE_2) i += 1 continue if not rd.pacer_doc_id: logger.info("Unable to find pacer_doc_id for: %s", rd.pk) i += 1 continue chain( get_pacer_doc_by_rd.s(rd.pk, session.cookies, tag=TAG_PHASE_2).set(queue=q), extract_recap_pdf.si(rd.pk).set(queue=q), add_items_to_solr.si([rd.pk], "search.RECAPDocument").set(queue=q), ).apply_async() i += 1
def get_documents(options): """Download documents from PACER if we don't already have them.""" q = options['queue'] throttle = CeleryThrottle(queue_name=q, min_items=options['queue_length']) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() page_size = 10000 main_query = build_main_query_from_query_string( Q_INVOICES, {'rows': page_size, 'fl': ['id', 'docket_id']}, {'group': False, 'facet': False}, ) si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode='r') results = si.query().add_extra(**main_query).execute() logger.info("Got %s search results.", results.result.numFound) for i, result in enumerate(results): if i < options['offset']: i += 1 continue if i >= options['limit'] > 0: break throttle.maybe_wait() rd = RECAPDocument.objects.get(pk=result['id']) logger.info("Doing item %s w/rd: %s, d: %s", i, rd.pk, result['docket_id']) if rd.is_available: logger.info("Already have pk %s; just tagging it.", rd.pk) add_tags(rd, TAG_PHASE_2) i += 1 continue if not rd.pacer_doc_id: logger.info("Unable to find pacer_doc_id for: %s", rd.pk) i += 1 continue chain( get_pacer_doc_by_rd.s(rd.pk, session.cookies, tag=TAG_PHASE_2).set(queue=q), extract_recap_pdf.si(rd.pk).set(queue=q), add_items_to_solr.si([rd.pk], 'search.RECAPDocument').set(queue=q), ).apply_async() i += 1
def download_documents(options): """We've got good values in the new columns, so just need to look those up, and get the documents from PACER. """ f = open(options["input_file"], "r") dialect = csv.Sniffer().sniff(f.read(1024)) f.seek(0) reader = csv.DictReader(f, dialect=dialect) q = options["queue"] throttle = CeleryThrottle(queue_name=q, min_items=options["queue_length"]) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() for i, row in enumerate(reader): if i < options["offset"]: continue if i >= options["limit"] > 0: break throttle.maybe_wait() logger.info("Doing row %s: %s", i, row) docket_number = (row["cl_d_docket_number"] or row["cl_d_docket_number (student)"] or None) if not docket_number: logger.warn("No docket number found for row: %s", i) continue court = Court.objects.get( fjc_court_id=row["AO ID"].rjust(2, "0"), jurisdiction=Court.FEDERAL_DISTRICT, ) try: d = Docket.objects.get(docket_number=docket_number, court=court) except Docket.MultipleObjectsReturned: logger.warn("Multiple objects returned for row: %s", i) continue except Docket.DoesNotExist: logger.warn("Could not find docket for row: %s", i) continue # Got the docket, now get the documents from it, tag & OCR them. document_date = datetime.strptime(row["Date"], "%m/%d/%Y").date() des = d.docket_entries.filter(date_filed=document_date) count = des.count() if count == 0: logger.warn("No docket entries found for row: %s", i) continue elif des.count() == 1: good_des = [des[0]] else: # More than one item. Apply filtering rules. good_des = filter_des(des) # We've got our des, now download them. for de in good_des: rds = de.recap_documents.filter( document_type=RECAPDocument.PACER_DOCUMENT) for rd in rds: if not rd.pacer_doc_id: logger.warn( "Unable to get pacer_doc_id for item with " "rd_pk: %s. Restricted document?", rd.pk, ) continue if options["task"] == "add_extra_tags": # Wherein I belatedly realize we need a tag specifically # for this part of the project. add_tags(rd, TAG_NAME_OPINIONS) else: # Otherwise, do the normal download thing. chain( get_pacer_doc_by_rd.s(rd.pk, session.cookies, tag=TAG_NAME).set(queue=q), extract_recap_pdf.si(rd.pk).set(queue=q), add_items_to_solr.si( [rd.pk], "search.RECAPDocument").set(queue=q), ).apply_async() f.close()
def download_documents(options): """We've got good values in the new columns, so just need to look those up, and get the documents from PACER. """ f = open(options['input_file'], 'r') dialect = csv.Sniffer().sniff(f.read(1024)) f.seek(0) reader = csv.DictReader(f, dialect=dialect) q = options['queue'] throttle = CeleryThrottle(queue_name=q, min_items=options['queue_length']) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() for i, row in enumerate(reader): if i < options['offset']: continue if i >= options['limit'] > 0: break throttle.maybe_wait() logger.info("Doing row %s: %s", i, row) docket_number = row['cl_d_docket_number'] or \ row['cl_d_docket_number (student)'] or \ None if not docket_number: logger.warn("No docket number found for row: %s", i) continue court = Court.objects.get(fjc_court_id=row['AO ID'].rjust(2, '0'), jurisdiction=Court.FEDERAL_DISTRICT) try: d = Docket.objects.get(docket_number=docket_number, court=court) except Docket.MultipleObjectsReturned: logger.warn("Multiple objects returned for row: %s", i) continue except Docket.DoesNotExist: logger.warn("Could not find docket for row: %s", i) continue # Got the docket, now get the documents from it, tag & OCR them. document_date = datetime.strptime(row['Date'], '%m/%d/%Y').date() des = d.docket_entries.filter(date_filed=document_date) count = des.count() if count == 0: logger.warn("No docket entries found for row: %s", i) continue elif des.count() == 1: good_des = [des[0]] else: # More than one item. Apply filtering rules. good_des = filter_des(des) # We've got our des, now download them. for de in good_des: rds = de.recap_documents.filter( document_type=RECAPDocument.PACER_DOCUMENT) for rd in rds: if not rd.pacer_doc_id: logger.warn("Unable to get pacer_doc_id for item with " "rd_pk: %s. Restricted document?", rd.pk) continue if options['task'] == 'add_extra_tags': # Wherein I belatedly realize we need a tag specifically # for this part of the project. add_tags(rd, TAG_NAME_OPINIONS) else: # Otherwise, do the normal download thing. chain( get_pacer_doc_by_rd.s( rd.pk, session.cookies, tag=TAG_NAME).set(queue=q), extract_recap_pdf.si(rd.pk).set(queue=q), add_items_to_solr.si( [rd.pk], 'search.RECAPDocument').set(queue=q), ).apply_async() f.close()