コード例 #1
0
def get_final_docs(options):
    """Get any documents that contain "final" in their description."""
    des = (DocketEntry.objects.filter(
        tags__name=TAG,
        description__icontains="final").order_by("pk").iterator())
    q = options["queue"]
    throttle = CeleryThrottle(queue_name=q)
    pacer_session = PacerSession(username=PACER_USERNAME,
                                 password=PACER_PASSWORD)
    pacer_session.login()
    for i, de in enumerate(des):
        if i < options["offset"]:
            i += 1
            continue
        if i >= options["limit"] > 0:
            break
        if i % 1000 == 0:
            pacer_session = PacerSession(username=PACER_USERNAME,
                                         password=PACER_PASSWORD)
            pacer_session.login()
            logger.info(f"Sent {i} tasks to celery so far.")
        logger.info("Doing row %s", i)
        rd_pks = (de.recap_documents.filter(
            document_type=RECAPDocument.PACER_DOCUMENT, ).exclude(
                pacer_doc_id="").order_by("pk").values_list("pk", flat=True))
        for rd_pk in rd_pks:
            throttle.maybe_wait()
            chain(
                get_pacer_doc_by_rd.s(rd_pk,
                                      pacer_session.cookies,
                                      tag=TAG_FINALS).set(queue=q),
                extract_recap_pdf.si(rd_pk).set(queue=q),
                add_items_to_solr.si([rd_pk],
                                     "search.RECAPDocument").set(queue=q),
            ).apply_async()
コード例 #2
0
def get_documents(options):
    """Download documents from PACER if we don't already have them."""
    q = options['queue']

    throttle = CeleryThrottle(queue_name=q)
    session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD)
    session.login()

    page_size = 20000
    main_query = build_main_query_from_query_string(
        QUERY_STRING,
        {
            'rows': page_size,
            'fl': ['id', 'docket_id']
        },
        {
            'group': False,
            'facet': False,
            'highlight': False
        },
    )
    si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode='r')
    results = si.query().add_extra(**main_query).execute()
    logger.info("Got %s search results.", results.result.numFound)

    for i, result in enumerate(results):
        if i < options['offset']:
            continue
        if i >= options['limit'] > 0:
            break
        throttle.maybe_wait()

        logger.info("Doing item %s w/rd: %s, d: %s", i, result['id'],
                    result['docket_id'])

        try:
            rd = RECAPDocument.objects.get(pk=result['id'])
        except RECAPDocument.DoesNotExist:
            logger.warn("Unable to find RECAP Document with id %s",
                        result['id'])
            continue

        if rd.is_available:
            logger.info("Already have pk %s; just tagging it.", rd.pk)
            add_tags(rd, TAG)
            continue

        if not rd.pacer_doc_id:
            logger.info("Unable to find pacer_doc_id for: %s", rd.pk)
            continue

        chain(
            get_pacer_doc_by_rd.s(rd.pk, session.cookies,
                                  tag=TAG).set(queue=q),
            extract_recap_pdf.si(rd.pk).set(queue=q),
            add_items_to_solr.si([rd.pk], 'search.RECAPDocument').set(queue=q),
        ).apply_async()
コード例 #3
0
def get_documents(options):
    """Download documents from PACER if we don't already have them."""
    q = options["queue"]
    throttle = CeleryThrottle(queue_name=q, min_items=options["queue_length"])
    session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD)
    session.login()

    page_size = 10000
    main_query = build_main_query_from_query_string(
        Q_INVOICES,
        {
            "rows": page_size,
            "fl": ["id", "docket_id"]
        },
        {
            "group": False,
            "facet": False,
            "highlight": False
        },
    )
    si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode="r")
    results = si.query().add_extra(**main_query).execute()
    si.conn.http_connection.close()
    logger.info("Got %s search results.", results.result.numFound)

    for i, result in enumerate(results):
        if i < options["offset"]:
            i += 1
            continue
        if i >= options["limit"] > 0:
            break
        throttle.maybe_wait()

        rd = RECAPDocument.objects.get(pk=result["id"])
        logger.info("Doing item %s w/rd: %s, d: %s", i, rd.pk,
                    result["docket_id"])

        if rd.is_available:
            logger.info("Already have pk %s; just tagging it.", rd.pk)
            add_tags(rd, TAG_PHASE_2)
            i += 1
            continue

        if not rd.pacer_doc_id:
            logger.info("Unable to find pacer_doc_id for: %s", rd.pk)
            i += 1
            continue

        chain(
            get_pacer_doc_by_rd.s(rd.pk, session.cookies,
                                  tag=TAG_PHASE_2).set(queue=q),
            extract_recap_pdf.si(rd.pk).set(queue=q),
            add_items_to_solr.si([rd.pk], "search.RECAPDocument").set(queue=q),
        ).apply_async()
        i += 1
コード例 #4
0
def get_documents(options):
    """Download documents from PACER if we don't already have them."""
    q = options['queue']
    throttle = CeleryThrottle(queue_name=q,
                              min_items=options['queue_length'])
    session = PacerSession(username=PACER_USERNAME,
                           password=PACER_PASSWORD)
    session.login()

    page_size = 10000
    main_query = build_main_query_from_query_string(
        Q_INVOICES,
        {'rows': page_size, 'fl': ['id', 'docket_id']},
        {'group': False, 'facet': False},
    )
    si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode='r')
    results = si.query().add_extra(**main_query).execute()
    logger.info("Got %s search results.", results.result.numFound)

    for i, result in enumerate(results):
        if i < options['offset']:
            i += 1
            continue
        if i >= options['limit'] > 0:
            break
        throttle.maybe_wait()

        rd = RECAPDocument.objects.get(pk=result['id'])
        logger.info("Doing item %s w/rd: %s, d: %s", i, rd.pk,
                    result['docket_id'])

        if rd.is_available:
            logger.info("Already have pk %s; just tagging it.", rd.pk)
            add_tags(rd, TAG_PHASE_2)
            i += 1
            continue

        if not rd.pacer_doc_id:
            logger.info("Unable to find pacer_doc_id for: %s", rd.pk)
            i += 1
            continue

        chain(
            get_pacer_doc_by_rd.s(rd.pk, session.cookies,
                                  tag=TAG_PHASE_2).set(queue=q),
            extract_recap_pdf.si(rd.pk).set(queue=q),
            add_items_to_solr.si([rd.pk], 'search.RECAPDocument').set(queue=q),
        ).apply_async()
        i += 1
コード例 #5
0
ファイル: kessler_ilnb.py プロジェクト: OCTO8888/LITIGAIT
def get_petitions(options):
    """Just get document number one for every docket that's tagged in this
    collection.
    """
    rds = (
        RECAPDocument.objects.filter(
            tags__name=TAG,
            document_number="1",
            document_type=RECAPDocument.PACER_DOCUMENT,
        )
        .exclude(pacer_doc_id="",)
        .order_by("pk")
        .values_list("pk", flat=True)
        .iterator()
    )
    q = options["queue"]
    throttle = CeleryThrottle(queue_name=q)
    pacer_session = PacerSession(
        username=PACER_USERNAME, password=PACER_PASSWORD
    )
    pacer_session.login()
    for i, rd_pk in enumerate(rds):
        if i < options["offset"]:
            i += 1
            continue
        if i >= options["limit"] > 0:
            break

        if i % 1000 == 0:
            pacer_session = PacerSession(
                username=PACER_USERNAME, password=PACER_PASSWORD
            )
            pacer_session.login()
            logger.info("Sent %s tasks to celery so far." % i)
        logger.info("Doing row %s", i)
        throttle.maybe_wait()

        chain(
            get_pacer_doc_by_rd.s(
                rd_pk, pacer_session.cookies, tag=TAG_PETITIONS
            ).set(queue=q),
            extract_recap_pdf.si(rd_pk).set(queue=q),
            add_items_to_solr.si([rd_pk], "search.RECAPDocument").set(queue=q),
        ).apply_async()
コード例 #6
0
def get_petitions(options):
    """Just get document number one for every docket that's tagged in this
    collection.
    """
    rds = RECAPDocument.objects.filter(
        tags__name=TAG,
        document_number='1',
        document_type=RECAPDocument.PACER_DOCUMENT,
    ).exclude(
        pacer_doc_id='',
    ).order_by('pk').values_list('pk', flat=True).iterator()
    q = options['queue']
    throttle = CeleryThrottle(queue_name=q)
    pacer_session = PacerSession(username=PACER_USERNAME,
                                 password=PACER_PASSWORD)
    pacer_session.login()
    for i, rd_pk in enumerate(rds):
        if i < options['offset']:
            i += 1
            continue
        if i >= options['limit'] > 0:
            break

        if i % 1000 == 0:
            pacer_session = PacerSession(username=PACER_USERNAME,
                                         password=PACER_PASSWORD)
            pacer_session.login()
            logger.info("Sent %s tasks to celery so far." % i)
        logger.info("Doing row %s", i)
        throttle.maybe_wait()

        chain(
            get_pacer_doc_by_rd.s(
                rd_pk, pacer_session.cookies, tag=TAG_PETITIONS).set(queue=q),
            extract_recap_pdf.si(rd_pk).set(queue=q),
            add_items_to_solr.si([rd_pk], 'search.RECAPDocument').set(queue=q),
        ).apply_async()
コード例 #7
0
def download_documents(options):
    """We've got good values in the new columns, so just need to look those up,
    and get the documents from PACER.
    """
    f = open(options["input_file"], "r")
    dialect = csv.Sniffer().sniff(f.read(1024))
    f.seek(0)
    reader = csv.DictReader(f, dialect=dialect)
    q = options["queue"]
    throttle = CeleryThrottle(queue_name=q, min_items=options["queue_length"])
    session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD)
    session.login()
    for i, row in enumerate(reader):
        if i < options["offset"]:
            continue
        if i >= options["limit"] > 0:
            break
        throttle.maybe_wait()

        logger.info("Doing row %s: %s", i, row)

        docket_number = (row["cl_d_docket_number"]
                         or row["cl_d_docket_number (student)"] or None)

        if not docket_number:
            logger.warn("No docket number found for row: %s", i)
            continue
        court = Court.objects.get(
            fjc_court_id=row["AO ID"].rjust(2, "0"),
            jurisdiction=Court.FEDERAL_DISTRICT,
        )

        try:
            d = Docket.objects.get(docket_number=docket_number, court=court)
        except Docket.MultipleObjectsReturned:
            logger.warn("Multiple objects returned for row: %s", i)
            continue
        except Docket.DoesNotExist:
            logger.warn("Could not find docket for row: %s", i)
            continue

        # Got the docket, now get the documents from it, tag & OCR them.
        document_date = datetime.strptime(row["Date"], "%m/%d/%Y").date()
        des = d.docket_entries.filter(date_filed=document_date)
        count = des.count()
        if count == 0:
            logger.warn("No docket entries found for row: %s", i)
            continue
        elif des.count() == 1:
            good_des = [des[0]]
        else:
            # More than one item. Apply filtering rules.
            good_des = filter_des(des)

        # We've got our des, now download them.
        for de in good_des:
            rds = de.recap_documents.filter(
                document_type=RECAPDocument.PACER_DOCUMENT)
            for rd in rds:
                if not rd.pacer_doc_id:
                    logger.warn(
                        "Unable to get pacer_doc_id for item with "
                        "rd_pk: %s. Restricted document?",
                        rd.pk,
                    )
                    continue
                if options["task"] == "add_extra_tags":
                    # Wherein I belatedly realize we need a tag specifically
                    # for this part of the project.
                    add_tags(rd, TAG_NAME_OPINIONS)
                else:
                    # Otherwise, do the normal download thing.
                    chain(
                        get_pacer_doc_by_rd.s(rd.pk,
                                              session.cookies,
                                              tag=TAG_NAME).set(queue=q),
                        extract_recap_pdf.si(rd.pk).set(queue=q),
                        add_items_to_solr.si(
                            [rd.pk], "search.RECAPDocument").set(queue=q),
                    ).apply_async()
    f.close()
コード例 #8
0
def download_documents(options):
    """We've got good values in the new columns, so just need to look those up,
    and get the documents from PACER.
    """
    f = open(options['input_file'], 'r')
    dialect = csv.Sniffer().sniff(f.read(1024))
    f.seek(0)
    reader = csv.DictReader(f, dialect=dialect)
    q = options['queue']
    throttle = CeleryThrottle(queue_name=q,
                              min_items=options['queue_length'])
    session = PacerSession(username=PACER_USERNAME,
                           password=PACER_PASSWORD)
    session.login()
    for i, row in enumerate(reader):
        if i < options['offset']:
            continue
        if i >= options['limit'] > 0:
            break
        throttle.maybe_wait()

        logger.info("Doing row %s: %s", i, row)

        docket_number = row['cl_d_docket_number'] or \
            row['cl_d_docket_number (student)'] or \
            None

        if not docket_number:
            logger.warn("No docket number found for row: %s", i)
            continue
        court = Court.objects.get(fjc_court_id=row['AO ID'].rjust(2, '0'),
                                  jurisdiction=Court.FEDERAL_DISTRICT)

        try:
            d = Docket.objects.get(docket_number=docket_number, court=court)
        except Docket.MultipleObjectsReturned:
            logger.warn("Multiple objects returned for row: %s", i)
            continue
        except Docket.DoesNotExist:
            logger.warn("Could not find docket for row: %s", i)
            continue

        # Got the docket, now get the documents from it, tag & OCR them.
        document_date = datetime.strptime(row['Date'], '%m/%d/%Y').date()
        des = d.docket_entries.filter(date_filed=document_date)
        count = des.count()
        if count == 0:
            logger.warn("No docket entries found for row: %s", i)
            continue
        elif des.count() == 1:
            good_des = [des[0]]
        else:
            # More than one item. Apply filtering rules.
            good_des = filter_des(des)

        # We've got our des, now download them.
        for de in good_des:
            rds = de.recap_documents.filter(
                document_type=RECAPDocument.PACER_DOCUMENT)
            for rd in rds:
                if not rd.pacer_doc_id:
                    logger.warn("Unable to get pacer_doc_id for item with "
                                "rd_pk: %s. Restricted document?", rd.pk)
                    continue
                if options['task'] == 'add_extra_tags':
                    # Wherein I belatedly realize we need a tag specifically
                    # for this part of the project.
                    add_tags(rd, TAG_NAME_OPINIONS)
                else:
                    # Otherwise, do the normal download thing.
                    chain(
                        get_pacer_doc_by_rd.s(
                            rd.pk, session.cookies, tag=TAG_NAME).set(queue=q),
                        extract_recap_pdf.si(rd.pk).set(queue=q),
                        add_items_to_solr.si(
                            [rd.pk], 'search.RECAPDocument').set(queue=q),
                    ).apply_async()
    f.close()