def get_next_date_range(court_id, span=7):
    """Get the next start and end query dates for a court.

    Check the DB for the last date for a court that was completed. Return the
    day after that date + span days into the future as the range to query for
    the requested court.

    If the court is still in progress, return (None, None).

    :param court_id: A PACER Court ID
    :param span: The number of days to go forward from the last completed date
    """
    court_id = map_pacer_to_cl_id(court_id)
    try:
        last_completion_log = PACERFreeDocumentLog.objects.filter(
            court_id=court_id,
        ).exclude(
            status=PACERFreeDocumentLog.SCRAPE_FAILED,
        ).latest('date_queried')
    except PACERFreeDocumentLog.DoesNotExist:
        logger.warn("FAILED ON: %s" % court_id)
        raise

    if last_completion_log.status == PACERFreeDocumentLog.SCRAPE_IN_PROGRESS:
        return None, None

    # Ensure that we go back five days from the last time we had success if
    # that success was in the last few days.
    last_complete_date = min(now().date() - timedelta(days=5),
                             last_completion_log.date_queried)
    next_end_date = min(now().date(),
                        last_complete_date + timedelta(days=span))
    return last_complete_date, next_end_date
Esempio n. 2
0
def get_next_date_range(court_id, span=7):
    """Get the next start and end query dates for a court.

    Check the DB for the last date for a court that was completed. Return the
    day after that date + span days into the future as the range to query for
    the requested court.

    If the court is still in progress, return (None, None).

    :param court_id: A PACER Court ID
    :param span: The number of days to go forward from the last completed date
    """
    court_id = map_pacer_to_cl_id(court_id)
    try:
        last_completion_log = PACERFreeDocumentLog.objects.filter(
            court_id=court_id, ).exclude(
                status=PACERFreeDocumentLog.SCRAPE_FAILED, ).latest(
                    'date_queried')
    except PACERFreeDocumentLog.DoesNotExist:
        logger.warn("FAILED ON: %s" % court_id)
        raise

    if last_completion_log.status == PACERFreeDocumentLog.SCRAPE_IN_PROGRESS:
        return None, None

    last_complete_date = last_completion_log.date_queried
    next_start_date = last_complete_date + timedelta(days=1)
    next_end_date = min(now().date(),
                        last_complete_date + timedelta(days=span))
    return next_start_date, next_end_date
Esempio n. 3
0
def get_cover_sheets_for_docket(options, docket_pks, tag=None):
    """Get civil cover sheets for dockets in our system."""
    q = options['queue']
    throttle = CeleryThrottle(queue_name=q)
    cover_sheet_re = re.compile(r'cover\s*sheet', re.IGNORECASE)
    for i, docket_pk in enumerate(docket_pks):
        throttle.maybe_wait()
        if i % 1000 == 0:
            pacer_session = PacerSession(username=PACER_USERNAME,
                                         password=PACER_PASSWORD)
            pacer_session.login()
            logger.info("Sent %s tasks to celery so far." % i)
        try:
            rd_pk = RECAPDocument.objects.get(
                document_number=1,
                docket_entry__docket_id=docket_pk,
            ).values_list()
        except (RECAPDocument.MultipleObjectsReturned,
                RECAPDocument.DoesNotExist) as e:
            logger.warn("Unable to get document 1 for docket_pk: %s" %
                        docket_pk)
        else:
            get_pacer_doc_by_rd_and_description.apply_async(
                args=(
                    rd_pk,
                    cover_sheet_re,
                    pacer_session,
                ),
                kwargs={
                    'tag': tag,
                },
                queue=q,
            )
Esempio n. 4
0
def get_documents(options):
    """Download documents from PACER if we don't already have them."""
    q = options['queue']

    throttle = CeleryThrottle(queue_name=q)
    session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD)
    session.login()

    page_size = 20000
    main_query = build_main_query_from_query_string(
        QUERY_STRING,
        {
            'rows': page_size,
            'fl': ['id', 'docket_id']
        },
        {
            'group': False,
            'facet': False,
            'highlight': False
        },
    )
    si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode='r')
    results = si.query().add_extra(**main_query).execute()
    logger.info("Got %s search results.", results.result.numFound)

    for i, result in enumerate(results):
        if i < options['offset']:
            continue
        if i >= options['limit'] > 0:
            break
        throttle.maybe_wait()

        logger.info("Doing item %s w/rd: %s, d: %s", i, result['id'],
                    result['docket_id'])

        try:
            rd = RECAPDocument.objects.get(pk=result['id'])
        except RECAPDocument.DoesNotExist:
            logger.warn("Unable to find RECAP Document with id %s",
                        result['id'])
            continue

        if rd.is_available:
            logger.info("Already have pk %s; just tagging it.", rd.pk)
            add_tags(rd, TAG)
            continue

        if not rd.pacer_doc_id:
            logger.info("Unable to find pacer_doc_id for: %s", rd.pk)
            continue

        chain(
            get_pacer_doc_by_rd.s(rd.pk, session.cookies,
                                  tag=TAG).set(queue=q),
            extract_recap_pdf.si(rd.pk).set(queue=q),
            add_items_to_solr.si([rd.pk], 'search.RECAPDocument').set(queue=q),
        ).apply_async()
def get_doc_by_re_and_de_nums_for_dockets(options,
                                          docket_pks,
                                          regex,
                                          de_nums,
                                          fallback=False,
                                          tag=None):
    """Get civil cover sheets for dockets in our system.

    :param options: The options sent on the command line as a dict.
    :param docket_pks: A list of docket pks to iterate over.
    :param regex: A regex to match on the document description on the attachment
    page. For example, to get initial complaints, set this to
    r'initial\s*complaints'.
    :param de_nums: The docket entry numbers to use when looking for items, as a
    list.
    :param fallback: After loading the attachment page, if we don't find
    something that matches `regex`, should we just grab the main document?
    :param tag: A tag to add to any modified content.
    """
    q = options['queue']
    throttle = CeleryThrottle(queue_name=q)
    for i, docket_pk in enumerate(docket_pks):
        if i >= options['count'] > 0:
            break
        throttle.maybe_wait()
        if i % 1000 == 0:
            pacer_session = PacerSession(username=PACER_USERNAME,
                                         password=PACER_PASSWORD)
            pacer_session.login()
            logger.info("Sent %s tasks to celery so far." % i)
        try:
            rds = RECAPDocument.objects.filter(
                document_number__in=de_nums,
                document_type=RECAPDocument.PACER_DOCUMENT,
                docket_entry__docket_id=docket_pk,
            )
        except (RECAPDocument.MultipleObjectsReturned,
                RECAPDocument.DoesNotExist):
            logger.warn("Unable to get document 1 for docket_pk: %s" %
                        docket_pk)
        else:
            for rd in rds:
                get_pacer_doc_by_rd_and_description.apply_async(
                    args=(
                        rd.pk,
                        regex,
                        pacer_session,
                    ),
                    kwargs={
                        'fallback_to_main_doc': fallback,
                        'tag': tag,
                    },
                    queue=q,
                )
Esempio n. 6
0
    def set_if_falsy(obj, attribute, new_value):
        """Check if the value passed in is Falsy. If so, set it to the value of
        new_value.

        return ok: Whether the item was set successfully
        """
        current_value = getattr(obj, attribute)
        if current_value is not None and isinstance(current_value, basestring):
            current_value = current_value.strip()

        does_not_currently_have_a_value = not current_value
        current_value_not_zero = current_value != 0
        new_value_not_blank = new_value.strip() != ""
        ok = True
        if all([
                does_not_currently_have_a_value,
                current_value_not_zero,
                new_value_not_blank,
        ]):
            logger.info("Updating %s with %s." %
                        (attribute, new_value.encode("utf-8")))
            setattr(obj, attribute, new_value)
        else:
            # Report if there's a difference -- that might spell trouble.
            values_differ = False
            if (isinstance(current_value, basestring)
                    and isinstance(new_value, basestring) and "".join(
                        current_value.split()) != "".join(new_value.split())):
                # Handles strings and normalizes them for comparison.
                values_differ = True
            elif isinstance(current_value,
                            int) and current_value != int(new_value):
                # Handles ints, which need no normalization for comparison.
                values_differ = True

            if values_differ:
                logger.warn(
                    "WARNING: Didn't set '{attr}' attribute on obj {obj_id} "
                    "because it already had a value, but the new value "
                    "('{new}') differs from current value ('{current}')".
                    format(
                        attr=attribute,
                        obj_id=obj.pk,
                        new=new_value,
                        current=force_bytes(current_value),
                    ))
                ok = False
            else:
                # The values were the same.
                logger.info("'%s' field unchanged -- old and new values were "
                            "the same: %s" % (attribute, new_value))
        return ok
    def set_if_falsy(obj, attribute, new_value):
        """Check if the value passed in is Falsy. If so, set it to the value of
        new_value.

        return ok: Whether the item was set successfully
        """
        current_value = getattr(obj, attribute)
        if current_value is not None and isinstance(current_value, basestring):
            current_value = current_value.strip()

        does_not_currently_have_a_value = not current_value
        current_value_not_zero = current_value != 0
        new_value_not_blank = new_value.strip() != ''
        ok = True
        if all([does_not_currently_have_a_value, current_value_not_zero,
                new_value_not_blank]):
            logger.info("Updating %s with %s." %
                        (attribute, new_value.encode('utf-8')))
            setattr(obj, attribute, new_value)
        else:
            # Report if there's a difference -- that might spell trouble.
            values_differ = False
            if (isinstance(current_value, basestring) and
                    isinstance(new_value, basestring) and
                    ''.join(current_value.split()) != ''.join(new_value.split())):
                # Handles strings and normalizes them for comparison.
                values_differ = True
            elif (isinstance(current_value, int) and
                  current_value != int(new_value)):
                # Handles ints, which need no normalization for comparison.
                values_differ = True

            if values_differ:
                logger.warn(
                    "WARNING: Didn't set '{attr}' attribute on obj {obj_id} "
                    "because it already had a value, but the new value "
                    "('{new}') differs from current value ('{current}')".format(
                        attr=attribute,
                        obj_id=obj.pk,
                        new=new_value,
                        current=force_bytes(current_value),
                    )
                )
                ok = False
            else:
                # The values were the same.
                logger.info("'%s' field unchanged -- old and new values were "
                            "the same: %s" % (attribute, new_value))
        return ok
Esempio n. 8
0
def find_missing_or_incorrect_docket_numbers(options):
    """Iterate over tax cases to verify which docket numbers are correct.

    :param options:
    :return: Nothing
    """

    should_fix = options["fix"]
    ocs = OpinionCluster.objects.filter(docket__court="tax").exclude(
        sub_opinions__plain_text=""
    )

    logger.info("%s clusters found", ocs.count())

    for oc in ocs:
        logger.info("Analyzing cluster %s", oc.id)
        ops = oc.sub_opinions.all()
        assert ops.count() == 1
        for op in ops:
            logger.warn(
                "Reference url: https://www.courtlistener.com/opinion/%s/x",
                oc.id,
            )
            # Only loop over the first opinion because these
            # cases should only one have one
            # because they were extracted from the tax courts
            dockets_in_db = oc.docket.docket_number.strip()
            found_dockets = get_tax_docket_numbers(op.plain_text)
            if found_dockets == dockets_in_db:
                if (
                    oc.docket.docket_number.strip() == ""
                    and dockets_in_db == ""
                ):
                    logger.info("No docket numbers found in db or text.")
                else:
                    logger.info("Docket numbers appear correct.")
                continue
            else:
                if dockets_in_db == "":
                    logger.warn(
                        "Docket No(s). found for the first time: %s",
                        found_dockets,
                    )
                elif found_dockets == "":
                    logger.warn(
                        "Docket No(s). not found in text but Docket No(s). %s in db",
                        dockets_in_db,
                    )
                else:
                    logger.warn(
                        "Dockets in db (%s) != (%s) docket parsed from text",
                        dockets_in_db,
                        found_dockets,
                    )
                if should_fix:
                    oc.docket.docket_number = found_dockets
                    oc.docket.save()
    def do_citations(cluster, scdb_info):
        """
        Handle the citation fields.

        :param cluster: The Cluster to be changed.
        :param scdb_info: A dict with the SCDB information.
        """
        fields = {
            'usCite': ("U.S.", Citation.FEDERAL),
            'sctCite': ("S. Ct.", Citation.FEDERAL),
            'ledCite': ("L. Ed.", Citation.FEDERAL),
            'lexisCite': ("U.S. LEXIS", Citation.LEXIS),
        }
        for scdb_field, reporter_info in fields.items():
            try:
                citation_obj = get_citations(
                    scdb_info[scdb_field],
                    html=False,
                    do_post_citation=False,
                    do_defendant=False,
                    disambiguate=False,
                )[0]
            except IndexError:
                logger.warn("Unable to parse citation for: %s",
                            scdb_info[scdb_field])
            else:
                cite = cluster.citations.filter(reporter=reporter_info[0])
                if cite:
                    # Update the existing citation.
                    cite.volume = citation_obj.volume
                    cite.reporter = citation_obj.reporter
                    cite.page = citation_obj.page
                    cite.save()
                else:
                    try:
                        # Create a new citation
                        Citation.objects.create(
                            cluster=cluster,
                            volume=citation_obj.volume,
                            reporter=citation_obj.reporter,
                            page=citation_obj.page,
                            type=reporter_info[1],
                        )
                    except IntegrityError:
                        # Violated unique_together constraint. Fine.
                        pass
Esempio n. 10
0
    def do_citations(cluster, scdb_info):
        """
        Handle the citation fields.

        :param cluster: The Cluster to be changed.
        :param scdb_info: A dict with the SCDB information.
        """
        fields = {
            "usCite": ("U.S.", Citation.FEDERAL),
            "sctCite": ("S. Ct.", Citation.FEDERAL),
            "ledCite": ("L. Ed.", Citation.FEDERAL),
            "lexisCite": ("U.S. LEXIS", Citation.LEXIS),
        }
        for scdb_field, reporter_info in fields.items():
            try:
                citation_obj = get_citations(
                    scdb_info[scdb_field],
                    html=False,
                    do_post_citation=False,
                    do_defendant=False,
                    disambiguate=False,
                )[0]
            except IndexError:
                logger.warn("Unable to parse citation for: %s",
                            scdb_info[scdb_field])
            else:
                cite = cluster.citations.filter(reporter=reporter_info[0])
                if cite:
                    # Update the existing citation.
                    cite.volume = citation_obj.volume
                    cite.reporter = citation_obj.reporter
                    cite.page = citation_obj.page
                    cite.save()
                else:
                    try:
                        # Create a new citation
                        Citation.objects.create(
                            cluster=cluster,
                            volume=citation_obj.volume,
                            reporter=citation_obj.reporter,
                            page=citation_obj.page,
                            type=reporter_info[1],
                        )
                    except IntegrityError:
                        # Violated unique_together constraint. Fine.
                        pass
    def do_first_pass(options):
        idb_rows = FjcIntegratedDatabase.objects.filter(
            dataset_source=CV_2017, ).order_by("pk")
        q = options["queue"]
        throttle = CeleryThrottle(queue_name=q)
        for i, idb_row in enumerate(queryset_generator(idb_rows)):
            # Iterate over all items in the IDB and find them in the Docket
            # table. If they're not there, create a new item.
            if i < options["offset"]:
                continue
            if i >= options["limit"] > 0:
                break

            throttle.maybe_wait()
            # TODO: See conversation in #courtlistener channel from 2019-07-11,
            # In which it appears we matched a criminal case with a civil one.
            # The code below doesn't protect against that, but it should (and I
            # think it does in the `do_second_pass` code, below.
            ds = Docket.objects.filter(
                docket_number_core=idb_row.docket_number,
                court=idb_row.district,
            )
            count = ds.count()
            if count == 0:
                logger.info("%s: Creating new docket for IDB row: %s", i,
                            idb_row)
                create_new_docket_from_idb.apply_async(
                    args=(idb_row.pk, ),
                    queue=q,
                )

            elif count == 1:
                d = ds[0]
                logger.info("%s: Merging Docket %s with IDB row: %s", i, d,
                            idb_row)
                merge_docket_with_idb.apply_async(args=(d.pk, idb_row.pk),
                                                  queue=q)
            elif count > 1:
                logger.warn(
                    "%s: Unable to merge. Got %s dockets for row: %s",
                    i,
                    count,
                    idb_row,
                )
Esempio n. 12
0
def process_citations(data, debug):
    """Walk through the citations and add them one at a time.
    """
    updated_ids = set()
    for index, item in data.iterrows():
        logger.info(
            "\nAdding citation from %s to %s" % (item["citing"], item["cited"])
        )
        try:
            cite = OpinionsCited.objects.get(
                citing_opinion_id=item["citing"],
                cited_opinion_id=item["cited"],
            )
            msg = "Citation already exists. Doing nothing:\n"
        except OpinionsCited.DoesNotExist:
            cite = OpinionsCited(
                citing_opinion_id=item["citing"],
                cited_opinion_id=item["cited"],
            )
            msg = "Created new citation:\n"
            if not debug:
                cite.save()
                updated_ids.add(cite.citing_opinion.pk)
        try:
            logger.info(
                "  %s"
                "    %s: %s\n"
                "    From: %s\n"
                "    To:   %s\n"
                % (msg, cite.pk, cite, cite.citing_opinion, cite.cited_opinion)
            )
        except Opinion.DoesNotExist:
            logger.warn(
                "  Unable to create citation. Underlying Opinion doesn't "
                "exist."
            )

    logger.info("\nUpdating Solr...")
    if not debug:
        add_items_to_solr(updated_ids, "search.Opinion")
    logger.info("Done.")
    def handle(self, *args, **options):
        super(Command, self).handle(*args, **options)

        idb_rows = FjcIntegratedDatabase.objects.filter(
            dataset_source=CV_2017, ).order_by('pk')
        q = options['queue']
        throttle = CeleryThrottle(queue_name=q)
        for i, idb_row in enumerate(queryset_generator(idb_rows)):
            # Iterate over all items in the IDB and find them in the Docket
            # table. If they're not there, create a new item.
            if i < options['offset']:
                continue
            if i >= options['limit'] > 0:
                break

            throttle.maybe_wait()
            docket_number_no_0s = remove_leading_zeros(idb_row.docket_number)
            ds = Docket.objects.filter(
                Q(docket_number_core=idb_row.docket_number)
                | Q(docket_number_core=docket_number_no_0s),
                court=idb_row.district,
            )
            count = ds.count()
            if count == 0:
                logger.info("%s: Creating new docket for IDB row: %s", i,
                            idb_row)
                create_new_docket_from_idb.apply_async(
                    args=(idb_row.pk, ),
                    queue=q,
                )

            elif count == 1:
                d = ds[0]
                logger.info("%s: Merging Docket %s with IDB row: %s", i, d,
                            idb_row)
                merge_docket_with_idb.apply_async(args=(d.pk, idb_row.pk),
                                                  queue=q)
            elif count > 1:
                logger.warn("%s: Unable to merge. Got %s dockets for row: %s",
                            i, count, idb_row)
Esempio n. 14
0
def create_or_update_row(values):
    fjc_filters = [
        {
            "district": values["district"],
            "docket_number": values["docket_number"],
            "origin": values["origin"],
            "date_filed": values["date_filed"],
        },
        # Match on defendant (that'll work better on criminal cases). It can
        # change over time, but if we find a match that's a very strong
        # indicator and we should use it.
        {
            "defendant": values["defendant"]
        },
    ]
    existing_rows = FjcIntegratedDatabase.objects.all()
    for fjc_filter in fjc_filters:
        existing_rows = existing_rows.filter(**fjc_filter)
        existing_row_count = existing_rows.count()
        if existing_row_count == 0:
            fjc_row = FjcIntegratedDatabase.objects.create(**values)
            logger.info("Added row: %s", fjc_row)
            break
        elif existing_row_count == 1:
            existing_rows.update(date_modified=now(), **values)
            fjc_row = existing_rows[0]
            logger.info("Updated row: %s" % fjc_row)
            break
    else:
        # Didn't hit a break b/c too many matches.
        logger.warn(
            "Got %s results when looking up row by filters: %s",
            existing_row_count,
            fjc_filter,
        )
        fjc_row = None

    return fjc_row
    def do_first_pass(options):
        idb_rows = FjcIntegratedDatabase.objects.filter(
            dataset_source=CV_2017,
        ).order_by('pk')
        q = options['queue']
        throttle = CeleryThrottle(queue_name=q)
        for i, idb_row in enumerate(queryset_generator(idb_rows)):
            # Iterate over all items in the IDB and find them in the Docket
            # table. If they're not there, create a new item.
            if i < options['offset']:
                continue
            if i >= options['limit'] > 0:
                break

            throttle.maybe_wait()
            ds = Docket.objects.filter(
                docket_number_core=idb_row.docket_number,
                court=idb_row.district,
            )
            count = ds.count()
            if count == 0:
                logger.info("%s: Creating new docket for IDB row: %s",
                            i, idb_row)
                create_new_docket_from_idb.apply_async(
                    args=(idb_row.pk,),
                    queue=q,
                )

            elif count == 1:
                d = ds[0]
                logger.info("%s: Merging Docket %s with IDB row: %s",
                            i, d, idb_row)
                merge_docket_with_idb.apply_async(args=(d.pk, idb_row.pk),
                                                  queue=q)
            elif count > 1:
                logger.warn("%s: Unable to merge. Got %s dockets for row: %s",
                            i, count, idb_row)
def process_citations(data, debug):
    """Walk through the citations and add them one at a time.
    """
    updated_ids = set()
    for index, item in data.iterrows():
        logger.info("\nAdding citation from %s to %s" % (item['citing'],
                                                         item['cited']))
        try:
            cite = OpinionsCited.objects.get(
                citing_opinion_id=item['citing'],
                cited_opinion_id=item['cited'],
            )
            msg = "Citation already exists. Doing nothing:\n"
        except OpinionsCited.DoesNotExist:
            cite = OpinionsCited(citing_opinion_id=item['citing'],
                                 cited_opinion_id=item['cited'])
            msg = "Created new citation:\n"
            if not debug:
                cite.save()
                updated_ids.add(cite.citing_opinion.pk)
        try:
            logger.info(
                "  %s"
                "    %s: %s\n"
                "    From: %s\n"
                "    To:   %s\n" % (msg, cite.pk, cite, cite.citing_opinion,
                                    cite.cited_opinion)
            )
        except Opinion.DoesNotExist:
            logger.warn("  Unable to create citation. Underlying Opinion doesn't "
                        "exist.")

    logger.info("\nUpdating Solr...")
    if not debug:
        add_items_to_solr(updated_ids, 'search.Opinion')
    logger.info("Done.")
Esempio n. 17
0
    def map_judges_to_photos(self):
        """Identify which of the judges in the DB have photos.

        We iterate over the entire collection of judges, identifying which have
        photos. We could instead iterate over the photos, but that increases
        the risk of duplicate issues.
        """
        # Create a dict of judge paths, mapping paths to empty lists.
        judge_paths = os.listdir(os.path.join(judge_root, "orig"))
        judge_map = {}
        for path in judge_paths:
            judge_map[path] = []

        # Iterate over the people, attempting to look them up in the list
        people = Person.objects.filter(is_alias_of=None)
        for person in people:
            for name in self.make_slugs(person):
                if name in judge_map:
                    # If there's a hit, add the path to the dict of judge paths.
                    judge_map[name].append(person)
                    break

        # After iterating, set all people to not have photos.
        if not self.debug:
            people.update(has_photo=False)

        found = 0
        missed = 0
        multi = 0
        for path, people in judge_map.items():
            if len(people) == 0:
                logger.warn("Did not find a judge for %s" % path)
                missed += 1
            if len(people) == 1:
                person = people[0]
                found += 1
                if not self.debug:
                    logger.info("Updating judge %s" % person)
                    person.has_photo = True
                    person.save()
            if len(people) > 1:
                logger.warn("Found more than one match for %s:" % path)
                for person in people:
                    logger.warn("Found: %s - %s" % (
                        person,
                        granular_date(
                            person,
                            "date_dob",
                            iso=True,
                        ),
                    ))
                multi += 1

        logger.info("\n\n%s Matches\n%s Missed\n%s Multiple results" %
                    (found, missed, multi))
    def map_judges_to_photos(self):
        """Identify which of the judges in the DB have photos.

        We iterate over the entire collection of judges, identifying which have
        photos. We could instead iterate over the photos, but that increases
        the risk of duplicate issues.
        """
        # Create a dict of judge paths, mapping paths to empty lists.
        judge_paths = os.listdir(os.path.join(judge_root, 'orig'))
        judge_map = {}
        for path in judge_paths:
            judge_map[path] = []

        # Iterate over the people, attempting to look them up in the list
        people = Person.objects.filter(is_alias_of=None)
        for person in people:
            for name in self.make_slugs(person):
                if name in judge_map:
                    # If there's a hit, add the path to the dict of judge paths.
                    judge_map[name].append(person)
                    break

        # After iterating, set all people to not have photos.
        if not self.debug:
            people.update(has_photo=False)

        found = 0
        missed = 0
        multi = 0
        for path, people in judge_map.items():
            if len(people) == 0:
                logger.warn("Did not find a judge for %s" % path)
                missed += 1
            if len(people) == 1:
                person = people[0]
                found += 1
                if not self.debug:
                    logger.info("Updating judge %s" % person)
                    person.has_photo = True
                    person.save()
            if len(people) > 1:
                logger.warn("Found more than one match for %s:" % path)
                for person in people:
                    logger.warn("Found: %s - %s" % (person, granular_date(
                        person,
                        'date_dob',
                        iso=True,
                    )))
                multi += 1

        logger.info("\n\n%s Matches\n%s Missed\n%s Multiple results" %
                    (found, missed, multi))
Esempio n. 19
0
    def scrape_court(self, site, full_crawl=False):
        download_error = False
        # Get the court object early for logging
        # opinions.united_states.federal.ca9_u --> ca9
        court_str = site.court_id.split(".")[-1].split("_")[0]
        court = Court.objects.get(pk=court_str)

        dup_checker = DupChecker(court, full_crawl=full_crawl)
        abort = dup_checker.abort_by_url_hash(site.url, site.hash)
        if not abort:
            if site.cookies:
                logger.info("Using cookies: %s" % site.cookies)
            for i, item in enumerate(site):
                msg, r = get_binary_content(
                    item["download_urls"],
                    site.cookies,
                    site._get_adapter_instance(),
                    method=site.method,
                )
                if msg:
                    logger.warn(msg)
                    ErrorLog(log_level="WARNING", court=court,
                             message=msg).save()
                    continue

                content = site.cleanup_content(r.content)

                current_date = item["case_dates"]
                try:
                    next_date = site[i + 1]["case_dates"]
                except IndexError:
                    next_date = None

                # request.content is sometimes a str, sometimes unicode, so
                # force it all to be bytes, pleasing hashlib.
                sha1_hash = sha1(force_bytes(content))
                onwards = dup_checker.press_on(
                    Audio,
                    current_date,
                    next_date,
                    lookup_value=sha1_hash,
                    lookup_by="sha1",
                )
                if dup_checker.emulate_break:
                    break

                if onwards:
                    # Not a duplicate, carry on
                    logger.info("Adding new document found at: %s" %
                                item["download_urls"].encode("utf-8"))
                    dup_checker.reset()

                    docket, audio_file, error = self.make_objects(
                        item,
                        court,
                        sha1_hash,
                        content,
                    )

                    if error:
                        download_error = True
                        continue

                    self.save_everything(
                        items={
                            "docket": docket,
                            "audio_file": audio_file,
                        },
                        index=False,
                    )
                    process_audio_file.apply_async(
                        (audio_file.pk, ), countdown=random.randint(0, 3600))

                    logger.info(
                        "Successfully added audio file {pk}: {name}".format(
                            pk=audio_file.pk,
                            name=item["case_names"].encode("utf-8"),
                        ))

            # Update the hash if everything finishes properly.
            logger.info("%s: Successfully crawled oral arguments." %
                        site.court_id)
            if not download_error and not full_crawl:
                # Only update the hash if no errors occurred.
                dup_checker.update_site_hash(site.hash)
    def scrape_court(self, site, full_crawl=False):
        download_error = False
        # Get the court object early for logging
        # opinions.united_states.federal.ca9_u --> ca9
        court_str = site.court_id.split('.')[-1].split('_')[0]
        court = Court.objects.get(pk=court_str)

        dup_checker = DupChecker(court, full_crawl=full_crawl)
        abort = dup_checker.abort_by_url_hash(site.url, site.hash)
        if not abort:
            if site.cookies:
                logger.info("Using cookies: %s" % site.cookies)
            for i, item in enumerate(site):
                msg, r = get_binary_content(item['download_urls'],
                                            site.cookies,
                                            site._get_adapter_instance(),
                                            method=site.method)
                if msg:
                    logger.warn(msg)
                    ErrorLog(log_level='WARNING', court=court,
                             message=msg).save()
                    continue

                content = site.cleanup_content(r.content)

                current_date = item['case_dates']
                try:
                    next_date = site[i + 1]['case_dates']
                except IndexError:
                    next_date = None

                # request.content is sometimes a str, sometimes unicode, so
                # force it all to be bytes, pleasing hashlib.
                sha1_hash = sha1(force_bytes(content))
                if (court_str == 'nev'
                        and item['precedential_statuses'] == 'Unpublished'):
                    # Nevada's non-precedential cases have different SHA1
                    # sums every time.
                    lookup_params = {
                        'lookup_value': item['download_urls'],
                        'lookup_by': 'download_url'
                    }
                else:
                    lookup_params = {
                        'lookup_value': sha1_hash,
                        'lookup_by': 'sha1'
                    }

                onwards = dup_checker.press_on(Opinion, current_date,
                                               next_date, **lookup_params)
                if dup_checker.emulate_break:
                    break

                if onwards:
                    # Not a duplicate, carry on
                    logger.info('Adding new document found at: %s' %
                                item['download_urls'].encode('utf-8'))
                    dup_checker.reset()

                    docket, opinion, cluster, citations, error = self.make_objects(
                        item, court, sha1_hash, content)

                    if error:
                        download_error = True
                        continue

                    self.save_everything(items={
                        'docket': docket,
                        'opinion': opinion,
                        'cluster': cluster,
                        'citations': citations,
                    },
                                         index=False)
                    extract_doc_content.delay(
                        opinion.pk,
                        do_ocr=True,
                        citation_jitter=True,
                    )

                    logger.info("Successfully added doc {pk}: {name}".format(
                        pk=opinion.pk,
                        name=item['case_names'].encode('utf-8'),
                    ))

            # Update the hash if everything finishes properly.
            logger.info("%s: Successfully crawled opinions." % site.court_id)
            if not download_error and not full_crawl:
                # Only update the hash if no errors occurred.
                dup_checker.update_site_hash(site.hash)
Esempio n. 21
0
    def do_federal_citations(self, cluster, scdb_info):
        """
        Handle the federal_cite fields differently, since they may have the
        values in any order.

        :param cluster: The Cluster to be changed.
        :param scdb_info: A dict with the SCDB information.
        :return: save: A boolean indicating whether the item should be saved.
        """
        save = True
        us_done, sct_done, led_done = False, False, False
        available_fields = []
        error = False
        for field in [
                'federal_cite_one', 'federal_cite_two', 'federal_cite_three'
        ]:
            # Update the value in place (ie, replace the U.S. citation with a
            # U.S. citation. Identify available fields.
            value = getattr(cluster, field).strip()
            if not value:
                available_fields.append(field)
                continue

            if "U.S." in value:
                error = self.set_if_falsy(cluster, field, scdb_info['usCite'])
                us_done = True
            elif "S. Ct." in value:
                error = self.set_if_falsy(cluster, field, scdb_info['sctCite'])
                sct_done = True
            elif "L. Ed." in value:
                error = self.set_if_falsy(cluster, field, scdb_info['ledCite'])
                led_done = True
            else:
                logger.warn("      WARNING: Fell through search for citation.")
                save = False
        if error:
            save = False

        num_undone_fields = sum(
            [f for f in [us_done, sct_done, led_done] if f is False])
        if num_undone_fields > len(available_fields):
            logger.warn("WARNING: More values were found than there were "
                        "slots to put them in. Time to create "
                        "federal_cite_four?")
            save = False
        else:
            # Save undone values into available fields. Any value that wasn't
            # updated above gets slotted into the fields that remain.
            for field in available_fields:
                if not us_done:
                    us_done = True
                    if scdb_info['usCite']:
                        self.set_if_falsy(cluster, field, scdb_info['usCite'])
                        # Continue if the value got set. Otherwise, fall let
                        # the next value fill the available field.
                        continue
                if not sct_done:
                    sct_done = True
                    if scdb_info['sctCite']:
                        self.set_if_falsy(cluster, field, scdb_info['sctCite'])
                        continue
                if not led_done:
                    led_done = True
                    if scdb_info['ledCite']:
                        self.set_if_falsy(cluster, field, scdb_info['ledCite'])
                        continue

        return save
Esempio n. 22
0
    def scrape_court(self, site, full_crawl=False):
        download_error = False
        # Get the court object early for logging
        # opinions.united_states.federal.ca9_u --> ca9
        court_str = site.court_id.split(".")[-1].split("_")[0]
        court = Court.objects.get(pk=court_str)

        dup_checker = DupChecker(court, full_crawl=full_crawl)
        if dup_checker.abort_by_url_hash(site.url, site.hash):
            return

        if site.cookies:
            logger.info("Using cookies: %s" % site.cookies)
        for i, item in enumerate(site):
            msg, r = get_binary_content(
                item["download_urls"],
                site.cookies,
                site._get_adapter_instance(),
                method=site.method,
            )
            if msg:
                logger.warn(msg)
                ErrorLog(log_level="WARNING", court=court, message=msg).save()
                continue

            content = site.cleanup_content(r.content)

            current_date = item["case_dates"]
            try:
                next_date = site[i + 1]["case_dates"]
            except IndexError:
                next_date = None

            # request.content is sometimes a str, sometimes unicode, so
            # force it all to be bytes, pleasing hashlib.
            sha1_hash = sha1(force_bytes(content))
            if (court_str == "nev"
                    and item["precedential_statuses"] == "Unpublished"):
                # Nevada's non-precedential cases have different SHA1 sums
                # every time.
                lookup_params = {
                    "lookup_value": item["download_urls"],
                    "lookup_by": "download_url",
                }
            else:
                lookup_params = {
                    "lookup_value": sha1_hash,
                    "lookup_by": "sha1",
                }

            proceed = dup_checker.press_on(Opinion, current_date, next_date,
                                           **lookup_params)
            if dup_checker.emulate_break:
                break
            if not proceed:
                continue

            # Not a duplicate, carry on
            logger.info("Adding new document found at: %s" %
                        item["download_urls"].encode("utf-8"))
            dup_checker.reset()

            docket, opinion, cluster, citations, error = self.make_objects(
                item, court, sha1_hash, content)

            if error:
                download_error = True
                continue

            self.save_everything(
                items={
                    "docket": docket,
                    "opinion": opinion,
                    "cluster": cluster,
                    "citations": citations,
                },
                index=False,
            )
            extract_doc_content.delay(
                opinion.pk,
                do_ocr=True,
                citation_jitter=True,
            )

            logger.info("Successfully added doc {pk}: {name}".format(
                pk=opinion.pk,
                name=item["case_names"].encode("utf-8"),
            ))

        # Update the hash if everything finishes properly.
        logger.info("%s: Successfully crawled opinions." % site.court_id)
        if not download_error and not full_crawl:
            # Only update the hash if no errors occurred.
            dup_checker.update_site_hash(site.hash)
    def scrape_court(self, site, full_crawl=False):
        download_error = False
        # Get the court object early for logging
        # opinions.united_states.federal.ca9_u --> ca9
        court_str = site.court_id.split('.')[-1].split('_')[0]
        court = Court.objects.get(pk=court_str)

        dup_checker = DupChecker(court, full_crawl=full_crawl)
        abort = dup_checker.abort_by_url_hash(site.url, site.hash)
        if not abort:
            if site.cookies:
                logger.info("Using cookies: %s" % site.cookies)
            for i, item in enumerate(site):
                msg, r = get_binary_content(
                    item['download_urls'],
                    site.cookies,
                    site._get_adapter_instance(),
                    method=site.method
                )
                if msg:
                    logger.warn(msg)
                    ErrorLog(log_level='WARNING',
                             court=court,
                             message=msg).save()
                    continue

                content = site.cleanup_content(r.content)

                current_date = item['case_dates']
                try:
                    next_date = site[i + 1]['case_dates']
                except IndexError:
                    next_date = None

                # request.content is sometimes a str, sometimes unicode, so
                # force it all to be bytes, pleasing hashlib.
                sha1_hash = hashlib.sha1(force_bytes(content)).hexdigest()
                onwards = dup_checker.press_on(
                    Audio,
                    current_date,
                    next_date,
                    lookup_value=sha1_hash,
                    lookup_by='sha1'
                )
                if dup_checker.emulate_break:
                    break

                if onwards:
                    # Not a duplicate, carry on
                    logger.info('Adding new document found at: %s' %
                                item['download_urls'].encode('utf-8'))
                    dup_checker.reset()

                    docket, audio_file, error = self.make_objects(
                        item, court, sha1_hash, content,
                    )

                    if error:
                        download_error = True
                        continue

                    self.save_everything(
                        items={
                            'docket': docket,
                            'audio_file': audio_file,
                        },
                        index=False,
                    )
                    process_audio_file.apply_async(
                        (audio_file.pk,),
                        countdown=random.randint(0, 3600)
                    )

                    logger.info(
                        "Successfully added audio file {pk}: {name}".format(
                            pk=audio_file.pk,
                            name=item['case_names'].encode('utf-8')
                        )
                    )

            # Update the hash if everything finishes properly.
            logger.info("%s: Successfully crawled oral arguments." %
                        site.court_id)
            if not download_error and not full_crawl:
                # Only update the hash if no errors occurred.
                dup_checker.update_site_hash(site.hash)
def download_documents(options):
    """We've got good values in the new columns, so just need to look those up,
    and get the documents from PACER.
    """
    f = open(options['input_file'], 'r')
    dialect = csv.Sniffer().sniff(f.read(1024))
    f.seek(0)
    reader = csv.DictReader(f, dialect=dialect)
    q = options['queue']
    throttle = CeleryThrottle(queue_name=q,
                              min_items=options['queue_length'])
    session = PacerSession(username=PACER_USERNAME,
                           password=PACER_PASSWORD)
    session.login()
    for i, row in enumerate(reader):
        if i < options['offset']:
            continue
        if i >= options['limit'] > 0:
            break
        throttle.maybe_wait()

        logger.info("Doing row %s: %s", i, row)

        docket_number = row['cl_d_docket_number'] or \
            row['cl_d_docket_number (student)'] or \
            None

        if not docket_number:
            logger.warn("No docket number found for row: %s", i)
            continue
        court = Court.objects.get(fjc_court_id=row['AO ID'].rjust(2, '0'),
                                  jurisdiction=Court.FEDERAL_DISTRICT)

        try:
            d = Docket.objects.get(docket_number=docket_number, court=court)
        except Docket.MultipleObjectsReturned:
            logger.warn("Multiple objects returned for row: %s", i)
            continue
        except Docket.DoesNotExist:
            logger.warn("Could not find docket for row: %s", i)
            continue

        # Got the docket, now get the documents from it, tag & OCR them.
        document_date = datetime.strptime(row['Date'], '%m/%d/%Y').date()
        des = d.docket_entries.filter(date_filed=document_date)
        count = des.count()
        if count == 0:
            logger.warn("No docket entries found for row: %s", i)
            continue
        elif des.count() == 1:
            good_des = [des[0]]
        else:
            # More than one item. Apply filtering rules.
            good_des = filter_des(des)

        # We've got our des, now download them.
        for de in good_des:
            rds = de.recap_documents.filter(
                document_type=RECAPDocument.PACER_DOCUMENT)
            for rd in rds:
                if not rd.pacer_doc_id:
                    logger.warn("Unable to get pacer_doc_id for item with "
                                "rd_pk: %s. Restricted document?", rd.pk)
                    continue
                if options['task'] == 'add_extra_tags':
                    # Wherein I belatedly realize we need a tag specifically
                    # for this part of the project.
                    add_tags(rd, TAG_NAME_OPINIONS)
                else:
                    # Otherwise, do the normal download thing.
                    chain(
                        get_pacer_doc_by_rd.s(
                            rd.pk, session.cookies, tag=TAG_NAME).set(queue=q),
                        extract_recap_pdf.si(rd.pk).set(queue=q),
                        add_items_to_solr.si(
                            [rd.pk], 'search.RECAPDocument').set(queue=q),
                    ).apply_async()
    f.close()
Esempio n. 25
0
def find_missing_or_incorrect_citations(options):
    """Iterate over tax cases to verify which citations are correctly parsed

    This code should pull back all the cases with plaintext tax courts to parse.
    Iterate over those cases extracting the citation if any

    :param options:
    :return:
    """
    should_fix = options["fix"]

    ocs = OpinionCluster.objects.filter(docket__court="tax").exclude(
        sub_opinions__plain_text=""
    )
    logger.info("%s clusters found", ocs.count())

    for oc in ocs:
        logger.warn(
            "Reference url: https://www.courtlistener.com/opinion/%s/x", oc.id,
        )
        cites = oc.citations.all()

        logger.info("Found %s cite(s) for case in db", cites.count())

        if cites.count() > 0:
            if should_fix:
                logger.warn("Deleting cites in cluster %s", oc.id)
                cites.delete()

        ops = oc.sub_opinions.all()
        assert ops.count() == 1
        for op in ops:
            # Only loop over the first opinion because
            # these cases should only one have one opinion
            found_cite = find_tax_court_citation(op.plain_text)
            if found_cite is not None:
                found_cite_str = found_cite.base_citation()
                logger.info(
                    "Found citation in plain text as %s", found_cite_str
                )
                if should_fix:
                    logger.warn("Creating citation: %s", found_cite_str)
                    Citation.objects.create(
                        volume=found_cite.volume,
                        reporter=found_cite.reporter,
                        page=found_cite.page,
                        type=found_cite.type,
                        cluster_id=oc.id,
                    )
                else:
                    if cites.count() > 0:
                        for cite in cites:
                            if str(cite) != found_cite_str:
                                logger.warn(
                                    "Have (%s), Expect (%s)",
                                    cite,
                                    found_cite_str,
                                )
                    else:
                        logger.warn("Add %s to db", found_cite_str)

            else:
                if cites.count() > 0:
                    for cite in cites:
                        logger.warn("Have (%s), Expect None", cite)
                        logger.warn("%s should be removed", cite)
                else:
                    logger.info("No citation in db or text: %s", oc.id)
    def scrape_court(self, site, full_crawl=False):
        download_error = False
        # Get the court object early for logging
        # opinions.united_states.federal.ca9_u --> ca9
        court_str = site.court_id.split('.')[-1].split('_')[0]
        court = Court.objects.get(pk=court_str)

        dup_checker = DupChecker(court, full_crawl=full_crawl)
        abort = dup_checker.abort_by_url_hash(site.url, site.hash)
        if not abort:
            if site.cookies:
                logger.info("Using cookies: %s" % site.cookies)
            for i, item in enumerate(site):
                msg, r = get_binary_content(
                    item['download_urls'],
                    site.cookies,
                    site._get_adapter_instance(),
                    method=site.method
                )
                if msg:
                    logger.warn(msg)
                    ErrorLog(log_level='WARNING',
                             court=court,
                             message=msg).save()
                    continue

                content = site.cleanup_content(r.content)

                current_date = item['case_dates']
                try:
                    next_date = site[i + 1]['case_dates']
                except IndexError:
                    next_date = None

                # request.content is sometimes a str, sometimes unicode, so
                # force it all to be bytes, pleasing hashlib.
                sha1_hash = hashlib.sha1(force_bytes(content)).hexdigest()
                if (court_str == 'nev' and
                        item['precedential_statuses'] == 'Unpublished'):
                    # Nevada's non-precedential cases have different SHA1
                    # sums every time.
                    lookup_params = {'lookup_value': item['download_urls'],
                                     'lookup_by': 'download_url'}
                else:
                    lookup_params = {'lookup_value': sha1_hash,
                                     'lookup_by': 'sha1'}

                onwards = dup_checker.press_on(Opinion, current_date, next_date,
                                               **lookup_params)
                if dup_checker.emulate_break:
                    break

                if onwards:
                    # Not a duplicate, carry on
                    logger.info('Adding new document found at: %s' %
                                item['download_urls'].encode('utf-8'))
                    dup_checker.reset()

                    docket, opinion, cluster, citations, error = self.make_objects(
                        item, court, sha1_hash, content
                    )

                    if error:
                        download_error = True
                        continue

                    self.save_everything(
                        items={
                            'docket': docket,
                            'opinion': opinion,
                            'cluster': cluster,
                            'citations': citations,
                        },
                        index=False
                    )
                    extract_doc_content.delay(
                        opinion.pk, do_ocr=True,
                        citation_jitter=True,
                    )

                    logger.info("Successfully added doc {pk}: {name}".format(
                        pk=opinion.pk,
                        name=item['case_names'].encode('utf-8'),
                    ))

            # Update the hash if everything finishes properly.
            logger.info("%s: Successfully crawled opinions." % site.court_id)
            if not download_error and not full_crawl:
                # Only update the hash if no errors occurred.
                dup_checker.update_site_hash(site.hash)
def download_documents(options):
    """We've got good values in the new columns, so just need to look those up,
    and get the documents from PACER.
    """
    f = open(options["input_file"], "r")
    dialect = csv.Sniffer().sniff(f.read(1024))
    f.seek(0)
    reader = csv.DictReader(f, dialect=dialect)
    q = options["queue"]
    throttle = CeleryThrottle(queue_name=q, min_items=options["queue_length"])
    session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD)
    session.login()
    for i, row in enumerate(reader):
        if i < options["offset"]:
            continue
        if i >= options["limit"] > 0:
            break
        throttle.maybe_wait()

        logger.info("Doing row %s: %s", i, row)

        docket_number = (row["cl_d_docket_number"]
                         or row["cl_d_docket_number (student)"] or None)

        if not docket_number:
            logger.warn("No docket number found for row: %s", i)
            continue
        court = Court.objects.get(
            fjc_court_id=row["AO ID"].rjust(2, "0"),
            jurisdiction=Court.FEDERAL_DISTRICT,
        )

        try:
            d = Docket.objects.get(docket_number=docket_number, court=court)
        except Docket.MultipleObjectsReturned:
            logger.warn("Multiple objects returned for row: %s", i)
            continue
        except Docket.DoesNotExist:
            logger.warn("Could not find docket for row: %s", i)
            continue

        # Got the docket, now get the documents from it, tag & OCR them.
        document_date = datetime.strptime(row["Date"], "%m/%d/%Y").date()
        des = d.docket_entries.filter(date_filed=document_date)
        count = des.count()
        if count == 0:
            logger.warn("No docket entries found for row: %s", i)
            continue
        elif des.count() == 1:
            good_des = [des[0]]
        else:
            # More than one item. Apply filtering rules.
            good_des = filter_des(des)

        # We've got our des, now download them.
        for de in good_des:
            rds = de.recap_documents.filter(
                document_type=RECAPDocument.PACER_DOCUMENT)
            for rd in rds:
                if not rd.pacer_doc_id:
                    logger.warn(
                        "Unable to get pacer_doc_id for item with "
                        "rd_pk: %s. Restricted document?",
                        rd.pk,
                    )
                    continue
                if options["task"] == "add_extra_tags":
                    # Wherein I belatedly realize we need a tag specifically
                    # for this part of the project.
                    add_tags(rd, TAG_NAME_OPINIONS)
                else:
                    # Otherwise, do the normal download thing.
                    chain(
                        get_pacer_doc_by_rd.s(rd.pk,
                                              session.cookies,
                                              tag=TAG_NAME).set(queue=q),
                        extract_recap_pdf.si(rd.pk).set(queue=q),
                        add_items_to_solr.si(
                            [rd.pk], "search.RECAPDocument").set(queue=q),
                    ).apply_async()
    f.close()
def lookup_row(row):
    """Lookup the row provided in the FJC DB.

    :param row: A row dict as pulled from the CSV using the csv DictReader
    :returns int: The PK of the row that matched.
    """
    try:
        plaintiff, defendant = row['Case Name'].lower().split(' v. ', 1)
    except IndexError:
        logger.warn("Unable to find ' v. ' in case name.")
        return
    except ValueError:
        logger.warn("Got multiple ' v. ' in the case name.")
        return
    opinion_date = datetime.strptime(row['Date'], '%m/%d/%Y')
    orig_query = FjcIntegratedDatabase.objects.filter(
        # All of these are civil.
        dataset_source=CV_2017,
        # Ensure the correct court.
        district__fjc_court_id=row['AO ID'],
        # The docket must have been filed *before* the date of the opinion.
        date_filed__lte=opinion_date,
        # But not more than five years prior to the opinion.
        date_filed__gte=opinion_date - timedelta(days=365 * 5),
    ).exclude(
        # FJC Ids are duplicated across bankruptcy and district. Since we only
        # know the FJC court ID, just exclude bankruptcy cases as a rule. That
        # will ensure we limit ourselves to the correct jurisdiction.
        district__jurisdiction=Court.FEDERAL_BANKRUPTCY,
    ).order_by('-date_filed')

    # Start with the strictest, then broaden when you fail. Truncate at 30
    # chars (that's all the field can contain).
    filter_tuples = [(
        # Try an exact match on case name.
        (),
        {
            'plaintiff__iexact': plaintiff[:30],
            'defendant__iexact': defendant[:30],
        }
    ), (
        # Try a starts with match on case name.
        (),
        {
            'plaintiff__istartswith': plaintiff[:30],
            'defendant__istartswith': defendant[:30],
        }
    ), (
        # To to find a match that contains the first three words from the
        # plaintiff and defendant (in any order). Note Q objects are args, not
        # kwargs, hence different format here.
        (make_party_q(defendant, 'defendant', slice(None, 3)),
         make_party_q(plaintiff, 'plaintiff', slice(None, 3))),
        {},
    ), (
        # Broaden. Try just the first word from plaintiff & defendant matching.
        (make_party_q(defendant, 'defendant', slice(None, 1)),
         make_party_q(plaintiff, 'plaintiff', slice(None, 1))),
        {},
    ), (
        # Explore. Try the second word of the plaintiff instead. It's often a
        # last name and worth a try.
        (make_party_q(plaintiff, 'plaintiff', slice(1, 2)),
         make_party_q(defendant, 'defendant', slice(None, 1))),
        {},
    )]

    for args, kwargs in filter_tuples:
        results = orig_query.filter(*args, **kwargs)
        count = results.count()
        if count == 0:
            logger.warn("Unable to find result (args: %s, kwargs: %s). "
                        "Broadening if possible." % (args, kwargs))
            continue
        if count == 1:
            logger.info("Got one result. Bingo (args: %s, kwargs: %s)." %
                        (args, kwargs))
            return results[0]
        elif 5 > count > 1:
            logger.info("Got %s results. Choosing closest to document date." %
                        count)
            return results[0]
        else:
            logger.warn("Got too many results. Cannot identify correct case "
                        "(args: %s, kwargs: %s)." % (args, kwargs))
            return
def lookup_row(row):
    """Lookup the row provided in the FJC DB.

    :param row: A row dict as pulled from the CSV using the csv DictReader
    :returns int: The PK of the row that matched.
    """
    try:
        plaintiff, defendant = row["Case Name"].lower().split(" v. ", 1)
    except IndexError:
        logger.warn("Unable to find ' v. ' in case name.")
        return
    except ValueError:
        logger.warn("Got multiple ' v. ' in the case name.")
        return
    opinion_date = datetime.strptime(row["Date"], "%m/%d/%Y")
    orig_query = (
        FjcIntegratedDatabase.objects.filter(
            # All of these are civil.
            dataset_source=CV_2017,
            # Ensure the correct court.
            district__fjc_court_id=row["AO ID"],
            # The docket must have been filed *before* the date of the opinion.
            date_filed__lte=opinion_date,
            # But not more than five years prior to the opinion.
            date_filed__gte=opinion_date - timedelta(days=365 * 5),
        ).exclude(
            # FJC Ids are duplicated across bankruptcy and district. Since we only
            # know the FJC court ID, just exclude bankruptcy cases as a rule. That
            # will ensure we limit ourselves to the correct jurisdiction.
            district__jurisdiction=Court.FEDERAL_BANKRUPTCY, ).order_by(
                "-date_filed"))

    # Start with the strictest, then broaden when you fail. Truncate at 30
    # chars (that's all the field can contain).
    filter_tuples = [
        (
            # Try an exact match on case name.
            (),
            {
                "plaintiff__iexact": plaintiff[:30],
                "defendant__iexact": defendant[:30],
            },
        ),
        (
            # Try a starts with match on case name.
            (),
            {
                "plaintiff__istartswith": plaintiff[:30],
                "defendant__istartswith": defendant[:30],
            },
        ),
        (
            # To to find a match that contains the first three words from the
            # plaintiff and defendant (in any order). Note Q objects are args, not
            # kwargs, hence different format here.
            (
                make_party_q(defendant, "defendant", slice(None, 3)),
                make_party_q(plaintiff, "plaintiff", slice(None, 3)),
            ),
            {},
        ),
        (
            # Broaden. Try just the first word from plaintiff & defendant matching.
            (
                make_party_q(defendant, "defendant", slice(None, 1)),
                make_party_q(plaintiff, "plaintiff", slice(None, 1)),
            ),
            {},
        ),
        (
            # Explore. Try the second word of the plaintiff instead. It's often a
            # last name and worth a try.
            (
                make_party_q(plaintiff, "plaintiff", slice(1, 2)),
                make_party_q(defendant, "defendant", slice(None, 1)),
            ),
            {},
        ),
    ]

    for args, kwargs in filter_tuples:
        results = orig_query.filter(*args, **kwargs)
        count = results.count()
        if count == 0:
            logger.warn("Unable to find result (args: %s, kwargs: %s). "
                        "Broadening if possible." % (args, kwargs))
            continue
        if count == 1:
            logger.info("Got one result. Bingo (args: %s, kwargs: %s)." %
                        (args, kwargs))
            return results[0]
        elif 5 > count > 1:
            logger.info("Got %s results. Choosing closest to document date." %
                        count)
            return results[0]
        else:
            logger.warn("Got too many results. Cannot identify correct case "
                        "(args: %s, kwargs: %s)." % (args, kwargs))
            return
Esempio n. 30
0
def get_and_save_free_document_reports(options: OptionsType) -> None:
    """Query the Free Doc Reports on PACER and get a list of all the free
    documents. Do not download those items, as that step is done later. For now
    just get the list.

    Note that this uses synchronous celery chains. A previous version was more
    complex and did not use synchronous chains. Unfortunately in Celery 4.2.0,
    or more accurately in redis-py 3.x.x, doing it that way failed nearly every
    time.

    This is a simpler version, though a slower one, but it should get the job
    done.
    """
    # Kill any *old* logs that report they're in progress. (They've failed.)
    three_hrs_ago = now() - timedelta(hours=3)
    PACERFreeDocumentLog.objects.filter(
        date_started__lt=three_hrs_ago,
        status=PACERFreeDocumentLog.SCRAPE_IN_PROGRESS,
    ).update(status=PACERFreeDocumentLog.SCRAPE_FAILED)

    cl_court_ids = (Court.federal_courts.district_pacer_courts().filter(
        in_use=True,
        end_date=None,
    ).exclude(pk__in=["casb", "gub", "ilnb", "innb", "miwb", "ohsb", "prb"
                      ]).values_list("pk", flat=True))
    pacer_court_ids = [map_cl_to_pacer_id(v) for v in cl_court_ids]
    today = now()
    for pacer_court_id in pacer_court_ids:
        while True:
            next_start_d, next_end_d = get_next_date_range(pacer_court_id)
            if next_end_d is None:
                logger.warn(f"Free opinion scraper for {pacer_court_id} still "
                            f"in progress.")
                break

            logger.info(
                "Attempting to get latest document references for "
                "%s between %s and %s",
                pacer_court_id,
                next_start_d,
                next_end_d,
            )
            mark_court_in_progress(pacer_court_id, next_end_d)
            try:
                status = get_and_save_free_document_report(
                    pacer_court_id, next_start_d, next_end_d)
            except (
                    RequestException,
                    ReadTimeoutError,
                    IndexError,
                    TypeError,
                    PacerLoginException,
            ) as exc:
                if isinstance(exc, (RequestException, ReadTimeoutError)):
                    reason = "network error."
                elif isinstance(exc, IndexError):
                    reason = "PACER 6.3 bug."
                elif isinstance(exc, TypeError):
                    reason = "failing PACER website."
                elif isinstance(exc, PacerLoginException):
                    reason = "PACER login issue."
                else:
                    reason = "unknown reason."
                logger.error(f"Failed to get free document references for "
                             f"{pacer_court_id} between {next_start_d} and "
                             f"{next_end_d} due to {reason}.")
                mark_court_done_on_date(
                    PACERFreeDocumentLog.SCRAPE_FAILED,
                    pacer_court_id,
                    next_end_d,
                )
                break

            mark_court_done_on_date(status, pacer_court_id, next_end_d)

            if status == PACERFreeDocumentLog.SCRAPE_SUCCESSFUL:
                if next_end_d >= today.date():
                    logger.info("Got all document references for '%s'.",
                                pacer_court_id)
                    # Break from while loop, onwards to next court
                    break
                else:
                    # More dates to do; let it continue
                    continue

            elif status == PACERFreeDocumentLog.SCRAPE_FAILED:
                logger.error("Encountered critical error on %s "
                             "(network error?). Marking as failed and "
                             "pressing on." % pacer_court_id)
                # Break from while loop, onwards to next court
                break