def upgrade_json(self, start, end, json_objects):
        """Upgrade the objects if possible.

        Throw an exception of the version types mismatch.
        """
        for obj in json_objects:
            logger.info("Reworking %s" % obj)
            if start == 1.0 and end == 1.1:
                j = json.loads(obj.json_data)
                j = self._upgrade_version_number(j, end)
                opinion_clusters = []
                for cluster in j['opinion_clusters']:
                    # Look up the ID, get the scdb_id value, and add it to the
                    # dict.
                    cluster_obj = OpinionCluster.objects.get(pk=cluster['id'])
                    cluster['scdb_id'] = getattr(cluster_obj, 'scdb_id', None)
                    opinion_clusters.append(cluster)
                j['opinion_clusters'] = opinion_clusters
                obj.json_data = json.dumps(j, indent=2)
                obj.save()

            else:
                raise NotImplementedError("Cannot upgrade from %s to %s" % (
                    start, end,
                ))
 def handle(self, *args, **options):
     super(Command, self).handle(*args, **options)
     logger.info("Using PACER username: %s" % PACER_USERNAME)
     if options['task'] == 'attachment_pages':
         get_attachment_pages(options)
     elif options['task'] == 'documents':
         get_documents(options)
def update_csv_with_idb_lookups(options):
    """Take in the CSV from the command line and update it with fields from
    our local IDB database, if we can find the value in there.
    """
    with open(options['input_file'], 'r') as f, \
            open('/tmp/final-pull-annotated.csv', 'wb') as o:
        dialect = csv.Sniffer().sniff(f.read(1024))
        f.seek(0)
        reader = csv.DictReader(f, dialect=dialect)
        out_fields = reader.fieldnames + ['fjc_id', 'docket_number',
                                          'case_name']
        writer = csv.DictWriter(o, fieldnames=out_fields)
        writer.writeheader()
        for i, row in enumerate(reader):
            if i < options['offset']:
                continue
            if i >= options['limit'] > 0:
                break
            logger.info("Doing row with contents: '%s'" % row)
            result = lookup_row(row)
            logger.info(result)
            if result is not None:
                row.update({
                    'fjc_id': result.pk,
                    'docket_number': result.docket_number,
                    'case_name': '%s v. %s' % (result.plaintiff,
                                               result.defendant)
                })
            if not options['log_only']:
                writer.writerow(row)
def get_dockets(options, items, tags, sample_size=0, doc_num_end=''):
    """Download dockets from PACER.

    :param options: Options provided by argparse
    :param items: Items from our FJC IDB database
    :param tags: A list of tag names to associate with the purchased content.
    :param sample_size: The number of items to get. If 0, get them all. Else,
    get only this many and do it randomly.
    :param doc_num_end: Only get docket numbers up to this value to constrain
    costs. If set to an empty string, no constraints are applied. Note that
    applying this value means no unnumbered entries will be retrieved by PACER.
    """

    if sample_size > 0:
        items = items.order_by('?')[:sample_size]

    q = options['queue']
    throttle = CeleryThrottle(queue_name=q)
    session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD)
    session.login()
    for i, row in enumerate(items):
        if i < options['offset']:
            continue
        if i >= options['limit'] > 0:
            break

        if i % 5000 == 0:
            # Re-authenticate just in case the auto-login mechanism isn't
            # working.
            session = PacerSession(username=PACER_USERNAME,
                                   password=PACER_PASSWORD)
            session.login()

        # All tests pass. Get the docket.
        logger.info("Doing row %s: %s", i, row)

        throttle.maybe_wait()
        params = make_fjc_idb_lookup_params(row)
        chain(
            get_pacer_case_id_and_title.s(
                pass_through=None,
                docket_number=row.docket_number,
                court_id=row.district_id,
                cookies=session.cookies,
                **params
            ).set(queue=q),
            filter_docket_by_tags.s(tags, row.district_id).set(queue=q),
            get_docket_by_pacer_case_id.s(
                court_id=row.district_id,
                cookies=session.cookies,
                tag_names=tags,
                **{
                    'show_parties_and_counsel': True,
                    'show_terminated_parties': True,
                    'show_list_of_member_cases': False,
                    'doc_num_end': doc_num_end,
                }
            ).set(queue=q),
            add_or_update_recap_docket.s().set(queue=q),
        ).apply_async()
def get_pacer_dockets(options, docket_pks, tags):
    """Get the pacer dockets identified by the FJC IDB rows"""
    q = options['queue']
    throttle = CeleryThrottle(queue_name=q)
    pacer_session = None
    for i, docket_pk in enumerate(docket_pks):
        if i < options['offset']:
            continue
        if i >= options['limit'] > 0:
            break
        throttle.maybe_wait()
        if i % 1000 == 0 or pacer_session is None:
            pacer_session = PacerSession(username=PACER_USERNAME,
                                         password=PACER_PASSWORD)
            pacer_session.login()
            logger.info("Sent %s tasks to celery so far." % i)
        d = Docket.objects.get(pk=docket_pk)
        chain(
            get_docket_by_pacer_case_id.s(
                {'pacer_case_id': d.pacer_case_id,
                 'docket_pk': d.pk},
                d.court_id,
                cookies=pacer_session.cookies,
                tag_names=tags,
                **{'show_parties_and_counsel': True,
                   'show_terminated_parties': True,
                   'show_list_of_member_cases': False}
            ).set(queue=q),
            add_or_update_recap_docket.s().set(queue=q),
        ).apply_async()
def upload_pdfs_to_internet_archive(options, do_non_free=False):
    """Upload items to the Internet Archive."""
    q = options['queue']
    rds = RECAPDocument.objects.filter(
        Q(ia_upload_failure_count__lt=3) | Q(ia_upload_failure_count=None),
        is_available=True,
        filepath_ia='',
    ).exclude(
        filepath_local='',
    ).values_list(
        'pk',
        flat=True,
    ).order_by()
    if do_non_free:
        rds = rds.filter(Q(is_free_on_pacer=False) | Q(is_free_on_pacer=None))
    else:
        rds = rds.filter(is_free_on_pacer=True)

    count = rds.count()
    logger.info("Sending %s items to Internet Archive.", count)
    throttle = CeleryThrottle(queue_name=q)
    for i, rd in enumerate(rds):
        throttle.maybe_wait()
        if i > 0 and i % 1000 == 0:
            logger.info("Sent %s/%s tasks to celery so far.", i, count)
        upload_pdf_to_ia.si(rd).set(queue=q).apply_async()
 def handle(self, *args, **options):
     super(Command, self).handle(*args, **options)
     logger.info("Using PACER username: %s" % PACER_USERNAME)
     if options['task'] == 'all_dockets':
         get_dockets(options)
     elif options['task'] == 'all_petitions':
         get_petitions(options)
    def handle(self, *args, **options):
        super(Command, self).handle(*args, **options)

        # Get attorneys that have roles on more than one docket
        roles = Role.objects.values('attorney_id').annotate(
            Count('id')).order_by().filter(id__count__gt=1)
        logger.info("Got %s attorneys that are on more than one docket." %
                    roles.count())

        # That returns a list of dictionaries like:
        # {'attorney_id': 1, 'id__count': 2}
        for i, role in enumerate(roles):
            if i >= options['count'] > 0:
                break
            orig_atty_id = role['attorney_id']
            atty = Attorney.objects.get(pk=orig_atty_id)
            dockets = Docket.objects.filter(
                role__attorney=atty,
            ).order_by(
                'date_created',
            ).distinct()
            logger.info("Got %s dockets for attorney %s. Cloning them all." %
                        (dockets.count(), atty))

            for docket in dockets[1:]:
                clone_attorney(orig_atty_id, atty, docket)
    def generate_data(self):
        """Make a CSV of the data extracted from the database.

        CSV will have the following format:
            Court, Name, Title, Count, 2000, 2011...

        {
            'ca2': {
                "harold baller": {
                    "Mag judge": {
                        "years": {
                            "1999': 22,
                            "2000': 14,
                        },
                        'total count': 36,
                    },
                }
            }
        }
        """
        courts = Court.objects.filter(
            jurisdiction__in=Court.FEDERAL_JURISDICTIONS,
        )
        out = {}
        for court in courts:
            out[court.pk] = {}
            dockets = (court.dockets
                       .exclude(Q(assigned_to_str='') & Q(referred_to_str=''))
                       .filter(source__in=Docket.RECAP_SOURCES)
                       .only('assigned_to_str', 'referred_to_str',
                             'date_filed'))
            logger.info("Processing %s dockets in %s" % (dockets.count(),
                                                         court.pk))
            for docket in dockets:
                for judge_type in ['assigned', 'referred']:
                    judge = getattr(docket, '%s_to_str' % judge_type)
                    if not judge:
                        continue

                    name, title = normalize_judge_string(unidecode(judge))
                    if not name:
                        continue
                    if name not in out[court.pk]:
                        # No entry for this person.
                        out[court.pk][name] = {
                            title: Counter([docket.date_filed.year]),
                        }
                    else:
                        # Person already exists.
                        if title not in out[court.pk][name]:
                            # Title not yet found.
                            out[court.pk][name][title] = Counter(
                                [docket.date_filed.year])
                        else:
                            # Title already exists.
                            out[court.pk][name][title][
                                docket.date_filed.year] += 1

        self.export_files(out)
def clear_old_values(do_it, debug):
    """Clear out the old values in the ftm fields. If debug or do_it is False,
    don't clear the values.
    """
    if not do_it or debug:
        return
    logger.info("Clearing out all old values in FTM fields.")
    Person.objects.all().update(ftm_eid='', ftm_total_received=None)
 def handle(self, *args, **options):
     logger.info("Using PACER username: %s"% PACER_USERNAME)
     if options['task'] == 'first_pass':
         self.do_first_pass(options)
     elif options['task'] == 'second_pass':
         self.do_second_pass(options)
     elif options['task'] == 'update_case_ids':
         self.update_any_missing_pacer_case_ids(options)
 def handle(self, *args, **options):
     super(Command, self).handle(*args, **options)
     logger.info("Using PACER username: %s" % PACER_USERNAME)
     if options['task'] in ['district', 'appellate']:
         if not options['file']:
             raise argparse.ArgumentError(
                 "The 'file' argument is required for that action.")
         get_dockets(options)
     elif options['task'] == 'district_attachments':
         get_district_attachment_pages(options)
 def handle(self, *args, **options):
     super(Command, self).handle(*args, **options)
     logger.info("Using PACER username: %s" % PACER_USERNAME)
     main_query = build_main_query_from_query_string(
         QUERY_STRING,
         {'rows': 10000, 'fl': ['id', 'docket_id']},
         {'group': False, 'facet': False},
     )
     docket_ids = get_docket_ids(main_query)
     get_pacer_dockets(options, docket_ids, [BAL_TAG, BAL_TAG_2019])
 def handle(self, *args, **options):
     super(Command, self).handle(*args, **options)
     self.options = options
     self.json_objects = JSONVersion.objects.all()
     self.num_objects = self.json_objects.count()
     logger.info("Acting on %s objects" % self.num_objects)
     self.upgrade_json(
         start=options['start'],
         end=options['end'],
         json_objects=self.json_objects,
     )
def download_dockets(options):
    """Download dockets listed in the spreadsheet."""
    with open(options['input_file'], 'r') as f:
        dialect = csv.Sniffer().sniff(f.read(1024))
        f.seek(0)
        reader = csv.DictReader(f, dialect=dialect)
        q = options['queue']
        task = options['task']
        throttle = CeleryThrottle(queue_name=q,
                                  min_items=options['queue_length'])
        session = PacerSession(username=PACER_USERNAME,
                               password=PACER_PASSWORD)
        session.login()
        for i, row in enumerate(reader):
            if i < options['offset']:
                continue
            if i >= options['limit'] > 0:
                break
            throttle.maybe_wait()

            logger.info("Doing row %s: %s", i, row)

            if row['idb_docket_number']:
                if task == 'download_student_dockets':
                    continue
                # Zero-pad the docket number up to seven digits because Excel
                # ate the leading zeros that these would normally have.
                docket_number = row['idb_docket_number'].rjust(7, '0')
            elif row['student_docket_number']:
                # Use the values collected by student
                # researchers, then cleaned up my mlr.
                docket_number = row['student_docket_number']
            else:
                # No docket number; move on.
                continue
            court = Court.objects.get(fjc_court_id=row['AO ID'].rjust(2, '0'),
                                      jurisdiction=Court.FEDERAL_DISTRICT)
            chain(
                get_pacer_case_id_and_title.s(
                    pass_through=None,
                    docket_number=docket_number,
                    court_id=court.pk,
                    cookies=session.cookies,
                    case_name=row['Case Name'],
                ).set(queue=q),
                get_docket_by_pacer_case_id.s(
                    court_id=court.pk,
                    cookies=session.cookies,
                    tag_names=[TAG_NAME],
                ).set(queue=q),
                add_or_update_recap_docket.s().set(queue=q),
            ).apply_async()
    def map_judges_to_photos(self):
        """Identify which of the judges in the DB have photos.

        We iterate over the entire collection of judges, identifying which have
        photos. We could instead iterate over the photos, but that increases
        the risk of duplicate issues.
        """
        # Create a dict of judge paths, mapping paths to empty lists.
        judge_paths = os.listdir(os.path.join(judge_root, 'orig'))
        judge_map = {}
        for path in judge_paths:
            judge_map[path] = []

        # Iterate over the people, attempting to look them up in the list
        people = Person.objects.filter(is_alias_of=None)
        for person in people:
            for name in self.make_slugs(person):
                if name in judge_map:
                    # If there's a hit, add the path to the dict of judge paths.
                    judge_map[name].append(person)
                    break

        # After iterating, set all people to not have photos.
        if not self.debug:
            people.update(has_photo=False)

        found = 0
        missed = 0
        multi = 0
        for path, people in judge_map.items():
            if len(people) == 0:
                logger.warn("Did not find a judge for %s" % path)
                missed += 1
            if len(people) == 1:
                person = people[0]
                found += 1
                if not self.debug:
                    logger.info("Updating judge %s" % person)
                    person.has_photo = True
                    person.save()
            if len(people) > 1:
                logger.warn("Found more than one match for %s:" % path)
                for person in people:
                    logger.warn("Found: %s - %s" % (person, granular_date(
                        person,
                        'date_dob',
                        iso=True,
                    )))
                multi += 1

        logger.info("\n\n%s Matches\n%s Missed\n%s Multiple results" %
                    (found, missed, multi))
def get_docket_ids(main_query):
    """Get the docket IDs for a query dict.

    :returns: a set() of docket IDs
    """
    si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode='r')
    results = si.query().add_extra(**main_query).execute()
    docket_ids = set()

    for result in results:
        docket_ids.add(result['docket_id'])

    logger.info("Got %s docket IDs back from Solr." % len(docket_ids))
    return docket_ids
Beispiel #18
0
 def handle(self, *args, **options):
     logger.info("Using PACER username: %s" % PACER_USERNAME)
     if options['task'] == 'dockets_nos_700_sample':
         get_nos_700_docket_sample(options)
     elif options['task'] == 'dockets_nos_700_all':
         get_nos_700_full(options)
     elif options['task'] == 'attachments_nos_700':
         get_attachment_pages(options, TAG_NOS_700)
     elif options['task'] == 'dockets_cand_sample':
         get_cand_docket_sample(options)
     elif options['task'] == 'dockets_cand_all':
         get_cand_full(options)
     elif options['task'] == 'attachments_cand':
         get_attachment_pages(options, TAG_CAND)
def get_attachment_pages(options):
    """Find docket entries that look like invoices and get their attachment
    pages.
    """
    page_size = 100
    main_query = build_main_query_from_query_string(
        Q_DOCS_ONLY,
        {'rows': page_size, 'fl': ['id', 'docket_id']},
        {'group': False, 'facet': False},
    )
    si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode='r')
    results = si.query().add_extra(**main_query)

    q = options['queue']
    recap_user = User.objects.get(username='******')
    throttle = CeleryThrottle(queue_name=q,
                              min_items=options['queue_length'])
    session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD)
    session.login()
    paginator = Paginator(results, page_size)
    i = 0
    for page_number in range(1, paginator.num_pages + 1):
        paged_results = paginator.page(page_number)
        for result in paged_results.object_list:
            if i < options['offset']:
                i += 1
                continue
            if i >= options['limit'] > 0:
                break

            logger.info("Doing row %s: rd: %s, docket: %s", i, result['id'],
                        result['docket_id'])
            throttle.maybe_wait()
            chain(
                # Query the attachment page and process it
                get_attachment_page_by_rd.s(
                    result['id'], session.cookies).set(queue=q),
                # Take that in a new task and make a PQ object
                make_attachment_pq_object.s(
                    result['id'], recap_user.pk).set(queue=q),
                # And then process that using the normal machinery.
                process_recap_attachment.s(tag_names=[TAG_PHASE_1]).set(queue=q),
            ).apply_async()
            i += 1
        else:
            # Inner loop exited normally (didn't "break")
            continue
        # Inner loop broke. Break outer loop too.
        break
    def set_if_falsy(obj, attribute, new_value):
        """Check if the value passed in is Falsy. If so, set it to the value of
        new_value.

        return ok: Whether the item was set successfully
        """
        current_value = getattr(obj, attribute)
        if current_value is not None and isinstance(current_value, basestring):
            current_value = current_value.strip()

        does_not_currently_have_a_value = not current_value
        current_value_not_zero = current_value != 0
        new_value_not_blank = new_value.strip() != ''
        ok = True
        if all([does_not_currently_have_a_value, current_value_not_zero,
                new_value_not_blank]):
            logger.info("Updating %s with %s." %
                        (attribute, new_value.encode('utf-8')))
            setattr(obj, attribute, new_value)
        else:
            # Report if there's a difference -- that might spell trouble.
            values_differ = False
            if (isinstance(current_value, basestring) and
                    isinstance(new_value, basestring) and
                    ''.join(current_value.split()) != ''.join(new_value.split())):
                # Handles strings and normalizes them for comparison.
                values_differ = True
            elif (isinstance(current_value, int) and
                  current_value != int(new_value)):
                # Handles ints, which need no normalization for comparison.
                values_differ = True

            if values_differ:
                logger.warn(
                    "WARNING: Didn't set '{attr}' attribute on obj {obj_id} "
                    "because it already had a value, but the new value "
                    "('{new}') differs from current value ('{current}')".format(
                        attr=attribute,
                        obj_id=obj.pk,
                        new=new_value,
                        current=force_bytes(current_value),
                    )
                )
                ok = False
            else:
                # The values were the same.
                logger.info("'%s' field unchanged -- old and new values were "
                            "the same: %s" % (attribute, new_value))
        return ok
 def do_solr(options):
     """Update Solr if requested, or report if not."""
     if options['update_solr']:
         call_command(
             'cl_update_index',
             '--type', 'search.Opinion',
             '--solr-url', settings.SOLR_OPINION_URL,
             '--noinput',
             '--update',
             '--everything',
             '--do-commit',
         )
     else:
         logger.info("\nSolr index not updated. You may want to do so "
                     "manually.\n")
 def handle(self, *args, **options):
     super(Command, self).handle(*args, **options)
     dockets = Docket.objects.filter(
         docket_entries__recap_documents__ocr_status=RECAPDocument.OCR_FAILED
     ).distinct()
     for docket in dockets:
         docket_path = docket.filepath_local.path
         try:
             logger.info("Doing docket at: %s" % docket_path)
             pacer_doc = CleanupPacerXMLParser(docket_path)
         except IOError:
             logger.info("Couldn't find docket at: %s" % docket_path)
         else:
             _ = pacer_doc.make_documents(
                 docket,
                 debug=options['debug'],
             )
 def handle(self, *args, **options):
     logger.info("Using PACER username: %s" % PACER_USERNAME)
     if options['task'] == 'everything':
         get_everything_full(options)
     elif options['task'] == 'everything_sample_50':
         get_everything_sample(options, 50)
     elif options['task'] == 'everything_sample_10000':
         # See email dated 2019-01-06
         get_everything_sample(options, 10000)
     elif options['task'] == 'price_sample_30':
         price_sample(options, '30')
     elif options['task'] == 'price_sample_40':
         price_sample(options, '40')
     elif options['task'] == 'price_sample_50':
         price_sample(options, '50')
     elif options['task'] == '2018_only':
         get_content_by_year(options, 2018)
def upload_oral_arguments_to_internet_archive(options):
    """Upload oral arguments to the Internet Archive"""
    q = options['queue']
    af_pks = Audio.objects.filter(Q(ia_upload_failure_count__lt=3) |
                               Q(ia_upload_failure_count=None),
                               filepath_ia='')\
        .exclude(local_path_mp3='')\
        .values_list('pk', flat=True)\
        .order_by()
    count = len(af_pks)
    logger.info("Sending %s oral argument files to Internet Archive", count)
    throttle = CeleryThrottle(queue_name=q)
    for i, af_pk in enumerate(af_pks):
        throttle.maybe_wait()
        if i > 0 and i % 1000 == 0:
            logger.info("Sent %s/%s tasks to celery so far.", i, count)
        upload_audio_to_ia.si(af_pk).set(queue=q).apply_async()
 def import_all(self):
     datadir = self.options['input_file']
     logger.info('importing presidents...')
     self.import_presidents(infile=datadir+'/presidents.xlsx')
     logger.info('importing FJC judges...')
     self.import_fjc_judges(infile=datadir+'/fjc-data.xlsx')
     logger.info('importing state supreme court judges...')
     self.import_state_judges(infile=datadir+'/state-supreme-court-bios-2016-04-06.xlsx')
     logger.info('importing state IAC judges...')
     self.import_state_judges(infile=datadir+'/state-iac-bios-2016-04-06.xlsx')
def get_dockets(options):
    """Download the dockets described in the CSV
    """
    f = options['file']
    reader = csv.DictReader(f)
    q = options['queue']
    throttle = CeleryThrottle(queue_name=q)
    pacer_session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD)
    pacer_session.login()
    for i, row in enumerate(reader):
        if i < options['offset']:
            continue
        if i >= options['limit'] > 0:
            break

        if i % 1000 == 0:
            pacer_session = PacerSession(username=PACER_USERNAME,
                                         password=PACER_PASSWORD)
            pacer_session.login()
            logger.info("Sent %s tasks to celery so far." % i)
        logger.info("Doing row %s", i)
        throttle.maybe_wait()
        chain(
            get_pacer_case_id_and_title.s(
                pass_through=None,
                docket_number=make_docket_number(row['filecy'], row['docket']),
                court_id='ilnb',
                cookies=pacer_session.cookies,
                office_number=row['office'],
                docket_number_letters='bk',
            ).set(queue=q),
            get_docket_by_pacer_case_id.s(
                court_id='ilnb',
                cookies=pacer_session.cookies,
                tag_names=[TAG],
                **{
                    'show_parties_and_counsel': True,
                    'show_terminated_parties': True,
                    'show_list_of_member_cases': True
                }
            ).set(queue=q),
            add_or_update_recap_docket.s().set(queue=q),
        ).apply_async()
def do_ocr(options):
    """Do the OCR for any items that need it, then save to the solr index."""
    q = options['queue']
    rds = RECAPDocument.objects.filter(
        ocr_status=RECAPDocument.OCR_NEEDED,
    ).values_list('pk', flat=True).order_by()
    count = rds.count()
    throttle = CeleryThrottle(queue_name=q)
    for i, pk in enumerate(rds):
        throttle.maybe_wait()
        if options['index']:
            extract_recap_pdf.si(pk, skip_ocr=False).set(queue=q).apply_async()
        else:
            chain(
                extract_recap_pdf.si(pk, skip_ocr=False).set(queue=q),
                add_docket_to_solr_by_rds.s().set(queue=q),
            ).apply_async()
        if i % 1000 == 0:
            logger.info("Sent %s/%s tasks to celery so far." % (i + 1, count))
    def handle(self, *args, **options):
        super(Command, self).handle(*args, **options)
        self.debug = options['debug']
        self.file = options['file']
        self.skip_human_review = options['skip_human_review']
        if self.skip_human_review and not self.debug:
            raise CommandError('Cannot skip review without --debug flag.')
        if self.skip_human_review:
            self.skipped_count = 0

        self.iterate_scdb_and_take_actions(
            action_zero=lambda *args, **kwargs: None,
            action_one=self.enhance_item_with_scdb,
            action_many=self.get_human_review,
            start_row=options['start_at'],
        )

        if self.skip_human_review:
            logger.info("\nSkipped %s items in SCDB which came up for human "
                        "review." % self.skipped_count)
def get_pdfs(options):
    """Get PDFs for the results of the Free Document Report queries.

    At this stage, we have rows in the PACERFreeDocumentRow table, each of
    which represents a PDF we need to download and merge into our normal
    tables: Docket, DocketEntry, and RECAPDocument.

    In this function, we iterate over the entire table of results, merge it
    into our normal tables, and then download and extract the PDF.

    :return: None
    """
    q = options['queue']
    index = options['index']
    cnt = CaseNameTweaker()
    rows = PACERFreeDocumentRow.objects.filter(error_msg="").only('pk')
    count = rows.count()
    task_name = "downloading"
    if index:
        task_name += " and indexing"
    logger.info("%s %s items from PACER." % (task_name, count))
    throttle = CeleryThrottle(queue_name=q)
    completed = 0
    for row in queryset_generator(rows):
        throttle.maybe_wait()
        if completed % 30000 == 0:
            pacer_session = PacerSession(username=PACER_USERNAME,
                                         password=PACER_PASSWORD)
            pacer_session.login()
        c = chain(
            process_free_opinion_result.si(row.pk, cnt).set(queue=q),
            get_and_process_pdf.s(pacer_session.cookies, row.pk).set(queue=q),
            delete_pacer_row.s(row.pk).set(queue=q),
        )
        if index:
            c |= add_items_to_solr.s('search.RECAPDocument').set(queue=q)
        c.apply_async()
        completed += 1
        if completed % 1000 == 0:
            logger.info("Sent %s/%s tasks to celery for %s so "
                        "far." % (completed, count, task_name))
    def send_emails(self, recipients):
        """Send the emails using the templates and contexts requested."""
        txt_template = loader.get_template('emails/welcome_email.txt')
        messages = []
        for recipient in recipients:
            email_txt = txt_template.render({'name': recipient.first_name})
            messages.append(EmailMultiAlternatives(
                subject='Welcome to CourtListener and Free Law Project',
                body=email_txt,
                from_email='Mike Lissner <*****@*****.**>',
                to=[recipient.email],
                headers={'X-Entity-Ref-ID': 'welcome.email:%s' % recipient.pk}
            ))

        if not self.options['simulate']:
            connection = get_connection()
            connection.send_messages(messages)
            logger.info("Sent %s daily welcome emails." % len(messages))
        else:
            sys.stdout.write('Simulation mode. Imagine that we just sent %s '
                             'welcome emails!\n' % len(messages))
Beispiel #31
0
    def iterate_scdb_and_take_actions(self,
                                      action_zero,
                                      action_one,
                                      action_many,
                                      start_row=0):
        """Iterates over the SCDB, looking for a single match for every item. If
        a single match is identified it takes the action in the action_one
        function using the Cluster identified and the dict of the SCDB
        information.

        If zero or many results are found it runs the action_zero or action_many
        functions. The action_many function takes the QuerySet of Clusters and
        the dict of SCDB info as parameters and returns the single item in
        the QuerySet that should have action_one performed on it.

        The action_zero function takes only the dict of SCDB information, and
        uses that to construct or identify a Cluster object that should have
        action_one performed on it.

        If action_zero or action_many return None, no action is taken.
        """
        f = open(self.file)
        dialect = csv.Sniffer().sniff(f.read(1024))
        f.seek(0)
        reader = csv.DictReader(f, dialect=dialect)
        for i, d in enumerate(reader):
            # Iterate over every item, looking for matches in various ways.
            if i < start_row:
                continue
            logger.info("\nRow is: %s. ID is: %s (%s)", i, d["caseId"],
                        d["caseName"])

            clusters = OpinionCluster.objects.none()
            cluster_count = clusters.count()
            if cluster_count == 0:
                logger.info("Checking scdb_id for SCDB field 'caseID'...")
                clusters = OpinionCluster.objects.filter(scdb_id=d["caseId"])
                cluster_count = clusters.count()
                logger.info("%s matches found.", cluster_count)
            if d["usCite"].strip() and cluster_count == 0:
                # None found by scdb_id. Try by citation number. Only do these
                # lookups if there is a usCite value, as newer cases don't yet
                # have citations.
                logger.info("Checking by federal citation")
                clusters = OpinionCluster.objects.filter(citation=d["usCite"],
                                                         scdb_id="")
                cluster_count = clusters.count()
                logger.info("%s matches found.", cluster_count)

            # At this point, we need to start getting more experimental b/c
            # the easy ways to find items did not work. Items matched here
            # are ones that lack citations.
            if cluster_count == 0:
                # try by date and then winnow by docket number
                logger.info("Checking by date...")
                clusters = OpinionCluster.objects.filter(
                    date_filed=datetime.strptime(d["dateDecision"],
                                                 "%m/%d/%Y"),
                    docket__court_id="scotus",
                    scdb_id="",
                )
                cluster_count = clusters.count()
                if cluster_count == 1:
                    # Winnow these by name too. Date isn't enough.
                    clusters = self.winnow_by_case_name(clusters, d)
                    cluster_count = clusters.count()
                logger.info("%s matches found.", cluster_count)

            if cluster_count > 1:
                if d["docket"]:
                    logger.info("Winnowing by docket number...")
                    clusters = self.winnow_by_docket_number(clusters, d)
                    cluster_count = clusters.count()
                    logger.info("%s matches found.", cluster_count)
                else:
                    logger.info(
                        "Cannot winnow by docket number -- there isn't one.")

            if cluster_count > 1:
                logger.info("Winnowing by case name...")
                clusters = self.winnow_by_case_name(clusters, d)
                cluster_count = clusters.count()
                logger.info("%s matches found.", cluster_count)

            # Searching complete, run actions.
            if cluster_count == 0:
                logger.info("No items found.")
                cluster = action_zero(d)
            elif cluster_count == 1:
                logger.info("Exactly one match found.")
                cluster = clusters[0]
            else:
                logger.info("%s items found:", cluster_count)
                cluster = action_many(clusters, d)

            if cluster is not None:
                action_one(cluster, d)
            else:
                logger.info("OK. No changes will be made.")

        f.close()
Beispiel #32
0
    def run_query(self, alert, rate):
        results = []
        cd = {}
        logger.info("Now running the query: %s\n" % alert.query)

        # Make a dict from the query string.
        qd = QueryDict(alert.query.encode(), mutable=True)
        try:
            del qd["filed_before"]
        except KeyError:
            pass
        qd["order_by"] = "score desc"
        cut_off_date = get_cut_off_date(rate)
        # Default to 'o', if not available, according to the front end.
        query_type = qd.get("type", SEARCH_TYPES.OPINION)
        if query_type in [SEARCH_TYPES.OPINION, SEARCH_TYPES.RECAP]:
            qd["filed_after"] = cut_off_date
        elif query_type == SEARCH_TYPES.ORAL_ARGUMENT:
            qd["argued_after"] = cut_off_date
        logger.info("Data sent to SearchForm is: %s\n" % qd)
        search_form = SearchForm(qd)
        if search_form.is_valid():
            cd = search_form.cleaned_data

            if (
                rate == Alert.REAL_TIME
                and len(self.valid_ids[query_type]) == 0
            ):
                # Bail out. No results will be found if no valid_ids.
                return query_type, results

            main_params = search_utils.build_main_query(cd, facet=False)
            main_params.update(
                {
                    "rows": "20",
                    "start": "0",
                    "hl.tag.pre": "<em><strong>",
                    "hl.tag.post": "</strong></em>",
                    "caller": "cl_send_alerts:%s" % query_type,
                }
            )

            if rate == Alert.REAL_TIME:
                main_params["fq"].append(
                    "id:(%s)"
                    % " OR ".join([str(i) for i in self.valid_ids[query_type]])
                )

            # Ignore warnings from this bit of code. Otherwise, it complains
            # about the query URL being too long and having to POST it instead
            # of being able to GET it.
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                results = (
                    self.sis[query_type]
                    .query()
                    .add_extra(**main_params)
                    .execute()
                )
            regroup_snippets(results)

        logger.info("There were %s results." % len(results))
        return qd, results
    def do_second_pass(options):
        """In the first pass, we ignored the duplicates that we got, preferring
        to let them stack up for later analysis. In this pass, we attempt to
        merge those failed items into the DB by more aggressive filtering and
        algorithmic selection.
        """
        idb_rows = FjcIntegratedDatabase.objects.filter(
            dataset_source=CV_2017,
            docket__isnull=True,
        ).order_by("pk")
        for i, idb_row in enumerate(queryset_generator(idb_rows)):
            # Iterate over all items in the IDB and find them in the Docket
            # table. If they're not there, create a new item.
            if i < options["offset"]:
                continue
            if i >= options["limit"] > 0:
                break

            ds = (Docket.objects.filter(
                docket_number_core=idb_row.docket_number,
                court=idb_row.district,
                docket_number__startswith="%s:" % idb_row.office,
            ).exclude(docket_number__icontains="cr").exclude(
                case_name__icontains="sealed").exclude(
                    case_name__icontains="suppressed").exclude(
                        case_name__icontains="search warrant"))
            count = ds.count()

            if count == 0:
                logger.info("%s: Creating new docket for IDB row: %s", i,
                            idb_row)
                create_new_docket_from_idb(idb_row.pk)
                continue
            elif count == 1:
                d = ds[0]
                logger.info("%s: Merging Docket %s with IDB row: %s", i, d,
                            idb_row)
                merge_docket_with_idb(d.pk, idb_row.pk)
                continue

            logger.info(
                "%s: Still have %s results after office and civil "
                "docket number filtering. Filtering further.",
                i,
                count,
            )

            case_names = []
            for d in ds:
                case_name = harmonize(d.case_name)
                parts = case_name.lower().split(" v. ")
                if len(parts) == 1:
                    case_names.append(case_name)
                elif len(parts) == 2:
                    plaintiff, defendant = parts[0], parts[1]
                    case_names.append("%s v. %s" %
                                      (plaintiff[0:30], defendant[0:30]))
                elif len(parts) > 2:
                    case_names.append(case_name)
            idb_case_name = harmonize("%s v. %s" %
                                      (idb_row.plaintiff, idb_row.defendant))
            results = find_best_match(case_names,
                                      idb_case_name,
                                      case_sensitive=False)

            if results["ratio"] > 0.65:
                logger.info(
                    "%s Found good match by case name for %s: %s",
                    i,
                    idb_case_name,
                    results["match_str"],
                )
                d = ds[results["match_index"]]
                merge_docket_with_idb(d.pk, idb_row.pk)
            else:
                logger.info(
                    "%s No good match after office and case name "
                    "filtering. Creating new item: %s",
                    i,
                    idb_row,
                )
                create_new_docket_from_idb(idb_row.pk)
Beispiel #34
0
def get_from_ia(reporter, volume):
    """
    Download cases from internet archive via case law and write them to
    disk.

    :param reporter: (str) Requires a reporter abbreviation to identify
    cases to download as used by IA.  (Ex. T.C. => tc)
    :param volume: (int) Specific volume number of the reporter.  If blank
    function will cycle through all volumes of the reporter on IA.
    :return: None
    """

    logger.info("Creating IA session...")
    access_key = settings.IA_ACCESS_KEY
    secret_key = settings.IA_SECRET_KEY
    ia_session = ia.get_session(
        {"s3": {
            "access": access_key,
            "secret": secret_key
        }})

    reporter_key = ".".join(["law.free.cap", reporter])

    # Checks that the returned reporter is the requested one.
    # Ex. searching for Mich will return both Mich-app. and Mich.
    for ia_identifier in ia_session.search_items(reporter_key):
        logger.info("Got ia identifier: %s" % ia_identifier)
        ia_key = ia_identifier["identifier"]
        if ia_key.split(".")[3] != reporter:
            continue

        # Checks if we requested a specific volume of the
        # reporter and if so skips all other volumes of that reporter
        ia_volume = ia_key.split(".")[-1]
        if volume is not None:
            if volume != ia_volume:
                continue

        ia_item = ia_session.get_item(ia_key)
        for item in ia_item.get_files():
            logger.info("Got item with name: %s" % item.name)
            if "json.json" in item.name:
                continue

            if "json" not in item.name:
                continue

            url = "https://archive.org/download/%s/%s" % (ia_key, item.name)
            file_path = os.path.join(
                settings.MEDIA_ROOT,
                "harvard_corpus",
                "%s" % ia_key,
                "%s" % item.name,
            )
            directory = file_path.rsplit("/", 1)[0]
            if os.path.exists(file_path):
                logger.info("Already captured: %s", url)
                continue

            logger.info("Capturing: %s", url)
            mkdir_p(directory)
            data = requests.get(url, timeout=10).json()
            with open(file_path, "w") as outfile:
                json.dump(data, outfile, indent=2)
Beispiel #35
0
def do_many(dir_path, limit, random_order, status_interval, newcases,
            skipdupes, skip_newcases, avoid_nocites, courtdates, startfolder,
            startfile, debug):
    """Runs through a directory of the form /data/[state]/[sub]/.../[folders]/[.xml documents].
    Parses each .xml document, instantiates the associated model object, and
    saves the object. Prints/logs status updates and tracebacks instead of
    raising exceptions.

    :param dir_path: The directory.
    :param limit: A limit on how many files to run through. If None, will run
    through all (or if random order, forever).
    :param random_order: If true, will run through the directories and files in
    random order.
    :param status_interval: How often a status update will be given.
    :param newcases: If true, skip court-years that already have data.
    :param skipdupes: If true, skip duplicates.    
    :param skip_newcases: If true, skip cases imported under newcases.
    :param avoid_nocites: If true, skip cases from dates after any case with no cite.
    :param courtdates: If true, skip cases with dates before court established.
    :param startfolder: If not None, start on startfolder
    :param startfile: If not None, start on this file (for resuming)
    """
    if limit:
        total = limit
    elif not random_order:
        logger.info("Getting an initial file count...")
        total = 0
        for _, _, file_names in os.walk(dir_path):
            total += len(fnmatch.filter(file_names, '*.xml'))
    else:
        total = None
    # go through the files, yielding parsed files and printing status updates as
    # we go
    folders = glob(dir_path + '/*')
    folders.sort()
    count = 0

    # get earliest dates for each court
    if newcases:
        logger.info('Only new cases: getting earliest dates by court.')
        min_dates = get_min_dates()
    else:
        min_dates = None

    if avoid_nocites:
        if newcases:
            raise Exception(
                "Cannot use both avoid_nocites and newcases options.")
        logger.info('Avoiding no cites: getting earliest dates by court with '
                    'no citation.')
        min_dates = get_min_nocite()

    if courtdates:
        start_dates = get_courtdates()
    else:
        start_dates = None

    # check if skipping first columbias cases

    if skip_newcases:
        skiplist = get_path_list()
    else:
        skiplist = set()

    # start/resume functionality
    if startfolder is not None:
        skipfolder = True
    else:
        skipfolder = False
    if startfile is not None:
        skipfile = True
    else:
        skipfile = False

    for folder in folders:
        if skipfolder:
            if startfolder is not None:
                checkfolder = folder.split('/')[-1]
                if checkfolder == startfolder:
                    skipfolder = False
                else:
                    continue
        logger.debug(folder)

        for path in file_generator(folder, random_order, limit):

            if skipfile:
                if startfile is not None:
                    checkfile = path.split('/')[-1]
                    if checkfile == startfile:
                        skipfile = False
                    else:
                        continue

            if path in skiplist:
                continue

            # skip cases in 'misc*' folders -- they are relatively different
            # than the other cases, so we'll deal with them later
            if 'miscellaneous_court_opinions' in path:
                continue

            logger.debug(path)

            # try to parse/save the case and show any exceptions with full
            # tracebacks
            try:
                parsed = parse_file(path)
                make_and_save(parsed, skipdupes, min_dates, start_dates, debug)
            except Exception as e:
                logger.info(path)
                # show simple exception summaries for known problems
                known = [
                    'mismatched tag', 'Failed to get a citation',
                    'Failed to find a court ID',
                    'null value in column "date_filed"', 'duplicate(s)'
                ]
                if any(k in str(e) for k in known):
                    logger.info("Known exception in file '%s':" % path)
                    logger.info(str(e))
                else:
                    logger.info("Unknown exception in file '%s':" % path)
                    logger.info(traceback.format_exc())
        # status update
        count += 1
        if count % status_interval == 0:
            if total:
                logger.info("Finished %s out of %s files." % (count, total))
            else:
                logger.info("Finished %s files." % count)
Beispiel #36
0
def download_dockets(options):
    """Download dockets listed in the spreadsheet."""
    f = open(options["input_file"], "r")
    dialect = csv.Sniffer().sniff(f.read(2048))
    f.seek(0)
    reader = csv.DictReader(f, dialect=dialect)
    q = options["queue"]
    throttle = CeleryThrottle(queue_name=q)
    session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD)
    session.login()
    for i, row in enumerate(reader):
        if i < options["offset"]:
            continue
        if i >= options["limit"] > 0:
            break

        throttle.maybe_wait()
        logger.info("Doing row %s: %s", i, row)

        row_tag = "%s-%s" % (PROJECT_TAG_NAME, row["id"])
        if not row["district_ct"]:
            chain(
                get_appellate_docket_by_docket_number.s(
                    docket_number=row["docket_no1"],
                    court_id=row["cl_court"],
                    cookies=session.cookies,
                    tag_names=[PROJECT_TAG_NAME, row_tag],
                    # Do not get the docket entries for now. We're only
                    # interested in the date terminated. If it's an open case,
                    # we'll handle that later.
                    **{
                        "show_docket_entries": False,
                        "show_orig_docket": False,
                        "show_prior_cases": False,
                        "show_associated_cases": False,
                        "show_panel_info": True,
                        "show_party_atty_info": True,
                        "show_caption": True,
                    }).set(queue=q),
                add_or_update_recap_docket.s().set(queue=q),
            ).apply_async()
        else:
            chain(
                get_pacer_case_id_and_title.s(
                    pass_through=None,
                    docket_number=row["docket_no1"],
                    court_id=row["cl_court"],
                    cookies=session.cookies,
                    case_name=row["name"],
                ).set(queue=q),
                do_case_query_by_pacer_case_id.s(
                    court_id=row["cl_court"],
                    cookies=session.cookies,
                    tag_names=[PROJECT_TAG_NAME, row_tag],
                ).set(queue=q),
                get_docket_by_pacer_case_id.s(
                    court_id=row["cl_court"],
                    cookies=session.cookies,
                    tag_names=[PROJECT_TAG_NAME, row_tag],
                    **{
                        # No docket entries
                        "doc_num_start": 10000,
                        "doc_num_end": 10000,
                        "show_parties_and_counsel": True,
                        "show_terminated_parties": True,
                        "show_list_of_member_cases": True,
                    }).set(queue=q),
                add_or_update_recap_docket.s().set(queue=q),
            ).apply_async()

    f.close()
Beispiel #37
0
def get_attachment_pages(options):
    """Find docket entries that look like invoices and get their attachment
    pages.
    """
    page_size = 100
    main_query = build_main_query_from_query_string(
        Q_DOCS_ONLY,
        {
            "rows": page_size,
            "fl": ["id", "docket_id"]
        },
        {
            "group": False,
            "facet": False,
            "highlight": False
        },
    )
    si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode="r")
    results = si.query().add_extra(**main_query)
    si.conn.http_connection.close()

    q = options["queue"]
    recap_user = User.objects.get(username="******")
    throttle = CeleryThrottle(queue_name=q, min_items=options["queue_length"])
    session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD)
    session.login()
    paginator = Paginator(results, page_size)
    i = 0
    for page_number in range(1, paginator.num_pages + 1):
        paged_results = paginator.page(page_number)
        for result in paged_results.object_list:
            if i < options["offset"]:
                i += 1
                continue
            if i >= options["limit"] > 0:
                break

            logger.info(
                "Doing row %s: rd: %s, docket: %s",
                i,
                result["id"],
                result["docket_id"],
            )
            throttle.maybe_wait()
            chain(
                # Query the attachment page and process it
                get_attachment_page_by_rd.s(result["id"],
                                            session.cookies).set(queue=q),
                # Take that in a new task and make a PQ object
                make_attachment_pq_object.s(result["id"],
                                            recap_user.pk).set(queue=q),
                # And then process that using the normal machinery.
                process_recap_attachment.s(tag_names=[TAG_PHASE_1]).set(queue=q
                                                                        ),
            ).apply_async()
            i += 1
        else:
            # Inner loop exited normally (didn't "break")
            continue
        # Inner loop broke. Break outer loop too.
        break
def get_and_save_free_document_reports(options: OptionsType) -> None:
    """Query the Free Doc Reports on PACER and get a list of all the free
    documents. Do not download those items, as that step is done later. For now
    just get the list.

    Note that this uses synchronous celery chains. A previous version was more
    complex and did not use synchronous chains. Unfortunately in Celery 4.2.0,
    or more accurately in redis-py 3.x.x, doing it that way failed nearly every
    time.

    This is a simpler version, though a slower one, but it should get the job
    done.
    """
    # Kill any *old* logs that report they're in progress. (They've failed.)
    three_hrs_ago = now() - timedelta(hours=3)
    PACERFreeDocumentLog.objects.filter(
        date_started__lt=three_hrs_ago,
        status=PACERFreeDocumentLog.SCRAPE_IN_PROGRESS,
    ).update(status=PACERFreeDocumentLog.SCRAPE_FAILED)

    cl_court_ids = (
        Court.federal_courts.district_pacer_courts()
        .filter(
            in_use=True,
            end_date=None,
        )
        .exclude(pk__in=["casb", "gub", "ilnb", "innb", "miwb", "ohsb", "prb"])
        .values_list("pk", flat=True)
    )
    pacer_court_ids = [map_cl_to_pacer_id(v) for v in cl_court_ids]
    today = now()
    for pacer_court_id in pacer_court_ids:
        while True:
            next_start_d, next_end_d = get_next_date_range(pacer_court_id)
            if next_end_d is None:
                logger.warn(
                    f"Free opinion scraper for {pacer_court_id} still "
                    f"in progress."
                )
                break

            logger.info(
                "Attempting to get latest document references for "
                "%s between %s and %s",
                pacer_court_id,
                next_start_d,
                next_end_d,
            )
            mark_court_in_progress(pacer_court_id, next_end_d)
            try:
                status = get_and_save_free_document_report(
                    pacer_court_id, next_start_d, next_end_d
                )
            except (
                RequestException,
                ReadTimeoutError,
                IndexError,
                TypeError,
                PacerLoginException,
            ) as exc:
                if isinstance(exc, (RequestException, ReadTimeoutError)):
                    reason = "network error."
                elif isinstance(exc, IndexError):
                    reason = "PACER 6.3 bug."
                elif isinstance(exc, TypeError):
                    reason = "failing PACER website."
                elif isinstance(exc, PacerLoginException):
                    reason = "PACER login issue."
                else:
                    reason = "unknown reason."
                logger.error(
                    f"Failed to get free document references for "
                    f"{pacer_court_id} between {next_start_d} and "
                    f"{next_end_d} due to {reason}."
                )
                mark_court_done_on_date(
                    PACERFreeDocumentLog.SCRAPE_FAILED,
                    pacer_court_id,
                    next_end_d,
                )
                break

            mark_court_done_on_date(status, pacer_court_id, next_end_d)

            if status == PACERFreeDocumentLog.SCRAPE_SUCCESSFUL:
                if next_end_d >= today.date():
                    logger.info(
                        "Got all document references for '%s'.", pacer_court_id
                    )
                    # Break from while loop, onwards to next court
                    break
                else:
                    # More dates to do; let it continue
                    continue

            elif status == PACERFreeDocumentLog.SCRAPE_FAILED:
                logger.error(
                    "Encountered critical error on %s "
                    "(network error?). Marking as failed and "
                    "pressing on." % pacer_court_id
                )
                # Break from while loop, onwards to next court
                break
    def scrape_court(self, site, full_crawl=False, ocr_available=True):
        # Get the court object early for logging
        # opinions.united_states.federal.ca9_u --> ca9
        court_str = site.court_id.split(".")[-1].split("_")[0]
        court = Court.objects.get(pk=court_str)

        dup_checker = DupChecker(court, full_crawl=full_crawl)
        if dup_checker.abort_by_url_hash(site.url, site.hash):
            return

        if site.cookies:
            logger.info(f"Using cookies: {site.cookies}")
        for i, item in enumerate(site):
            msg, r = get_binary_content(
                item["download_urls"],
                site.cookies,
                method=site.method,
            )
            if msg:
                logger.warning(msg)
                ErrorLog(log_level="WARNING", court=court, message=msg).save()
                continue

            content = site.cleanup_content(r.content)

            current_date = item["case_dates"]
            try:
                next_date = site[i + 1]["case_dates"]
            except IndexError:
                next_date = None

            # request.content is sometimes a str, sometimes unicode, so
            # force it all to be bytes, pleasing hashlib.
            sha1_hash = sha1(force_bytes(content))
            if (court_str == "nev"
                    and item["precedential_statuses"] == "Unpublished"):
                # Nevada's non-precedential cases have different SHA1 sums
                # every time.
                lookup_params = {
                    "lookup_value": item["download_urls"],
                    "lookup_by": "download_url",
                }
            else:
                lookup_params = {
                    "lookup_value": sha1_hash,
                    "lookup_by": "sha1",
                }

            proceed = dup_checker.press_on(Opinion, current_date, next_date,
                                           **lookup_params)
            if dup_checker.emulate_break:
                break
            if not proceed:
                continue

            # Not a duplicate, carry on
            logger.info(
                f"Adding new document found at: {item['download_urls'].encode()}"
            )
            dup_checker.reset()

            docket, opinion, cluster, citations = make_objects(
                item, court, sha1_hash, content)

            save_everything(
                items={
                    "docket": docket,
                    "opinion": opinion,
                    "cluster": cluster,
                    "citations": citations,
                },
                index=False,
            )
            extract_doc_content.delay(opinion.pk,
                                      ocr_available=ocr_available,
                                      citation_jitter=True)

            logger.info(f"Successfully added opinion {opinion.pk}: "
                        f"{item['case_names'].encode()}")

        # Update the hash if everything finishes properly.
        logger.info(f"{site.court_id}: Successfully crawled opinions.")
        if not full_crawl:
            # Only update the hash if no errors occurred.
            dup_checker.update_site_hash(site.hash)
Beispiel #40
0
    def handle(self, *args, **options):
        """Identify parallel citations and save them as requested.

        This process proceeds in two phases. The first phase is to work through
        the entire corpus, identifying citations that occur very near to each
        other. These are considered parallel citations, and they are built into
        a graph data structure where citations are nodes and each parallel
        citation is an edge. The weight of each edge is determined by the
        number of times a parallel citation has been identified between two
        citations. This should solve problems like typos or other issues with
        our heuristic approach.

        The second phase of this process is to update the database with the
        high quality citations. This can only be done by matching the citations
        with actual items in the database and then updating them with parallel
        citations that are sufficiently likely to be good.
        """
        super(Command, self).handle(*args, **options)
        no_option = (not any([options.get('doc_id'), options.get('all')]))
        if no_option:
            raise CommandError("Please specify if you want all items or a "
                               "specific item.")
        if not options['update_database']:
            logger.info(
                "--update_database is not set. No changes will be made to the "
                "database.")

        # Update Citation object to consider similar objects equal.
        self.monkey_patch_citation()

        logger.info("## Entering phase one: Building a network object of "
                    "all citations.\n")
        q = Opinion.objects.all()
        if options.get('doc_id'):
            q = q.filter(pk__in=options['doc_id'])
        count = q.count()
        opinions = queryset_generator(q, chunksize=10000)

        node_count = edge_count = completed = 0
        subtasks = []
        for o in opinions:
            subtasks.append(
                # This will call the second function with the results from the
                # first.
                get_document_citations.s(o) | identify_parallel_citations.s())
            last_item = (count == completed + 1)
            if (completed % 50 == 0) or last_item:
                job = group(subtasks)
                result = job.apply_async().join()
                [
                    self.add_groups_to_network(citation_groups)
                    for citation_groups in result
                ]
                subtasks = []

            completed += 1
            if completed % 250 == 0 or last_item:
                # Only do this once in a while.
                node_count = len(self.g.nodes())
                edge_count = len(self.g.edges())
            sys.stdout.write("\r  Completed %s of %s. (%s nodes, %s edges)" % (
                completed,
                count,
                node_count,
                edge_count,
            ))
            sys.stdout.flush()

        logger.info("\n\n## Entering phase two: Saving the best edges to "
                    "the database.\n\n")
        for sub_graph in nx.connected_component_subgraphs(self.g):
            self.handle_subgraph(sub_graph, options)

        logger.info("\n\n## Done. Added %s new citations." % self.update_count)

        self.do_solr(options)
Beispiel #41
0
    def handle_subgraph(self, sub_graph, options):
        """Add edges to the database if significant.

        An earlier version of the code simply looked at each edge, but this
        looks at sub_graphs within the main graph. This is different (and
        better) because the main graph might have multiple nodes like so:

            A <-- (22 US 33): This node is in the DB already
            |
            B <-- (2013 LEXIS 223948): This node is not yet in the DB
            |
            C <-- (2013 WL 3808347): This node is not yet in the DB
            |
            D <-- This node can be disregarded because it has low edge weight.

        If we handled this edge by edge, we might process B --> C before doing
        A --> B. If we did that, we'd get zero results for B and C, and we'd
        add nothing. That'd be bad, since there's a strong edge between A, B,
        and C.

        Instead, we process this as a graph, looking at all the nodes at once.
        """
        # Remove nodes that are only connected weakly.
        for node in sub_graph.nodes():
            has_good_edge = False
            for a, b, data in sub_graph.edges([node], data=True):
                if data['weight'] > EDGE_RELEVANCE_THRESHOLD:
                    has_good_edge = True
                    break
            if not has_good_edge:
                sub_graph.remove_node(node)

        if len(sub_graph.nodes()) == 0:
            logger.info("  No strong edges found. Pass.\n")
            return

        # Look up all remaining nodes in Solr, and make a (node, results) pair.
        result_sets = []
        for node in sub_graph.nodes():
            result_sets.append((node, self.match_on_citation(node)))

        if sum(len(results) for node, results in result_sets) == 0:
            logger.info("  Got no results for any citation. Pass.\n")
            return

        if all(len(results) > 0 for node, results in result_sets):
            logger.info("  Got results for all citations. Pass.\n")
            return

        # Remove any node-results pairs with more than than one result.
        result_sets = filter(
            lambda (n, r): len(r) > 1,
            result_sets,
        )

        # For result_sets with more than 0 results, do all the citations have
        # the same ID?
        if len(
                set([
                    results[0]['cluster_id']
                    for node, results in result_sets if len(results) > 0
                ])) > 1:
            logger.info("  Got multiple IDs for the citations. Pass.\n")
            return

        # Are the number of unique reporters equal to the number of results?
        if len(set([node.reporter
                    for node, results in result_sets])) != len(result_sets):
            logger.info("  Got duplicated reporter in citations. Pass.\n")
            return

        # Get the cluster. By now we know all results have either 0 or 1 item.
        oc = None
        for node, results in result_sets:
            if len(results) > 0:
                oc = OpinionCluster.objects.get(pk=results[0]['cluster_id'])
                break

        if oc is not None:
            # Update the cluster with all the nodes that had no results.
            for node, results in result_sets:
                if len(results) == 0:
                    # add the citation to the cluster.
                    self._update_cluster_with_citation(oc, node)
            if options['update_database']:
                oc.save()
Beispiel #42
0
def print_stats(match_stats, candidate_eid_lists):
    """Print the stats."""
    logger.info("#########")
    logger.info("# Stats #")
    logger.info("#########")
    logger.info("Finished matching judges:")
    for k, v in match_stats.items():
        logger.info(" - %s had %s matches" % (v, k))
    ftm_judge_count = 0
    for v in candidate_eid_lists.values():
        ftm_judge_count += len(v)
        logger.info("There were %s judges in FTM that we matched "
                    "against." % ftm_judge_count)
Beispiel #43
0
 def assign_judges_to_opinions(self):
     logger.info("Assigning authors...")
     assign_authors_to_opinions(jurisdictions=self.options["jurisdictions"],
                                testing=self.debug)
    def scrape_court(self, site, full_crawl=False):
        download_error = False
        # Get the court object early for logging
        # opinions.united_states.federal.ca9_u --> ca9
        court_str = site.court_id.split('.')[-1].split('_')[0]
        court = Court.objects.get(pk=court_str)

        dup_checker = DupChecker(court, full_crawl=full_crawl)
        abort = dup_checker.abort_by_url_hash(site.url, site.hash)
        if not abort:
            if site.cookies:
                logger.info("Using cookies: %s" % site.cookies)
            for i, item in enumerate(site):
                msg, r = get_binary_content(item['download_urls'],
                                            site.cookies,
                                            site._get_adapter_instance(),
                                            method=site.method)
                if msg:
                    logger.warn(msg)
                    ErrorLog(log_level='WARNING', court=court,
                             message=msg).save()
                    continue

                content = site.cleanup_content(r.content)

                current_date = item['case_dates']
                try:
                    next_date = site[i + 1]['case_dates']
                except IndexError:
                    next_date = None

                # request.content is sometimes a str, sometimes unicode, so
                # force it all to be bytes, pleasing hashlib.
                sha1_hash = hashlib.sha1(force_bytes(content)).hexdigest()
                if (court_str == 'nev'
                        and item['precedential_statuses'] == 'Unpublished'):
                    # Nevada's non-precedential cases have different SHA1
                    # sums every time.
                    lookup_params = {
                        'lookup_value': item['download_urls'],
                        'lookup_by': 'download_url'
                    }
                else:
                    lookup_params = {
                        'lookup_value': sha1_hash,
                        'lookup_by': 'sha1'
                    }

                onwards = dup_checker.press_on(Opinion, current_date,
                                               next_date, **lookup_params)
                if dup_checker.emulate_break:
                    break

                if onwards:
                    # Not a duplicate, carry on
                    logger.info('Adding new document found at: %s' %
                                item['download_urls'].encode('utf-8'))
                    dup_checker.reset()

                    docket, opinion, cluster, citations, error = self.make_objects(
                        item, court, sha1_hash, content)

                    if error:
                        download_error = True
                        continue

                    self.save_everything(items={
                        'docket': docket,
                        'opinion': opinion,
                        'cluster': cluster,
                        'citations': citations,
                    },
                                         index=False)
                    extract_doc_content.delay(
                        opinion.pk,
                        callback=subtask(extract_by_ocr),
                        citation_countdown=random.randint(0, 3600))

                    logger.info("Successfully added doc {pk}: {name}".format(
                        pk=opinion.pk,
                        name=item['case_names'].encode('utf-8'),
                    ))

            # Update the hash if everything finishes properly.
            logger.info("%s: Successfully crawled opinions." % site.court_id)
            if not download_error and not full_crawl:
                # Only update the hash if no errors occurred.
                dup_checker.update_site_hash(site.hash)
 def handle(self, *args, **options):
     super(Command, self).handle(*args, **options)
     cache.clear()
     logger.info('Cleared cache')
    def handle(self, *args, **options):
        super(Command, self).handle(*args, **options)
        global die_now

        # this line is used for handling SIGTERM (CTRL+4), so things can die
        # safely
        signal.signal(signal.SIGTERM, signal_handler)

        module_strings = build_module_list(options['court_id'])
        if not len(module_strings):
            raise CommandError('Unable to import module or package. Aborting.')

        logger.info("Starting up the scraper.")
        num_courts = len(module_strings)
        wait = (options['rate'] * 60) / num_courts
        i = 0
        while i < num_courts:
            # this catches SIGTERM, so the code can be killed safely.
            if die_now:
                logger.info("The scraper has stopped.")
                sys.exit(1)

            package, module = module_strings[i].rsplit('.', 1)

            mod = __import__("%s.%s" % (package, module), globals(), locals(),
                             [module])
            # noinspection PyBroadException
            try:
                self.parse_and_scrape_site(mod, options['full_crawl'])
            except Exception as e:
                # noinspection PyBroadException
                try:
                    msg = ('********!! CRAWLER DOWN !!***********\n'
                           '*****scrape_court method failed!*****\n'
                           '********!! ACTION NEEDED !!**********\n%s' %
                           traceback.format_exc())
                    logger.critical(msg)

                    # opinions.united_states.federal.ca9_u --> ca9
                    court_str = mod.Site.__module__.split('.')[-1].split(
                        '_')[0]
                    court = Court.objects.get(pk=court_str)
                    ErrorLog(log_level='CRITICAL', court=court,
                             message=msg).save()
                except Exception as e:
                    # This is very important. Without this, an exception
                    # above will crash the caller.
                    pass
            finally:
                time.sleep(wait)
                last_court_in_list = (i == (num_courts - 1))
                if last_court_in_list and options['daemon']:
                    # Start over...
                    logger.info(
                        "All jurisdictions done. Looping back to "
                        "the beginning because daemon mode is enabled.")
                    i = 0
                else:
                    i += 1

        logger.info("The scraper has stopped.")
        sys.exit(0)
    def run_query(self, alert, rate):
        results = []
        error = False
        cd = {}
        try:
            logger.info("Now running the query: %s\n" % alert.query)

            # Set up the data
            data = search_utils.get_string_to_dict(alert.query)
            try:
                del data['filed_before']
            except KeyError:
                pass
            data['order_by'] = 'score desc'
            logger.info("  Data sent to SearchForm is: %s\n" % data)
            search_form = SearchForm(data)
            if search_form.is_valid():
                cd = search_form.cleaned_data

                if rate == 'rt' and len(self.valid_ids[cd['type']]) == 0:
                    # Bail out. No results will be found if no valid_ids.
                    return error, cd['type'], results

                cut_off_date = get_cut_off_date(rate)
                if cd['type'] == 'o':
                    cd['filed_after'] = cut_off_date
                elif cd['type'] == 'oa':
                    cd['argued_after'] = cut_off_date
                main_params = search_utils.build_main_query(cd, facet=False)
                main_params.update({
                    'rows': '20',
                    'start': '0',
                    'hl.tag.pre': '<em><strong>',
                    'hl.tag.post': '</strong></em>',
                    'caller': 'cl_send_alerts',
                })

                if rate == 'rt':
                    main_params['fq'].append('id:(%s)' % ' OR '.join(
                        [str(i) for i in self.valid_ids[cd['type']]]
                    ))
                results = self.connections[
                    cd['type']
                ].query().add_extra(
                    **main_params
                ).execute()
                regroup_snippets(results)

            else:
                logger.info("  Query for alert %s was invalid\n"
                            "  Errors from the SearchForm: %s\n" %
                            (alert.query, search_form.errors))
                error = True
        except:
            traceback.print_exc()
            logger.info("  Search for this alert failed: %s\n" %
                        alert.query)
            error = True

        logger.info("  There were %s results\n" % len(results))

        return error, cd.get('type'), results
Beispiel #48
0
def upload_recap_data(options):
    """Upload RECAP data to Internet Archive."""
    q = options['queue']
    r = redis.StrictRedis(host=settings.REDIS_HOST,
                          port=settings.REDIS_PORT,
                          db=settings.REDIS_DATABASES['CACHE'])
    redis_key = 'recap-docket-last-id'
    last_pk = r.getset(redis_key, 0)
    ds = Docket.objects.filter(
        Q(ia_upload_failure_count__lte=3) | Q(ia_upload_failure_count=None),
        ia_needs_upload=True,
        source__in=Docket.RECAP_SOURCES,
        pk__gt=last_pk,
    ).order_by('pk').only('pk')

    chunk_size = 100  # Small to save memory
    i = 0
    previous_i = None
    delay_count = 0
    t1 = now()
    logger.info("Sending recap dockets to Internet Archive")
    throttle = CeleryThrottle(queue_name=q, min_items=5)
    while True:
        # Start of quarter needs to be re-analyzed every time through the loop.
        # This ensures that if the quarter changes while this runs, we get the
        # new value.
        params = {
            'pk__gt': last_pk,
            'ia_date_first_change__lt': get_start_of_quarter(),
        }
        for d in ds.filter(**params)[:chunk_size]:
            throttle.maybe_wait()
            upload_recap_json.apply_async(args=(d.pk, ), queue=q)
            i += 1
            if i % 100 == 0:
                # Print a useful log line with expected finish date.
                t2 = now()
                elapsed_minutes = float((t2 - t1).seconds) / 60
                try:
                    rate = i / float(elapsed_minutes)
                    logger.info("Uploaded %s dockets to IA so far (%.01f/m)",
                                i, rate)
                except ZeroDivisionError:
                    # First lap through can be completed in less than 1s.
                    pass
            last_pk = d.pk
            r.set(redis_key, last_pk)

        # Detect if we've hit the end of the loop and reset it if so. We do
        # this by keeping track of the last_pk that we saw the last time the
        # for loop changed. If that PK doesn't change after the for loop has
        # run again, then we know we've hit the end of the loop and we should
        # reset it.
        empty_loop = i == previous_i
        if empty_loop:
            # i is the same as the last time the
            # for loop finished. Reset things.
            if last_pk == 0:
                # We went through the for loop a second time and still didn't
                # do anything. Stall with capped back off.
                delay_count += 1
                max_delay = 60 * 30  # Thirty minutes
                delay = min(delay_count * 60, max_delay)
                time.sleep(delay)
            else:
                delay_count = 0
                last_pk = 0
                r.set(redis_key, 0)
        else:
            previous_i = i
Beispiel #49
0
def update_judges_by_solr(candidate_id_map, debug):
    """Update judges by looking up each entity from FTM in Solr."""
    conn = SolrInterface(settings.SOLR_PEOPLE_URL, mode="r")
    match_stats = defaultdict(int)
    # These IDs are ones that cannot be updated due to being identified as
    # problematic in FTM's data.
    blacklisted_ids = defaultdict(set)
    for court_id, candidate_list in candidate_id_map.items():
        for candidate in candidate_list:
            # Look up the candidate in Solr.
            logger.info("Doing: %s" % candidate["name"])
            name = (" AND ".join([
                word for word in candidate["name"].split() if len(word) > 1
            ])).replace(",", "")
            results = conn.raw_query(
                **{
                    "caller":
                    "ftm_update_judges_by_solr",
                    "fq": [
                        "name:(%s)" % name,
                        "court_exact:%s" % court_id,
                        # This filters out Sr/Jr problems by insisting on recent
                        # positions. 1980 is arbitrary, based on testing.
                        "date_start:[1980-12-31T23:59:59Z TO *]",
                    ],
                    "q":
                    "*",
                }).execute()

            if len(results) == 0:
                match_stats[len(results)] += 1
                logger.info("Found no matches.")

            elif len(results) == 1:
                match_stats[len(results)] += 1
                logger.info("Found one match: %s" % results[0]["name"])

                # Get the person from the DB and update them.
                pk = results[0]["id"]
                if pk in blacklisted_ids:
                    continue
                p = Person.objects.get(pk=pk)
                if p.ftm_eid:
                    if p.ftm_eid != candidate["eid"]:
                        logger.info("  Found values in ftm database fields. "
                                    "This indicates a duplicate in FTM.")

                        blacklisted_ids[p.pk].add(candidate["eid"])
                        blacklisted_ids[p.pk].add(p.ftm_eid)
                        p.ftm_eid = ""
                        p.ftm_total_received = None
                    else:
                        logger.info("Found values with matching EID. Adding "
                                    "amounts, since this indicates multiple "
                                    "jurisdictions that the judge was in.")
                        p.ftm_total_received += candidate["total"]
                    if not debug:
                        p.save()
                else:
                    # No major problems. Proceed.
                    p.ftm_eid = candidate["eid"]
                    p.ftm_total_received = candidate["total"]
                    if not debug:
                        p.save()

            elif len(results) > 1:
                match_stats[len(results)] += 1
                logger.info("  Found more than one match: %s" % results)

    print_stats(match_stats, candidate_id_map)
    logger.info("Blacklisted IDs: %s" % blacklisted_ids)
Beispiel #50
0
 def assign_bankruptcy_fjc(self):
     """update FJC judges with bankruptcy positions"""
     logger.info("Assigning bankruptcy courtids...")
     update_bankruptcy_and_magistrate(testing=self.debug)
Beispiel #51
0
def update_tax_opinions():
    """
    This code identifies tax opinions without
    docket numbers or citations and attempts to parse them out
    and add the citation and docket numbers to the case.

    http://www.ustaxcourt.gov/UstcInOp/asp/Todays.asp is an identifier for
    bad scrapes in tax court.
    :return: None
    """
    logger.info("Start updating Tax Opinions")
    ocs = OpinionCluster.objects.filter(docket__court="tax").filter(
        docket__docket_number=None)

    # We had a number of failed scrapes and the bad_url helps identify them
    bad_url = "http://www.ustaxcourt.gov/UstcInOp/asp/Todays.asp"
    for oc in ocs:
        op_objs = oc.sub_opinions.all()
        for opinion in op_objs:
            if opinion.plain_text == "":
                logger.info("No plain text to parse.")
                continue
            if opinion.download_url == bad_url:
                logger.info("Failed scrape, nothing to parse.")
                continue

            docket_numbers = get_tax_docket_numbers(opinion.plain_text)
            if docket_numbers:
                logger.info("Adding Docket Numbers: %s to %s" %
                            (docket_numbers, oc.docket.case_name))
                oc.docket.docket_number = docket_numbers
                oc.docket.save()

            cite = generate_citation(opinion.plain_text, oc.id)

            if cite is None:
                logger.info("No cite to add for opinion %s on cluster %s" %
                            (opinion.id, oc.id))
                continue

            logger.info("Saving citation %s %s %s" %
                        (cite.volume, cite.reporter, cite.page))

            Citation.objects.get_or_create(
                volume=cite.volume,
                reporter=cite.reporter,
                page=cite.page,
                type=cite.type,
                cluster_id=oc.id,
            )
    def make_recap_document(
        self,
        doc_node,
        docket_entry,
        entry_number,
        attachment_number,
        document_type,
        debug,
    ):
        """Do nothing for items that don't start with zero. For ones that do,
        find the stripped version, fix it, download the correct item, extract
        it and finally save it to Solr.
        """

        if not entry_number.startswith("0"):
            # Only touch things where the new value leads with a zero.
            return None
        else:
            logger.info("  Doing docket_entry: %s, document_number, "
                        "%s and attachment number: %s" %
                        (docket_entry, entry_number, attachment_number))
        old_entry_number = int(entry_number)

        try:
            rd = RECAPDocument.objects.get(
                docket_entry=docket_entry,
                document_number=old_entry_number,
                attachment_number=attachment_number or None,
            )
            logger.info("    Found item.")
        except RECAPDocument.DoesNotExist:
            logger.info("    Failed to find item.")
            return None

        rd.document_number = entry_number
        if rd.is_available:
            new_ia = get_ia_document_url_from_path(self.path, entry_number,
                                                   attachment_number)
            logger.info("    Updating IA URL from %s to %s" %
                        (rd.filepath_ia, new_ia))
            rd.filepath_ia = new_ia

            if not os.path.isfile(rd.filepath_local.path):
                # Set the value correctly and get the file from IA if we don't
                # already have it.
                new_local_path = os.path.join(
                    "recap",
                    get_local_document_url_from_path(self.path, entry_number,
                                                     attachment_number),
                )
                logger.info("    Updating local path from %s to %s" %
                            (rd.filepath_local, new_local_path))
                rd.filepath_local = new_local_path
                filename = rd.filepath_ia.rsplit("/", 1)[-1]
                logger.info("    Downloading item with filename %s" % filename)
                if not debug:
                    download_recap_item(rd.filepath_ia, filename)
            else:
                logger.info("    File already on disk. Punting.")

            if rd.page_count is None:
                logger.info("    Getting page count.")
                extension = rd.filepath_local.path.split(".")[-1]
                rd.page_count = get_page_count(rd.filepath_local.path,
                                               extension)
        else:
            logger.info("    Item not available in RECAP. Punting.")
            return None

        if not debug:
            try:
                extract_recap_pdf(rd.pk, check_if_needed=False)
                rd.save(do_extraction=False, index=True)
                logger.info(
                    "    Item saved at https://www.courtlistener.com%s" %
                    rd.get_absolute_url())
            except IntegrityError:
                logger.info("    Integrity error while saving.")
                return None
        else:
            logger.info("    No save requested in debug mode.")

        return rd
def parse_harvard_opinions(reporter, volume, make_searchable):
    """
    Parse downloaded CaseLaw Corpus from internet archive and add them to our
    database.

    Optionally uses a reporter abbreviation to identify cases to download as
    used by IA.  (Ex. T.C. => tc)

    Optionally uses a volume integer.

    If neither is provided, code will cycle through all downloaded files.

    :param volume: The volume (int) of the reporters (optional) (ex 10)
    :param reporter: Reporter string as slugify'd (optional) (tc) for T.C.
    :param make_searchable: Boolean to indicate saving to solr
    :return: None
    """
    if not reporter and volume:
        logger.error("You provided a volume but no reporter. Exiting.")
        return

    for file_path in filepath_list(reporter, volume):
        ia_download_url = "/".join(
            ["https://archive.org/download",
             file_path.split("/", 9)[-1]])

        if OpinionCluster.objects.filter(
                filepath_json_harvard=file_path).exists():
            logger.info("Skipping - already in system %s" % ia_download_url)
            continue

        try:
            with open(file_path) as f:
                data = json.load(f)
        except ValueError:
            logger.warning("Empty json: missing case at: %s" % ia_download_url)
            continue
        except Exception as e:
            logger.warning("Unknown error %s for: %s" % (e, ia_download_url))
            continue

        cites = get_citations(data["citations"][0]["cite"])
        if not cites:
            logger.info("No citation found for %s." %
                        data["citations"][0]["cite"])
            continue

        case_name = harmonize(data["name_abbreviation"])
        case_name_short = cnt.make_case_name_short(case_name)
        case_name_full = harmonize(data["name"])

        citation = cites[0]
        if skip_processing(citation, case_name, file_path):
            continue

        # TODO: Generalize this to handle all court types somehow.
        court_id = match_court_string(
            data["court"]["name"],
            state=True,
            federal_appeals=True,
            federal_district=True,
        )

        soup = BeautifulSoup(data["casebody"]["data"], "lxml")

        # Some documents contain images in the HTML
        # Flag them for a later crawl by using the placeholder '[[Image]]'
        judge_list = [
            find_judge_names(x.text) for x in soup.find_all("judges")
        ]
        author_list = [
            find_judge_names(x.text) for x in soup.find_all("author")
        ]
        # Flatten and dedupe list of judges
        judges = ", ".join(
            sorted(
                list(
                    set(itertools.chain.from_iterable(judge_list +
                                                      author_list)))))
        judges = titlecase(judges)
        docket_string = (data["docket_number"].replace(
            "Docket No.", "").replace("Docket Nos.", "").strip())

        short_fields = ["attorneys", "disposition", "otherdate", "seealso"]

        long_fields = [
            "syllabus",
            "summary",
            "history",
            "headnotes",
            "correction",
        ]

        short_data = parse_extra_fields(soup, short_fields, False)
        long_data = parse_extra_fields(soup, long_fields, True)

        with transaction.atomic():
            logger.info("Adding docket for: %s", citation.base_citation())
            docket = Docket(
                case_name=case_name,
                case_name_short=case_name_short,
                case_name_full=case_name_full,
                docket_number=docket_string,
                court_id=court_id,
                source=Docket.HARVARD,
                ia_needs_upload=False,
            )
            try:
                with transaction.atomic():
                    docket.save()
            except OperationalError as e:
                if "exceeds maximum" in str(e):
                    docket.docket_number = (
                        "%s, See Corrections for full Docket Number" %
                        trunc(docket_string, length=5000, ellipsis="..."))
                    docket.save()
                    long_data["correction"] = "%s <br> %s" % (
                        data["docket_number"],
                        long_data["correction"],
                    )
            # Handle partial dates by adding -01v to YYYY-MM dates
            date_filed, is_approximate = validate_dt(data["decision_date"])

            logger.info("Adding cluster for: %s", citation.base_citation())
            cluster = OpinionCluster(
                case_name=case_name,
                case_name_short=case_name_short,
                case_name_full=case_name_full,
                precedential_status="Published",
                docket_id=docket.id,
                source="U",
                date_filed=date_filed,
                date_filed_is_approximate=is_approximate,
                attorneys=short_data["attorneys"],
                disposition=short_data["disposition"],
                syllabus=long_data["syllabus"],
                summary=long_data["summary"],
                history=long_data["history"],
                other_dates=short_data["otherdate"],
                cross_reference=short_data["seealso"],
                headnotes=long_data["headnotes"],
                correction=long_data["correction"],
                judges=judges,
                filepath_json_harvard=file_path,
            )
            cluster.save(index=False)

            logger.info("Adding citation for: %s", citation.base_citation())
            Citation.objects.create(
                volume=citation.volume,
                reporter=citation.reporter,
                page=citation.page,
                type=map_reporter_db_cite_type(
                    REPORTERS[citation.canonical_reporter][0]["cite_type"]),
                cluster_id=cluster.id,
            )
            new_op_pks = []
            for op in soup.find_all("opinion"):
                # This code cleans author tags for processing.
                # It is particularly useful for identifiying Per Curiam
                for elem in [op.find("author")]:
                    if elem is not None:
                        [x.extract() for x in elem.find_all("page-number")]

                auth = op.find("author")
                if auth is not None:
                    author_tag_str = titlecase(auth.text.strip(":"))
                    author_str = titlecase("".join(
                        find_judge_names(author_tag_str)))
                else:
                    author_str = ""
                    author_tag_str = ""

                per_curiam = True if author_tag_str == "Per Curiam" else False
                # If Per Curiam is True set author string to Per Curiam
                if per_curiam:
                    author_str = "Per Curiam"

                op_type = map_opinion_type(op.get("type"))
                opinion_xml = str(op)
                logger.info("Adding opinion for: %s", citation.base_citation())
                op = Opinion(
                    cluster_id=cluster.id,
                    type=op_type,
                    author_str=author_str,
                    xml_harvard=opinion_xml,
                    per_curiam=per_curiam,
                    extracted_by_ocr=True,
                )
                # Don't index now; do so later if desired
                op.save(index=False)
                new_op_pks.append(op.pk)

        if make_searchable:
            add_items_to_solr.delay(new_op_pks, "search.Opinion")

        logger.info("Finished: %s", citation.base_citation())
Beispiel #54
0
def get_documents(options):
    """Download documents from PACER if we don't already have them."""
    q = options["queue"]

    throttle = CeleryThrottle(queue_name=q)
    session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD)
    session.login()

    page_size = 20000
    main_query = build_main_query_from_query_string(
        QUERY_STRING,
        {
            "rows": page_size,
            "fl": ["id", "docket_id"]
        },
        {
            "group": False,
            "facet": False,
            "highlight": False
        },
    )
    si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode="r")
    results = si.query().add_extra(**main_query).execute()
    logger.info("Got %s search results.", results.result.numFound)

    for i, result in enumerate(results):
        if i < options["offset"]:
            continue
        if i >= options["limit"] > 0:
            break
        throttle.maybe_wait()

        logger.info(
            "Doing item %s w/rd: %s, d: %s",
            i,
            result["id"],
            result["docket_id"],
        )

        try:
            rd = RECAPDocument.objects.get(pk=result["id"])
        except RECAPDocument.DoesNotExist:
            logger.warning("Unable to find RECAP Document with id %s",
                           result["id"])
            continue

        if rd.is_available:
            logger.info("Already have pk %s; just tagging it.", rd.pk)
            add_tags(rd, TAG)
            continue

        if not rd.pacer_doc_id:
            logger.info("Unable to find pacer_doc_id for: %s", rd.pk)
            continue

        chain(
            get_pacer_doc_by_rd.s(rd.pk, session.cookies,
                                  tag=TAG).set(queue=q),
            extract_recap_pdf.si(rd.pk).set(queue=q),
            add_items_to_solr.si([rd.pk], "search.RECAPDocument").set(queue=q),
        ).apply_async()
Beispiel #55
0
    def get_human_review(self, clusters, d):
        for i, cluster in enumerate(clusters):
            logger.info("%s: Cluster %s (%0.3f sim):" % (
                i,
                cluster.pk,
                gen_diff_ratio(cluster.case_name.lower(),
                               d["caseName"].lower()),
            ))
            logger.info("https://www.courtlistener.com%s" %
                        cluster.get_absolute_url())
            logger.info("%s" % cluster.case_name.encode())
            if cluster.docket.docket_number:
                logger.info(cluster.docket.docket_number.encode())
            logger.info(cluster.date_filed)
        logger.info("SCDB info:")
        logger.info(d["caseName"])
        if d["docket"]:
            logger.info(d["docket"])
        logger.info(d["dateDecision"])

        if self.skip_human_review:
            logger.info(
                "Skipping human review and just returning the first item.")
            self.skipped_count += 1
            return clusters[0]
        else:
            choice = input("Which item should we update? [0-%s] " %
                           (len(clusters) - 1))

            try:
                choice = int(choice)
                cluster = clusters[choice]
            except ValueError:
                cluster = None
            return cluster
Beispiel #56
0
 def assign_judges_to_oral_arguments(self):
     logger.info("Assigning panel members to oral arguments...")
     assign_authors_to_oral_arguments(testing=self.debug)
Beispiel #57
0
 def handle(self, *args, **options):
     super(Command, self).handle(*args, **options)
     logger.info("Using PACER username: %s" % PACER_USERNAME)
     get_documents(options)
Beispiel #58
0
def make_dict_of_ftm_eids(use_pickle=True):
    """Build up a dictionary mapping jurisdiction IDs to candidates in those
    locations
    """
    pickle_location = "/tmp/eid_lists.pkl"
    if use_pickle:
        if os.path.isfile(pickle_location):
            with open(pickle_location, "r") as f:
                logger.info("Loading pickled candidate list. Read the command "
                            "documentation if this is not desired.")
                return pickle.load(f)
        else:
            logger.info("Unable to find pickle file.")

    candidate_eid_lists = defaultdict(list)

    for courtid, (state, level) in courtid2statelevel.items():
        if level != "H":
            # We only want high courts.
            continue
        for year in range(1989, 2017):
            url = url_template.format(
                key=settings.FTM_KEY,
                state=state,
                level=leveldict[level],
                year=year,
            )
            logger.info("Getting url at: %s" % url)
            data = requests.get(url).json()

            if data["records"] == ["No Records"]:
                logger.info("  No records found in court %s and year %s." %
                            (courtid, year))
                continue
            logger.info("  Found %s records in court %s and year %s" %
                        (len(data["records"]), courtid, year))

            for item in data["records"]:
                # add an eid, name, year tuple to this court's list
                candidate_eid_lists[courtid].append({
                    "eid":
                    item["Candidate_Entity"]["id"],
                    "name":
                    item["Candidate_Entity"]["Candidate_Entity"],
                    "total":
                    float(item["Total_$"]["Total_$"]),
                    "year":
                    year,
                })

    if use_pickle:
        with open(pickle_location, "w") as f:
            logger.info("Creating pickle file at: %s" % pickle_location)
            pickle.dump(candidate_eid_lists, f)
    return candidate_eid_lists
Beispiel #59
0
def get_dockets(options):
    """Download the dockets described in the CSV according to the `tasks`
    option.
    """
    f = options["file"]
    reader = csv.DictReader(f)
    q = options["queue"]
    task = options["task"]
    throttle = CeleryThrottle(queue_name=q)
    session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD)
    session.login()
    for i, row in enumerate(reader):
        if i < options["offset"]:
            continue
        if i >= options["limit"] > 0:
            break
        if row["Too Old"] == "Yes":
            continue
        if row["Appellate/District"].lower() != task:
            # Only do appellate when appellate, and district when district.
            continue

        # All tests pass. Get the docket.
        logger.info("Doing row %s: %s", i, row)
        throttle.maybe_wait()
        if task == "appellate":
            chain(
                get_appellate_docket_by_docket_number.s(
                    docket_number=row["Cleaned case_No"],
                    court_id=row["fjc_court_id"],
                    cookies=session.cookies,
                    tag_names=[TAG],
                    **{
                        "show_docket_entries": True,
                        "show_orig_docket": True,
                        "show_prior_cases": True,
                        "show_associated_cases": True,
                        "show_panel_info": True,
                        "show_party_atty_info": True,
                        "show_caption": True,
                    }
                ).set(queue=q),
                add_or_update_recap_docket.s().set(queue=q),
            ).apply_async()
        elif task == "district":
            chain(
                get_pacer_case_id_and_title.s(
                    pass_through=None,
                    docket_number=row["Cleaned case_No"],
                    court_id=row["fjc_court_id"],
                    cookies=session.cookies,
                    case_name=row["Title"],
                ).set(queue=q),
                get_docket_by_pacer_case_id.s(
                    court_id=row["fjc_court_id"],
                    cookies=session.cookies,
                    tag_names=[TAG],
                    **{
                        "show_parties_and_counsel": True,
                        "show_terminated_parties": True,
                        "show_list_of_member_cases": True,
                    }
                ).set(queue=q),
                add_or_update_recap_docket.s().set(queue=q),
            ).apply_async()
Beispiel #60
0
    def fix_fjc_positions(self, infile=None):
        """
        Addresses issue #624.

        We had some errant regexes in the district court assignments. This code
        reassigns the court fields for these judges where the new regexes
        differs from the old ones.

        :param infile: The import file with fjc-data.xslx
        :return: None
        """

        if infile is None:
            self.ensure_input_file()
            infile = self.options["input_file"]
        textfields = [
            "firstname",
            "midname",
            "lastname",
            "gender",
            "Place of Birth (City)",
            "Place of Birth (State)",
            "Place of Death (City)",
            "Place of Death (State)",
        ]
        df = pd.read_excel(infile, 0)
        for x in textfields:
            df[x] = df[x].replace(np.nan, "", regex=True)
        df["Employment text field"].replace(to_replace=r";\sno",
                                            value=r", no",
                                            inplace=True,
                                            regex=True)
        for i, item in df.iterrows():
            fjc_id = item["Judge Identification Number"]
            p = Person.objects.get(fjc_id=fjc_id)
            logger.info("Doing person with FJC ID: %s, "
                        "https://courtlistener.com%s" %
                        (fjc_id, p.get_absolute_url()))

            exclusions = []
            for posnum in range(1, 7):
                if posnum > 1:
                    pos_str = " (%s)" % posnum
                else:
                    pos_str = ""

                if pd.isnull(item["Court Name" + pos_str]):
                    continue
                courtid = match_court_string(item["Court Name" + pos_str],
                                             federal_district=True)
                if courtid is None:
                    raise Exception
                date_termination = process_date_string(
                    item["Date of Termination" + pos_str])
                date_start = process_date_string(item["Commission Date" +
                                                      pos_str])
                date_recess_appointment = process_date_string(
                    item["Recess Appointment date" + pos_str])
                if pd.isnull(
                        date_start) and not pd.isnull(date_recess_appointment):
                    date_start = date_recess_appointment
                if pd.isnull(date_start):
                    # if still no start date, skip
                    date_start = None
                positions = Position.objects.filter(
                    person=p,
                    date_start=date_start,
                    date_termination=date_termination,
                    position_type="jud",
                ).exclude(pk__in=exclusions)
                position_count = positions.count()
                if position_count < 1:
                    logger.info("Couldn't find position to match '%s' on '%s' "
                                "with exclusions: %s" %
                                (p, date_start, exclusions))
                    add_positions_from_row(item,
                                           p,
                                           self.debug,
                                           fix_nums=[posnum])
                    if not self.debug:
                        add_items_to_solr.delay([p.pk], "people_db.Person")
                    continue
                elif position_count == 1:
                    # Good case. Press on!
                    position = positions[0]
                    exclusions.append(position.pk)
                elif position_count > 1:
                    logger.info(
                        "Got too many results for '%s' on '%s'. Got %s" %
                        (p, date_start, position_count))
                    continue

                if position.court.pk == courtid:
                    logger.info("Court IDs are both '%s'. No changes made." %
                                courtid)
                else:
                    logger.info("Court IDs are different! Old: %s, New: %s" %
                                (position.court.pk, courtid))
                    court = Court.objects.get(pk=courtid)
                    position.court = court

                    if not self.debug:
                        position.save()
                        add_items_to_solr.delay([p.pk], "people_db.Person")