Example #1
0
    def setUp(self):
        docket = Docket(
            case_name=u'foo',
            court=Court.objects.get(pk='test'),
            source=Docket.DEFAULT
        )
        docket.save()
        # Must be more than a year old for all tests to be runnable.
        last_month = now().date() - timedelta(days=400)
        self.doc_cluster = OpinionCluster(
            case_name=u"foo",
            docket=docket,
            date_filed=last_month
        )
        self.doc_cluster.save(index=False)
        opinion = Opinion(cluster=self.doc_cluster, type='Lead Opinion')
        opinion.save(index=False)

        opinion2 = Opinion(cluster=self.doc_cluster, type='Concurrence')
        opinion2.save(index=False)

        OpinionsCited.objects.create(
            citing_opinion=opinion2,
            cited_opinion=opinion
        )

        # Scrape the audio "site" and add its contents
        site = test_oral_arg_scraper.Site().parse()
        OralArgumentCommand().scrape_court(site, full_crawl=True)
Example #2
0
def lookup_and_save(new, debug=False):
    """Merge new docket info into the database.

    Start by attempting to lookup an existing Docket. If that's not found,
    create a new one. Either way, merge all the attributes of `new` into the
    Docket found, and then save the Docket.

    Returns None if an error occurs, else, return the new or updated Docket.
    """
    try:
        d = Docket.objects.get(pacer_case_id=new.pacer_case_id,
                               court=new.court)
    except (Docket.DoesNotExist, Docket.MultipleObjectsReturned):
        d = None

    if d is None:
        ds = Docket.objects.filter(docket_number=new.docket_number,
                                   court=new.court).order_by('-date_filed')
        count = ds.count()
        if count < 1:
            # Can't find it by pacer_case_id or docket_number. Make a new item.
            d = Docket(source=Docket.RECAP)
        elif count == 1:
            # Nailed it!
            d = ds[0]
        elif count > 1:
            # Too many dockets returned. Disambiguate.
            logger.error("Got multiple results while attempting save.")

            def is_different(x):
                return x.pacer_case_id and x.pacer_case_id != new.pacer_case_id
            if all([is_different(d) for d in ds]):
                # All the dockets found match on docket number, but have
                # different pacer_case_ids. This means that the docket has
                # multiple pacer_case_ids in PACER, and we should mirror that
                # in CL by creating a new docket for the new item.
                d = Docket(source=Docket.RECAP)
            else:
                # Just use the most recent docket. Looking at the data, this is
                # OK. Nearly all of these are dockets associated with clusters
                # that can be merged (however, that's a project for clusters).
                d = ds[0]

    # Add RECAP as a source if it's not already.
    if d.source in [Docket.DEFAULT, Docket.SCRAPER]:
        d.source = Docket.RECAP_AND_SCRAPER
    elif d.source == Docket.COLUMBIA:
        d.source = Docket.COLUMBIA_AND_RECAP
    elif d.source == Docket.COLUMBIA_AND_SCRAPER:
        d.source = Docket.COLUMBIA_AND_RECAP_AND_SCRAPER

    for attr, v in new.__dict__.items():
        setattr(d, attr, v)

    if not debug:
        d.save()
        logger.info("Saved as Docket %s: https://www.courtlistener.com%s" %
                    (d.pk, d.get_absolute_url()))
    return d
Example #3
0
def import_resource_org_item(case_location):
    """Using the path to a case, import it, gathering all needed meta data.

    Path is any valid URI that the requests library can handle.
    """
    def get_file(location):
        if location.startswith('/'):
            with open(location) as f:
                r = requests.Session()
                r.content = f.read()
        else:
            r = requests.get(location)
        return fromstring(r.content), get_clean_body_content(r.content)

    # Get trees and text for the opinion itself and for the index page
    # that links to it. Each has useful data.
    case_tree, case_text = get_file(case_location)
    vol_location = case_location.rsplit('/', 1)[-2] + '/index.html'
    vol_tree, vol_text = get_file(vol_location)

    html, blocked = anonymize(get_case_body(case_tree))

    case_location_relative = case_location.rsplit('/', 1)[1]
    case_name, status = get_case_name_and_status(
        vol_tree, case_location_relative)
    docket = Docket(
        docket_number=get_docket_number(case_location),
        court=Court.objects.get(pk=get_court_id(case_tree)),
        case_name=case_name,
    )
    doc = Document(
        case_name=case_name,
        federal_cite_one=get_west_cite(vol_tree, case_location_relative),
        date_filed=get_date_filed(vol_tree, case_location_relative),
        source='R',
        sha1=hashlib.sha1(case_text).hexdigest(),
        docket=docket,
        download_url=case_location,
        html=html,
        precedential_status=status,
    )
    if blocked:
        doc.blocked = True
        docket.blocked = True
        doc.date_blocked = datetime.date.today()
        docket.date_blocked = datetime.date.today()

    docket.save()
    doc.docket = docket
    doc.save()

    # Update the citation graph
    from cl.citations.tasks import update_document_by_id
    update_document_by_id(doc.pk)

    return doc
Example #4
0
    def test_save_old_opinion(self):
        """Can we save opinions older than 1900?"""
        docket = Docket(case_name=u"Blah", court_id='test',
                        source=Docket.DEFAULT)
        docket.save()
        oc = OpinionCluster(
            case_name=u"Blah",
            docket=docket,
            date_filed=datetime.date(1899, 1, 1),
        )
        oc.save()
        o = Opinion(cluster=oc, type='Lead Opinion')

        try:
            cf = ContentFile(StringIO.StringIO('blah').read())
            o.file_with_date = datetime.date(1899, 1, 1)
            o.local_path.save('file_name.pdf', cf, save=False)
            o.save(index=False)
        except ValueError as e:
            raise ValueError("Unable to save a case older than 1900. Did you "
                             "try to use `strftime`...again?")
Example #5
0
    def save(self, debug):
        """Save the item to the database, updating any existing items.

        Returns None if an error occurs.
        """
        required_fields = ['case_name', 'date_filed']
        for field in required_fields:
            if not getattr(self, field):
                print "  Missing required field: %s" % field
                return None

        try:
            d = Docket.objects.get(
                Q(pacer_case_id=self.pacer_case_id) |
                Q(docket_number=self.docket_number),
                court=self.court,
            )
            if d.source == Docket.SCRAPER:
                d.source = Docket.RECAP_AND_SCRAPER
        except Docket.DoesNotExist:
            d = Docket(
                source=Docket.RECAP,
            )
        except Docket.MultipleObjectsReturned:
            print "  Got multiple results while attempting save."
            return None

        for attr, v in self.__dict__.items():
            setattr(d, attr, v)

        if not debug:
            d.save()
            print "  Saved as Docket %s: https://www.courtlistener.com%s" % (
                d.pk,
                d.get_absolute_url()
            )
        return d
Example #6
0
def lookup_and_save(new, debug=False):
    """Merge new docket info into the database.

    Start by attempting to lookup an existing Docket. If that's not found,
    create a new one. Either way, merge all the attributes of `new` into the
    Docket found, and then save the Docket.

    Returns None if an error occurs, else, return the new or updated Docket.
    """
    try:
        d = Docket.objects.get(
            pacer_case_id=new.pacer_case_id, court=new.court
        )
    except (Docket.DoesNotExist, Docket.MultipleObjectsReturned):
        d = None

    if d is None:
        ds = Docket.objects.filter(
            docket_number=new.docket_number, court=new.court
        ).order_by("-date_filed")
        count = ds.count()
        if count < 1:
            # Can't find it by pacer_case_id or docket_number. Make a new item.
            d = Docket(source=Docket.RECAP)
        elif count == 1:
            # Nailed it!
            d = ds[0]
        elif count > 1:
            # Too many dockets returned. Disambiguate.
            logger.error("Got multiple results while attempting save.")

            def is_different(x):
                return x.pacer_case_id and x.pacer_case_id != new.pacer_case_id

            if all([is_different(d) for d in ds]):
                # All the dockets found match on docket number, but have
                # different pacer_case_ids. This means that the docket has
                # multiple pacer_case_ids in PACER, and we should mirror that
                # in CL by creating a new docket for the new item.
                d = Docket(source=Docket.RECAP)
            else:
                # Just use the most recent docket. Looking at the data, this is
                # OK. Nearly all of these are dockets associated with clusters
                # that can be merged (however, that's a project for clusters).
                d = ds[0]

    # Add RECAP as a source if it's not already.
    if d.source in [Docket.DEFAULT, Docket.SCRAPER]:
        d.source = Docket.RECAP_AND_SCRAPER
    elif d.source == Docket.COLUMBIA:
        d.source = Docket.COLUMBIA_AND_RECAP
    elif d.source == Docket.COLUMBIA_AND_SCRAPER:
        d.source = Docket.COLUMBIA_AND_RECAP_AND_SCRAPER

    for attr, v in new.__dict__.items():
        setattr(d, attr, v)

    if not debug:
        d.save()
        logger.info(
            "Saved as Docket %s: https://www.courtlistener.com%s"
            % (d.pk, d.get_absolute_url())
        )
    return d
Example #7
0
def process_recap_docket(self, pk):
    """Process an uploaded docket from the RECAP API endpoint.

    :param pk: The primary key of the processing queue item you want to work on.
    :returns: A dict of the form:

        {
            // The PK of the docket that's created or updated
            'docket_pk': 22,
            // A boolean indicating whether a new docket entry or recap document
            // was created (implying a Solr needs updating).
            'needs_solr_update': True,
        }

    This value is a dict so that it can be ingested in a Celery chain.
    """
    pq = ProcessingQueue.objects.get(pk=pk)
    mark_pq_status(pq, '', pq.PROCESSING_IN_PROGRESS)
    logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq))

    report = DocketReport(map_cl_to_pacer_id(pq.court_id))
    text = pq.filepath_local.read().decode('utf-8')

    if 'History/Documents' in text:
        # Prior to 1.1.8, we did not separate docket history reports into their
        # own upload_type. Alas, we still have some old clients around, so we
        # need to handle those clients here.
        pq.upload_type = pq.DOCKET_HISTORY_REPORT
        pq.save()
        process_recap_docket_history_report(pk)
        self.request.callbacks = None
        return None

    report._parse_text(text)
    docket_data = report.data
    logger.info("Parsing completed of item %s" % pq)

    if docket_data == {}:
        # Not really a docket. Some sort of invalid document (see Juriscraper).
        msg = "Not a valid docket upload."
        mark_pq_status(pq, msg, pq.INVALID_CONTENT)
        self.request.callbacks = None
        return None

    # Merge the contents of the docket into CL. Attempt several lookups of
    # decreasing specificity. Note that pacer_case_id is required for Docket
    # uploads.
    d = None
    for kwargs in [{'pacer_case_id': pq.pacer_case_id,
                    'docket_number': docket_data['docket_number']},
                   {'pacer_case_id': pq.pacer_case_id},
                   {'docket_number': docket_data['docket_number'],
                    'pacer_case_id': None}]:
        try:
            d = Docket.objects.get(court_id=pq.court_id, **kwargs)
            break
        except Docket.DoesNotExist:
            continue
        except Docket.MultipleObjectsReturned:
            msg = "Too many dockets found when trying to look up '%s'" % pq
            mark_pq_status(pq, msg, pq.PROCESSING_FAILED)
            self.request.callbacks = None
            return None

    if d is None:
        # Couldn't find it. Make a new one.
        d = Docket(
            source=Docket.RECAP,
            pacer_case_id=pq.pacer_case_id,
            court_id=pq.court_id
        )

    # Add RECAP as a source if it's not already.
    if d.source in [Docket.DEFAULT, Docket.SCRAPER]:
        d.source = Docket.RECAP_AND_SCRAPER
    elif d.source == Docket.COLUMBIA:
        d.source = Docket.COLUMBIA_AND_RECAP
    elif d.source == Docket.COLUMBIA_AND_SCRAPER:
        d.source = Docket.COLUMBIA_AND_RECAP_AND_SCRAPER

    update_docket_metadata(d, docket_data)

    if pq.debug:
        mark_pq_successful(pq, d_id=d.pk)
        self.request.callbacks = None
        return {'docket_pk': d.pk, 'needs_solr_update': False}

    d.save()

    # Add the HTML to the docket in case we need it someday.
    pacer_file = PacerHtmlFiles(content_object=d)
    pacer_file.filepath.save(
        'docket.html',  # We only care about the ext w/UUIDFileSystemStorage
        ContentFile(text),
    )

    # Docket entries & documents
    rds_created = []
    needs_solr_update = False
    for docket_entry in docket_data['docket_entries']:
        try:
            de, de_created = DocketEntry.objects.update_or_create(
                docket=d,
                entry_number=docket_entry['document_number'],
                defaults={
                    'description': docket_entry['description'],
                    'date_filed': docket_entry['date_filed'],
                }
            )
        except DocketEntry.MultipleObjectsReturned:
            logger.error(
                "Multiple docket entries found for document entry number '%s' "
                "while processing '%s'" % (docket_entry['document_number'], pq)
            )
            continue
        if de_created:
            needs_solr_update = True

        # Then make the RECAPDocument object. Try to find it. If we do, update
        # the pacer_doc_id field if it's blank. If we can't find it, create it
        # or throw an error.
        params = {
            'docket_entry': de,
            # No attachments when uploading dockets.
            'document_type': RECAPDocument.PACER_DOCUMENT,
            'document_number': docket_entry['document_number'],
        }
        try:
            rd = RECAPDocument.objects.get(**params)
        except RECAPDocument.DoesNotExist:
            rd = RECAPDocument.objects.create(
                pacer_doc_id=docket_entry['pacer_doc_id'],
                is_available=False,
                **params
            )
            rds_created.append(rd)
        except RECAPDocument.MultipleObjectsReturned:
            logger.error(
                "Multiple recap documents found for document entry number'%s' "
                "while processing '%s'" % (docket_entry['document_number'], pq)
            )
            continue
        else:
            rd.pacer_doc_id = rd.pacer_doc_id or pq.pacer_doc_id

    add_parties_and_attorneys(d, docket_data['parties'])
    process_orphan_documents(rds_created, pq.court_id, d.date_filed)
    mark_pq_successful(pq, d_id=d.pk)
    return {
        'docket_pk': d.pk,
        'needs_solr_update': bool(rds_created or needs_solr_update),
    }
def download_and_save():
    """This function is run in many threads simultaneously. Each thread
    runs so long as there are items in the queue. Once an item is found, it's
    downloaded and saved.

    The number of items that can be concurrently saved is determined by the
    number of threads that are running this function.
    """
    while True:
        item = queue.get()
        logger.info("%s: Attempting to add item at: %s" %
                    (threading.current_thread().name, item['url']))
        try:
            msg, r = get_binary_content(
                item['url'],
                {},
            )
        except:
            logger.info("%s: Unable to get item at: %s" %
                        (threading.current_thread().name, item['url']))
            queue.task_done()

        if msg:
            logger.warn(msg)
            queue.task_done()
            continue

        sha1_hash = hashlib.sha1(r.content).hexdigest()
        if Audio.objects.filter(sha1=sha1_hash).exists():
            # Simpsons did it! Try the next one.
            logger.info("%s: Item already exists, moving to next item." %
                        threading.current_thread().name)
            queue.task_done()
            continue
        else:
            # New item, onwards!
            logger.info('%s: Adding new document found at: %s' %
                        (threading.current_thread().name, item['url']))
            audio_file = Audio(
                source='H',
                sha1=sha1_hash,
                case_name=item['case_name'],
                download_url=item['url'],
                processing_complete=False,
            )
            if item['judges']:
                audio_file.judges = item['judges']
            if item['docket_number']:
                audio_file.docket.docket_number = item['docket_number']

            court = Court.objects.get(pk=item['court_code'])

            docket = Docket(
                case_name=item['case_name'],
                court=court,
                date_argued=item['date_argued'],
            )
            # Make and associate the file object
            try:
                cf = ContentFile(r.content)
                extension = get_extension(r.content)
                if extension not in ['.mp3', '.wma']:
                    extension = '.' + item['url'].rsplit('.', 1)[1]
                # See bitbucket issue #215 for why this must be
                # lower-cased.
                file_name = trunc(item['case_name'].lower(), 75) + extension
                audio_file.local_path_original_file.save(file_name, cf,
                                                         save=False)
            except:
                msg = 'Unable to save binary. Deleted document: %s.\n%s' % \
                      (item['case_name'], traceback.format_exc())
                logger.critical(msg)
                queue.task_done()

            docket.save()
            audio_file.docket = docket
            audio_file.save(index=False)

            random_delay = random.randint(0, 3600)
            process_audio_file.apply_async(
                (audio_file.pk,),
                countdown=random_delay
            )

            logger.info("%s: Successfully added audio file %s: %s" %
                        (threading.current_thread().name,
                         audio_file.pk,
                         audio_file.case_name))
Example #9
0
    def migrate_opinions_oral_args_and_dockets(self):
        """Migrate the core objects across, diffing as you go.

        :param start_date: Items changed after this date will be processed.
        :return: None
        """
        self.stdout.write("Migrating dockets, audio files, and opinions...")
        # Find dockets modified after date or with sub-items modified after
        # date.
        q = Q(date_modified__gte=self.start)
        q |= Q(documents__date_modified__gte=self.start)
        q |= Q(audio_files__date_modified__gte=self.start)
        old_dockets = DocketOld.objects.using('old').filter(q)

        for old_docket in old_dockets:
            try:
                old_audio = old_docket.audio_files.all()[0]
            except IndexError:
                old_audio = None
            try:
                old_document = old_docket.documents.all()[0]
            except IndexError:
                old_document = None
            if old_document is None and old_audio is None:
                continue

            if old_document is not None:
                old_citation = old_document.citation
                old_docket.case_name, old_docket.case_name_full, old_docket.case_name_short = self._get_case_names(
                    old_citation.case_name)
            else:
                # Fall back on the docket if needed. Assumes they docket and
                # document case_names are always the same.
                old_docket.case_name, old_docket.case_name_full, old_docket.case_name_short = self._get_case_names(
                    old_docket.case_name)
            if old_audio is not None:
                old_audio.case_name, old_audio.case_name_full, old_audio.case_name_short = self._get_case_names(
                    old_audio.case_name)

            # Courts are in place thanks to initial data. Get the court.
            court = CourtNew.objects.get(pk=old_docket.court_id)

            # Do Dockets
            try:
                existing_docket = (DocketNew.objects.using('default').get(
                    pk=old_docket.pk))
            except DocketNew.DoesNotExist:
                existing_docket = None
            if existing_docket is not None:
                # Intersection. No need for complicated merge as all differences
                # have been resolved by hand.
                new_docket = existing_docket
            else:
                # New docket in old system. Create it in the new system.
                new_docket = DocketNew(
                    pk=old_docket.pk,
                    date_modified=old_docket.date_modified,
                    date_created=old_docket.date_modified,
                    court=court,
                    case_name=old_docket.case_name,
                    case_name_full=old_docket.case_name_full,
                    case_name_short=old_docket.case_name_short,
                    slug=self._none_to_blank(old_docket.slug),
                    docket_number=self._none_to_blank(
                        old_citation.docket_number),
                    date_blocked=old_docket.date_blocked,
                    blocked=old_docket.blocked,
                )
                if old_audio is not None:
                    new_docket.date_argued = old_audio.date_argued
                new_docket.save(using='default')

            # Do Documents/Clusters
            if old_document is not None:
                try:
                    existing_oc = (
                        OpinionClusterNew.objects.using('default').get(
                            pk=old_document.pk))
                except OpinionClusterNew.DoesNotExist:
                    existing_oc = None
                try:
                    existing_o = (OpinionNew.objects.using('default').get(
                        pk=old_document.pk))
                except OpinionNew.DoesNotExist:
                    existing_o = None
                if existing_oc is not None or existing_o is not None:
                    # Run the conflict algo.
                    if self.find_conflicts(old_document, old_citation,
                                           old_docket, existing_oc,
                                           existing_o):
                        self.stdout.write("Found conflict. Resolve that.")
                    else:
                        # No conflicts. Update the existing item.
                        self.add_oc_and_o(old_document, old_citation,
                                          old_docket, new_docket)
                else:
                    # New item. Just add it.
                    self.add_oc_and_o(old_document, old_citation, old_docket,
                                      new_docket)

            # Finally we do Audio. No checks needed because we haven't changed
            # anything on the new server.
            if old_audio is not None:
                new_audio_file = AudioNew(
                    pk=old_audio.pk,
                    docket=new_docket,
                    source=old_audio.source,
                    case_name=old_audio.case_name,
                    case_name_short=old_audio.case_name_short,
                    case_name_full=old_audio.case_name_full,
                    judges=self._none_to_blank(old_audio.judges),
                    date_created=old_audio.time_retrieved,
                    date_modified=old_audio.date_modified,
                    sha1=old_audio.sha1,
                    download_url=old_audio.download_url,
                    local_path_mp3=old_audio.local_path_mp3,
                    local_path_original_file=old_audio.
                    local_path_original_file,
                    duration=old_audio.duration,
                    processing_complete=old_audio.processing_complete,
                    date_blocked=old_audio.date_blocked,
                    blocked=old_audio.blocked,
                )
                new_audio_file.save(
                    using='default',
                    index=False,
                )
def download_and_save():
    """This function is run in many threads simultaneously. Each thread
    runs so long as there are items in the queue. Once an item is found, it's
    downloaded and saved.

    The number of items that can be concurrently saved is determined by the
    number of threads that are running this function.
    """
    while True:
        item = queue.get()
        logger.info("%s: Attempting to add item at: %s" %
                    (threading.current_thread().name, item['url']))
        try:
            msg, r = get_binary_content(
                item['url'],
                {},
            )
        except:
            logger.info("%s: Unable to get item at: %s" %
                        (threading.current_thread().name, item['url']))
            queue.task_done()

        if msg:
            logger.warn(msg)
            queue.task_done()
            continue

        sha1_hash = hashlib.sha1(r.content).hexdigest()
        if Audio.objects.filter(sha1=sha1_hash).exists():
            # Simpsons did it! Try the next one.
            logger.info("%s: Item already exists, moving to next item." %
                        threading.current_thread().name)
            queue.task_done()
            continue
        else:
            # New item, onwards!
            logger.info('%s: Adding new document found at: %s' %
                        (threading.current_thread().name, item['url']))
            audio_file = Audio(
                source='H',
                sha1=sha1_hash,
                case_name=item['case_name'],
                download_url=item['url'],
                processing_complete=False,
            )
            if item['judges']:
                audio_file.judges = item['judges']
            if item['docket_number']:
                audio_file.docket.docket_number = item['docket_number']

            court = Court.objects.get(pk=item['court_code'])

            docket = Docket(
                case_name=item['case_name'],
                court=court,
                date_argued=item['date_argued'],
            )
            # Make and associate the file object
            try:
                cf = ContentFile(r.content)
                extension = get_extension(r.content)
                if extension not in ['.mp3', '.wma']:
                    extension = '.' + item['url'].rsplit('.', 1)[1]
                # See bitbucket issue #215 for why this must be
                # lower-cased.
                file_name = trunc(item['case_name'].lower(), 75) + extension
                audio_file.local_path_original_file.save(file_name,
                                                         cf,
                                                         save=False)
            except:
                msg = 'Unable to save binary. Deleted document: %s.\n%s' % \
                      (item['case_name'], traceback.format_exc())
                logger.critical(msg)
                queue.task_done()

            docket.save()
            audio_file.docket = docket
            audio_file.save(index=False)

            random_delay = random.randint(0, 3600)
            process_audio_file.apply_async((audio_file.pk, ),
                                           countdown=random_delay)

            logger.info("%s: Successfully added audio file %s: %s" %
                        (threading.current_thread().name, audio_file.pk,
                         audio_file.case_name))
Example #11
0
def process_recap_docket(pk):
    """Process an uploaded docket from the RECAP API endpoint.

    :param pk: The primary key of the processing queue item you want to work on.
    :return: The docket that's created or updated.
    """
    pq = ProcessingQueue.objects.get(pk=pk)
    mark_pq_status(pq, '', pq.PROCESSING_IN_PROGRESS)
    logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq))

    report = DocketReport(map_cl_to_pacer_id(pq.court_id))
    text = pq.filepath_local.read().decode('utf-8')
    report._parse_text(text)
    docket_data = report.data
    logger.info("Parsing completed of item %s" % pq)

    if docket_data == {}:
        # Not really a docket. Some sort of invalid document (see Juriscraper).
        msg = "Not a valid docket upload."
        mark_pq_status(pq, msg, pq.INVALID_CONTENT)
        return None

    # Merge the contents of the docket into CL. Attempt several lookups of
    # decreasing specificity.
    d = None
    for kwargs in [{
            'pacer_case_id': pq.pacer_case_id,
            'docket_number': docket_data['docket_number']
    }, {
            'pacer_case_id': pq.pacer_case_id
    }, {
            'docket_number': docket_data['docket_number']
    }]:
        try:
            d = Docket.objects.get(court_id=pq.court_id, **kwargs)
            break
        except Docket.DoesNotExist:
            continue
        except Docket.MultipleObjectsReturned:
            msg = "Too many dockets found when trying to look up '%s'" % pq
            mark_pq_status(pq, msg, pq.PROCESSING_FAILED)
            return None

    if d is None:
        # Couldn't find it. Make a new one.
        d = Docket(source=Docket.RECAP,
                   pacer_case_id=pq.pacer_case_id,
                   court_id=pq.court_id)

    # Add RECAP as a source if it's not already.
    if d.source in [Docket.DEFAULT, Docket.SCRAPER]:
        d.source = Docket.RECAP_AND_SCRAPER
    elif d.source == Docket.COLUMBIA:
        d.source = Docket.COLUMBIA_AND_RECAP
    elif d.source == Docket.COLUMBIA_AND_SCRAPER:
        d.source = Docket.COLUMBIA_AND_RECAP_AND_SCRAPER

    update_docket_metadata(d, docket_data)

    if pq.debug:
        mark_pq_successful(pq, d_id=d.pk)
        return d

    d.save()

    # Add the HTML to the docket in case we need it someday.
    pacer_file = PacerHtmlFiles(content_object=d)
    pacer_file.filepath.save(
        'docket.html',  # We only care about the ext w/UUIDFileSystemStorage
        ContentFile(text),
    )

    # Docket entries
    for docket_entry in docket_data['docket_entries']:
        try:
            de, created = DocketEntry.objects.update_or_create(
                docket=d,
                entry_number=docket_entry['document_number'],
                defaults={
                    'description': docket_entry['description'],
                    'date_filed': docket_entry['date_filed'],
                })
        except DocketEntry.MultipleObjectsReturned:
            logger.error(
                "Multiple docket entries found for document entry number '%s' "
                "while processing '%s'" %
                (docket_entry['document_number'], pq))
            continue

        # Then make the RECAPDocument object. Try to find it. If we do, update
        # the pacer_doc_id field if it's blank. If we can't find it, create it
        # or throw an error.
        try:
            rd = RECAPDocument.objects.get(
                docket_entry=de,
                # No attachments when uploading dockets.
                document_type=RECAPDocument.PACER_DOCUMENT,
                document_number=docket_entry['document_number'],
            )
        except RECAPDocument.DoesNotExist:
            try:
                RECAPDocument.objects.create(
                    docket_entry=de,
                    # No attachments when uploading dockets.
                    document_type=RECAPDocument.PACER_DOCUMENT,
                    document_number=docket_entry['document_number'],
                    pacer_doc_id=docket_entry['pacer_doc_id'],
                    is_available=False,
                )
            except IntegrityError:
                logger.warn(
                    "Creating new document with pacer_doc_id of '%s' violates "
                    "unique constraint on pacer_doc_id field." %
                    docket_entry['pacer_doc_id'])
                continue
        except RECAPDocument.MultipleObjectsReturned:
            logger.error(
                "Multiple recap documents found for document entry number'%s' "
                "while processing '%s'" %
                (docket_entry['document_number'], pq))
            continue
        else:
            rd.pacer_doc_id = rd.pacer_doc_id or pq.pacer_doc_id

    add_parties_and_attorneys(d, docket_data['parties'])
    mark_pq_successful(pq, d_id=d.pk)
    return d
def make_and_save(item, skipdupes=False, min_dates=None, testing=True):
    """Associates case data from `parse_opinions` with objects. Saves these
    objects.

    min_date: if not none, will skip cases after min_date
    """
    date_filed = date_argued = date_reargued = date_reargument_denied = date_cert_granted = date_cert_denied = None
    unknown_date = None
    for date_cluster in item['dates']:
        for date_info in date_cluster:
            # check for any dates that clearly aren't dates
            if date_info[1].year < 1600 or date_info[1].year > 2020:
                continue
            # check for untagged dates that will be assigned to date_filed
            if date_info[0] is None:
                date_filed = date_info[1]
                continue
            # try to figure out what type of date it is based on its tag string
            if date_info[0] in FILED_TAGS:
                date_filed = date_info[1]
            elif date_info[0] in DECIDED_TAGS:
                if not date_filed:
                    date_filed = date_info[1]
            elif date_info[0] in ARGUED_TAGS:
                date_argued = date_info[1]
            elif date_info[0] in REARGUE_TAGS:
                date_reargued = date_info[1]
            elif date_info[0] in REARGUE_DENIED_TAGS:
                date_reargument_denied = date_info[1]
            elif date_info[0] in CERT_GRANTED_TAGS:
                date_cert_granted = date_info[1]
            elif date_info[0] in CERT_DENIED_TAGS:
                date_cert_denied = date_info[1]
            else:
                unknown_date = date_info[1]
                if date_info[0] not in UNKNOWN_TAGS:
                    print("\nFound unknown date tag '%s' with date '%s'.\n" %
                          date_info)

    # the main date (used for date_filed in OpinionCluster) and panel dates
    # (used for finding judges) are ordered in terms of which type of dates
    # best reflect them
    main_date = (date_filed or date_argued or date_reargued or
                 date_reargument_denied or unknown_date)
    panel_date = (date_argued or date_reargued or date_reargument_denied or
                  date_filed or unknown_date)

    if main_date is None:
        raise Exception("Failed to get a date for " + item['file'])

    if min_dates is not None:
        if min_dates.get(item['court_id']) is not None:
            if main_date >= min_dates[item['court_id']]:
                print(main_date, 'after', min_dates[item['court_id']],
                      ' -- skipping.')
                return

    docket = Docket(
        source=Docket.COLUMBIA,
        date_argued=date_argued,
        date_reargued=date_reargued,
        date_cert_granted=date_cert_granted,
        date_cert_denied=date_cert_denied,
        date_reargument_denied=date_reargument_denied,
        court_id=item['court_id'],
        case_name_short=item['case_name_short'] or '',
        case_name=item['case_name'] or '',
        case_name_full=item['case_name_full'] or '',
        docket_number=item['docket'] or ''
    )

    # get citations in the form of, e.g. {'federal_cite_one': '1 U.S. 1', ...}
    found_citations = []
    for c in item['citations']:
        found = get_citations(c)
        if not found:
            # if the docket number --is-- citation string, we're likely dealing
            # with a somewhat common triplet of (docket number, date,
            # jurisdiction), which isn't a citation at all (so there's no
            # problem)
            if item['docket']:
                docket_no = item['docket'].lower()
                if 'claim no.' in docket_no:
                    docket_no = docket_no.split('claim no.')[0]
                for junk in DOCKET_JUNK:
                    docket_no = docket_no.replace(junk, '')
                docket_no = docket_no.strip('.').strip()
                if docket_no and docket_no in c.lower():
                    continue

            # there are a trivial number of letters (except for months and a few
            # trivial words) in the citation, then it's not a citation at all
            non_trivial = c.lower()
            for trivial in TRIVIAL_CITE_WORDS:
                non_trivial = non_trivial.replace(trivial, '')
            num_letters = sum(non_trivial.count(letter) for letter in string.lowercase)
            if num_letters < 3:
                continue

            # if there is a string that's known to indicate a bad citation, then
            # it's not a citation
            if any(bad in c for bad in BAD_CITES):
                continue
            # otherwise, this is a problem
            raise Exception("Failed to get a citation from the string '%s' in "
                            "court '%s' with docket '%s'." % (
                                c, item['court_id'], item['docket']
                            ))
        else:
            found_citations.extend(found)
    citations_map = map_citations_to_models(found_citations)

    cluster = OpinionCluster(
        judges=item.get('judges', '') or "",
        precedential_status=('Unpublished' if item['unpublished'] else 'Published'),
        date_filed=main_date,
        case_name_short=item['case_name_short'] or '',
        case_name=item['case_name'] or '',
        case_name_full=item['case_name_full'] or '',
        source='Z',
        attorneys=item['attorneys'] or '',
        posture=item['posture'] or '',
        **citations_map
    )
    panel = [find_person(n, item['court_id'], case_date=panel_date) for n in
             item['panel']]
    panel = [x for x in panel if x is not None]

    opinions = []
    for i, opinion_info in enumerate(item['opinions']):
        if opinion_info['author'] is None:
            author = None
        else:
            author = find_person(opinion_info['author'], item['court_id'],
                                 case_date=panel_date)
        converted_text = convert_columbia_html(opinion_info['opinion'])
        opinion_type = OPINION_TYPE_MAPPING[opinion_info['type']]
        if opinion_type == '020lead' and i > 0:
            opinion_type = '050addendum'

        opinion = Opinion(
            author=author,
            per_curiam=opinion_info['per_curiam'],
            type=opinion_type,
            # type=OPINION_TYPE_MAPPING[opinion_info['type']],
            html_columbia=converted_text,
            sha1=opinion_info['sha1'],
            local_path=opinion_info['local_path'],
        )
        joined_by = [find_person(n, item['court_id'], case_date=panel_date) for n in opinion_info['joining']]
        joined_by = [x for x in joined_by if x is not None]
        opinions.append((opinion, joined_by))

    if min_dates is None:
        # check to see if this is a duplicate
        dups = find_dups(docket, cluster, panel, opinions)
        if dups:
            if skipdupes:
                print('Duplicate. skipping.')
            else:
                raise Exception("Found %s duplicate(s)." % len(dups))

    # save all the objects
    if not testing:
        try:
            docket.save()
            cluster.docket = docket
            cluster.save(index=False)
            for member in panel:
                cluster.panel.add(member)
            for opinion, joined_by in opinions:
                opinion.cluster = cluster
                opinion.save(index=False)
                for joiner in joined_by:
                    opinion.joined_by.add(joiner)
            if settings.DEBUG:
                domain = "http://127.0.0.1:8000"
            else:
                domain = "https://www.courtlistener.com"
            print("Created item at: %s%s" % (domain, cluster.get_absolute_url()))
        except:
            # if anything goes wrong, try to delete everything
            try:
                docket.delete()
            except:
                pass
            raise
Example #13
0
class StaticFilesTest(TestCase):
    good_mp3_path = 'mp3/2014/06/09/ander_v._leo.mp3'
    good_txt_path = 'txt/2015/12/28/opinion_text.txt'
    good_pdf_path = 'pdf/2013/06/12/' + \
                    'in_re_motion_for_consent_to_disclosure_of_court_records.pdf'

    def setUp(self):
        self.court = Court.objects.get(pk='test')
        self.docket = Docket(case_name=u'Docket', court=self.court, source=Docket.DEFAULT)
        self.docket.save()

        self.audio = Audio(
            local_path_original_file=self.good_mp3_path,
            local_path_mp3=self.good_mp3_path,
            docket=self.docket,
            blocked=False,
            case_name_full='Ander v. Leo',
            date_created=datetime.date(2014, 6, 9)
        )
        self.audio.save(index=False)

        self.opinioncluster = OpinionCluster(
            case_name=u'Hotline Bling',
            docket=self.docket,
            date_filed=datetime.date(2015, 12, 14),
        )
        self.opinioncluster.save(index=False)

        self.txtopinion = Opinion(
            cluster=self.opinioncluster,
            type='Lead Opinion',
            local_path=self.good_txt_path
        )
        self.txtopinion.save(index=False)

        self.pdfopinion = Opinion(
            cluster=self.opinioncluster,
            type='Lead Opinion',
            local_path=self.good_pdf_path
        )
        self.pdfopinion.save(index=False)

    def test_serve_static_file_serves_mp3(self):
        request = HttpRequest()
        file_path = self.audio.local_path_mp3
        response = serve_static_file(request, file_path=self.good_mp3_path)
        self.assertEqual(response.status_code, 200)
        self.assertEqual(response['Content-Type'], 'audio/mpeg')
        self.assertIn('inline;', response['Content-Disposition'])

    def test_serve_static_file_serves_txt(self):
        request = HttpRequest()
        response = serve_static_file(request, file_path=self.good_txt_path)
        self.assertEqual(response.status_code, 200)
        self.assertEqual(response['Content-Type'], 'text/plain')
        self.assertIn('inline;', response['Content-Disposition'])
        self.assertIn(
            'FOR THE DISTRICT OF COLUMBIA CIRCUIT',
            response.content
        )

    def test_serve_static_file_serves_pdf(self):
        request = HttpRequest()
        response = serve_static_file(request, file_path=self.good_pdf_path)
        self.assertEqual(response.status_code, 200)
        self.assertEqual(response['Content-Type'], 'application/pdf')
        self.assertIn('inline;', response['Content-Disposition'])
Example #14
0
def process_docket_data(
    d: Docket,
    report_type: int,
    filepath: str = None,
) -> Optional[int]:
    """Process docket data file.

    :param d: A docket object to work on.
    :param report_type: Whether it's a docket or a docket history report.
    :param filepath: A local path where the item can be found. If not provided,
    the filepath_local field of the docket object will be attempted.
    """
    from cl.recap.mergers import (
        add_bankruptcy_data_to_docket,
        add_claims_to_docket,
        add_docket_entries,
        add_parties_and_attorneys,
        update_docket_appellate_metadata,
        update_docket_metadata,
    )

    court_id = map_cl_to_pacer_id(d.court_id)
    if report_type == UPLOAD_TYPE.DOCKET:
        report = DocketReport(court_id)
    elif report_type == UPLOAD_TYPE.DOCKET_HISTORY_REPORT:
        report = DocketHistoryReport(court_id)
    elif report_type == UPLOAD_TYPE.APPELLATE_DOCKET:
        report = AppellateDocketReport(court_id)
    elif report_type == UPLOAD_TYPE.IA_XML_FILE:
        report = InternetArchive(court_id)
    elif report_type == UPLOAD_TYPE.CASE_REPORT_PAGE:
        report = CaseQuery(court_id)
    elif report_type == UPLOAD_TYPE.CLAIMS_REGISTER:
        report = ClaimsRegister(court_id)
    else:
        raise NotImplementedError(
            "The report type with id '%s' is not yet "
            "supported. Perhaps you need to add it?" % report_type
        )

    if filepath:
        with open(filepath, "r") as f:
            text = f.read()
    else:
        # This is an S3 path, so get it remotely.
        text = d.filepath_local.read().decode()

    report._parse_text(text)
    data = report.data
    if data == {}:
        return None

    if report_type == UPLOAD_TYPE.CLAIMS_REGISTER:
        add_bankruptcy_data_to_docket(d, data)
        add_claims_to_docket(d, data["claims"])
    else:
        update_docket_metadata(d, data)
        d, og_info = update_docket_appellate_metadata(d, data)
        if og_info is not None:
            og_info.save()
            d.originating_court_information = og_info
        d.save()
        if data.get("docket_entries"):
            add_docket_entries(d, data["docket_entries"])
    if report_type in (
        UPLOAD_TYPE.DOCKET,
        UPLOAD_TYPE.APPELLATE_DOCKET,
        UPLOAD_TYPE.IA_XML_FILE,
    ):
        add_parties_and_attorneys(d, data["parties"])
    return d.pk
Example #15
0
def process_recap_docket(pk):
    pq = ProcessingQueue.objects.get(pk=pk)
    pq.status = pq.PROCESSING_IN_PROGRESS
    pq.save()
    logger.info("Processing RECAP item: %s" % pq)

    report = DocketReport(map_cl_to_pacer_id(pq.court_id))
    text = pq.filepath_local.read().decode('utf-8')
    report.parse_text(text)
    docket_data = report.data
    logger.info("Parsing completed of item %s" % pq)

    # Merge the contents of the docket into CL
    try:
        d = Docket.objects.get(
            Q(pacer_case_id=pq.pacer_case_id) |
            Q(docket_number=docket_data['docket_number']),
            court_id=pq.court_id,
        )
        # Add RECAP as a source if it's not already.
        if d.source in [Docket.DEFAULT, Docket.SCRAPER]:
            d.source = Docket.RECAP_AND_SCRAPER
        elif d.source == Docket.COLUMBIA:
            d.source = Docket.COLUMBIA_AND_RECAP
        elif d.source == Docket.COLUMBIA_AND_SCRAPER:
            d.source = Docket.COLUMBIA_AND_RECAP_AND_SCRAPER
    except Docket.DoesNotExist:
        d = Docket(
            source=Docket.RECAP,
            pacer_case_id=pq.pacer_case_id,
            court_id=pq.court_id
        )
    except Docket.MultipleObjectsReturned:
        msg = "Too many dockets found when trying to look up '%s'" % pq
        logger.error(msg)
        pq.error_message = msg
        pq.status = pq.PROCESSING_FAILED
        pq.save()
        return None

    update_docket_metadata(d, docket_data)
    d.save()

    # Docket entries
    for docket_entry in docket_data['docket_entries']:
        try:
            de, created = DocketEntry.objects.update_or_create(
                docket=d,
                entry_number=docket_entry['document_number'],
                defaults={
                    'description': docket_entry['description'],
                    'date_filed': docket_entry['date_filed'],
                }
            )
        except DocketEntry.MultipleObjectsReturned:
            logger.error(
                "Multiple docket entries found for document entry number '%s' "
                "while processing '%s'" % (docket_entry['document_number'], pq)
            )
            continue

        # Then make the RECAPDocument object. Try to find it. If we do, update
        # the pacer_doc_id field if it's blank. If we can't find it, create it
        # or throw an error.
        try:
            rd = RECAPDocument.objects.get(
                docket_entry=de,
                # No attachments when uploading dockets.
                document_type=RECAPDocument.PACER_DOCUMENT,
                document_number=docket_entry['document_number'],
            )
        except RECAPDocument.DoesNotExist:
            RECAPDocument.objects.create(
                docket_entry=de,
                # No attachments when uploading dockets.
                document_type=RECAPDocument.PACER_DOCUMENT,
                document_number=docket_entry['document_number'],
                pacer_doc_id=docket_entry['pacer_doc_id'],
                is_available=False,
            )
        except RECAPDocument.MultipleObjectsReturned:
            logger.error(
                "Multiple recap documents found for document entry number'%s' "
                "while processing '%s'" % (docket_entry['document_number'], pq)
            )
            continue
        else:
            rd.pacer_doc_id = rd.pacer_doc_id or docket_entry.pacer_doc_id

    # Parties
    for party in docket_data['parties']:
        try:
            p = Party.objects.get(name=party['name'])
        except Party.DoesNotExist:
            p = Party.objects.create(
                name=party['name'],
                extra_info=party['extra_info'],
            )
        except Party.MultipleObjectsReturned:
            continue
        else:
            if party['extra_info']:
                p.extra_info = party['extra_info']
                p.save()

        # If the party type doesn't exist, make a new one.
        if not p.party_types.filter(docket=d, name=party['type']).exists():
            PartyType.objects.create(docket=d, party=p, name=party['type'])

        # Attorneys
        for atty in party.get('attorneys', []):
            add_attorney(atty, p, d)

    pq.error_message = ''  # Clear out errors b/c successful
    pq.status = pq.PROCESSING_SUCCESSFUL
    pq.save()

    return d
Example #16
0
class StaticFilesTest(TestCase):
    good_mp3_path = 'mp3/2014/06/09/ander_v._leo.mp3'
    good_txt_path = 'txt/2015/12/28/opinion_text.txt'
    good_pdf_path = 'pdf/2013/06/12/' + \
                    'in_re_motion_for_consent_to_disclosure_of_court_records.pdf'

    def setUp(self):
        self.court = Court.objects.get(pk='test')
        self.docket = Docket(case_name=u'Docket', court=self.court, source=Docket.DEFAULT)
        self.docket.save()

        self.audio = Audio(
            local_path_original_file=self.good_mp3_path,
            local_path_mp3=self.good_mp3_path,
            docket=self.docket,
            blocked=False,
            case_name_full='Ander v. Leo',
            date_created=datetime.date(2014, 6, 9)
        )
        self.audio.save(index=False)

        self.opinioncluster = OpinionCluster(
            case_name=u'Hotline Bling',
            docket=self.docket,
            date_filed=datetime.date(2015, 12, 14),
        )
        self.opinioncluster.save(index=False)

        self.txtopinion = Opinion(
            cluster=self.opinioncluster,
            type='Lead Opinion',
            local_path=self.good_txt_path
        )
        self.txtopinion.save(index=False)

        self.pdfopinion = Opinion(
            cluster=self.opinioncluster,
            type='Lead Opinion',
            local_path=self.good_pdf_path
        )
        self.pdfopinion.save(index=False)

    def test_serve_static_file_serves_mp3(self):
        request = HttpRequest()
        file_path = self.audio.local_path_mp3
        response = serve_static_file(request, file_path=self.good_mp3_path)
        self.assertEqual(response.status_code, 200)
        self.assertEqual(response['Content-Type'], 'audio/mpeg')
        self.assertIn('attachment;', response['Content-Disposition'])

    def test_serve_static_file_serves_txt(self):
        request = HttpRequest()
        response = serve_static_file(request, file_path=self.good_txt_path)
        self.assertEqual(response.status_code, 200)
        self.assertEqual(response['Content-Type'], 'text/plain')
        self.assertIn('attachment;', response['Content-Disposition'])
        self.assertIn(
            'FOR THE DISTRICT OF COLUMBIA CIRCUIT',
            response.content
        )

    def test_serve_static_file_serves_pdf(self):
        request = HttpRequest()
        response = serve_static_file(request, file_path=self.good_pdf_path)
        self.assertEqual(response.status_code, 200)
        self.assertEqual(response['Content-Type'], 'application/pdf')
        self.assertIn('attachment;', response['Content-Disposition'])
    def migrate_opinions_oral_args_and_dockets(self):
        """Migrate the core objects across, diffing as you go.

        :param start_date: Items changed after this date will be processed.
        :return: None
        """
        self.stdout.write("Migrating dockets, audio files, and opinions...")
        # Find dockets modified after date or with sub-items modified after
        # date.
        q = Q(date_modified__gte=self.start)
        q |= Q(documents__date_modified__gte=self.start)
        q |= Q(audio_files__date_modified__gte=self.start)
        old_dockets = DocketOld.objects.using('old').filter(q)

        for old_docket in old_dockets:
            try:
                old_audio = old_docket.audio_files.all()[0]
            except IndexError:
                old_audio = None
            try:
                old_document = old_docket.documents.all()[0]
            except IndexError:
                old_document = None
            if old_document is None and old_audio is None:
                continue

            if old_document is not None:
                old_citation = old_document.citation
                old_docket.case_name, old_docket.case_name_full, old_docket.case_name_short = self._get_case_names(
                    old_citation.case_name)
            else:
                # Fall back on the docket if needed. Assumes they docket and
                # document case_names are always the same.
                old_docket.case_name, old_docket.case_name_full, old_docket.case_name_short = self._get_case_names(
                        old_docket.case_name)
            if old_audio is not None:
                old_audio.case_name, old_audio.case_name_full, old_audio.case_name_short = self._get_case_names(
                    old_audio.case_name)

            # Courts are in place thanks to initial data. Get the court.
            court = CourtNew.objects.get(pk=old_docket.court_id)

            # Do Dockets
            try:
                existing_docket = (DocketNew.objects
                                   .using('default')
                                   .get(pk=old_docket.pk))
            except DocketNew.DoesNotExist:
                existing_docket = None
            if existing_docket is not None:
                # Intersection. No need for complicated merge as all differences
                # have been resolved by hand.
                new_docket = existing_docket
            else:
                # New docket in old system. Create it in the new system.
                new_docket = DocketNew(
                    pk=old_docket.pk,
                    date_modified=old_docket.date_modified,
                    date_created=old_docket.date_modified,
                    court=court,
                    case_name=old_docket.case_name,
                    case_name_full=old_docket.case_name_full,
                    case_name_short=old_docket.case_name_short,
                    slug=self._none_to_blank(old_docket.slug),
                    docket_number=self._none_to_blank(
                        old_citation.docket_number),
                    date_blocked=old_docket.date_blocked,
                    blocked=old_docket.blocked,
                )
                if old_audio is not None:
                    new_docket.date_argued = old_audio.date_argued
                new_docket.save(using='default')

            # Do Documents/Clusters
            if old_document is not None:
                try:
                    existing_oc = (OpinionClusterNew.objects
                                   .using('default')
                                   .get(pk=old_document.pk))
                except OpinionClusterNew.DoesNotExist:
                    existing_oc = None
                try:
                    existing_o = (OpinionNew.objects
                                  .using('default')
                                  .get(pk=old_document.pk))
                except OpinionNew.DoesNotExist:
                    existing_o = None
                if existing_oc is not None or existing_o is not None:
                    # Run the conflict algo.
                    if self.find_conflicts(old_document, old_citation,
                                           old_docket, existing_oc,
                                           existing_o):
                        self.stdout.write("Found conflict. Resolve that.")
                    else:
                        # No conflicts. Update the existing item.
                        self.add_oc_and_o(old_document, old_citation,
                                          old_docket, new_docket)
                else:
                    # New item. Just add it.
                    self.add_oc_and_o(old_document, old_citation, old_docket,
                                      new_docket)

            # Finally we do Audio. No checks needed because we haven't changed
            # anything on the new server.
            if old_audio is not None:
                new_audio_file = AudioNew(
                    pk=old_audio.pk,
                    docket=new_docket,
                    source=old_audio.source,
                    case_name=old_audio.case_name,
                    case_name_short=old_audio.case_name_short,
                    case_name_full=old_audio.case_name_full,
                    judges=self._none_to_blank(old_audio.judges),
                    date_created=old_audio.time_retrieved,
                    date_modified=old_audio.date_modified,
                    sha1=old_audio.sha1,
                    download_url=old_audio.download_url,
                    local_path_mp3=old_audio.local_path_mp3,
                    local_path_original_file=old_audio.local_path_original_file,
                    duration=old_audio.duration,
                    processing_complete=old_audio.processing_complete,
                    date_blocked=old_audio.date_blocked,
                    blocked=old_audio.blocked,
                )
                new_audio_file.save(
                    using='default',
                    index=False,
                )
Example #18
0
class StaticFilesTest(TestCase):
    good_mp3_path = "mp3/2014/06/09/ander_v._leo.mp3"
    good_txt_path = "txt/2015/12/28/opinion_text.txt"
    good_pdf_path = (
        "pdf/2013/06/12/" +
        "in_re_motion_for_consent_to_disclosure_of_court_records.pdf")

    def setUp(self):
        self.court = Court.objects.get(pk="test")
        self.docket = Docket(case_name=u"Docket",
                             court=self.court,
                             source=Docket.DEFAULT)
        self.docket.save()

        self.audio = Audio(
            local_path_original_file=self.good_mp3_path,
            local_path_mp3=self.good_mp3_path,
            docket=self.docket,
            blocked=False,
            case_name_full="Ander v. Leo",
            date_created=datetime.date(2014, 6, 9),
        )
        self.audio.save(index=False)

        self.opinioncluster = OpinionCluster(
            case_name=u"Hotline Bling",
            docket=self.docket,
            date_filed=datetime.date(2015, 12, 14),
        )
        self.opinioncluster.save(index=False)

        self.txtopinion = Opinion(
            cluster=self.opinioncluster,
            type="Lead Opinion",
            local_path=self.good_txt_path,
        )
        self.txtopinion.save(index=False)

        self.pdfopinion = Opinion(
            cluster=self.opinioncluster,
            type="Lead Opinion",
            local_path=self.good_pdf_path,
        )
        self.pdfopinion.save(index=False)

    def test_serve_static_file_serves_mp3(self):
        request = HttpRequest()
        file_path = self.audio.local_path_mp3
        response = serve_static_file(request, file_path=self.good_mp3_path)
        self.assertEqual(response.status_code, 200)
        self.assertEqual(response["Content-Type"], "audio/mpeg")
        self.assertIn("inline;", response["Content-Disposition"])

    def test_serve_static_file_serves_txt(self):
        request = HttpRequest()
        response = serve_static_file(request, file_path=self.good_txt_path)
        self.assertEqual(response.status_code, 200)
        self.assertEqual(response["Content-Type"], "text/plain")
        self.assertIn("inline;", response["Content-Disposition"])
        self.assertIn("FOR THE DISTRICT OF COLUMBIA CIRCUIT", response.content)

    def test_serve_static_file_serves_pdf(self):
        request = HttpRequest()
        response = serve_static_file(request, file_path=self.good_pdf_path)
        self.assertEqual(response.status_code, 200)
        self.assertEqual(response["Content-Type"], "application/pdf")
        self.assertIn("inline;", response["Content-Disposition"])
def make_and_save(item):
    """Associates case data from `parse_opinions` with objects. Saves these objects."""
    date_filed = date_argued = date_reargued = date_reargument_denied = date_cert_granted = date_cert_denied = None
    for date_cluster in item['dates']:
        for date_info in date_cluster:
            # check for any dates that clearly aren't dates
            if date_info[1].year < 1600 or date_info[1].year > 2020:
                continue
            # check for untagged dates that will be assigned to date_filed
            if date_info[0] is None:
                date_filed = date_info[1]
                continue
            # try to figure out what type of date it is based on its tag string
            if date_info[0] in FILED_TAGS:
                date_filed = date_info[1]
            elif date_info[0] in DECIDED_TAGS:
                if not date_filed:
                    date_filed = date_info[1]
            elif date_info[0] in ARGUED_TAGS:
                date_argued = date_info[1]
            elif date_info[0] in REARGUE_TAGS:
                date_reargued = date_info[1]
            elif date_info[0] in REARGUE_DENIED_TAGS:
                date_reargument_denied = date_info[1]
            elif date_info[0] in CERT_GRANTED_TAGS:
                date_cert_granted = date_info[1]
            elif date_info[0] in CERT_DENIED_TAGS:
                date_cert_denied = date_info[1]
            else:
                print("Found unknown date tag '%s' with date '%s'." % date_info)

    docket = Docket(
        date_argued=date_argued
        ,date_reargued=date_reargued
        ,date_cert_granted=date_cert_granted
        ,date_cert_denied=date_cert_denied
        ,date_reargument_denied=date_reargument_denied
        ,court_id=item['court_id']
        ,case_name_short=item['case_name_short'] or ''
        ,case_name=item['case_name'] or ''
        ,case_name_full=item['case_name_full'] or ''
        ,docket_number=item['docket'] or ''
    )
    docket.save()

    # get citations in the form of, e.g. {'federal_cite_one': '1 U.S. 1', ...}
    found_citations = []
    for c in item['citations']:
        found = get_citations(c)
        if not found:
            raise Exception("Failed to get a citation from the string '%s'." % c)
        elif len(found) > 1:
            raise Exception("Got multiple citations from string '%s' when there should have been one." % c)
        found_citations.append(found[0])
    citations_map = map_citations_to_models(found_citations)

    cluster = OpinionCluster(
        docket=docket
        ,precedential_status=('Unpublished' if item['unpublished'] else 'Published')
        ,date_filed=date_filed
        ,case_name_short=item['case_name_short'] or ''
        ,case_name=item['case_name'] or ''
        ,case_name_full=item['case_name_full'] or ''
        ,source='Z'
        ,attorneys=item['attorneys'] or ''
        ,posture=item['posture'] or ''
        ,**citations_map
    )
    cluster.save()
    
    if date_argued is not None:
        paneldate = date_argued
    else:
        paneldate = date_filed
    panel = [find_person(n, item['court_id'], paneldate) for n in item['panel']]
    panel = [x for x in panel if x is not None]
    for member in panel:
        cluster.panel.add(member)

    for opinion_info in item['opinions']:
        if opinion_info['author'] is None:
            author = None
        else:
            author = find_person(opinion_info['author'], item['court_id'], date_filed or date_argued)
        opinion = Opinion(
            cluster=cluster
            ,author=author
            ,type=OPINION_TYPE_MAPPING[opinion_info['type']]
            ,html_columbia=opinion_info['opinion']
        )
        opinion.save()
        joined_by = [find_person(n, item['court_id'], paneldate) for n in opinion_info['joining']]
        joined_by = [x for x in joined_by if x is not None]
        for joiner in joined_by:
            opinion.joined_by.add(joiner)
def parse_harvard_opinions(reporter, volume, make_searchable):
    """
    Parse downloaded CaseLaw Corpus from internet archive and add them to our
    database.

    Optionally uses a reporter abbreviation to identify cases to download as
    used by IA.  (Ex. T.C. => tc)

    Optionally uses a volume integer.

    If neither is provided, code will cycle through all downloaded files.

    :param volume: The volume (int) of the reporters (optional) (ex 10)
    :param reporter: Reporter string as slugify'd (optional) (tc) for T.C.
    :param make_searchable: Boolean to indicate saving to solr
    :return: None
    """
    if not reporter and volume:
        logger.error("You provided a volume but no reporter. Exiting.")
        return

    for file_path in filepath_list(reporter, volume):
        ia_download_url = "/".join(
            ["https://archive.org/download", file_path.split("/", 9)[-1]]
        )

        if OpinionCluster.objects.filter(
            filepath_json_harvard=file_path
        ).exists():
            logger.info("Skipping - already in system %s" % ia_download_url)
            continue

        try:
            with open(file_path) as f:
                data = json.load(f)
        except ValueError:
            logger.warning("Empty json: missing case at: %s" % ia_download_url)
            continue
        except Exception as e:
            logger.warning("Unknown error %s for: %s" % (e, ia_download_url))
            continue

        cites = get_citations(data["citations"][0]["cite"])
        if not cites:
            logger.info(
                "No citation found for %s." % data["citations"][0]["cite"]
            )
            continue

        case_name = harmonize(data["name_abbreviation"])
        case_name_short = cnt.make_case_name_short(case_name)
        case_name_full = harmonize(data["name"])

        citation = cites[0]
        if skip_processing(citation, case_name, file_path):
            continue

        # TODO: Generalize this to handle all court types somehow.
        court_id = match_court_string(
            data["court"]["name"],
            state=True,
            federal_appeals=True,
            federal_district=True,
        )

        soup = BeautifulSoup(data["casebody"]["data"], "lxml")

        # Some documents contain images in the HTML
        # Flag them for a later crawl by using the placeholder '[[Image]]'
        judge_list = [
            extract_judge_last_name(x.text) for x in soup.find_all("judges")
        ]
        author_list = [
            extract_judge_last_name(x.text) for x in soup.find_all("author")
        ]
        # Flatten and dedupe list of judges
        judges = ", ".join(
            sorted(
                list(
                    set(
                        itertools.chain.from_iterable(judge_list + author_list)
                    )
                )
            )
        )
        judges = titlecase(judges)
        docket_string = (
            data["docket_number"]
            .replace("Docket No.", "")
            .replace("Docket Nos.", "")
            .strip()
        )

        short_fields = ["attorneys", "disposition", "otherdate", "seealso"]

        long_fields = [
            "syllabus",
            "summary",
            "history",
            "headnotes",
            "correction",
        ]

        short_data = parse_extra_fields(soup, short_fields, False)
        long_data = parse_extra_fields(soup, long_fields, True)

        with transaction.atomic():
            logger.info("Adding docket for: %s", citation.base_citation())
            docket = Docket(
                case_name=case_name,
                case_name_short=case_name_short,
                case_name_full=case_name_full,
                docket_number=docket_string,
                court_id=court_id,
                source=Docket.HARVARD,
                ia_needs_upload=False,
            )
            try:
                with transaction.atomic():
                    docket.save()
            except OperationalError as e:
                if "exceeds maximum" in str(e):
                    docket.docket_number = (
                        "%s, See Corrections for full Docket Number"
                        % trunc(docket_string, length=5000, ellipsis="...")
                    )
                    docket.save()
                    long_data["correction"] = "%s <br> %s" % (
                        data["docket_number"],
                        long_data["correction"],
                    )
            # Handle partial dates by adding -01v to YYYY-MM dates
            date_filed, is_approximate = validate_dt(data["decision_date"])

            logger.info("Adding cluster for: %s", citation.base_citation())
            cluster = OpinionCluster(
                case_name=case_name,
                case_name_short=case_name_short,
                case_name_full=case_name_full,
                precedential_status="Published",
                docket_id=docket.id,
                source="U",
                date_filed=date_filed,
                date_filed_is_approximate=is_approximate,
                attorneys=short_data["attorneys"],
                disposition=short_data["disposition"],
                syllabus=long_data["syllabus"],
                summary=long_data["summary"],
                history=long_data["history"],
                other_dates=short_data["otherdate"],
                cross_reference=short_data["seealso"],
                headnotes=long_data["headnotes"],
                correction=long_data["correction"],
                judges=judges,
                filepath_json_harvard=file_path,
            )
            cluster.save(index=False)

            logger.info("Adding citation for: %s", citation.base_citation())
            Citation.objects.create(
                volume=citation.volume,
                reporter=citation.reporter,
                page=citation.page,
                type=map_reporter_db_cite_type(
                    REPORTERS[citation.canonical_reporter][0]["cite_type"]
                ),
                cluster_id=cluster.id,
            )
            new_op_pks = []
            for op in soup.find_all("opinion"):
                # This code cleans author tags for processing.
                # It is particularly useful for identifiying Per Curiam
                for elem in [op.find("author")]:
                    if elem is not None:
                        [x.extract() for x in elem.find_all("page-number")]

                auth = op.find("author")
                if auth is not None:
                    author_tag_str = titlecase(auth.text.strip(":"))
                    author_str = titlecase(
                        "".join(extract_judge_last_name(author_tag_str))
                    )
                else:
                    author_str = ""
                    author_tag_str = ""

                per_curiam = True if author_tag_str == "Per Curiam" else False
                # If Per Curiam is True set author string to Per Curiam
                if per_curiam:
                    author_str = "Per Curiam"

                op_type = map_opinion_type(op.get("type"))
                opinion_xml = str(op)
                logger.info("Adding opinion for: %s", citation.base_citation())
                op = Opinion(
                    cluster_id=cluster.id,
                    type=op_type,
                    author_str=author_str,
                    xml_harvard=opinion_xml,
                    per_curiam=per_curiam,
                    extracted_by_ocr=True,
                )
                # Don't index now; do so later if desired
                op.save(index=False)
                new_op_pks.append(op.pk)

        if make_searchable:
            add_items_to_solr.delay(new_op_pks, "search.Opinion")

        logger.info("Finished: %s", citation.base_citation())
def make_and_save(item,
                  skipdupes=False,
                  min_dates=None,
                  start_dates=None,
                  testing=True):
    """Associates case data from `parse_opinions` with objects. Saves these
    objects.

    min_date: if not none, will skip cases after min_date
    """
    date_filed = (date_argued) = (
        date_reargued
    ) = date_reargument_denied = date_cert_granted = date_cert_denied = None
    unknown_date = None
    for date_cluster in item["dates"]:
        for date_info in date_cluster:
            # check for any dates that clearly aren't dates
            if date_info[1].year < 1600 or date_info[1].year > 2020:
                continue
            # check for untagged dates that will be assigned to date_filed
            if date_info[0] is None:
                date_filed = date_info[1]
                continue
            # try to figure out what type of date it is based on its tag string
            if date_info[0] in FILED_TAGS:
                date_filed = date_info[1]
            elif date_info[0] in DECIDED_TAGS:
                if not date_filed:
                    date_filed = date_info[1]
            elif date_info[0] in ARGUED_TAGS:
                date_argued = date_info[1]
            elif date_info[0] in REARGUE_TAGS:
                date_reargued = date_info[1]
            elif date_info[0] in REARGUE_DENIED_TAGS:
                date_reargument_denied = date_info[1]
            elif date_info[0] in CERT_GRANTED_TAGS:
                date_cert_granted = date_info[1]
            elif date_info[0] in CERT_DENIED_TAGS:
                date_cert_denied = date_info[1]
            else:
                unknown_date = date_info[1]
                if date_info[0] not in UNKNOWN_TAGS:
                    print("\nFound unknown date tag '%s' with date '%s'.\n" %
                          date_info)

    # the main date (used for date_filed in OpinionCluster) and panel dates
    # (used for finding judges) are ordered in terms of which type of dates
    # best reflect them
    main_date = (date_filed or date_argued or date_reargued
                 or date_reargument_denied or unknown_date)
    panel_date = (date_argued or date_reargued or date_reargument_denied
                  or date_filed or unknown_date)

    if main_date is None:
        raise Exception(f"Failed to get a date for {item['file']}")

    # special rule for Kentucky
    if item["court_id"] == "kycourtapp" and main_date <= date(1975, 12, 31):
        item["court_id"] = "kycourtapphigh"

    if min_dates is not None:
        if min_dates.get(item["court_id"]) is not None:
            if main_date >= min_dates[item["court_id"]]:
                print(
                    main_date,
                    "after",
                    min_dates[item["court_id"]],
                    " -- skipping.",
                )
                return
    if start_dates is not None:
        if start_dates.get(item["court_id"]) is not None:
            if main_date <= start_dates[item["court_id"]]:
                print(
                    main_date,
                    "before court founding:",
                    start_dates[item["court_id"]],
                    " -- skipping.",
                )
                return

    docket = Docket(
        source=Docket.COLUMBIA,
        date_argued=date_argued,
        date_reargued=date_reargued,
        date_cert_granted=date_cert_granted,
        date_cert_denied=date_cert_denied,
        date_reargument_denied=date_reargument_denied,
        court_id=item["court_id"],
        case_name_short=item["case_name_short"] or "",
        case_name=item["case_name"] or "",
        case_name_full=item["case_name_full"] or "",
        docket_number=item["docket"] or "",
    )

    # get citation objects in a list for addition to the cluster
    found_citations = []
    for c in item["citations"]:
        found = get_citations(clean_text(c, ["html", "inline_whitespace"]))
        if not found:
            # if the docket number --is-- citation string, we're likely dealing
            # with a somewhat common triplet of (docket number, date,
            # jurisdiction), which isn't a citation at all (so there's no
            # problem)
            if item["docket"]:
                docket_no = item["docket"].lower()
                if "claim no." in docket_no:
                    docket_no = docket_no.split("claim no.")[0]
                for junk in DOCKET_JUNK:
                    docket_no = docket_no.replace(junk, "")
                docket_no = docket_no.strip(".").strip()
                if docket_no and docket_no in c.lower():
                    continue

            # there are a trivial number of letters (except for
            # months and a few trivial words) in the citation,
            # then it's not a citation at all
            non_trivial = c.lower()
            for trivial in TRIVIAL_CITE_WORDS:
                non_trivial = non_trivial.replace(trivial, "")
            num_letters = sum(
                non_trivial.count(letter) for letter in string.lowercase)
            if num_letters < 3:
                continue

            # if there is a string that's known to indicate
            # a bad citation, then it's not a citation
            if any(bad in c for bad in BAD_CITES):
                continue
            # otherwise, this is a problem
            raise Exception("Failed to get a citation from the string '%s' in "
                            "court '%s' with docket '%s'." %
                            (c, item["court_id"], item["docket"]))
        else:
            found_citations.extend(found.to_model())

    cluster = OpinionCluster(
        judges=item.get("judges", "") or "",
        precedential_status=("Unpublished"
                             if item["unpublished"] else "Published"),
        date_filed=main_date,
        case_name_short=item["case_name_short"] or "",
        case_name=item["case_name"] or "",
        case_name_full=item["case_name_full"] or "",
        source="Z",
        attorneys=item["attorneys"] or "",
        posture=item["posture"] or "",
    )
    panel = lookup_judges_by_last_name_list(item["panel"], item["court_id"],
                                            panel_date)

    opinions = []
    for i, opinion_info in enumerate(item["opinions"]):
        if opinion_info["author"] is None:
            author = None
        else:
            author = lookup_judge_by_last_name(opinion_info["author"],
                                               item["court_id"], panel_date)

        converted_text = convert_columbia_html(opinion_info["opinion"])
        opinion_type = OPINION_TYPE_MAPPING[opinion_info["type"]]
        if opinion_type == Opinion.LEAD and i > 0:
            opinion_type = Opinion.ADDENDUM

        opinion = Opinion(
            author=author,
            per_curiam=opinion_info["per_curiam"],
            type=opinion_type,
            # type=OPINION_TYPE_MAPPING[opinion_info['type']],
            html_columbia=converted_text,
            sha1=opinion_info["sha1"],
            # This is surely not updated for the new S3 world. If you're
            # reading this, you'll need to update this code.
            local_path=opinion_info["local_path"],
        )
        joined_by = lookup_judges_by_last_name_list(item["joining"],
                                                    item["court_id"],
                                                    panel_date)
        opinions.append((opinion, joined_by))

    if min_dates is None:
        # check to see if this is a duplicate
        dups = find_dups(docket, cluster)
        if dups:
            if skipdupes:
                print("Duplicate. skipping.")
            else:
                raise Exception(f"Found {len(dups)} duplicate(s).")

    # save all the objects
    if not testing:
        try:
            docket.save()
            cluster.docket = docket
            cluster.save(index=False)
            for citation in found_citations:
                citation.cluster = cluster
                citation.save()
            for member in panel:
                cluster.panel.add(member)
            for opinion, joined_by in opinions:
                opinion.cluster = cluster
                opinion.save(index=False)
                for joiner in joined_by:
                    opinion.joined_by.add(joiner)
            if settings.DEBUG:
                domain = "http://127.0.0.1:8000"
            else:
                domain = "https://www.courtlistener.com"
            print(f"Created item at: {domain}{cluster.get_absolute_url()}")
        except:
            # if anything goes wrong, try to delete everything
            try:
                docket.delete()
            except:
                pass
            raise
Example #22
0
def process_recap_docket(pk):
    """Process an uploaded docket from the RECAP API endpoint.

    param pk: The primary key of the processing queue item you want to work on.
    """
    pq = ProcessingQueue.objects.get(pk=pk)
    pq.status = pq.PROCESSING_IN_PROGRESS
    pq.save()
    logger.info("Processing RECAP item: %s" % pq)

    report = DocketReport(map_cl_to_pacer_id(pq.court_id))
    text = pq.filepath_local.read().decode('utf-8')
    report._parse_text(text)
    docket_data = report.data
    logger.info("Parsing completed of item %s" % pq)

    # Merge the contents of the docket into CL
    try:
        d = Docket.objects.get(
            Q(pacer_case_id=pq.pacer_case_id) |
            Q(docket_number=docket_data['docket_number']),
            court_id=pq.court_id,
        )
        # Add RECAP as a source if it's not already.
        if d.source in [Docket.DEFAULT, Docket.SCRAPER]:
            d.source = Docket.RECAP_AND_SCRAPER
        elif d.source == Docket.COLUMBIA:
            d.source = Docket.COLUMBIA_AND_RECAP
        elif d.source == Docket.COLUMBIA_AND_SCRAPER:
            d.source = Docket.COLUMBIA_AND_RECAP_AND_SCRAPER
    except Docket.DoesNotExist:
        d = Docket(
            source=Docket.RECAP,
            pacer_case_id=pq.pacer_case_id,
            court_id=pq.court_id
        )
    except Docket.MultipleObjectsReturned:
        msg = "Too many dockets found when trying to look up '%s'" % pq
        logger.error(msg)
        pq.error_message = msg
        pq.status = pq.PROCESSING_FAILED
        pq.save()
        return None

    update_docket_metadata(d, docket_data)
    d.save()

    # Add the HTML to the docket in case we need it someday.
    pacer_file = PacerHtmlFiles(content_object=d)
    pacer_file.filepath.save(
        'docket.html',  # We only care about the ext w/UUIDFileSystemStorage
        ContentFile(text),
    )

    # Docket entries
    for docket_entry in docket_data['docket_entries']:
        try:
            de, created = DocketEntry.objects.update_or_create(
                docket=d,
                entry_number=docket_entry['document_number'],
                defaults={
                    'description': docket_entry['description'],
                    'date_filed': docket_entry['date_filed'],
                }
            )
        except DocketEntry.MultipleObjectsReturned:
            logger.error(
                "Multiple docket entries found for document entry number '%s' "
                "while processing '%s'" % (docket_entry['document_number'], pq)
            )
            continue

        # Then make the RECAPDocument object. Try to find it. If we do, update
        # the pacer_doc_id field if it's blank. If we can't find it, create it
        # or throw an error.
        try:
            rd = RECAPDocument.objects.get(
                docket_entry=de,
                # No attachments when uploading dockets.
                document_type=RECAPDocument.PACER_DOCUMENT,
                document_number=docket_entry['document_number'],
            )
        except RECAPDocument.DoesNotExist:
            RECAPDocument.objects.create(
                docket_entry=de,
                # No attachments when uploading dockets.
                document_type=RECAPDocument.PACER_DOCUMENT,
                document_number=docket_entry['document_number'],
                pacer_doc_id=docket_entry['pacer_doc_id'],
                is_available=False,
            )
        except RECAPDocument.MultipleObjectsReturned:
            logger.error(
                "Multiple recap documents found for document entry number'%s' "
                "while processing '%s'" % (docket_entry['document_number'], pq)
            )
            continue
        else:
            rd.pacer_doc_id = rd.pacer_doc_id or pq.pacer_doc_id

    add_parties_and_attorneys(d, docket_data['parties'])

    # Ditch the original file (it's associated with the Docket now)
    pq.filepath_local.delete(save=False)
    pq.error_message = ''  # Clear out errors b/c successful
    pq.status = pq.PROCESSING_SUCCESSFUL
    pq.docket = d
    pq.save()

    return d
Example #23
0
def get_docket_by_pacer_case_id(self, pacer_case_id, court_id, session,
                                tag=None, **kwargs):
    """Get a docket by PACER case id, CL court ID, and a collection of kwargs
    that can be passed to the DocketReport query.

    For details of acceptable parameters, see DocketReport.query()

    :param pacer_case_id: The internal case ID of the item in PACER.
    :param court_id: A courtlistener court ID.
    :param session: A valid PacerSession object.
    :param tag: The tag name that should be stored with the item in the DB.
    :param kwargs: A variety of keyword args to pass to DocketReport.query().
    """
    report = DocketReport(map_cl_to_pacer_id(court_id), session)
    logger.info("Querying docket report %s.%s" % (court_id, pacer_case_id))
    try:
        d = Docket.objects.get(
            pacer_case_id=pacer_case_id,
            court_id=court_id,
        )
    except Docket.DoesNotExist:
        d = None
    except Docket.MultipleObjectsReturned:
        d = None

    if d is not None:
        first_missing_id = get_first_missing_de_number(d)
        if d is not None and first_missing_id > 1:
            # We don't have to get the whole thing!
            kwargs.setdefault('doc_num_start', first_missing_id)

    report.query(pacer_case_id, **kwargs)
    docket_data = report.data
    logger.info("Querying and parsing complete for %s.%s" % (court_id,
                                                             pacer_case_id))

    # Merge the contents into CL.
    try:
        if d is None:
            d = Docket.objects.get(
                Q(pacer_case_id=pacer_case_id) |
                Q(docket_number=docket_data['docket_number']),
                court_id=court_id,
            )
        # Add RECAP as a source if it's not already.
        if d.source in [Docket.DEFAULT, Docket.SCRAPER]:
            d.source = Docket.RECAP_AND_SCRAPER
        elif d.source == Docket.COLUMBIA:
            d.source = Docket.COLUMBIA_AND_RECAP
        elif d.source == Docket.COLUMBIA_AND_SCRAPER:
            d.source = Docket.COLUMBIA_AND_RECAP_AND_SCRAPER
    except Docket.DoesNotExist:
        d = Docket(
            source=Docket.RECAP,
            pacer_case_id=pacer_case_id,
            court_id=court_id
        )
    except Docket.MultipleObjectsReturned:
        logger.error("Too many dockets returned when trying to look up '%s.%s'" %
                     (court_id, pacer_case_id))
        return None

    update_docket_metadata(d, docket_data)
    d.save()
    if tag is not None:
        tag, _ = Tag.objects.get_or_create(name=tag)
        d.tags.add(tag)

    # Add the HTML to the docket in case we need it someday.
    pacer_file = PacerHtmlFiles(content_object=d)
    pacer_file.filepath.save(
        'docket.html',  # We only care about the ext w/UUIDFileSystemStorage
        ContentFile(report.response.text),
    )

    for docket_entry in docket_data['docket_entries']:
        try:
            de, created = DocketEntry.objects.update_or_create(
                docket=d,
                entry_number=docket_entry['document_number'],
                defaults={
                    'description': docket_entry['description'],
                    'date_filed': docket_entry['date_filed'],
                }
            )
        except DocketEntry.MultipleObjectsReturned:
            logger.error(
                "Multiple docket entries found for document entry number '%s' "
                "while processing '%s.%s'" % (docket_entry['document_number'],
                                              court_id, pacer_case_id)
            )
            continue
        else:
            if tag is not None:
                de.tags.add(tag)

        try:
            rd = RECAPDocument.objects.get(
                docket_entry=de,
                # No attachments when uploading dockets.
                document_type=RECAPDocument.PACER_DOCUMENT,
                document_number=docket_entry['document_number'],
            )
        except RECAPDocument.DoesNotExist:
            try:
                rd = RECAPDocument.objects.create(
                    docket_entry=de,
                    # No attachments when uploading dockets.
                    document_type=RECAPDocument.PACER_DOCUMENT,
                    document_number=docket_entry['document_number'],
                    pacer_doc_id=docket_entry['pacer_doc_id'],
                    is_available=False,
                )
            except IntegrityError:
                # Race condition. The item was created after our get failed.
                rd = RECAPDocument.objects.get(
                    docket_entry=de,
                    # No attachments when uploading dockets.
                    document_type=RECAPDocument.PACER_DOCUMENT,
                    document_number=docket_entry['document_number'],
                )
        except RECAPDocument.MultipleObjectsReturned:
            logger.error(
                "Multiple recap documents found for document entry "
                "number: '%s', docket: %s" % (docket_entry['document_number'], d)
            )
            continue

        rd.pacer_doc_id = rd.pacer_doc_id or docket_entry['pacer_doc_id']
        if tag is not None:
            rd.tags.add(tag)

    add_parties_and_attorneys(d, docket_data['parties'])
    logger.info("Created/updated docket: %s" % d)

    return d
Example #24
0
def make_and_save(item,
                  skipdupes=False,
                  min_dates=None,
                  start_dates=None,
                  testing=True):
    """Associates case data from `parse_opinions` with objects. Saves these
    objects.

    min_date: if not none, will skip cases after min_date
    """
    date_filed = date_argued = date_reargued = date_reargument_denied = date_cert_granted = date_cert_denied = None
    unknown_date = None
    for date_cluster in item['dates']:
        for date_info in date_cluster:
            # check for any dates that clearly aren't dates
            if date_info[1].year < 1600 or date_info[1].year > 2020:
                continue
            # check for untagged dates that will be assigned to date_filed
            if date_info[0] is None:
                date_filed = date_info[1]
                continue
            # try to figure out what type of date it is based on its tag string
            if date_info[0] in FILED_TAGS:
                date_filed = date_info[1]
            elif date_info[0] in DECIDED_TAGS:
                if not date_filed:
                    date_filed = date_info[1]
            elif date_info[0] in ARGUED_TAGS:
                date_argued = date_info[1]
            elif date_info[0] in REARGUE_TAGS:
                date_reargued = date_info[1]
            elif date_info[0] in REARGUE_DENIED_TAGS:
                date_reargument_denied = date_info[1]
            elif date_info[0] in CERT_GRANTED_TAGS:
                date_cert_granted = date_info[1]
            elif date_info[0] in CERT_DENIED_TAGS:
                date_cert_denied = date_info[1]
            else:
                unknown_date = date_info[1]
                if date_info[0] not in UNKNOWN_TAGS:
                    print("\nFound unknown date tag '%s' with date '%s'.\n" %
                          date_info)

    # the main date (used for date_filed in OpinionCluster) and panel dates
    # (used for finding judges) are ordered in terms of which type of dates
    # best reflect them
    main_date = (date_filed or date_argued or date_reargued
                 or date_reargument_denied or unknown_date)
    panel_date = (date_argued or date_reargued or date_reargument_denied
                  or date_filed or unknown_date)

    if main_date is None:
        raise Exception("Failed to get a date for " + item['file'])

    # special rule for Kentucky
    if item['court_id'] == 'kycourtapp' and main_date <= date(1975, 12, 31):
        item['court_id'] = 'kycourtapphigh'

    if min_dates is not None:
        if min_dates.get(item['court_id']) is not None:
            if main_date >= min_dates[item['court_id']]:
                print(main_date, 'after', min_dates[item['court_id']],
                      ' -- skipping.')
                return
    if start_dates is not None:
        if start_dates.get(item['court_id']) is not None:
            if main_date <= start_dates[item['court_id']]:
                print(main_date, 'before court founding:',
                      start_dates[item['court_id']], ' -- skipping.')
                return

    docket = Docket(source=Docket.COLUMBIA,
                    date_argued=date_argued,
                    date_reargued=date_reargued,
                    date_cert_granted=date_cert_granted,
                    date_cert_denied=date_cert_denied,
                    date_reargument_denied=date_reargument_denied,
                    court_id=item['court_id'],
                    case_name_short=item['case_name_short'] or '',
                    case_name=item['case_name'] or '',
                    case_name_full=item['case_name_full'] or '',
                    docket_number=item['docket'] or '')

    # get citations in the form of, e.g. {'federal_cite_one': '1 U.S. 1', ...}
    found_citations = []
    for c in item['citations']:
        found = get_citations(c)
        if not found:
            # if the docket number --is-- citation string, we're likely dealing
            # with a somewhat common triplet of (docket number, date,
            # jurisdiction), which isn't a citation at all (so there's no
            # problem)
            if item['docket']:
                docket_no = item['docket'].lower()
                if 'claim no.' in docket_no:
                    docket_no = docket_no.split('claim no.')[0]
                for junk in DOCKET_JUNK:
                    docket_no = docket_no.replace(junk, '')
                docket_no = docket_no.strip('.').strip()
                if docket_no and docket_no in c.lower():
                    continue

            # there are a trivial number of letters (except for months and a few
            # trivial words) in the citation, then it's not a citation at all
            non_trivial = c.lower()
            for trivial in TRIVIAL_CITE_WORDS:
                non_trivial = non_trivial.replace(trivial, '')
            num_letters = sum(
                non_trivial.count(letter) for letter in string.lowercase)
            if num_letters < 3:
                continue

            # if there is a string that's known to indicate a bad citation, then
            # it's not a citation
            if any(bad in c for bad in BAD_CITES):
                continue
            # otherwise, this is a problem
            raise Exception("Failed to get a citation from the string '%s' in "
                            "court '%s' with docket '%s'." %
                            (c, item['court_id'], item['docket']))
        else:
            found_citations.extend(found)
    citations_map = map_citations_to_models(found_citations)

    cluster = OpinionCluster(
        judges=item.get('judges', '') or "",
        precedential_status=('Unpublished'
                             if item['unpublished'] else 'Published'),
        date_filed=main_date,
        case_name_short=item['case_name_short'] or '',
        case_name=item['case_name'] or '',
        case_name_full=item['case_name_full'] or '',
        source='Z',
        attorneys=item['attorneys'] or '',
        posture=item['posture'] or '',
        **citations_map)
    panel = [
        find_person(n, item['court_id'], case_date=panel_date)
        for n in item['panel']
    ]
    panel = [x for x in panel if x is not None]

    opinions = []
    for i, opinion_info in enumerate(item['opinions']):
        if opinion_info['author'] is None:
            author = None
        else:
            author = find_person(opinion_info['author'],
                                 item['court_id'],
                                 case_date=panel_date)
        converted_text = convert_columbia_html(opinion_info['opinion'])
        opinion_type = OPINION_TYPE_MAPPING[opinion_info['type']]
        if opinion_type == '020lead' and i > 0:
            opinion_type = '050addendum'

        opinion = Opinion(
            author=author,
            per_curiam=opinion_info['per_curiam'],
            type=opinion_type,
            # type=OPINION_TYPE_MAPPING[opinion_info['type']],
            html_columbia=converted_text,
            sha1=opinion_info['sha1'],
            local_path=opinion_info['local_path'],
        )
        joined_by = [
            find_person(n, item['court_id'], case_date=panel_date)
            for n in opinion_info['joining']
        ]
        joined_by = [x for x in joined_by if x is not None]
        opinions.append((opinion, joined_by))

    if min_dates is None:
        # check to see if this is a duplicate
        dups = find_dups(docket, cluster)
        if dups:
            if skipdupes:
                print('Duplicate. skipping.')
            else:
                raise Exception("Found %s duplicate(s)." % len(dups))

    # save all the objects
    if not testing:
        try:
            docket.save()
            cluster.docket = docket
            cluster.save(index=False)
            for member in panel:
                cluster.panel.add(member)
            for opinion, joined_by in opinions:
                opinion.cluster = cluster
                opinion.save(index=False)
                for joiner in joined_by:
                    opinion.joined_by.add(joiner)
            if settings.DEBUG:
                domain = "http://127.0.0.1:8000"
            else:
                domain = "https://www.courtlistener.com"
            print("Created item at: %s%s" %
                  (domain, cluster.get_absolute_url()))
        except:
            # if anything goes wrong, try to delete everything
            try:
                docket.delete()
            except:
                pass
            raise
Example #25
0
    def migrate_opinions_oral_args_and_dockets(self):
        self.stdout.write("Migrating dockets, audio files, and opinions to new "
                          "database...")
        q = DocketOld.objects.using('old').all()
        old_dockets = queryset_generator(q)
        num_dockets = q.count()

        progress = 0
        self._print_progress(progress, num_dockets)
        for old_docket in old_dockets:
            # First do the docket, then create the cluster and opinion objects.
            try:
                old_audio = old_docket.audio_files.all()[0]
            except IndexError:
                old_audio = None
            try:
                old_document = old_docket.documents.all()[0]
            except IndexError:
                old_document = None
            if old_document is not None:
                old_citation = old_document.citation
                old_doc_case_name, old_doc_case_name_full, old_doc_case_name_short = self._get_case_names(old_citation.case_name)
            if old_audio is not None:
                old_audio_case_name, old_audio_case_name_full, old_audio_case_name_short = self._get_case_names(old_audio.case_name)

            court = CourtNew.objects.get(pk=old_docket.court_id)  # Courts are in place thanks to initial data.

            new_docket = DocketNew(
                pk=old_docket.pk,
                date_modified=old_docket.date_modified,
                date_created=old_docket.date_modified,
                court=court,
                case_name=old_doc_case_name,
                case_name_full=old_doc_case_name_full,
                case_name_short=old_doc_case_name_short,
                slug=self._none_to_blank(old_docket.slug),
                docket_number=self._none_to_blank(old_citation.docket_number),
                date_blocked=old_docket.date_blocked,
                blocked=old_docket.blocked,
            )
            if old_audio is not None:
                new_docket.date_argued = old_audio.date_argued
            new_docket.save(using='default')

            if old_document is not None:
                new_opinion_cluster = OpinionClusterNew(
                    pk=old_document.pk,
                    docket=new_docket,
                    judges=self._none_to_blank(old_document.judges),
                    date_modified=old_document.date_modified,
                    date_created=old_document.date_modified,
                    date_filed=old_document.date_filed,
                    slug=self._none_to_blank(old_citation.slug),
                    citation_id=old_document.citation_id,
                    case_name_short=old_doc_case_name_short,
                    case_name=old_doc_case_name,
                    case_name_full=old_doc_case_name_full,
                    federal_cite_one=self._none_to_blank(
                        old_citation.federal_cite_one),
                    federal_cite_two=self._none_to_blank(
                        old_citation.federal_cite_two),
                    federal_cite_three=self._none_to_blank(
                        old_citation.federal_cite_three),
                    state_cite_one=self._none_to_blank(
                        old_citation.state_cite_one),
                    state_cite_two=self._none_to_blank(
                        old_citation.state_cite_two),
                    state_cite_three=self._none_to_blank(
                        old_citation.state_cite_three),
                    state_cite_regional=self._none_to_blank(
                        old_citation.state_cite_regional),
                    specialty_cite_one=self._none_to_blank(
                        old_citation.specialty_cite_one),
                    scotus_early_cite=self._none_to_blank(
                        old_citation.scotus_early_cite),
                    lexis_cite=self._none_to_blank(old_citation.lexis_cite),
                    westlaw_cite=self._none_to_blank(old_citation.westlaw_cite),
                    neutral_cite=self._none_to_blank(old_citation.neutral_cite),
                    scdb_id=self._none_to_blank(
                        old_document.supreme_court_db_id),
                    source=old_document.source,
                    nature_of_suit=old_document.nature_of_suit,
                    citation_count=old_document.citation_count,
                    precedential_status=old_document.precedential_status,
                    date_blocked=old_document.date_blocked,
                    blocked=old_document.blocked,
                )
                new_opinion_cluster.save(
                    using='default',
                    index=False,
                )

                new_opinion = OpinionNew(
                    pk=old_document.pk,
                    cluster=new_opinion_cluster,
                    date_modified=old_document.date_modified,
                    date_created=old_document.time_retrieved,
                    type='010combined',
                    sha1=old_document.sha1,
                    download_url=old_document.download_url,
                    local_path=old_document.local_path,
                    plain_text=old_document.plain_text,
                    html=self._none_to_blank(old_document.html),
                    html_lawbox=self._none_to_blank(old_document.html_lawbox),
                    html_with_citations=old_document.html_with_citations,
                    extracted_by_ocr=old_document.extracted_by_ocr,
                )
                new_opinion.save(
                    using='default',
                    index=False,
                )

            if old_audio is not None:
                new_audio_file = AudioNew(
                    pk=old_audio.pk,
                    docket=new_docket,
                    source=old_audio.source,
                    case_name=old_audio_case_name,
                    case_name_short=old_audio_case_name_short,
                    case_name_full=old_audio_case_name_full,
                    judges=self._none_to_blank(old_audio.judges),
                    date_created=old_audio.time_retrieved,
                    date_modified=old_audio.date_modified,
                    sha1=old_audio.sha1,
                    download_url=old_audio.download_url,
                    local_path_mp3=old_audio.local_path_mp3,
                    local_path_original_file=old_audio.local_path_original_file,
                    duration=old_audio.duration,
                    processing_complete=old_audio.processing_complete,
                    date_blocked=old_audio.date_blocked,
                    blocked=old_audio.blocked,
                )
                new_audio_file.save(
                    using='default',
                    index=False,
                )

            progress += 1
            self._print_progress(progress, num_dockets)
        self.stdout.write(u'')  # Newline
Example #26
0
def process_recap_docket(pk):
    pq = ProcessingQueue.objects.get(pk=pk)
    pq.status = pq.PROCESSING_IN_PROGRESS
    pq.save()
    logger.info("Processing RECAP item: %s" % pq)

    report = DocketReport(map_cl_to_pacer_id(pq.court_id))
    text = pq.filepath_local.read().decode('utf-8')
    report.parse_text(text)
    docket_data = report.data
    logger.info("Parsing completed of item %s" % pq)

    # Merge the contents of the docket into CL
    try:
        d = Docket.objects.get(
            Q(pacer_case_id=pq.pacer_case_id)
            | Q(docket_number=docket_data['docket_number']),
            court_id=pq.court_id,
        )
        # Add RECAP as a source if it's not already.
        if d.source in [Docket.DEFAULT, Docket.SCRAPER]:
            d.source = Docket.RECAP_AND_SCRAPER
        elif d.source == Docket.COLUMBIA:
            d.source = Docket.COLUMBIA_AND_RECAP
        elif d.source == Docket.COLUMBIA_AND_SCRAPER:
            d.source = Docket.COLUMBIA_AND_RECAP_AND_SCRAPER
    except Docket.DoesNotExist:
        d = Docket(source=Docket.RECAP,
                   pacer_case_id=pq.pacer_case_id,
                   court_id=pq.court_id)
    except Docket.MultipleObjectsReturned:
        msg = "Too many dockets found when trying to look up '%s'" % pq
        logger.error(msg)
        pq.error_message = msg
        pq.status = pq.PROCESSING_FAILED
        pq.save()
        return None

    update_docket_metadata(d, docket_data)
    d.save()

    # Docket entries
    for docket_entry in docket_data['docket_entries']:
        try:
            de, created = DocketEntry.objects.update_or_create(
                docket=d,
                entry_number=docket_entry['document_number'],
                defaults={
                    'description': docket_entry['description'],
                    'date_filed': docket_entry['date_filed'],
                })
        except DocketEntry.MultipleObjectsReturned:
            logger.error(
                "Multiple docket entries found for document entry number '%s' "
                "while processing '%s'" %
                (docket_entry['document_number'], pq))
            continue

        # Then make the RECAPDocument object. Try to find it. If we do, update
        # the pacer_doc_id field if it's blank. If we can't find it, create it
        # or throw an error.
        try:
            rd = RECAPDocument.objects.get(
                docket_entry=de,
                # No attachments when uploading dockets.
                document_type=RECAPDocument.PACER_DOCUMENT,
                document_number=docket_entry['document_number'],
            )
        except RECAPDocument.DoesNotExist:
            RECAPDocument.objects.create(
                docket_entry=de,
                # No attachments when uploading dockets.
                document_type=RECAPDocument.PACER_DOCUMENT,
                document_number=docket_entry['document_number'],
                pacer_doc_id=docket_entry['pacer_doc_id'],
                is_available=False,
            )
        except RECAPDocument.MultipleObjectsReturned:
            logger.error(
                "Multiple recap documents found for document entry number'%s' "
                "while processing '%s'" %
                (docket_entry['document_number'], pq))
            continue
        else:
            rd.pacer_doc_id = rd.pacer_doc_id or docket_entry.pacer_doc_id

    # Parties
    for party in docket_data['parties']:
        try:
            p = Party.objects.get(name=party['name'])
        except Party.DoesNotExist:
            p = Party.objects.create(
                name=party['name'],
                extra_info=party['extra_info'],
            )
        except Party.MultipleObjectsReturned:
            continue
        else:
            if party['extra_info']:
                p.extra_info = party['extra_info']
                p.save()

        # If the party type doesn't exist, make a new one.
        if not p.party_types.filter(docket=d, name=party['type']).exists():
            PartyType.objects.create(docket=d, party=p, name=party['type'])

        # Attorneys
        for atty in party.get('attorneys', []):
            add_attorney(atty, p, d)

    pq.error_message = ''  # Clear out errors b/c successful
    pq.status = pq.PROCESSING_SUCCESSFUL
    pq.save()

    return d