コード例 #1
0
ファイル: pacer.py プロジェクト: saizai/courtlistener
def process_docket_data(d, filepath, report_type):
    """Process docket data file.

    :param d: A docket object to work on.
    :param filepath: The path to a saved HTML file containing docket or docket
    history report data.
    :param report_type: Whether it's a docket or a docket history report.
    """
    from cl.recap.tasks import update_docket_metadata, add_docket_entries, \
        add_parties_and_attorneys, update_docket_appellate_metadata
    if report_type == UPLOAD_TYPE.DOCKET:
        report = DocketReport(map_cl_to_pacer_id(d.court_id))
    elif report_type == UPLOAD_TYPE.DOCKET_HISTORY_REPORT:
        report = DocketHistoryReport(map_cl_to_pacer_id(d.court_id))
    elif report_type == UPLOAD_TYPE.APPELLATE_DOCKET:
        report = AppellateDocketReport(map_cl_to_pacer_id(d.court_id))
    elif report_type == UPLOAD_TYPE.IA_XML_FILE:
        report = InternetArchive()
    with open(filepath, 'r') as f:
        text = f.read().decode('utf-8')
    report._parse_text(text)
    data = report.data
    if data == {}:
        return None
    update_docket_metadata(d, data)
    d, og_info = update_docket_appellate_metadata(d, data)
    if og_info is not None:
        og_info.save()
        d.originating_court_information = og_info
    d.save()
    add_docket_entries(d, data['docket_entries'])
    if report_type in (UPLOAD_TYPE.DOCKET, UPLOAD_TYPE.APPELLATE_DOCKET,
                       UPLOAD_TYPE.IA_XML_FILE):
        add_parties_and_attorneys(d, data['parties'])
    return d.pk
コード例 #2
0
def reprocess_docket_data(d, filepath, report_type):
    """Reprocess docket data that we already have.

    :param d: A docket object to work on.
    :param filepath: The path to a saved HTML file containing docket or docket
    history report data.
    :param report_type: Whether it's a docket or a docket history report.
    """
    from cl.recap.tasks import update_docket_metadata, add_docket_entries, \
        add_parties_and_attorneys
    if report_type == DOCKET:
        report = DocketReport(map_cl_to_pacer_id(d.court_id))
    elif report_type == DOCKET_HISTORY_REPORT:
        report = DocketHistoryReport(map_cl_to_pacer_id(d.court_id))
    with open(filepath, 'r') as f:
        text = f.read().decode('utf-8')
    report._parse_text(text)
    data = report.data
    if data == {}:
        return None
    update_docket_metadata(d, data)
    d.save()
    add_docket_entries(d, data['docket_entries'])
    if report_type == DOCKET:
        add_parties_and_attorneys(d, data['parties'])
    return d.pk
コード例 #3
0
 def test_dhr_merges_separate_docket_entries(self):
     """Does the docket history report merge separate minute entries if
     one entry has a short description, and the other has a long
     description?
     """
     # Create two unnumbered docket entries, one with a short description
     # and one with a long description. Then see what happens when you try
     # to add a DHR result (it should merge them).
     short_desc = 'Entry one short description'
     long_desc = 'Entry one long desc'
     date_filed = date(2014, 11, 16)
     d = Docket.objects.create(source=0, court_id='scotus')
     de1 = DocketEntry.objects.create(docket=d,
                                      entry_number=None,
                                      description=long_desc,
                                      date_filed=date_filed)
     RECAPDocument.objects.create(
         docket_entry=de1,
         document_number='',
         description='',
         document_type=RECAPDocument.PACER_DOCUMENT,
     )
     de2 = DocketEntry.objects.create(docket=d,
                                      entry_number=None,
                                      description='',
                                      date_filed=date_filed)
     RECAPDocument.objects.create(
         docket_entry=de2,
         document_number='',
         description=short_desc,
         document_type=RECAPDocument.PACER_DOCUMENT,
     )
     # Add a docket entry that spans the two above. Same date, same short
     # and long description. This should trigger a merge.
     add_docket_entries(d, [{
         "date_filed": date_filed,
         "description": long_desc,
         "document_number": None,
         "pacer_doc_id": None,
         "pacer_seq_no": None,
         "short_description": short_desc,
     }])
     expected_item_count = 1
     self.assertEqual(d.docket_entries.count(), expected_item_count)
コード例 #4
0
    def test_rss_feed_ingestion(self):
        """Can we ingest RSS feeds without creating duplicates?"""
        court_id = 'scotus'
        rss_feed = PacerRssFeed(court_id)
        rss_feed.is_bankruptcy = True  # Needed because we say SCOTUS above.
        with open(self.make_path('rss_sample_unnumbered_mdb.xml')) as f:
            text = f.read().decode('utf-8')
        rss_feed._parse_text(text)
        docket = rss_feed.data[0]
        d, docket_count = find_docket_object(court_id, docket['pacer_case_id'],
                                             docket['docket_number'])
        update_docket_metadata(d, docket)
        d.save()
        self.assertTrue(docket_count == 0)

        expected_count = 1
        add_docket_entries(d, docket['docket_entries'])
        self.assertEqual(d.docket_entries.count(), expected_count)
        add_docket_entries(d, docket['docket_entries'])
        self.assertEqual(d.docket_entries.count(), expected_count)
コード例 #5
0
ファイル: tasks.py プロジェクト: glennneiger/courtlistener
def merge_rss_feed_contents(rss_feed, court_pk, feed_status_pk):
    """Merge the rss feed contents into CourtListener

    :param rss_feed: A PacerRssFeed object that has already queried the feed.
    :param court_pk: The CourtListener court ID.
    :param feed_status_pk: The CL ID for the RSS status object.
    :returns all_rds_created: A list of all the RDs created during the
    processing.
    """
    start_time = now()
    feed_status = RssFeedStatus.objects.get(pk=feed_status_pk)
    rss_feed.parse()
    logger.info("%s: Got %s results to merge." % (feed_status.court_id,
                                                  len(rss_feed.data)))
    # RSS feeds are a list of normal Juriscraper docket objects.
    all_rds_created = []
    d_pks_to_alert = []
    for docket in rss_feed.data:
        item_hash = hash_item(docket)
        if is_cached(item_hash):
            continue

        with transaction.atomic():
            cached_ok = cache_hash(item_hash)
            if not cached_ok:
                # The item is already in the cache, ergo it's getting processed
                # in another thread/process and we had a race condition.
                continue
            d, docket_count = find_docket_object(
                court_pk, docket['pacer_case_id'], docket['docket_number'])
            if docket_count > 1:
                logger.info("Found %s dockets during lookup. Choosing "
                            "oldest." % docket_count)
                d = d.earliest('date_created')

            add_recap_source(d)
            update_docket_metadata(d, docket)
            if not d.pacer_case_id:
                d.pacer_case_id = docket['pacer_case_id']
            d.save()
            rds_created, content_updated = add_docket_entries(
                d, docket['docket_entries'])

        if content_updated and docket_count > 0:
            newly_enqueued = enqueue_docket_alert(d.pk)
            if newly_enqueued:
                d_pks_to_alert.append((d.pk, start_time))

        all_rds_created.extend([rd.pk for rd in rds_created])

    logger.info("%s: Sending %s new RECAP documents to Solr for indexing and "
                "sending %s dockets for alerts.", feed_status.court_id,
                len(all_rds_created), len(d_pks_to_alert))
    return {'d_pks_to_alert': d_pks_to_alert, 'rds_for_solr': all_rds_created}
コード例 #6
0
ファイル: tasks.py プロジェクト: freelawproject/courtlistener
def merge_rss_feed_contents(feed_data, court_pk, feed_status_pk):
    """Merge the rss feed contents into CourtListener

    :param feed_data: The data parameter of a PacerRssFeed object that has
    already queried the feed and been parsed.
    :param court_pk: The CourtListener court ID.
    :param feed_status_pk: The CL ID for the RSS status object.
    :returns all_rds_created: A list of all the RDs created during the
    processing.
    """
    start_time = now()
    feed_status = RssFeedStatus.objects.get(pk=feed_status_pk)

    # RSS feeds are a list of normal Juriscraper docket objects.
    all_rds_created = []
    d_pks_to_alert = []
    for docket in feed_data:
        item_hash = hash_item(docket)
        if is_cached(item_hash):
            continue

        with transaction.atomic():
            cached_ok = cache_hash(item_hash)
            if not cached_ok:
                # The item is already in the cache, ergo it's getting processed
                # in another thread/process and we had a race condition.
                continue
            d, docket_count = find_docket_object(
                court_pk, docket['pacer_case_id'], docket['docket_number'])
            if docket_count > 1:
                logger.info("Found %s dockets during lookup. Choosing "
                            "oldest." % docket_count)
                d = d.earliest('date_created')

            d.add_recap_source()
            update_docket_metadata(d, docket)
            if not d.pacer_case_id:
                d.pacer_case_id = docket['pacer_case_id']
            d.save()
            rds_created, content_updated = add_docket_entries(
                d, docket['docket_entries'])

        if content_updated and docket_count > 0:
            newly_enqueued = enqueue_docket_alert(d.pk)
            if newly_enqueued:
                d_pks_to_alert.append((d.pk, start_time))

        all_rds_created.extend([rd.pk for rd in rds_created])

    logger.info("%s: Sending %s new RECAP documents to Solr for indexing and "
                "sending %s dockets for alerts.", feed_status.court_id,
                len(all_rds_created), len(d_pks_to_alert))
    return {'d_pks_to_alert': d_pks_to_alert, 'rds_for_solr': all_rds_created}
コード例 #7
0
ファイル: pacer.py プロジェクト: freelawproject/courtlistener
def process_docket_data(d, filepath, report_type):
    """Process docket data file.

    :param d: A docket object to work on.
    :param filepath: The path to a saved HTML file containing docket or docket
    history report data.
    :param report_type: Whether it's a docket or a docket history report.
    """
    from cl.recap.tasks import update_docket_metadata, add_docket_entries, \
        add_parties_and_attorneys, update_docket_appellate_metadata
    if report_type == UPLOAD_TYPE.DOCKET:
        report = DocketReport(map_cl_to_pacer_id(d.court_id))
    elif report_type == UPLOAD_TYPE.DOCKET_HISTORY_REPORT:
        report = DocketHistoryReport(map_cl_to_pacer_id(d.court_id))
    elif report_type == UPLOAD_TYPE.APPELLATE_DOCKET:
        report = AppellateDocketReport(map_cl_to_pacer_id(d.court_id))
    elif report_type == UPLOAD_TYPE.IA_XML_FILE:
        report = InternetArchive()
    elif report_type == UPLOAD_TYPE.CASE_REPORT_PAGE:
        report = CaseQuery(map_cl_to_pacer_id(d.court_id))
    with open(filepath, 'r') as f:
        text = f.read().decode('utf-8')
    report._parse_text(text)
    data = report.data
    if data == {}:
        return None
    update_docket_metadata(d, data)
    d, og_info = update_docket_appellate_metadata(d, data)
    if og_info is not None:
        og_info.save()
        d.originating_court_information = og_info
    d.save()
    if data.get('docket_entries'):
        add_docket_entries(d, data['docket_entries'])
    if report_type in (UPLOAD_TYPE.DOCKET, UPLOAD_TYPE.APPELLATE_DOCKET,
                       UPLOAD_TYPE.IA_XML_FILE):
        add_parties_and_attorneys(d, data['parties'])
    return d.pk
コード例 #8
0
def get_docket_by_pacer_case_id(self,
                                pacer_case_id,
                                court_id,
                                session,
                                tag=None,
                                **kwargs):
    """Get a docket by PACER case id, CL court ID, and a collection of kwargs
    that can be passed to the DocketReport query.

    For details of acceptable parameters, see DocketReport.query()

    :param pacer_case_id: The internal case ID of the item in PACER.
    :param court_id: A courtlistener court ID.
    :param session: A valid PacerSession object.
    :param tag: The tag name that should be stored with the item in the DB.
    :param kwargs: A variety of keyword args to pass to DocketReport.query().
    """
    report = DocketReport(map_cl_to_pacer_id(court_id), session)
    logger.info("Querying docket report %s.%s" % (court_id, pacer_case_id))
    try:
        d = Docket.objects.get(
            pacer_case_id=pacer_case_id,
            court_id=court_id,
        )
    except Docket.DoesNotExist:
        d = None
    except Docket.MultipleObjectsReturned:
        d = None

    if d is not None:
        first_missing_id = get_first_missing_de_number(d)
        if first_missing_id > 1:
            # We don't have to get the whole thing!
            kwargs.setdefault('doc_num_start', first_missing_id)

    report.query(pacer_case_id, **kwargs)
    docket_data = report.data
    logger.info("Querying and parsing complete for %s.%s" %
                (court_id, pacer_case_id))

    # Merge the contents into CL.
    if d is None:
        d, count = find_docket_object(court_id, pacer_case_id,
                                      docket_data['docket_number'])
        if count > 1:
            d = d.earliest('date_created')

    add_recap_source(d)
    update_docket_metadata(d, docket_data)
    d.save()
    if tag is not None:
        tag, _ = Tag.objects.get_or_create(name=tag)
        d.tags.add(tag)

    # Add the HTML to the docket in case we need it someday.
    pacer_file = PacerHtmlFiles(content_object=d, upload_type=DOCKET)
    pacer_file.filepath.save(
        'docket.html',  # We only care about the ext w/UUIDFileSystemStorage
        ContentFile(report.response.text),
    )

    rds_created, needs_solr_update = add_docket_entries(
        d, docket_data['docket_entries'], tag=tag)
    add_parties_and_attorneys(d, docket_data['parties'])
    process_orphan_documents(rds_created, d.court_id, d.date_filed)
    logger.info("Created/updated docket: %s" % d)
    return {
        'docket_pk': d.pk,
        'needs_solr_update': bool(rds_created or needs_solr_update),
    }
コード例 #9
0
ファイル: tasks.py プロジェクト: saizai/courtlistener
def get_appellate_docket_by_docket_number(self,
                                          docket_number,
                                          court_id,
                                          cookies,
                                          tag_names=None,
                                          **kwargs):
    """Get a docket by docket number, CL court ID, and a collection of kwargs
    that can be passed to the DocketReport query.

    For details of acceptable parameters, see DocketReport.query()

    :param docket_number: The docket number of the case.
    :param court_id: A courtlistener/PACER appellate court ID.
    :param cookies: A requests.cookies.RequestsCookieJar with the cookies of a
    logged-in PACER user.
    :param tag_names: The tag name that should be stored with the item in the
    DB, if desired.
    :param kwargs: A variety of keyword args to pass to DocketReport.query().
    """
    s = PacerSession(cookies=cookies)
    report = AppellateDocketReport(court_id, s)
    logging_id = "%s - %s" % (court_id, docket_number)
    logger.info("Querying docket report %s", logging_id)

    try:
        report.query(docket_number, **kwargs)
    except requests.RequestException as e:
        logger.warning("Problem getting docket %s", logging_id)
        if self.request.retries == self.max_retries:
            self.request.callbacks = None
            return None
        raise self.retry(exc=e)

    docket_data = report.data
    logger.info('Querying and parsing complete for %s', logging_id)

    if docket_data == {}:
        logger.info("Unable to find docket: %s", logging_id)
        self.request.callbacks = None
        return None

    try:
        d = Docket.objects.get(
            docket_number=docket_number,
            court_id=court_id,
        )
    except Docket.DoesNotExist:
        d = None
    except Docket.MultipleObjectsReturned:
        d = None

    if d is None:
        d, count = find_docket_object(court_id, docket_number, docket_number)
        if count > 1:
            d = d.earliest('date_created')

    add_recap_source(d)
    update_docket_metadata(d, docket_data)
    d, og_info = update_docket_appellate_metadata(d, docket_data)
    if not d.pacer_case_id:
        d.pacer_case_id = docket_number

    if og_info is not None:
        og_info.save()
        d.originating_court_information = og_info
    d.save()
    tags = []
    if tag_names is not None:
        for tag_name in tag_names:
            tag, _ = Tag.objects.get_or_create(name=tag_name)
            tag.tag_object(d)
            tags.append(tag)

    # Save the HTML to the docket in case we need it someday.
    pacer_file = PacerHtmlFiles(content_object=d,
                                upload_type=UPLOAD_TYPE.APPELLATE_DOCKET)
    pacer_file.filepath.save(
        'docket.html',  # We only care about the ext w/UUIDFileSystemStorage
        ContentFile(report.response.text),
    )

    rds_created, content_updated = add_docket_entries(
        d, docket_data['docket_entries'], tags=tags)
    add_parties_and_attorneys(d, docket_data['parties'])
    process_orphan_documents(rds_created, d.court_id, d.date_filed)
    logger.info("Created/updated docket: %s" % d)
    return {
        'docket_pk': d.pk,
        'content_updated': bool(rds_created or content_updated),
    }
コード例 #10
0
ファイル: tasks.py プロジェクト: saizai/courtlistener
def get_docket_by_pacer_case_id(self,
                                data,
                                court_id,
                                cookies,
                                tag_names=None,
                                **kwargs):
    """Get a docket by PACER case id, CL court ID, and a collection of kwargs
    that can be passed to the DocketReport query.

    For details of acceptable parameters, see DocketReport.query()

    :param data: A dict containing:
        Required: 'pacer_case_id': The internal case ID of the item in PACER.
        Optional: 'docket_pk': The ID of the docket to work on to avoid lookups
                  if it's known in advance.
    :param court_id: A courtlistener court ID.
    :param cookies: A requests.cookies.RequestsCookieJar with the cookies of a
    logged-in PACER user.
    :param tag_names: A list of tag names that should be stored with the item
    in the DB.
    :param kwargs: A variety of keyword args to pass to DocketReport.query().
    :return: A dict indicating if we need to update Solr.
    """
    s = PacerSession(cookies=cookies)
    if data is None:
        logger.info("Empty data argument. Terminating " "chains and exiting.")
        self.request.callbacks = None
        return

    pacer_case_id = data.get('pacer_case_id')
    report = DocketReport(map_cl_to_pacer_id(court_id), s)
    logger.info("Querying docket report %s.%s" % (court_id, pacer_case_id))
    if data.get('docket_pk') is not None:
        d = Docket.objects.get(pk=data['docket_pk'])
    else:
        try:
            d = Docket.objects.get(
                pacer_case_id=pacer_case_id,
                court_id=court_id,
            )
        except Docket.DoesNotExist:
            d = None
        except Docket.MultipleObjectsReturned:
            d = None

    if d is not None:
        first_missing_id = get_first_missing_de_number(d)
        if first_missing_id > 1:
            # We don't have to get the whole thing!
            kwargs.setdefault('doc_num_start', first_missing_id)

    report.query(pacer_case_id, **kwargs)
    docket_data = report.data
    logger.info("Querying and parsing complete for %s.%s" %
                (court_id, pacer_case_id))

    if not docket_data:
        logger.info("No valid docket data for %s.%s", court_id, pacer_case_id)
        self.request.callbacks = None
        return

    # Merge the contents into CL.
    if d is None:
        d, count = find_docket_object(court_id, pacer_case_id,
                                      docket_data['docket_number'])
        if count > 1:
            d = d.earliest('date_created')

    add_recap_source(d)
    update_docket_metadata(d, docket_data)
    d.save()
    tags = []
    if tag_names is not None:
        for tag_name in tag_names:
            tag, _ = Tag.objects.get_or_create(name=tag_name)
            tag.tag_object(d)
            tags.append(tag)

    # Add the HTML to the docket in case we need it someday.
    pacer_file = PacerHtmlFiles(content_object=d,
                                upload_type=UPLOAD_TYPE.DOCKET)
    pacer_file.filepath.save(
        'docket.html',  # We only care about the ext w/UUIDFileSystemStorage
        ContentFile(report.response.text),
    )

    rds_created, content_updated = add_docket_entries(
        d, docket_data['docket_entries'], tags=tags)
    add_parties_and_attorneys(d, docket_data['parties'])
    process_orphan_documents(rds_created, d.court_id, d.date_filed)
    logger.info("Created/updated docket: %s" % d)
    return {
        'docket_pk': d.pk,
        'content_updated': bool(rds_created or content_updated),
    }