Esempio n. 1
0
    def handle(self, *args, **options):
        both_list_and_endpoints = (options.get('doc_id') is not None and
                                   (options.get('start_id') is not None or
                                    options.get('end_id') is not None or
                                    options.get('filed_after') is not None))
        no_option = (not any([options.get('doc_id') is None,
                              options.get('start_id') is None,
                              options.get('end_id') is None,
                              options.get('filed_after') is None,
                              options.get('all') is False]))
        if both_list_and_endpoints or no_option:
            raise CommandError('Please specify either a list of documents, a '
                               'range of ids, a range of dates, or '
                               'everything.')

        self.index = options['index']
        self.si = sunburnt.SolrInterface(settings.SOLR_OPINION_URL, mode='rw')

        # Use query chaining to build the query
        query = Opinion.objects.all()
        if options.get('doc_id'):
            query = query.filter(pk__in=options.get('doc_id'))
        if options.get('end_id'):
            query = query.filter(pk__lte=options.get('end_id'))
        if options.get('start_id'):
            query = query.filter(pk__gte=options.get('start_id'))
        if options.get('filed_after'):
            query = query.filter(cluster__date_filed__gte=options['filed_after'])
        if options.get('all'):
            query = Opinion.objects.all()
        count = query.count()
        docs = queryset_generator(query, chunksize=10000)
        self.update_documents(docs, count)
Esempio n. 2
0
    def handle(self, *args, **options):
        super(Command, self).handle(*args, **options)

        ds = Docket.objects.filter(source__in=Docket.RECAP_SOURCES).only(
            'pk',
            'case_name',
        )
        count = ds.count()
        xml_error_ids = []
        for i, d in enumerate(queryset_generator(ds, chunksize=50000)):
            sys.stdout.write('\rDoing docket: %s of %s, with pk: %s' %
                             (i, count, d.pk))
            sys.stdout.flush()
            if d.pk < options['start_pk'] > 0:
                continue

            try:
                d.reprocess_recap_content(do_original_xml=True)
            except IntegrityError:
                # Happens when there's wonkiness in the source data. Move on.
                continue
            except (XMLSyntaxError, IOError):
                # Happens when the local IA XML file is empty. Not sure why
                # these happen.
                xml_error_ids.append(d.pk)
                continue

        print("Encountered XMLSyntaxErrors/IOErrors for: %s" % xml_error_ids)
Esempio n. 3
0
 def test_queryset_generator(self):
     """Does the generator work properly with a variety of queries?"""
     tests = [
         {
             "query": UrlHash.objects.filter(pk__in=["BAD ID"]),
             "count": 0
         },
         {
             "query": UrlHash.objects.filter(pk__in=["0"]),
             "count": 1
         },
         {
             "query": UrlHash.objects.filter(pk__in=["0", "1"]),
             "count": 2
         },
     ]
     for test in tests:
         print(
             "Testing queryset_generator with %s expected results..." %
             test["count"],
             end="",
         )
         count = 0
         for _ in queryset_generator(test["query"]):
             count += 1
         self.assertEqual(count, test["count"])
         print("✓")
Esempio n. 4
0
 def test_queryset_generator(self):
     """Does the generator work properly with a variety of queries?"""
     tests = [
         {
             'query': UrlHash.objects.filter(pk__in=['BAD ID']),
             'count': 0
         },
         {
             'query': UrlHash.objects.filter(pk__in=['0']),
             'count': 1
         },
         {
             'query': UrlHash.objects.filter(pk__in=['0', '1']),
             'count': 2
         },
     ]
     for test in tests:
         print("Testing queryset_generator with %s expected results..." %
               test['count'],
               end='')
         count = 0
         for _ in queryset_generator(test['query']):
             count += 1
         self.assertEqual(count, test['count'])
         print('✓')
    def handle(self, *args, **options):
        super(Command, self).handle(*args, **options)

        ds = (
            Docket.objects.filter(
                # Only do ones that have HTML files *or* that have an IA XML file.
                # The latter is defined by ones that *don't* have blank
                # filepath_local fields.
                Q(html_documents__isnull=False) | ~Q(filepath_local=""),
                source__in=Docket.RECAP_SOURCES,
            ).distinct().only("pk", "case_name"))
        if options["start_pk"]:
            ds = ds.filter(pk__gte=options["start_pk"])
        count = ds.count()
        xml_error_ids = []
        for i, d in enumerate(queryset_generator(ds, chunksize=50000)):
            sys.stdout.write("\rDoing docket: %s of %s, with pk: %s" %
                             (i, count, d.pk))
            sys.stdout.flush()

            try:
                d.reprocess_recap_content(do_original_xml=True)
            except IntegrityError:
                # Happens when there's wonkiness in the source data. Move on.
                continue
            except (XMLSyntaxError, IOError):
                # Happens when the local IA XML file is empty. Not sure why
                # these happen.
                xml_error_ids.append(d.pk)
                continue

        print("Encountered XMLSyntaxErrors/IOErrors for: %s" % xml_error_ids)
Esempio n. 6
0
    def test_queryset_generator_values_query(self):
        """Do values queries work?"""
        print("Testing raising an error when we can't get a PK in a values "
              "query...", end='')
        self.assertRaises(
            Exception,
            queryset_generator(UrlHash.objects.values('sha1')),
            msg="Values query did not fail when pk was not provided."
        )
        print('✓')

        print("Testing a good values query...", end='')
        self.assertEqual(
            sum(1 for _ in queryset_generator(UrlHash.objects.values())),
            2,
        )
        print('✓')
Esempio n. 7
0
    def test_queryset_generator_values_query(self):
        """Do values queries work?"""
        print("Testing raising an error when we can't get a PK in a values "
              "query...", end='')
        self.assertRaises(
            Exception,
            queryset_generator(UrlHash.objects.values('sha1')),
            msg="Values query did not fail when pk was not provided."
        )
        print('✓')

        print("Testing a good values query...", end='')
        self.assertEqual(
            sum(1 for _ in queryset_generator(UrlHash.objects.values())),
            2,
        )
        print('✓')
Esempio n. 8
0
 def add_or_update_by_datetime(self, dt):
     """
     Given a datetime, adds or updates all items newer than that time.
     """
     self.stdout.write("Adding or updating items(s) newer than %s\n" % dt)
     qs = self.type.objects.filter(date_created__gte=dt)
     items = queryset_generator(qs, chunksize=5000)
     count = qs.count()
     self._chunk_queryset_into_tasks(items, count)
Esempio n. 9
0
 def add_or_update_by_datetime(self, dt):
     """
     Given a datetime, adds or updates all items newer than that time.
     """
     self.stdout.write("Adding or updating items(s) newer than %s\n" % dt)
     qs = self.type.objects.filter(date_created__gte=dt)
     items = queryset_generator(qs, chunksize=5000)
     count = qs.count()
     self.process_queryset(items, count)
def populate_docket_number_core_field(apps, schema_editor):
    Docket = apps.get_model('search', 'Docket')
    ds = Docket.objects.filter(
        court__jurisdiction='FD',
        docket_number_core='',
    ).only('docket_number')

    for d in queryset_generator(ds):
        d.docket_number_core = make_docket_number_core(d.docket_number)
        d.save()
Esempio n. 11
0
 def test_queryset_generator_chunking(self):
     """Does chunking work properly without duplicates or omissions?"""
     print("Testing if queryset_iterator chunking returns the right "
           "number of results...", end='')
     expected_count = 2
     results = queryset_generator(UrlHash.objects.all(), chunksize=1)
     self.assertEqual(
         expected_count,
         sum(1 for _ in results),
     )
     print('✓')
Esempio n. 12
0
 def test_queryset_generator_chunking(self):
     """Does chunking work properly without duplicates or omissions?"""
     print("Testing if queryset_iterator chunking returns the right "
           "number of results...", end='')
     expected_count = 2
     results = queryset_generator(UrlHash.objects.all(), chunksize=1)
     self.assertEqual(
         expected_count,
         sum(1 for _ in results),
     )
     print('✓')
Esempio n. 13
0
    def add_or_update_all(self):
        """
        Iterates over the entire corpus, adding it to the index. Can be run on
        an empty index or an existing one.

        If run on an existing index, existing items will be updated.
        """
        self.stdout.write("Adding or updating all items...\n")
        if self.type == Person:
            q = self.type.objects.filter(
                is_alias_of=None
            ).prefetch_related(
                'positions',
                'positions__predecessor',
                'positions__supervisor',
                'positions__appointer',
                'positions__court',
                'political_affiliations',
                'aba_ratings',
                'educations__school',
                'aliases',
                'race',
            )
            # Filter out non-judges -- they don't get searched.
            q = [item for item in q if item.is_judge]
            count = len(q)
        elif self.type == Docket:
            q = self.type.objects.filter(source=Docket.RECAP)
            count = q.count()
            q = queryset_generator(
                q,
                chunksize=5000,
            )
        else:
            q = self.type.objects.all()
            count = q.count()
            q = queryset_generator(
                q,
                chunksize=5000,
            )
        self._chunk_queryset_into_tasks(q, count)
Esempio n. 14
0
    def add_or_update_all(self):
        """
        Iterates over the entire corpus, adding it to the index. Can be run on
        an empty index or an existing one.

        If run on an existing index, existing items will be updated.
        """
        self.stdout.write("Adding or updating all items...\n")
        q = self.type.objects.all()
        items = queryset_generator(q, chunksize=5000)
        count = q.count()
        self._chunk_queryset_into_tasks(items, count)
Esempio n. 15
0
    def add_or_update_all(self):
        """
        Iterates over the entire corpus, adding it to the index. Can be run on
        an empty index or an existing one.

        If run on an existing index, existing items will be updated.
        """
        self.stdout.write("Adding or updating all items...\n")
        q = self.type.objects.all()
        items = queryset_generator(q, chunksize=5000)
        count = q.count()
        self._chunk_queryset_into_tasks(items, count)
    def add_or_update_all(self):
        """
        Iterates over the entire corpus, adding it to the index. Can be run on
        an empty index or an existing one.

        If run on an existing index, existing items will be updated.
        """
        self.stdout.write("Adding or updating all items...\n")
        if self.type == Person:
            q = self.type.objects.filter(is_alias_of=None).prefetch_related(
                'positions',
                'positions__predecessor',
                'positions__supervisor',
                'positions__appointer',
                'positions__court',
                'political_affiliations',
                'aba_ratings',
                'educations__school',
                'aliases',
                'race',
            )
            # Filter out non-judges -- they don't get searched.
            q = [item for item in q if item.is_judge]
            count = len(q)
        elif self.type == Docket:
            q = self.type.objects.filter(source=Docket.RECAP)
            count = q.count()
            q = queryset_generator(
                q,
                chunksize=5000,
            )
        else:
            q = self.type.objects.all()
            count = q.count()
            q = queryset_generator(
                q,
                chunksize=5000,
            )
        self._chunk_queryset_into_tasks(q, count)
Esempio n. 17
0
def process_pdf_queue(options):
    """Download all PDFs in queue

    Work through the queue of PDFs that need to be added to the database,
    download them and add them one by one.

    :return: None
    """
    pdf_pks = queryset_generator(
        QueuedPDF.objects.all().order_by("-document_id").values_list(
            "pk", flat=True))
    q = options["queue"]
    throttle = CeleryThrottle(queue_name=q)
    for pdf_pk in pdf_pks:
        throttle.maybe_wait()
        tasks.download_pdf.apply_async(kwargs={"pdf_pk": pdf_pk}, queue=q)
    def do_first_pass(options):
        idb_rows = FjcIntegratedDatabase.objects.filter(
            dataset_source=CV_2017,
        ).order_by("pk")
        q = options["queue"]
        throttle = CeleryThrottle(queue_name=q)
        for i, idb_row in enumerate(queryset_generator(idb_rows)):
            # Iterate over all items in the IDB and find them in the Docket
            # table. If they're not there, create a new item.
            if i < options["offset"]:
                continue
            if i >= options["limit"] > 0:
                break

            throttle.maybe_wait()
            # TODO: See conversation in #courtlistener channel from 2019-07-11,
            # In which it appears we matched a criminal case with a civil one.
            # The code below doesn't protect against that, but it should (and I
            # think it does in the `do_second_pass` code, below.
            ds = Docket.objects.filter(
                docket_number_core=idb_row.docket_number,
                court=idb_row.district,
            )
            count = ds.count()
            if count == 0:
                logger.info(
                    "%s: Creating new docket for IDB row: %s", i, idb_row
                )
                create_new_docket_from_idb.apply_async(
                    args=(idb_row.pk,), queue=q,
                )

            elif count == 1:
                d = ds[0]
                logger.info(
                    "%s: Merging Docket %s with IDB row: %s", i, d, idb_row
                )
                merge_docket_with_idb.apply_async(
                    args=(d.pk, idb_row.pk), queue=q
                )
            elif count > 1:
                logger.warning(
                    "%s: Unable to merge. Got %s dockets for row: %s",
                    i,
                    count,
                    idb_row,
                )
Esempio n. 19
0
 def test_queryset_generator(self):
     """Does the generator work properly with a variety of queries?"""
     tests = [
         {'query': UrlHash.objects.filter(pk__in=['BAD ID']),
          'count': 0},
         {'query': UrlHash.objects.filter(pk__in=['0']),
          'count': 1},
         {'query': UrlHash.objects.filter(pk__in=['0', '1']),
          'count': 2},
     ]
     for test in tests:
         print("Testing queryset_generator with %s expected results..." %
               test['count'], end='')
         count = 0
         for _ in queryset_generator(test['query']):
             count += 1
         self.assertEqual(count, test['count'])
         print('✓')
Esempio n. 20
0
def get_pdfs(options):
    """Get PDFs for the results of the Free Document Report queries.

    At this stage, we have rows in the PACERFreeDocumentRow table, each of
    which represents a PDF we need to download and merge into our normal
    tables: Docket, DocketEntry, and RECAPDocument.

    In this function, we iterate over the entire table of results, merge it
    into our normal tables, and then download and extract the PDF.

    :return: None
    """
    q = options["queue"]
    index = options["index"]
    cnt = CaseNameTweaker()
    rows = PACERFreeDocumentRow.objects.filter(error_msg="").only("pk")
    count = rows.count()
    task_name = "downloading"
    if index:
        task_name += " and indexing"
    logger.info("%s %s items from PACER." % (task_name, count))
    throttle = CeleryThrottle(queue_name=q)
    completed = 0
    for row in queryset_generator(rows):
        throttle.maybe_wait()
        if completed % 30000 == 0:
            pacer_session = PacerSession(
                username=PACER_USERNAME, password=PACER_PASSWORD
            )
            pacer_session.login()
        c = chain(
            process_free_opinion_result.si(row.pk, cnt).set(queue=q),
            get_and_process_pdf.s(pacer_session.cookies, row.pk).set(queue=q),
            delete_pacer_row.s(row.pk).set(queue=q),
        )
        if index:
            c |= add_items_to_solr.s("search.RECAPDocument").set(queue=q)
        c.apply_async()
        completed += 1
        if completed % 1000 == 0:
            logger.info(
                "Sent %s/%s tasks to celery for %s so "
                "far." % (completed, count, task_name)
            )
Esempio n. 21
0
    def handle(self, *args, **options):
        super(Command, self).handle(*args, **options)

        ds = Docket.objects.filter(source__in=Docket.RECAP_SOURCES).only(
            'pk',
            'case_name',
        )
        count = ds.count()
        for i, d in enumerate(queryset_generator(ds, chunksize=50000)):
            sys.stdout.write('\rDoing docket: %s of %s, with pk: %s' %
                             (i, count, d.pk))
            sys.stdout.flush()
            if d.pk < options['start_pk'] > 0:
                continue

            try:
                d.reprocess_recap_content(do_original_xml=True)
            except IntegrityError:
                # Happens when there's wonkiness in the source data. Move on.
                continue
def get_pdfs(options):
    """Get PDFs for the results of the Free Document Report queries.

    At this stage, we have rows in the PACERFreeDocumentRow table, each of
    which represents a PDF we need to download and merge into our normal
    tables: Docket, DocketEntry, and RECAPDocument.

    In this function, we iterate over the entire table of results, merge it
    into our normal tables, and then download and extract the PDF.

    :return: None
    """
    q = options['queue']
    index = options['index']
    cnt = CaseNameTweaker()
    rows = PACERFreeDocumentRow.objects.filter(error_msg="").only('pk')
    count = rows.count()
    task_name = "downloading"
    if index:
        task_name += " and indexing"
    logger.info("%s %s items from PACER." % (task_name, count))
    throttle = CeleryThrottle(queue_name=q)
    completed = 0
    for row in queryset_generator(rows):
        throttle.maybe_wait()
        if completed % 30000 == 0:
            pacer_session = PacerSession(username=PACER_USERNAME,
                                         password=PACER_PASSWORD)
            pacer_session.login()
        c = chain(
            process_free_opinion_result.si(row.pk, cnt).set(queue=q),
            get_and_process_pdf.s(pacer_session.cookies, row.pk).set(queue=q),
            delete_pacer_row.s(row.pk).set(queue=q),
        )
        if index:
            c |= add_items_to_solr.s('search.RECAPDocument').set(queue=q)
        c.apply_async()
        completed += 1
        if completed % 1000 == 0:
            logger.info("Sent %s/%s tasks to celery for %s so "
                        "far." % (completed, count, task_name))
Esempio n. 23
0
    def handle(self, *args, **options):
        super(Command, self).handle(*args, **options)
        qs = OpinionCluster.objects.all()
        start_at = options['start_at']
        if start_at:
            qs = qs.filter(pk__gte=start_at)
        for i, cluster in enumerate(queryset_generator(qs)):
            for field in cluster.citation_fields:
                citation_str = getattr(cluster, field)
                if citation_str:
                    # Split the citation and add it to the DB.
                    try:
                        citation_obj = get_citations(
                            citation_str,
                            html=False,
                            do_post_citation=False,
                            do_defendant=False,
                            disambiguate=False,
                        )[0]
                    except IndexError:
                        msg = "Errored out on: %s in %s" % (citation_str,
                                                            cluster.pk)
                        print(msg)
                        logger.info(msg)
                        continue
                    try:
                        Citation.objects.create(
                            cluster=cluster,
                            volume=citation_obj.volume,
                            reporter=citation_obj.reporter,
                            page=citation_obj.page,
                            type=map_model_field_to_citation_type(field)
                        )
                    except IntegrityError:
                        # Violated unique_together constraint. Fine.
                        pass

            if i % 1000 == 0:
                msg = "Completed %s items (last: %s)"
                print(msg % (i, cluster.pk))
                logger.info(msg, i, cluster.pk)
    def handle(self, *args, **options):
        super(Command, self).handle(*args, **options)

        idb_rows = FjcIntegratedDatabase.objects.filter(
            dataset_source=CV_2017, ).order_by('pk')
        q = options['queue']
        throttle = CeleryThrottle(queue_name=q)
        for i, idb_row in enumerate(queryset_generator(idb_rows)):
            # Iterate over all items in the IDB and find them in the Docket
            # table. If they're not there, create a new item.
            if i < options['offset']:
                continue
            if i >= options['limit'] > 0:
                break

            throttle.maybe_wait()
            docket_number_no_0s = remove_leading_zeros(idb_row.docket_number)
            ds = Docket.objects.filter(
                Q(docket_number_core=idb_row.docket_number)
                | Q(docket_number_core=docket_number_no_0s),
                court=idb_row.district,
            )
            count = ds.count()
            if count == 0:
                logger.info("%s: Creating new docket for IDB row: %s", i,
                            idb_row)
                create_new_docket_from_idb.apply_async(
                    args=(idb_row.pk, ),
                    queue=q,
                )

            elif count == 1:
                d = ds[0]
                logger.info("%s: Merging Docket %s with IDB row: %s", i, d,
                            idb_row)
                merge_docket_with_idb.apply_async(args=(d.pk, idb_row.pk),
                                                  queue=q)
            elif count > 1:
                logger.warn("%s: Unable to merge. Got %s dockets for row: %s",
                            i, count, idb_row)
    def update_any_missing_pacer_case_ids(options):
        """The network requests were making things far too slow and had to be
        disabled during the first pass. With this method, we update any items
        that are missing their pacer case ID value.
        """
        ds = Docket.objects.filter(
            idb_data__isnull=False,
            pacer_case_id=None,
        )
        q = options['queue']
        throttle = CeleryThrottle(queue_name=q)
        session = PacerSession(username=PACER_USERNAME,
                               password=PACER_PASSWORD)
        session.login()
        for i, d in enumerate(queryset_generator(ds)):
            if i < options['offset']:
                continue
            if i >= options['limit'] > 0:
                break

            if i % 5000 == 0:
                # Re-authenticate just in case the auto-login mechanism isn't
                # working.
                session = PacerSession(username=PACER_USERNAME,
                                       password=PACER_PASSWORD)
                session.login()

            throttle.maybe_wait()
            logger.info("Getting pacer_case_id for item %s", d)
            params = make_fjc_idb_lookup_params(d.idb_data)
            chain(
                get_pacer_case_id_and_title.s(
                    pass_through=d.pk,
                    docket_number=d.idb_data.docket_number,
                    court_id=d.idb_data.district_id,
                    cookies=session.cookies,
                    **params
                ).set(queue=q),
                update_docket_from_hidden_api.s().set(queue=q),
            ).apply_async()
    def handle(self, *args, **options):
        """
        For any item that has a citation count > 0, update the citation
        count based on the DB.
        """
        index_during_processing = False
        if options['index'] == 'concurrently':
            index_during_processing = True

        q = OpinionCluster.objects.filter(citation_count__gt=0)
        if options.get('doc_id'):
            q = q.filter(pk__in=options['doc_id'])
        items = queryset_generator(q, chunksize=10000)
        for item in items:
            count = 0
            for sub_opinion in item.sub_opinions.all():
                count += sub_opinion.citing_opinions.all().count()

            item.citation_count = count
            item.save(index=index_during_processing)

        self.do_solr(options)
Esempio n. 27
0
    def handle(self, *args, **options):
        """
        For any item that has a citation count > 0, update the citation
        count based on the DB.
        """
        index_during_processing = False
        if options['index'] == 'concurrently':
            index_during_processing = True

        q = OpinionCluster.objects.filter(citation_count__gt=0)
        if options.get('doc_id'):
            q = q.filter(pk__in=options['doc_id'])
        items = queryset_generator(q, chunksize=10000)
        for item in items:
            count = 0
            for sub_opinion in item.sub_opinions.all():
                count += sub_opinion.citing_opinions.all().count()

            item.citation_count = count
            item.save(index=index_during_processing)

        self.do_solr(options)
Esempio n. 28
0
    def handle(self, *args, **options):
        super(Command, self).handle(*args, **options)
        both_list_and_endpoints = (options.get('doc_id') is not None and
                                   (options.get('start_id') is not None
                                    or options.get('end_id') is not None
                                    or options.get('filed_after') is not None))
        no_option = (not any([
            options.get('doc_id') is None,
            options.get('start_id') is None,
            options.get('end_id') is None,
            options.get('filed_after') is None,
            options.get('all') is False
        ]))
        if both_list_and_endpoints or no_option:
            raise CommandError('Please specify either a list of documents, a '
                               'range of ids, a range of dates, or '
                               'everything.')

        self.index = options['index']
        self.si = sunburnt.SolrInterface(settings.SOLR_OPINION_URL, mode='rw')

        # Use query chaining to build the query
        query = Opinion.objects.all()
        if options.get('doc_id'):
            query = query.filter(pk__in=options.get('doc_id'))
        if options.get('end_id'):
            query = query.filter(pk__lte=options.get('end_id'))
        if options.get('start_id'):
            query = query.filter(pk__gte=options.get('start_id'))
        if options.get('filed_after'):
            query = query.filter(
                cluster__date_filed__gte=options['filed_after'])
        if options.get('all'):
            query = Opinion.objects.all()
        self.count = query.count()
        self.average_per_s = 0
        self.timings = []
        docs = queryset_generator(query, chunksize=10000)
        self.update_documents(docs)
    def handle(self, *args, **options):
        super(Command, self).handle(*args, **options)

        ops = queryset_generator(
            Opinion.objects.exclude(
                Q(html="") | Q(html=None),
                Q(html_lawbox="") | Q(html_lawbox=None),
                Q(html_columbia="") | Q(html_columbia=None),
            ))

        for op in ops:
            content = render_to_string("simple_opinion.html", {"o": op})
            output_dir = os.path.join(
                options["output_directory"],
                str(op.cluster.date_filed.year),
                str(op.cluster.date_filed.month),
                str(op.cluster.date_filed.day),
            )
            mkdir_p(output_dir)
            output_path = os.path.join(output_dir, f"{op.pk}.html")
            with open(output_path, "w") as f:
                f.write(content.encode())
Esempio n. 30
0
    def handle(self, *args, **options):
        both_list_and_endpoints = options.get("doc_id") is not None and (
            options.get("start_id") is not None
            or options.get("end_id") is not None
            or options.get("filed_after") is not None
        )
        no_option = not any(
            [
                options.get("doc_id") is None,
                options.get("start_id") is None,
                options.get("end_id") is None,
                options.get("filed_after") is None,
                options.get("all") is False,
            ]
        )
        if both_list_and_endpoints or no_option:
            raise CommandError(
                "Please specify either a list of documents, a " "range of ids, a range of dates, or " "everything."
            )

        self.index = options["index"]
        self.si = sunburnt.SolrInterface(settings.SOLR_OPINION_URL, mode="rw")

        # Use query chaining to build the query
        query = Opinion.objects.all()
        if options.get("doc_id"):
            query = query.filter(pk__in=options.get("doc_id"))
        if options.get("end_id"):
            query = query.filter(pk__lte=options.get("end_id"))
        if options.get("start_id"):
            query = query.filter(pk__gte=options.get("start_id"))
        if options.get("filed_after"):
            query = query.filter(cluster__date_filed__gte=options["filed_after"])
        if options.get("all"):
            query = Opinion.objects.all()
        count = query.count()
        docs = queryset_generator(query, chunksize=10000)
        self.update_documents(docs, count)
    def handle(self, *args, **options):
        super(Command, self).handle(*args, **options)

        ops = queryset_generator(Opinion.objects.exclude(
            Q(html='') | Q(html=None),
            Q(html_lawbox='') | Q(html_lawbox=None),
            Q(html_columbia='') | Q(html_columbia=None),
        ))

        for op in ops:
            content = render_to_string('simple_opinion.html', {
                'o': op,
            })
            output_dir = os.path.join(
                options['output_directory'],
                str(op.cluster.date_filed.year),
                str(op.cluster.date_filed.month),
                str(op.cluster.date_filed.day),
            )
            mkdir_p(output_dir)
            output_path = os.path.join(output_dir, '%s.html' % op.pk)
            with open(output_path, 'w') as f:
                f.write(content.encode('utf-8'))
    def do_first_pass(options):
        idb_rows = FjcIntegratedDatabase.objects.filter(
            dataset_source=CV_2017,
        ).order_by('pk')
        q = options['queue']
        throttle = CeleryThrottle(queue_name=q)
        for i, idb_row in enumerate(queryset_generator(idb_rows)):
            # Iterate over all items in the IDB and find them in the Docket
            # table. If they're not there, create a new item.
            if i < options['offset']:
                continue
            if i >= options['limit'] > 0:
                break

            throttle.maybe_wait()
            ds = Docket.objects.filter(
                docket_number_core=idb_row.docket_number,
                court=idb_row.district,
            )
            count = ds.count()
            if count == 0:
                logger.info("%s: Creating new docket for IDB row: %s",
                            i, idb_row)
                create_new_docket_from_idb.apply_async(
                    args=(idb_row.pk,),
                    queue=q,
                )

            elif count == 1:
                d = ds[0]
                logger.info("%s: Merging Docket %s with IDB row: %s",
                            i, d, idb_row)
                merge_docket_with_idb.apply_async(args=(d.pk, idb_row.pk),
                                                  queue=q)
            elif count > 1:
                logger.warn("%s: Unable to merge. Got %s dockets for row: %s",
                            i, count, idb_row)
    def handle(self, *args, **options):
        super(Command, self).handle(*args, **options)

        ops = queryset_generator(
            Opinion.objects.exclude(
                Q(html='') | Q(html=None),
                Q(html_lawbox='') | Q(html_lawbox=None),
                Q(html_columbia='') | Q(html_columbia=None),
            ))

        for op in ops:
            content = render_to_string('simple_opinion.html', {
                'o': op,
            })
            output_dir = os.path.join(
                options['output_directory'],
                str(op.cluster.date_filed.year),
                str(op.cluster.date_filed.month),
                str(op.cluster.date_filed.day),
            )
            mkdir_p(output_dir)
            output_path = os.path.join(output_dir, '%s.html' % op.pk)
            with open(output_path, 'w') as f:
                f.write(content.encode('utf-8'))
Esempio n. 34
0
 def handle(self, *args, **options):
     super(Command, self).handle(*args, **options)
     qs = RECAPDocument.objects.filter(is_available=True, file_size=None)
     for i, rd in enumerate(queryset_generator(qs)):
         try:
             rd.file_size = rd.filepath_local.size
         except OSError as e:
             if e.errno != 2:
                 # Problem other than No such file or directory.
                 raise
             continue
         except ValueError:
             #  The 'filepath_local' attribute has no file
             # associated with it.
             continue
         try:
             rd.save()
         except ValidationError:
             # [u'Duplicate values violate save constraint. An object with
             # this document_number and docket_entry already exists:
             # (8, 16188376)']
             continue
         if i % 1000 == 0:
             logger.info("Completed %s items", i)
    def handle(self, *args, **options):
        """Identify parallel citations and save them as requested.

        This process proceeds in two phases. The first phase is to work through
        the entire corpus, identifying citations that occur very near to each
        other. These are considered parallel citations, and they are built into
        a graph data structure where citations are nodes and each parallel
        citation is an edge. The weight of each edge is determined by the
        number of times a parallel citation has been identified between two
        citations. This should solve problems like typos or other issues with
        our heuristic approach.

        The second phase of this process is to update the database with the
        high quality citations. This can only be done by matching the citations
        with actual items in the database and then updating them with parallel
        citations that are sufficiently likely to be good.
        """
        super(Command, self).handle(*args, **options)
        no_option = (not any([options.get('doc_id'), options.get('all')]))
        if no_option:
            raise CommandError("Please specify if you want all items or a "
                               "specific item.")
        if not options['update_database']:
            logger.info(
                "--update_database is not set. No changes will be made to the "
                "database."
            )

        # Update Citation object to consider similar objects equal.
        self.monkey_patch_citation()

        logger.info("## Entering phase one: Building a network object of "
                    "all citations.\n")
        q = Opinion.objects.all()
        if options.get('doc_id'):
            q = q.filter(pk__in=options['doc_id'])
        count = q.count()
        opinions = queryset_generator(q, chunksize=10000)

        node_count = edge_count = completed = 0
        subtasks = []
        for o in opinions:
            subtasks.append(
                # This will call the second function with the results from the
                # first.
                get_document_citations.s(o) | identify_parallel_citations.s()
            )
            last_item = (count == completed + 1)
            if (completed % 50 == 0) or last_item:
                job = group(subtasks)
                result = job.apply_async().join()
                [self.add_groups_to_network(citation_groups) for
                 citation_groups in result]
                subtasks = []

            completed += 1
            if completed % 250 == 0 or last_item:
                # Only do this once in a while.
                node_count = len(self.g.nodes())
                edge_count = len(self.g.edges())
            sys.stdout.write("\r  Completed %s of %s. (%s nodes, %s edges)" % (
                completed,
                count,
                node_count,
                edge_count,
            ))
            sys.stdout.flush()

        logger.info("\n\n## Entering phase two: Saving the best edges to "
                    "the database.\n\n")
        for sub_graph in nx.connected_component_subgraphs(self.g):
            self.handle_subgraph(sub_graph, options)

        logger.info("\n\n## Done. Added %s new citations." % self.update_count)

        self.do_solr(options)
Esempio n. 36
0
    def add_or_update_all(self):
        """
        Iterates over the entire corpus, adding it to the index. Can be run on
        an empty index or an existing one.

        If run on an existing index, existing items will be updated.
        """
        self.stdout.write("Adding or updating all items...\n")
        if self.type == Person:
            q = self.type.objects.filter(is_alias_of=None).prefetch_related(
                'positions',
                'positions__predecessor',
                'positions__supervisor',
                'positions__appointer',
                'positions__court',
                'political_affiliations',
                'aba_ratings',
                'educations__school',
                'aliases',
                'race',
            )
            # Filter out non-judges -- they don't get searched.
            q = [item for item in q if item.is_judge]
            count = len(q)
        elif self.type == RECAPDocument:
            q = self.type.objects.all().prefetch_related(
                # IDs
                'docket_entry__pk',
                'docket_entry__docket__pk',
                'docket_entry__docket__court__pk',
                'docket_entry__docket__assigned_to__pk',
                'docket_entry__docket__referred_to__pk',

                # Docket Entry
                'docket_entry__description',
                'docket_entry__entry_number',
                'docket_entry__date_filed',

                # Docket
                'docket_entry__docket__date_argued',
                'docket_entry__docket__date_filed',
                'docket_entry__docket__date_terminated',
                'docket_entry__docket__docket_number',
                'docket_entry__docket__case_name_short',
                'docket_entry__docket__case_name',
                'docket_entry__docket__case_name_full',
                'docket_entry__docket__nature_of_suit',
                'docket_entry__docket__cause',
                'docket_entry__docket__jury_demand',
                'docket_entry__docket__jurisdiction_type',
                'docket_entry__docket__slug',

                # Judges
                'docket_entry__docket__assigned_to__name_first',
                'docket_entry__docket__assigned_to__name_middle',
                'docket_entry__docket__assigned_to__name_last',
                'docket_entry__docket__assigned_to__name_suffix',
                'docket_entry__docket__assigned_to_str',
                'docket_entry__docket__referred_to__name_first',
                'docket_entry__docket__referred_to__name_middle',
                'docket_entry__docket__referred_to__name_last',
                'docket_entry__docket__referred_to__name_suffix',
                'docket_entry__docket__referred_to_str',

                # Court
                'docket_entry__docket__court__full_name',
                'docket_entry__docket__court__citation_string',
            )
            count = q.count()
            q = queryset_generator(
                q,
                chunksize=5000,
            )
        else:
            q = self.type.objects.all()
            count = q.count()
            q = queryset_generator(
                q,
                chunksize=5000,
            )
        self._chunk_queryset_into_tasks(q, count)
    def do_second_pass(options):
        """In the first pass, we ignored the duplicates that we got, preferring
        to let them stack up for later analysis. In this pass, we attempt to
        merge those failed items into the DB by more aggressive filtering and
        algorithmic selection.
        """
        idb_rows = FjcIntegratedDatabase.objects.filter(
            dataset_source=CV_2017,
            docket__isnull=True,
        ).order_by('pk')
        for i, idb_row in enumerate(queryset_generator(idb_rows)):
            # Iterate over all items in the IDB and find them in the Docket
            # table. If they're not there, create a new item.
            if i < options['offset']:
                continue
            if i >= options['limit'] > 0:
                break

            ds = Docket.objects.filter(
                docket_number_core=idb_row.docket_number,
                court=idb_row.district,
                docket_number__startswith='%s:' % idb_row.office
            ).exclude(
                docket_number__icontains='cr'
            ).exclude(
                case_name__icontains="sealed"
            ).exclude(
                case_name__icontains='suppressed'
            ).exclude(
                case_name__icontains='search warrant'
            )
            count = ds.count()

            if count == 0:
                logger.info("%s: Creating new docket for IDB row: %s",
                            i, idb_row)
                create_new_docket_from_idb(idb_row.pk)
                continue
            elif count == 1:
                d = ds[0]
                logger.info("%s: Merging Docket %s with IDB row: %s",
                            i, d, idb_row)
                merge_docket_with_idb(d.pk, idb_row.pk)
                continue

            logger.info("%s: Still have %s results after office and civil "
                        "docket number filtering. Filtering further.",
                        i, count)

            case_names = []
            for d in ds:
                case_name = harmonize(d.case_name)
                parts = case_name.lower().split(' v. ')
                if len(parts) == 1:
                    case_names.append(case_name)
                elif len(parts) == 2:
                    plaintiff, defendant = parts[0], parts[1]
                    case_names.append(
                        '%s v. %s' % (plaintiff[0:30], defendant[0:30])
                    )
                elif len(parts) > 2:
                    case_names.append(case_name)
            idb_case_name = harmonize('%s v. %s' % (idb_row.plaintiff,
                                                    idb_row.defendant))
            results = find_best_match(case_names, idb_case_name,
                                      case_sensitive=False)

            if results['ratio'] > 0.65:
                logger.info("%s Found good match by case name for %s: %s",
                            i, idb_case_name, results['match_str'])
                d = ds[results['match_index']]
                merge_docket_with_idb(d.pk, idb_row.pk)
            else:
                logger.info("%s No good match after office and case name "
                            "filtering. Creating new item: %s", i, idb_row)
                create_new_docket_from_idb(idb_row.pk)
Esempio n. 38
0
    def handle(self, *args, **options):
        """Identify parallel citations and save them as requested.

        This process proceeds in two phases. The first phase is to work through
        the entire corpus, identifying citations that occur very near to each
        other. These are considered parallel citations, and they are built into
        a graph data structure where citations are nodes and each parallel
        citation is an edge. The weight of each edge is determined by the
        number of times a parallel citation has been identified between two
        citations. This should solve problems like typos or other issues with
        our heuristic approach.

        The second phase of this process is to update the database with the
        high quality citations. This can only be done by matching the citations
        with actual items in the database and then updating them with parallel
        citations that are sufficiently likely to be good.
        """
        super(Command, self).handle(*args, **options)
        no_option = not any([options.get("doc_id"), options.get("all")])
        if no_option:
            raise CommandError(
                "Please specify if you want all items or a specific item.")
        if not options["update_database"]:
            logger.info(
                "--update_database is not set. No changes will be made to the "
                "database.")

        logger.info("## Entering phase one: Building a network object of "
                    "all citations.\n")
        q = Opinion.objects.all()
        if options.get("doc_id"):
            q = q.filter(pk__in=options["doc_id"])
        count = q.count()
        opinions = queryset_generator(q, chunksize=10000)

        node_count = edge_count = completed = 0
        subtasks = []
        for o in opinions:
            subtasks.append(
                # This will call the second function with the results from the
                # first.
                get_document_citations.s(o)
                | identify_parallel_citations.s())
            last_item = count == completed + 1
            if (completed % 50 == 0) or last_item:
                job = group(subtasks)
                result = job.apply_async().join()
                [
                    self.add_groups_to_network(citation_groups)
                    for citation_groups in result
                ]
                subtasks = []

            completed += 1
            if completed % 250 == 0 or last_item:
                # Only do this once in a while.
                node_count = len(self.g.nodes())
                edge_count = len(self.g.edges())
            sys.stdout.write("\r  Completed %s of %s. (%s nodes, %s edges)" %
                             (completed, count, node_count, edge_count))
            sys.stdout.flush()

        logger.info("\n\n## Entering phase two: Saving the best edges to "
                    "the database.\n\n")
        for sub_graph in nx.connected_component_subgraphs(self.g):
            self.handle_subgraph(sub_graph, options)

        logger.info("\n\n## Done. Added %s new citations." % self.update_count)

        self.do_solr(options)
Esempio n. 39
0
def write_json_to_disk(courts, obj_type_str, obj_class, court_attr,
                       serializer, bulk_dir):
    """Write all items to disk as json files inside directories named by
    jurisdiction.

    The main trick is that we identify if we are creating a bulk archive
    from scratch. If so, we iterate over everything. If not, we only
    iterate over items that have been modified since the last good date.

    We deal with two kinds of bulk data. The first is jurisdiction-centric, in
    which we want to make bulk data for that particular jurisdiction, such as
    opinions or PACER data, or whatever. The second is non-jurisdiction-
    specific, like people or schools. For jurisdiction-specific data, we make
    jurisdiction directories to put the data into. Otherwise, we do not.

    :param courts: Court objects that you expect to make data for.
    :param obj_type_str: A string to use for the directory name of a type of
    data. For example, for clusters, it's 'clusters'.
    :param obj_class: The actual class to make a bulk data for.
    :param court_attr: A string that can be used to find the court attribute
    on an object. For example, on clusters, this is currently docket.court_id.
    :param serializer: A DRF serializer to use to generate the data.
    :param bulk_dir: A directory to place the serialized JSON data into.

    :returns int: The number of items generated
    """
    # Are there already bulk files?
    history = BulkJsonHistory(obj_type_str, bulk_dir)
    last_good_date = history.get_last_good_date()
    history.add_current_attempt_and_save()

    if court_attr is not None:
        # Create a directory for every jurisdiction, if they don't already
        # exist. This does not clobber.
        for court in courts:
            mkdir_p(join(
                bulk_dir,
                obj_type_str,
                court.pk,
            ))
    else:
        # Make a directory for the object type.
        mkdir_p(join(bulk_dir, obj_type_str))

    if last_good_date is not None:
        print("   - Incremental data found. Assuming it's good and using it...")
        qs = obj_class.objects.filter(date_modified__gte=last_good_date)
    else:
        print("   - Incremental data not found. Working from scratch...")
        qs = obj_class.objects.all()

    if qs.count() == 0:
        print("   - No %s-type items in the DB or none that have changed. All "
              "done here." % obj_type_str)
        history.mark_success_and_save()
        return 0
    else:
        if type(qs[0].pk) == int:
            item_list = queryset_generator(qs)
        else:
            # Necessary for Court objects, which don't have ints for ids.
            item_list = qs

        i = 0
        renderer = JSONRenderer()
        r = RequestFactory().request()
        r.META['SERVER_NAME'] = 'www.courtlistener.com'  # Else, it's testserver
        r.META['SERVER_PORT'] = '443'  # Else, it's 80
        r.META['wsgi.url_scheme'] = 'https'  # Else, it's http.
        r.version = 'v3'
        r.versioning_scheme = URLPathVersioning()
        context = dict(request=r)
        for item in item_list:
            if i % 1000 == 0:
                print("Completed %s items so far." % i)
            json_str = renderer.render(
                serializer(item, context=context).data,
                accepted_media_type='application/json; indent=2',
            )

            if court_attr is not None:
                loc = join(bulk_dir, obj_type_str, deepgetattr(item, court_attr),
                           '%s.json' % item.pk)
            else:
                # A non-jurisdiction-centric object.
                loc = join(bulk_dir, obj_type_str, '%s.json' % item.pk)

            with open(loc, 'wb') as f:
                f.write(json_str)
            i += 1

        print ('   - %s %s json files created.' % (i, obj_type_str))

        history.mark_success_and_save()
        return i
Esempio n. 40
0
def write_json_to_disk(courts, obj_type_str, obj_class, court_attr, serializer,
                       bulk_dir):
    """Write all items to disk as json files inside directories named by
    jurisdiction.

    The main trick is that we identify if we are creating a bulk archive
    from scratch. If so, we iterate over everything. If not, we only
    iterate over items that have been modified since the last good date.

    We deal with two kinds of bulk data. The first is jurisdiction-centric, in
    which we want to make bulk data for that particular jurisdiction, such as
    opinions or PACER data, or whatever. The second is non-jurisdiction-
    specific, like people or schools. For jurisdiction-specific data, we make
    jurisdiction directories to put the data into. Otherwise, we do not.

    :param courts: Court objects that you expect to make data for.
    :param obj_type_str: A string to use for the directory name of a type of
    data. For example, for clusters, it's 'clusters'.
    :param obj_class: The actual class to make a bulk data for.
    :param court_attr: A string that can be used to find the court attribute
    on an object. For example, on clusters, this is currently docket.court_id.
    :param serializer: A DRF serializer to use to generate the data.
    :param bulk_dir: A directory to place the serialized JSON data into.

    :returns int: The number of items generated
    """
    # Are there already bulk files?
    history = BulkJsonHistory(obj_type_str, bulk_dir)
    last_good_date = history.get_last_good_date()
    history.add_current_attempt_and_save()

    if court_attr is not None:
        # Create a directory for every jurisdiction, if they don't already
        # exist. This does not clobber.
        for court in courts:
            mkdir_p(join(
                bulk_dir,
                obj_type_str,
                court.pk,
            ))
    else:
        # Make a directory for the object type.
        mkdir_p(join(bulk_dir, obj_type_str))

    if last_good_date is not None:
        print(
            "   - Incremental data found. Assuming it's good and using it...")
        qs = obj_class.objects.filter(date_modified__gte=last_good_date)
    else:
        print("   - Incremental data not found. Working from scratch...")
        qs = obj_class.objects.all()

    if qs.count() == 0:
        print("   - No %s-type items in the DB or none that have changed. All "
              "done here." % obj_type_str)
        history.mark_success_and_save()
        return 0
    else:
        if type(qs[0].pk) == int:
            item_list = queryset_generator(qs)
        else:
            # Necessary for Court objects, which don't have ints for ids.
            item_list = qs

        i = 0
        renderer = JSONRenderer()
        r = RequestFactory().request()
        r.META[
            "SERVER_NAME"] = "www.courtlistener.com"  # Else, it's testserver
        r.META["SERVER_PORT"] = "443"  # Else, it's 80
        r.META["wsgi.url_scheme"] = "https"  # Else, it's http.
        r.version = "v3"
        r.versioning_scheme = URLPathVersioning()
        context = dict(request=r)
        for item in item_list:
            if i % 1000 == 0:
                print("Completed %s items so far." % i)
            json_str = renderer.render(
                serializer(item, context=context).data,
                accepted_media_type="application/json; indent=2",
            )

            if court_attr is not None:
                loc = join(
                    bulk_dir,
                    obj_type_str,
                    deepgetattr(item, court_attr),
                    "%s.json" % item.pk,
                )
            else:
                # A non-jurisdiction-centric object.
                loc = join(bulk_dir, obj_type_str, "%s.json" % item.pk)

            with open(loc, "wb") as f:
                f.write(json_str)
            i += 1

        print("   - %s %s json files created." % (i, obj_type_str))

        history.mark_success_and_save()
        return i
Esempio n. 41
0
def write_json_to_disk(courts, obj_type_str, obj_type, court_attr,
                       serializer):
    """Write all items to disk as json files inside directories named by
    jurisdiction.

    The main trick is that we identify if we are creating a bulk archive
    from scratch. If so, we iterate over everything. If not, we only
    iterate over items that have been modified in the last 32 days because
    it's assumed that the bulk files are generated once per month.
    """
    # Are there already bulk files?
    history = BulkJsonHistory(obj_type_str)
    last_good_date = history.get_last_good_date()
    history.add_current_attempt_and_save()

    if court_attr is not None:
        # Create a directory for every jurisdiction, if they don't already
        # exist. This does not clobber.
        for court in courts:
            mkdir_p(join(
                settings.BULK_DATA_DIR,
                'tmp',
                obj_type_str,
                court.pk,
            ))

    if last_good_date is not None:
        print "   - Incremental data found. Assuming it's good and using it..."
        qs = obj_type.objects.filter(date_modified__gte=last_good_date)
    else:
        print "   - Incremental data not found. Working from scratch..."
        qs = obj_type.objects.all()

    if qs.count() == 0:
        print "   - No %s-type items in the DB or none that have changed. All done here." % obj_type_str
        history.mark_success_and_save()
        return 0
    else:
        if type(qs[0].pk) == int:
            item_list = queryset_generator(qs)
        else:
            # Necessary for Court objects, which don't have ints for ids.
            item_list = qs

        i = 0
        renderer = JSONRenderer()
        r = RequestFactory().request()
        r.META['SERVER_NAME'] = 'www.courtlistener.com'  # Else, it's testserver
        r.version = 'v3'
        r.versioning_scheme = URLPathVersioning()
        context = dict(request=r)
        for item in item_list:
            json_str = renderer.render(
                serializer(item, context=context).data,
                accepted_media_type='application/json; indent=2',
            )

            if court_attr is not None:
                loc = join(settings.BULK_DATA_DIR, 'tmp', obj_type_str,
                           deepgetattr(item, court_attr), '%s.json' % item.pk)
            else:
                # A non-jurisdiction-centric object.
                loc = join(settings.BULK_DATA_DIR, 'tmp', obj_type_str,
                           '%s.json' % item.pk)

            with open(loc, 'wb') as f:
                f.write(json_str)
            i += 1

        print '   - %s %s json files created.' % (i, obj_type_str)

        history.mark_success_and_save()
        return i
Esempio n. 42
0
def write_json_to_disk(courts, obj_type_str, obj_type, court_attr, serializer):
    """Write all items to disk as json files inside directories named by
    jurisdiction.

    The main trick is that we identify if we are creating a bulk archive
    from scratch. If so, we iterate over everything. If not, we only
    iterate over items that have been modified since the last good date.
    """
    # Are there already bulk files?
    history = BulkJsonHistory(obj_type_str)
    last_good_date = history.get_last_good_date()
    history.add_current_attempt_and_save()

    if court_attr is not None:
        # Create a directory for every jurisdiction, if they don't already
        # exist. This does not clobber.
        for court in courts:
            mkdir_p(
                join(
                    settings.BULK_DATA_DIR,
                    'tmp',
                    obj_type_str,
                    court.pk,
                ))

    if last_good_date is not None:
        print "   - Incremental data found. Assuming it's good and using it..."
        qs = obj_type.objects.filter(date_modified__gte=last_good_date)
    else:
        print "   - Incremental data not found. Working from scratch..."
        qs = obj_type.objects.all()

    if qs.count() == 0:
        print "   - No %s-type items in the DB or none that have changed. All done here." % obj_type_str
        history.mark_success_and_save()
        return 0
    else:
        if type(qs[0].pk) == int:
            item_list = queryset_generator(qs)
        else:
            # Necessary for Court objects, which don't have ints for ids.
            item_list = qs

        i = 0
        renderer = JSONRenderer()
        r = RequestFactory().request()
        r.META[
            'SERVER_NAME'] = 'www.courtlistener.com'  # Else, it's testserver
        r.version = 'v3'
        r.versioning_scheme = URLPathVersioning()
        context = dict(request=r)
        for item in item_list:
            json_str = renderer.render(
                serializer(item, context=context).data,
                accepted_media_type='application/json; indent=2',
            )

            if court_attr is not None:
                loc = join(settings.BULK_DATA_DIR, 'tmp', obj_type_str,
                           deepgetattr(item, court_attr), '%s.json' % item.pk)
            else:
                # A non-jurisdiction-centric object.
                loc = join(settings.BULK_DATA_DIR, 'tmp', obj_type_str,
                           '%s.json' % item.pk)

            with open(loc, 'wb') as f:
                f.write(json_str)
            i += 1

        print '   - %s %s json files created.' % (i, obj_type_str)

        history.mark_success_and_save()
        return i
Esempio n. 43
0
    def migrate_opinions_oral_args_and_dockets(self):
        self.stdout.write("Migrating dockets, audio files, and opinions to new "
                          "database...")
        q = DocketOld.objects.using('old').all()
        old_dockets = queryset_generator(q)
        num_dockets = q.count()

        progress = 0
        self._print_progress(progress, num_dockets)
        for old_docket in old_dockets:
            # First do the docket, then create the cluster and opinion objects.
            try:
                old_audio = old_docket.audio_files.all()[0]
            except IndexError:
                old_audio = None
            try:
                old_document = old_docket.documents.all()[0]
            except IndexError:
                old_document = None
            if old_document is not None:
                old_citation = old_document.citation
                old_doc_case_name, old_doc_case_name_full, old_doc_case_name_short = self._get_case_names(old_citation.case_name)
            if old_audio is not None:
                old_audio_case_name, old_audio_case_name_full, old_audio_case_name_short = self._get_case_names(old_audio.case_name)

            court = CourtNew.objects.get(pk=old_docket.court_id)  # Courts are in place thanks to initial data.

            new_docket = DocketNew(
                pk=old_docket.pk,
                date_modified=old_docket.date_modified,
                date_created=old_docket.date_modified,
                court=court,
                case_name=old_doc_case_name,
                case_name_full=old_doc_case_name_full,
                case_name_short=old_doc_case_name_short,
                slug=self._none_to_blank(old_docket.slug),
                docket_number=self._none_to_blank(old_citation.docket_number),
                date_blocked=old_docket.date_blocked,
                blocked=old_docket.blocked,
            )
            if old_audio is not None:
                new_docket.date_argued = old_audio.date_argued
            new_docket.save(using='default')

            if old_document is not None:
                new_opinion_cluster = OpinionClusterNew(
                    pk=old_document.pk,
                    docket=new_docket,
                    judges=self._none_to_blank(old_document.judges),
                    date_modified=old_document.date_modified,
                    date_created=old_document.date_modified,
                    date_filed=old_document.date_filed,
                    slug=self._none_to_blank(old_citation.slug),
                    citation_id=old_document.citation_id,
                    case_name_short=old_doc_case_name_short,
                    case_name=old_doc_case_name,
                    case_name_full=old_doc_case_name_full,
                    federal_cite_one=self._none_to_blank(
                        old_citation.federal_cite_one),
                    federal_cite_two=self._none_to_blank(
                        old_citation.federal_cite_two),
                    federal_cite_three=self._none_to_blank(
                        old_citation.federal_cite_three),
                    state_cite_one=self._none_to_blank(
                        old_citation.state_cite_one),
                    state_cite_two=self._none_to_blank(
                        old_citation.state_cite_two),
                    state_cite_three=self._none_to_blank(
                        old_citation.state_cite_three),
                    state_cite_regional=self._none_to_blank(
                        old_citation.state_cite_regional),
                    specialty_cite_one=self._none_to_blank(
                        old_citation.specialty_cite_one),
                    scotus_early_cite=self._none_to_blank(
                        old_citation.scotus_early_cite),
                    lexis_cite=self._none_to_blank(old_citation.lexis_cite),
                    westlaw_cite=self._none_to_blank(old_citation.westlaw_cite),
                    neutral_cite=self._none_to_blank(old_citation.neutral_cite),
                    scdb_id=self._none_to_blank(
                        old_document.supreme_court_db_id),
                    source=old_document.source,
                    nature_of_suit=old_document.nature_of_suit,
                    citation_count=old_document.citation_count,
                    precedential_status=old_document.precedential_status,
                    date_blocked=old_document.date_blocked,
                    blocked=old_document.blocked,
                )
                new_opinion_cluster.save(
                    using='default',
                    index=False,
                )

                new_opinion = OpinionNew(
                    pk=old_document.pk,
                    cluster=new_opinion_cluster,
                    date_modified=old_document.date_modified,
                    date_created=old_document.time_retrieved,
                    type='010combined',
                    sha1=old_document.sha1,
                    download_url=old_document.download_url,
                    local_path=old_document.local_path,
                    plain_text=old_document.plain_text,
                    html=self._none_to_blank(old_document.html),
                    html_lawbox=self._none_to_blank(old_document.html_lawbox),
                    html_with_citations=old_document.html_with_citations,
                    extracted_by_ocr=old_document.extracted_by_ocr,
                )
                new_opinion.save(
                    using='default',
                    index=False,
                )

            if old_audio is not None:
                new_audio_file = AudioNew(
                    pk=old_audio.pk,
                    docket=new_docket,
                    source=old_audio.source,
                    case_name=old_audio_case_name,
                    case_name_short=old_audio_case_name_short,
                    case_name_full=old_audio_case_name_full,
                    judges=self._none_to_blank(old_audio.judges),
                    date_created=old_audio.time_retrieved,
                    date_modified=old_audio.date_modified,
                    sha1=old_audio.sha1,
                    download_url=old_audio.download_url,
                    local_path_mp3=old_audio.local_path_mp3,
                    local_path_original_file=old_audio.local_path_original_file,
                    duration=old_audio.duration,
                    processing_complete=old_audio.processing_complete,
                    date_blocked=old_audio.date_blocked,
                    blocked=old_audio.blocked,
                )
                new_audio_file.save(
                    using='default',
                    index=False,
                )

            progress += 1
            self._print_progress(progress, num_dockets)
        self.stdout.write(u'')  # Newline
Esempio n. 44
0
    def do_second_pass(options):
        """In the first pass, we ignored the duplicates that we got, preferring
        to let them stack up for later analysis. In this pass, we attempt to
        merge those failed items into the DB by more aggressive filtering and
        algorithmic selection.
        """
        idb_rows = FjcIntegratedDatabase.objects.filter(
            dataset_source=CV_2017,
            docket__isnull=True,
        ).order_by('pk')
        for i, idb_row in enumerate(queryset_generator(idb_rows)):
            # Iterate over all items in the IDB and find them in the Docket
            # table. If they're not there, create a new item.
            if i < options['offset']:
                continue
            if i >= options['limit'] > 0:
                break

            ds = Docket.objects.filter(
                docket_number_core=idb_row.docket_number,
                court=idb_row.district,
                docket_number__startswith='%s:' %
                idb_row.office).exclude(docket_number__icontains='cr').exclude(
                    case_name__icontains="sealed").exclude(
                        case_name__icontains='suppressed').exclude(
                            case_name__icontains='search warrant')
            count = ds.count()

            if count == 0:
                logger.info("%s: Creating new docket for IDB row: %s", i,
                            idb_row)
                create_new_docket_from_idb(idb_row.pk)
                continue
            elif count == 1:
                d = ds[0]
                logger.info("%s: Merging Docket %s with IDB row: %s", i, d,
                            idb_row)
                merge_docket_with_idb(d.pk, idb_row.pk)
                continue

            logger.info(
                "%s: Still have %s results after office and civil "
                "docket number filtering. Filtering further.", i, count)

            case_names = []
            for d in ds:
                case_name = harmonize(d.case_name)
                parts = case_name.lower().split(' v. ')
                if len(parts) == 1:
                    case_names.append(case_name)
                elif len(parts) == 2:
                    plaintiff, defendant = parts[0], parts[1]
                    case_names.append('%s v. %s' %
                                      (plaintiff[0:30], defendant[0:30]))
                elif len(parts) > 2:
                    case_names.append(case_name)
            idb_case_name = harmonize('%s v. %s' %
                                      (idb_row.plaintiff, idb_row.defendant))
            results = find_best_match(case_names,
                                      idb_case_name,
                                      case_sensitive=False)

            if results['ratio'] > 0.65:
                logger.info("%s Found good match by case name for %s: %s", i,
                            idb_case_name, results['match_str'])
                d = ds[results['match_index']]
                merge_docket_with_idb(d.pk, idb_row.pk)
            else:
                logger.info(
                    "%s No good match after office and case name "
                    "filtering. Creating new item: %s", i, idb_row)
                create_new_docket_from_idb(idb_row.pk)