Ejemplo n.º 1
0
    def as_search_dict(self) -> Dict[str, Union[int, List[int], str]]:
        """Create a dict that can be ingested by Solr"""
        # IDs
        out = {
            "id": self.pk,
            "docket_id": self.docket_id,
            "court_id": self.docket.court_id,
        }

        # Docket
        docket = {"docketNumber": self.docket.docket_number}
        if self.docket.date_argued is not None:
            docket["dateArgued"] = midnight_pst(self.docket.date_argued)
        if self.docket.date_reargued is not None:
            docket["dateReargued"] = midnight_pst(self.docket.date_reargued)
        if self.docket.date_reargument_denied is not None:
            docket["dateReargumentDenied"] = midnight_pst(
                self.docket.date_reargument_denied)
        out.update(docket)

        # Court
        out.update({
            "court": self.docket.court.full_name,
            "court_citation_string": self.docket.court.citation_string,
            "court_exact": self.docket.court_id,  # For faceting
        })

        # Audio File
        out.update({
            "caseName":
            best_case_name(self),
            "panel_ids": [judge.pk for judge in self.panel.all()],
            "judge":
            self.judges,
            "file_size_mp3":
            deepgetattr(self, "local_path_mp3.size", None),
            "duration":
            self.duration,
            "source":
            self.source,
            "download_url":
            self.download_url,
            "local_path":
            deepgetattr(self, "local_path_mp3.name", None),
        })
        try:
            out["absolute_url"] = self.get_absolute_url()
        except NoReverseMatch:
            raise InvalidDocumentError(
                f"Unable to save to index due to missing absolute_url: {self.pk}"
            )

        text_template = loader.get_template("indexes/audio_text.txt")
        out["text"] = text_template.render({"item": self}).translate(null_map)

        return normalize_search_dicts(out)
Ejemplo n.º 2
0
    def as_search_dict(self):
        """Create a dict that can be ingested by Solr"""
        # IDs
        out = {
            'id': self.pk,
            'docket_id': self.docket_id,
            'court_id': self.docket.court_id,
        }

        # Docket
        docket = {'docketNumber': self.docket.docket_number}
        if self.docket.date_argued is not None:
            docket['dateArgued'] = datetime.combine(
                self.docket.date_argued,
                time()
            )
        if self.docket.date_reargued is not None:
            docket['dateReargued'] = datetime.combine(
                self.docket.date_reargued,
                time()
            )
        if self.docket.date_reargument_denied is not None:
            docket['dateReargumentDenied'] = datetime.combine(
                self.docket.date_reargument_denied,
                time()
            )
        out.update(docket)

        # Court
        out.update({
            'court': self.docket.court.full_name,
            'court_citation_string': self.docket.court.citation_string,
            'court_exact': self.docket.court_id,  # For faceting
        })

        # Audio File
        out.update({
            'caseName': best_case_name(self),
            'panel_ids': [judge.pk for judge in self.panel.all()],
            'judge': self.judges,
            'file_size_mp3': deepgetattr(self, 'local_path_mp3.size', None),
            'duration': self.duration,
            'source': self.source,
            'download_url': self.download_url,
            'local_path': unicode(getattr(self, 'local_path_mp3', None))
        })
        try:
            out['absolute_url'] = self.get_absolute_url()
        except NoReverseMatch:
            raise InvalidDocumentError(
                "Unable to save to index due to missing absolute_url: %s"
                % self.pk
            )

        text_template = loader.get_template('indexes/audio_text.txt')
        out['text'] = text_template.render({'item': self}).translate(null_map)

        return nuke_nones(out)
Ejemplo n.º 3
0
    def as_search_dict(self):
        """Create a dict that can be ingested by Solr"""
        # IDs
        out = {
            'id': self.pk,
            'docket_id': self.docket_id,
            'court_id': self.docket.court_id,
        }

        # Docket
        docket = {'docketNumber': self.docket.docket_number}
        if self.docket.date_argued is not None:
            docket['dateArgued'] = datetime.combine(self.docket.date_argued,
                                                    time())
        if self.docket.date_reargued is not None:
            docket['dateReargued'] = datetime.combine(
                self.docket.date_reargued, time())
        if self.docket.date_reargument_denied is not None:
            docket['dateReargumentDenied'] = datetime.combine(
                self.docket.date_reargument_denied, time())
        out.update(docket)

        # Court
        out.update({
            'court': self.docket.court.full_name,
            'court_citation_string': self.docket.court.citation_string,
            'court_exact': self.docket.court_id,  # For faceting
        })

        # Audio File
        out.update({
            'caseName':
            best_case_name(self),
            'panel_ids': [judge.pk for judge in self.panel.all()],
            'judge':
            self.judges,
            'file_size_mp3':
            deepgetattr(self, 'local_path_mp3.size', None),
            'duration':
            self.duration,
            'source':
            self.source,
            'download_url':
            self.download_url,
            'local_path':
            unicode(getattr(self, 'local_path_mp3', None))
        })
        try:
            out['absolute_url'] = self.get_absolute_url()
        except NoReverseMatch:
            raise InvalidDocumentError(
                "Unable to save to index due to missing absolute_url: %s" %
                self.pk)

        text_template = loader.get_template('indexes/audio_text.txt')
        out['text'] = text_template.render({'item': self}).translate(null_map)

        return normalize_search_dicts(out)
Ejemplo n.º 4
0
    def __init__(self, item):
        self.id = item.pk
        self.docket_id = item.docket_id

        # Docket
        if item.docket.date_argued is not None:
            self.dateArgued = datetime.combine(
                item.docket.date_argued,
                time()
            )
        if item.docket.date_reargued is not None:
            self.dateReargued = datetime.combine(
                item.docket.date_reargued,
                time()
            )
        if item.docket.date_reargument_denied is not None:
            self.dateReargumentDenied = datetime.combine(
                item.docket.date_reargument_denied,
                time()
            )
        self.docketNumber = item.docket.docket_number

        # Court
        self.court = item.docket.court.full_name
        self.court_id = item.docket.court_id
        self.court_citation_string = item.docket.court.citation_string

        # Audio file
        self.caseName = best_case_name(item)
        self.panel_ids = [judge.pk for judge in item.panel.all()]
        self.judge = item.judges
        self.file_size_mp3 = deepgetattr(item, 'local_path_mp3.size', None)
        self.duration = item.duration
        self.source = item.source
        self.download_url = item.download_url
        self.local_path = unicode(getattr(item, 'local_path_mp3', None))

        try:
            self.absolute_url = item.get_absolute_url()
        except NoReverseMatch:
            raise InvalidDocumentError(
                "Unable to save to index due to missing absolute_url: %s"
                % item.pk)

        text_template = loader.get_template('indexes/audio_text.txt')
        context = {'item': item}
        self.text = text_template.render(context).translate(null_map)

        # For faceting
        self.court_exact = item.docket.court_id
Ejemplo n.º 5
0
    def __init__(self, item):
        self.id = item.pk
        self.docket_id = item.docket_id

        # Docket
        if item.docket.date_argued is not None:
            self.dateArgued = datetime.combine(item.docket.date_argued, time())
        if item.docket.date_reargued is not None:
            self.dateReargued = datetime.combine(item.docket.date_reargued,
                                                 time())
        if item.docket.date_reargument_denied is not None:
            self.dateReargumentDenied = datetime.combine(
                item.docket.date_reargument_denied, time())
        self.docketNumber = item.docket.docket_number

        # Court
        self.court = item.docket.court.full_name
        self.court_id = item.docket.court_id
        self.court_citation_string = item.docket.court.citation_string

        # Audio file
        self.caseName = best_case_name(item)
        self.panel_ids = [judge.pk for judge in item.panel.all()]
        self.judge = item.judges
        self.file_size_mp3 = deepgetattr(item, 'local_path_mp3.size', None)
        self.duration = item.duration
        self.source = item.source
        self.download_url = item.download_url
        self.local_path = unicode(getattr(item, 'local_path_mp3', None))

        try:
            self.absolute_url = item.get_absolute_url()
        except NoReverseMatch:
            raise InvalidDocumentError(
                "Unable to save to index due to missing absolute_url: %s" %
                item.pk)

        text_template = loader.get_template('indexes/audio_text.txt')
        context = {'item': item}
        self.text = text_template.render(context).translate(null_map)

        # For faceting
        self.court_exact = item.docket.court_id
Ejemplo n.º 6
0
def write_json_to_disk(courts, obj_type_str, obj_type, court_attr, serializer):
    """Write all items to disk as json files inside directories named by
    jurisdiction.

    The main trick is that we identify if we are creating a bulk archive
    from scratch. If so, we iterate over everything. If not, we only
    iterate over items that have been modified since the last good date.
    """
    # Are there already bulk files?
    history = BulkJsonHistory(obj_type_str)
    last_good_date = history.get_last_good_date()
    history.add_current_attempt_and_save()

    if court_attr is not None:
        # Create a directory for every jurisdiction, if they don't already
        # exist. This does not clobber.
        for court in courts:
            mkdir_p(
                join(
                    settings.BULK_DATA_DIR,
                    'tmp',
                    obj_type_str,
                    court.pk,
                ))

    if last_good_date is not None:
        print "   - Incremental data found. Assuming it's good and using it..."
        qs = obj_type.objects.filter(date_modified__gte=last_good_date)
    else:
        print "   - Incremental data not found. Working from scratch..."
        qs = obj_type.objects.all()

    if qs.count() == 0:
        print "   - No %s-type items in the DB or none that have changed. All done here." % obj_type_str
        history.mark_success_and_save()
        return 0
    else:
        if type(qs[0].pk) == int:
            item_list = queryset_generator(qs)
        else:
            # Necessary for Court objects, which don't have ints for ids.
            item_list = qs

        i = 0
        renderer = JSONRenderer()
        r = RequestFactory().request()
        r.META[
            'SERVER_NAME'] = 'www.courtlistener.com'  # Else, it's testserver
        r.version = 'v3'
        r.versioning_scheme = URLPathVersioning()
        context = dict(request=r)
        for item in item_list:
            json_str = renderer.render(
                serializer(item, context=context).data,
                accepted_media_type='application/json; indent=2',
            )

            if court_attr is not None:
                loc = join(settings.BULK_DATA_DIR, 'tmp', obj_type_str,
                           deepgetattr(item, court_attr), '%s.json' % item.pk)
            else:
                # A non-jurisdiction-centric object.
                loc = join(settings.BULK_DATA_DIR, 'tmp', obj_type_str,
                           '%s.json' % item.pk)

            with open(loc, 'wb') as f:
                f.write(json_str)
            i += 1

        print '   - %s %s json files created.' % (i, obj_type_str)

        history.mark_success_and_save()
        return i
Ejemplo n.º 7
0
def write_json_to_disk(courts, obj_type_str, obj_type, court_attr,
                       serializer):
    """Write all items to disk as json files inside directories named by
    jurisdiction.

    The main trick is that we identify if we are creating a bulk archive
    from scratch. If so, we iterate over everything. If not, we only
    iterate over items that have been modified in the last 32 days because
    it's assumed that the bulk files are generated once per month.
    """
    # Are there already bulk files?
    history = BulkJsonHistory(obj_type_str)
    last_good_date = history.get_last_good_date()
    history.add_current_attempt_and_save()

    if court_attr is not None:
        # Create a directory for every jurisdiction, if they don't already
        # exist. This does not clobber.
        for court in courts:
            mkdir_p(join(
                settings.BULK_DATA_DIR,
                'tmp',
                obj_type_str,
                court.pk,
            ))

    if last_good_date is not None:
        print "   - Incremental data found. Assuming it's good and using it..."
        qs = obj_type.objects.filter(date_modified__gte=last_good_date)
    else:
        print "   - Incremental data not found. Working from scratch..."
        qs = obj_type.objects.all()

    if qs.count() == 0:
        print "   - No %s-type items in the DB or none that have changed. All done here." % obj_type_str
        history.mark_success_and_save()
        return 0
    else:
        if type(qs[0].pk) == int:
            item_list = queryset_generator(qs)
        else:
            # Necessary for Court objects, which don't have ints for ids.
            item_list = qs

        i = 0
        renderer = JSONRenderer()
        r = RequestFactory().request()
        r.META['SERVER_NAME'] = 'www.courtlistener.com'  # Else, it's testserver
        r.version = 'v3'
        r.versioning_scheme = URLPathVersioning()
        context = dict(request=r)
        for item in item_list:
            json_str = renderer.render(
                serializer(item, context=context).data,
                accepted_media_type='application/json; indent=2',
            )

            if court_attr is not None:
                loc = join(settings.BULK_DATA_DIR, 'tmp', obj_type_str,
                           deepgetattr(item, court_attr), '%s.json' % item.pk)
            else:
                # A non-jurisdiction-centric object.
                loc = join(settings.BULK_DATA_DIR, 'tmp', obj_type_str,
                           '%s.json' % item.pk)

            with open(loc, 'wb') as f:
                f.write(json_str)
            i += 1

        print '   - %s %s json files created.' % (i, obj_type_str)

        history.mark_success_and_save()
        return i
Ejemplo n.º 8
0
def write_json_to_disk(courts, obj_type_str, obj_class, court_attr, serializer,
                       bulk_dir):
    """Write all items to disk as json files inside directories named by
    jurisdiction.

    The main trick is that we identify if we are creating a bulk archive
    from scratch. If so, we iterate over everything. If not, we only
    iterate over items that have been modified since the last good date.

    We deal with two kinds of bulk data. The first is jurisdiction-centric, in
    which we want to make bulk data for that particular jurisdiction, such as
    opinions or PACER data, or whatever. The second is non-jurisdiction-
    specific, like people or schools. For jurisdiction-specific data, we make
    jurisdiction directories to put the data into. Otherwise, we do not.

    :param courts: Court objects that you expect to make data for.
    :param obj_type_str: A string to use for the directory name of a type of
    data. For example, for clusters, it's 'clusters'.
    :param obj_class: The actual class to make a bulk data for.
    :param court_attr: A string that can be used to find the court attribute
    on an object. For example, on clusters, this is currently docket.court_id.
    :param serializer: A DRF serializer to use to generate the data.
    :param bulk_dir: A directory to place the serialized JSON data into.

    :returns int: The number of items generated
    """
    # Are there already bulk files?
    history = BulkJsonHistory(obj_type_str, bulk_dir)
    last_good_date = history.get_last_good_date()
    history.add_current_attempt_and_save()

    if court_attr is not None:
        # Create a directory for every jurisdiction, if they don't already
        # exist. This does not clobber.
        for court in courts:
            mkdir_p(join(
                bulk_dir,
                obj_type_str,
                court.pk,
            ))
    else:
        # Make a directory for the object type.
        mkdir_p(join(bulk_dir, obj_type_str))

    if last_good_date is not None:
        print(
            "   - Incremental data found. Assuming it's good and using it...")
        qs = obj_class.objects.filter(date_modified__gte=last_good_date)
    else:
        print("   - Incremental data not found. Working from scratch...")
        qs = obj_class.objects.all()

    if qs.count() == 0:
        print("   - No %s-type items in the DB or none that have changed. All "
              "done here." % obj_type_str)
        history.mark_success_and_save()
        return 0
    else:
        if type(qs[0].pk) == int:
            item_list = queryset_generator(qs)
        else:
            # Necessary for Court objects, which don't have ints for ids.
            item_list = qs

        i = 0
        renderer = JSONRenderer()
        r = RequestFactory().request()
        r.META[
            "SERVER_NAME"] = "www.courtlistener.com"  # Else, it's testserver
        r.META["SERVER_PORT"] = "443"  # Else, it's 80
        r.META["wsgi.url_scheme"] = "https"  # Else, it's http.
        r.version = "v3"
        r.versioning_scheme = URLPathVersioning()
        context = dict(request=r)
        for item in item_list:
            if i % 1000 == 0:
                print("Completed %s items so far." % i)
            json_str = renderer.render(
                serializer(item, context=context).data,
                accepted_media_type="application/json; indent=2",
            )

            if court_attr is not None:
                loc = join(
                    bulk_dir,
                    obj_type_str,
                    deepgetattr(item, court_attr),
                    "%s.json" % item.pk,
                )
            else:
                # A non-jurisdiction-centric object.
                loc = join(bulk_dir, obj_type_str, "%s.json" % item.pk)

            with open(loc, "wb") as f:
                f.write(json_str)
            i += 1

        print("   - %s %s json files created." % (i, obj_type_str))

        history.mark_success_and_save()
        return i
Ejemplo n.º 9
0
def write_json_to_disk(courts, obj_type_str, obj_class, court_attr,
                       serializer, bulk_dir):
    """Write all items to disk as json files inside directories named by
    jurisdiction.

    The main trick is that we identify if we are creating a bulk archive
    from scratch. If so, we iterate over everything. If not, we only
    iterate over items that have been modified since the last good date.

    We deal with two kinds of bulk data. The first is jurisdiction-centric, in
    which we want to make bulk data for that particular jurisdiction, such as
    opinions or PACER data, or whatever. The second is non-jurisdiction-
    specific, like people or schools. For jurisdiction-specific data, we make
    jurisdiction directories to put the data into. Otherwise, we do not.

    :param courts: Court objects that you expect to make data for.
    :param obj_type_str: A string to use for the directory name of a type of
    data. For example, for clusters, it's 'clusters'.
    :param obj_class: The actual class to make a bulk data for.
    :param court_attr: A string that can be used to find the court attribute
    on an object. For example, on clusters, this is currently docket.court_id.
    :param serializer: A DRF serializer to use to generate the data.
    :param bulk_dir: A directory to place the serialized JSON data into.

    :returns int: The number of items generated
    """
    # Are there already bulk files?
    history = BulkJsonHistory(obj_type_str, bulk_dir)
    last_good_date = history.get_last_good_date()
    history.add_current_attempt_and_save()

    if court_attr is not None:
        # Create a directory for every jurisdiction, if they don't already
        # exist. This does not clobber.
        for court in courts:
            mkdir_p(join(
                bulk_dir,
                obj_type_str,
                court.pk,
            ))
    else:
        # Make a directory for the object type.
        mkdir_p(join(bulk_dir, obj_type_str))

    if last_good_date is not None:
        print("   - Incremental data found. Assuming it's good and using it...")
        qs = obj_class.objects.filter(date_modified__gte=last_good_date)
    else:
        print("   - Incremental data not found. Working from scratch...")
        qs = obj_class.objects.all()

    if qs.count() == 0:
        print("   - No %s-type items in the DB or none that have changed. All "
              "done here." % obj_type_str)
        history.mark_success_and_save()
        return 0
    else:
        if type(qs[0].pk) == int:
            item_list = queryset_generator(qs)
        else:
            # Necessary for Court objects, which don't have ints for ids.
            item_list = qs

        i = 0
        renderer = JSONRenderer()
        r = RequestFactory().request()
        r.META['SERVER_NAME'] = 'www.courtlistener.com'  # Else, it's testserver
        r.META['SERVER_PORT'] = '443'  # Else, it's 80
        r.META['wsgi.url_scheme'] = 'https'  # Else, it's http.
        r.version = 'v3'
        r.versioning_scheme = URLPathVersioning()
        context = dict(request=r)
        for item in item_list:
            if i % 1000 == 0:
                print("Completed %s items so far." % i)
            json_str = renderer.render(
                serializer(item, context=context).data,
                accepted_media_type='application/json; indent=2',
            )

            if court_attr is not None:
                loc = join(bulk_dir, obj_type_str, deepgetattr(item, court_attr),
                           '%s.json' % item.pk)
            else:
                # A non-jurisdiction-centric object.
                loc = join(bulk_dir, obj_type_str, '%s.json' % item.pk)

            with open(loc, 'wb') as f:
                f.write(json_str)
            i += 1

        print ('   - %s %s json files created.' % (i, obj_type_str))

        history.mark_success_and_save()
        return i