Ejemplo n.º 1
0
 def setUp(self):
     self.history = BulkJsonHistory('test')
Ejemplo n.º 2
0
def write_json_to_disk(courts, obj_type_str, obj_type, court_attr,
                       serializer):
    """Write all items to disk as json files inside directories named by
    jurisdiction.

    The main trick is that we identify if we are creating a bulk archive
    from scratch. If so, we iterate over everything. If not, we only
    iterate over items that have been modified in the last 32 days because
    it's assumed that the bulk files are generated once per month.
    """
    # Are there already bulk files?
    history = BulkJsonHistory(obj_type_str)
    last_good_date = history.get_last_good_date()
    history.add_current_attempt_and_save()

    if court_attr is not None:
        # Create a directory for every jurisdiction, if they don't already
        # exist. This does not clobber.
        for court in courts:
            mkdir_p(join(
                settings.BULK_DATA_DIR,
                'tmp',
                obj_type_str,
                court.pk,
            ))

    if last_good_date is not None:
        print "   - Incremental data found. Assuming it's good and using it..."
        qs = obj_type.objects.filter(date_modified__gte=last_good_date)
    else:
        print "   - Incremental data not found. Working from scratch..."
        qs = obj_type.objects.all()

    if qs.count() == 0:
        print "   - No %s-type items in the DB or none that have changed. All done here." % obj_type_str
        history.mark_success_and_save()
        return 0
    else:
        if type(qs[0].pk) == int:
            item_list = queryset_generator(qs)
        else:
            # Necessary for Court objects, which don't have ints for ids.
            item_list = qs

        i = 0
        renderer = JSONRenderer()
        r = RequestFactory().request()
        r.META['SERVER_NAME'] = 'www.courtlistener.com'  # Else, it's testserver
        r.version = 'v3'
        r.versioning_scheme = URLPathVersioning()
        context = dict(request=r)
        for item in item_list:
            json_str = renderer.render(
                serializer(item, context=context).data,
                accepted_media_type='application/json; indent=2',
            )

            if court_attr is not None:
                loc = join(settings.BULK_DATA_DIR, 'tmp', obj_type_str,
                           deepgetattr(item, court_attr), '%s.json' % item.pk)
            else:
                # A non-jurisdiction-centric object.
                loc = join(settings.BULK_DATA_DIR, 'tmp', obj_type_str,
                           '%s.json' % item.pk)

            with open(loc, 'wb') as f:
                f.write(json_str)
            i += 1

        print '   - %s %s json files created.' % (i, obj_type_str)

        history.mark_success_and_save()
        return i
Ejemplo n.º 3
0
class BulkJsonHistoryTest(TestCase):

    def setUp(self):
        self.history = BulkJsonHistory('test')

    def tearDown(self):
        self.history.delete_from_disk()

    def test_load_the_file(self):
        data = self.history.load_json_file()
        self.assertEqual(
            {},
            data,
        )

    def test_load_date_when_none(self):
        d = self.history.get_last_good_date()
        self.assertIsNone(d)

    def test_set_date_then_load_it(self):
        self.history.add_current_attempt_and_save()
        self.history.mark_success_and_save()
        d = self.history.get_last_good_date()
        self.assertAlmostEqual(
            # The date serialized is within ten seconds of now.
            d,
            now(),
            delta=timedelta(seconds=10)
        )

    def test_add_current_attempt(self):
        self.history.add_current_attempt_and_save()
        d = self.history.get_last_attempt()
        self.assertAlmostEqual(
            d,
            now(),
            delta=timedelta(seconds=10)
        )
Ejemplo n.º 4
0
def write_json_to_disk(courts, obj_type_str, obj_class, court_attr,
                       serializer, bulk_dir):
    """Write all items to disk as json files inside directories named by
    jurisdiction.

    The main trick is that we identify if we are creating a bulk archive
    from scratch. If so, we iterate over everything. If not, we only
    iterate over items that have been modified since the last good date.

    We deal with two kinds of bulk data. The first is jurisdiction-centric, in
    which we want to make bulk data for that particular jurisdiction, such as
    opinions or PACER data, or whatever. The second is non-jurisdiction-
    specific, like people or schools. For jurisdiction-specific data, we make
    jurisdiction directories to put the data into. Otherwise, we do not.

    :param courts: Court objects that you expect to make data for.
    :param obj_type_str: A string to use for the directory name of a type of
    data. For example, for clusters, it's 'clusters'.
    :param obj_class: The actual class to make a bulk data for.
    :param court_attr: A string that can be used to find the court attribute
    on an object. For example, on clusters, this is currently docket.court_id.
    :param serializer: A DRF serializer to use to generate the data.
    :param bulk_dir: A directory to place the serialized JSON data into.

    :returns int: The number of items generated
    """
    # Are there already bulk files?
    history = BulkJsonHistory(obj_type_str, bulk_dir)
    last_good_date = history.get_last_good_date()
    history.add_current_attempt_and_save()

    if court_attr is not None:
        # Create a directory for every jurisdiction, if they don't already
        # exist. This does not clobber.
        for court in courts:
            mkdir_p(join(
                bulk_dir,
                obj_type_str,
                court.pk,
            ))
    else:
        # Make a directory for the object type.
        mkdir_p(join(bulk_dir, obj_type_str))

    if last_good_date is not None:
        print("   - Incremental data found. Assuming it's good and using it...")
        qs = obj_class.objects.filter(date_modified__gte=last_good_date)
    else:
        print("   - Incremental data not found. Working from scratch...")
        qs = obj_class.objects.all()

    if qs.count() == 0:
        print("   - No %s-type items in the DB or none that have changed. All "
              "done here." % obj_type_str)
        history.mark_success_and_save()
        return 0
    else:
        if type(qs[0].pk) == int:
            item_list = queryset_generator(qs)
        else:
            # Necessary for Court objects, which don't have ints for ids.
            item_list = qs

        i = 0
        renderer = JSONRenderer()
        r = RequestFactory().request()
        r.META['SERVER_NAME'] = 'www.courtlistener.com'  # Else, it's testserver
        r.META['SERVER_PORT'] = '443'  # Else, it's 80
        r.META['wsgi.url_scheme'] = 'https'  # Else, it's http.
        r.version = 'v3'
        r.versioning_scheme = URLPathVersioning()
        context = dict(request=r)
        for item in item_list:
            if i % 1000 == 0:
                print("Completed %s items so far." % i)
            json_str = renderer.render(
                serializer(item, context=context).data,
                accepted_media_type='application/json; indent=2',
            )

            if court_attr is not None:
                loc = join(bulk_dir, obj_type_str, deepgetattr(item, court_attr),
                           '%s.json' % item.pk)
            else:
                # A non-jurisdiction-centric object.
                loc = join(bulk_dir, obj_type_str, '%s.json' % item.pk)

            with open(loc, 'wb') as f:
                f.write(json_str)
            i += 1

        print ('   - %s %s json files created.' % (i, obj_type_str))

        history.mark_success_and_save()
        return i
Ejemplo n.º 5
0
 def setUp(self):
     self.history = BulkJsonHistory('test', settings.BULK_DATA_DIR)
Ejemplo n.º 6
0
 def setUp(self):
     self.history = BulkJsonHistory('test')
Ejemplo n.º 7
0
class BulkJsonHistoryTest(TestCase):
    def setUp(self):
        self.history = BulkJsonHistory('test')

    def tearDown(self):
        self.history.delete_from_disk()

    def test_load_the_file(self):
        data = self.history.load_json_file()
        self.assertEqual(
            {},
            data,
        )

    def test_load_date_when_none(self):
        d = self.history.get_last_good_date()
        self.assertIsNone(d)

    def test_set_date_then_load_it(self):
        self.history.add_current_attempt_and_save()
        self.history.mark_success_and_save()
        d = self.history.get_last_good_date()
        self.assertAlmostEqual(
            # The date serialized is within ten seconds of now.
            d,
            now(),
            delta=timedelta(seconds=10))

    def test_add_current_attempt(self):
        self.history.add_current_attempt_and_save()
        d = self.history.get_last_attempt()
        self.assertAlmostEqual(d, now(), delta=timedelta(seconds=10))
Ejemplo n.º 8
0
 def setUp(self) -> None:
     self.history = BulkJsonHistory("test", settings.BULK_DATA_DIR)
Ejemplo n.º 9
0
def write_json_to_disk(courts,
                       obj_type_str,
                       obj_class,
                       court_attr,
                       serializer,
                       bulk_dir=join(settings.BULK_DATA_DIR, 'tmp')):
    """Write all items to disk as json files inside directories named by
    jurisdiction.

    The main trick is that we identify if we are creating a bulk archive
    from scratch. If so, we iterate over everything. If not, we only
    iterate over items that have been modified since the last good date.

    We deal with two kinds of bulk data. The first is jurisdiction-centric, in
    which we want to make bulk data for that particular jurisdiction, such as
    opinions or PACER data, or whatever. The second is non-jurisdiction-
    specific, like people or schools. For jurisdiction-specific data, we make
    jurisdiction directories to put the data into. Otherwise, we do not.

    :param courts: Court objects that you expect to make data for.
    :param obj_type_str: A string to use for the directory name of a type of
    data. For example, for clusters, it's 'clusters'.
    :param obj_class: The actual class to make a bulk data for.
    :param court_attr: A string that can be used to find the court attribute
    on an object. For example, on clusters, this is currently docket.court_id.
    :param serializer: A DRF serializer to use to generate the data.
    :param bulk_dir: A directory to place the serialized JSON data into.

    :returns int: The number of items generated
    """
    # Are there already bulk files?
    history = BulkJsonHistory(obj_type_str, bulk_dir)
    last_good_date = history.get_last_good_date()
    history.add_current_attempt_and_save()

    if court_attr is not None:
        # Create a directory for every jurisdiction, if they don't already
        # exist. This does not clobber.
        for court in courts:
            mkdir_p(join(
                bulk_dir,
                obj_type_str,
                court.pk,
            ))
    else:
        # Make a directory for the object type.
        mkdir_p(join(bulk_dir, obj_type_str))

    if last_good_date is not None:
        print(
            "   - Incremental data found. Assuming it's good and using it...")
        qs = obj_class.objects.filter(date_modified__gte=last_good_date)
    else:
        print("   - Incremental data not found. Working from scratch...")
        qs = obj_class.objects.all()

    if qs.count() == 0:
        print("   - No %s-type items in the DB or none that have changed. All "
              "done here." % obj_type_str)
        history.mark_success_and_save()
        return 0
    else:
        if type(qs[0].pk) == int:
            item_list = queryset_generator(qs)
        else:
            # Necessary for Court objects, which don't have ints for ids.
            item_list = qs

        i = 0
        renderer = JSONRenderer()
        r = RequestFactory().request()
        r.META[
            'SERVER_NAME'] = 'www.courtlistener.com'  # Else, it's testserver
        r.META['wsgi.url_scheme'] = 'https'  # Else, it's http.
        r.version = 'v3'
        r.versioning_scheme = URLPathVersioning()
        context = dict(request=r)
        for item in item_list:
            if i % 1000 == 0:
                print("Completed %s items so far." % i)
            json_str = renderer.render(
                serializer(item, context=context).data,
                accepted_media_type='application/json; indent=2',
            )

            if court_attr is not None:
                loc = join(bulk_dir, obj_type_str,
                           deepgetattr(item, court_attr), '%s.json' % item.pk)
            else:
                # A non-jurisdiction-centric object.
                loc = join(bulk_dir, obj_type_str, '%s.json' % item.pk)

            with open(loc, 'wb') as f:
                f.write(json_str)
            i += 1

        print('   - %s %s json files created.' % (i, obj_type_str))

        history.mark_success_and_save()
        return i
Ejemplo n.º 10
0
 def setUp(self):
     self.history = BulkJsonHistory('test', settings.BULK_DATA_DIR)