def make_citation_data(tmp_destination): """Because citations are paginated and because as of this moment there are 11M citations in the database, we cannot provide users with a bulk data file containing the complete objects for every citation. Instead of doing that, we dump our citation table with a shell command, which provides people with compact and reasonable data they can import. """ mkdir_p(tmp_destination) logger.info(' - Copying the citations table to disk...') # This command calls the psql COPY command and requests that it dump # the citation table to disk as a compressed CSV. default_db = settings.DATABASES['default'] os.system( '''PGPASSWORD="******" psql -c "COPY \\"search_opinionscited\\" (citing_opinion_id, cited_opinion_id) to stdout DELIMITER ',' CSV HEADER" --host {host} --dbname {database} --username {username} | gzip > {destination}''' .format( password=default_db['PASSWORD'], host=default_db['HOST'], database=default_db['NAME'], username=default_db['USER'], destination=join(tmp_destination, 'all.csv.gz'), )) logger.info(' - Table created successfully.')
def cp_pr_file_to_bulk_dir(result_file_path, chown): """Copy the pagerank file to the bulk data directory for public analysis. """ mkdir_p(settings.BULK_DATA_DIR) # The dir doesn't always already exist. shutil.copy(result_file_path, settings.BULK_DATA_DIR) if chown: user_info = pwd.getpwnam('www-data') os.chown( settings.BULK_DATA_DIR + 'external_pagerank', user_info.pw_uid, user_info.pw_gid, )
def get_from_ia(reporter, volume): """ Download cases from internet archive via case law and write them to disk. :param reporter: (str) Requires a reporter abbreviation to identify cases to download as used by IA. (Ex. T.C. => tc) :param volume: (int) Specific volume number of the reporter. If blank function will cycle through all volumes of the reporter on IA. :return: None """ reporter_key = ".".join(['law.free.cap', reporter]) # Checks that the returned reporter is the requested one. # Ex. searching for Mich will return both Mich-app. and Mich. for ia_identifier in search_items(reporter_key): ia_key = ia_identifier['identifier'] if ia_key.split(".")[3] != reporter: continue # Checks if we requested a specific volume of the # reporter and if so skips all other volumes of that reporter ia_volume = ia_key.split(".")[-1] if volume is not None: if volume != ia_volume: continue for item in get_files(ia_key): if "json.json" in item.name: continue if "json" in item.name: url = "https://archive.org/download/%s/%s" % ( ia_key, item.name) file_path = os.path.join(settings.MEDIA_ROOT, 'harvard_corpus', '%s' % ia_key, '%s' % item.name, ) directory = file_path.rsplit("/", 1)[0] if os.path.exists(file_path): logger.info("Already captured: %s", url) continue logger.info("Capturing: %s", url) mkdir_p(directory) data = requests.get(url, timeout=10).json() with open(file_path, 'w') as outfile: json.dump(data, outfile, indent=2)
def swap_archives(obj_type_str, bulk_dir, tmp_bulk_dir): """Swap out new archives, clobbering the old, if present""" tmp_gz_dir = join(tmp_bulk_dir, obj_type_str) final_gz_dir = join(bulk_dir, obj_type_str) mkdir_p(final_gz_dir) for f in glob.glob(join(tmp_gz_dir, '*.tar*')): shutil.move(f, join(final_gz_dir, os.path.basename(f))) # Move the info files too. try: shutil.copy2(join(tmp_gz_dir, 'info.json'), join(final_gz_dir, 'info.json')) except IOError as e: if e.errno == 2: # No such file/directory pass else: raise
def swap_archives(obj_type_str, bulk_dir, tmp_bulk_dir): """Swap out new archives, clobbering the old, if present""" tmp_gz_dir = join(tmp_bulk_dir, obj_type_str) final_gz_dir = join(bulk_dir, obj_type_str) mkdir_p(final_gz_dir) for f in glob.glob(join(tmp_gz_dir, "*.tar*")): shutil.move(f, join(final_gz_dir, os.path.basename(f))) # Move the info files too. try: shutil.copy2(join(tmp_gz_dir, "info.json"), join(final_gz_dir, "info.json")) except IOError as e: if e.errno == 2: # No such file/directory pass else: raise
def swap_archives(obj_type_str): """Swap out new archives, clobbering the old, if present""" mkdir_p(join(settings.BULK_DATA_DIR, obj_type_str)) path_to_gz_files = join(settings.BULK_DATA_DIR, 'tmp', obj_type_str, '*.tar*') for f in glob.glob(path_to_gz_files): shutil.move( f, join(settings.BULK_DATA_DIR, obj_type_str, os.path.basename(f))) # Move the info files too. try: shutil.copy2( join(settings.BULK_DATA_DIR, 'tmp', obj_type_str, 'info.json'), join(settings.BULK_DATA_DIR, obj_type_str, 'info.json')) except IOError as e: if e.errno == 2: # No such file/directory pass else: raise
def handle(self, *args, **options): super(Command, self).handle(*args, **options) ops = queryset_generator( Opinion.objects.exclude( Q(html="") | Q(html=None), Q(html_lawbox="") | Q(html_lawbox=None), Q(html_columbia="") | Q(html_columbia=None), )) for op in ops: content = render_to_string("simple_opinion.html", {"o": op}) output_dir = os.path.join( options["output_directory"], str(op.cluster.date_filed.year), str(op.cluster.date_filed.month), str(op.cluster.date_filed.day), ) mkdir_p(output_dir) output_path = os.path.join(output_dir, f"{op.pk}.html") with open(output_path, "w") as f: f.write(content.encode())
def make_citation_data(tmp_destination, obj_type_str): """Because citations are paginated and because as of this moment there are 11M citations in the database, we cannot provide users with a bulk data file containing the complete objects for every citation. Instead of doing that, we dump our citation table with a shell command, which provides people with compact and reasonable data they can import. """ mkdir_p(tmp_destination) print ' - Copying the citations table to disk...' # This command calls the psql COPY command and requests that it dump # the citation table to disk as a compressed CSV. os.system( '''PGPASSWORD="******" psql -c "COPY \\"search_opinionscited\\" (citing_opinion_id, cited_opinion_id) to stdout DELIMITER ',' CSV HEADER" -d {database} --username {username} | gzip > {destination}'''.format( password=settings.DATABASES['default']['PASSWORD'], database=settings.DATABASES['default']['NAME'], username=settings.DATABASES['default']['USER'], destination=join(tmp_destination, 'all.csv.gz'), ) ) print ' - Table created successfully.'
def handle(self, *args, **options): super(Command, self).handle(*args, **options) ops = queryset_generator(Opinion.objects.exclude( Q(html='') | Q(html=None), Q(html_lawbox='') | Q(html_lawbox=None), Q(html_columbia='') | Q(html_columbia=None), )) for op in ops: content = render_to_string('simple_opinion.html', { 'o': op, }) output_dir = os.path.join( options['output_directory'], str(op.cluster.date_filed.year), str(op.cluster.date_filed.month), str(op.cluster.date_filed.day), ) mkdir_p(output_dir) output_path = os.path.join(output_dir, '%s.html' % op.pk) with open(output_path, 'w') as f: f.write(content.encode('utf-8'))
def swap_archives(obj_type_str): """Swap out new archives, clobbering the old, if present""" mkdir_p(join(settings.BULK_DATA_DIR, obj_type_str)) path_to_gz_files = join(settings.BULK_DATA_DIR, 'tmp', obj_type_str, '*.tar*') for f in glob.glob(path_to_gz_files): shutil.move( f, join(settings.BULK_DATA_DIR, obj_type_str, os.path.basename(f)) ) # Move the info files too. try: shutil.copy2( join(settings.BULK_DATA_DIR, 'tmp', obj_type_str, 'info.json'), join(settings.BULK_DATA_DIR, obj_type_str, 'info.json') ) except IOError as e: if e.errno == 2: # No such file/directory pass else: raise
def handle(self, *args, **options): super(Command, self).handle(*args, **options) ops = queryset_generator( Opinion.objects.exclude( Q(html='') | Q(html=None), Q(html_lawbox='') | Q(html_lawbox=None), Q(html_columbia='') | Q(html_columbia=None), )) for op in ops: content = render_to_string('simple_opinion.html', { 'o': op, }) output_dir = os.path.join( options['output_directory'], str(op.cluster.date_filed.year), str(op.cluster.date_filed.month), str(op.cluster.date_filed.day), ) mkdir_p(output_dir) output_path = os.path.join(output_dir, '%s.html' % op.pk) with open(output_path, 'w') as f: f.write(content.encode('utf-8'))
def get_from_ia(reporter, volume): """ Download cases from internet archive via case law and write them to disk. :param reporter: (str) Requires a reporter abbreviation to identify cases to download as used by IA. (Ex. T.C. => tc) :param volume: (int) Specific volume number of the reporter. If blank function will cycle through all volumes of the reporter on IA. :return: None """ logger.info("Creating IA session...") access_key = settings.IA_ACCESS_KEY secret_key = settings.IA_SECRET_KEY ia_session = ia.get_session( {"s3": { "access": access_key, "secret": secret_key, }}) reporter_key = ".".join(["law.free.cap", reporter]) # Checks that the returned reporter is the requested one. # Ex. searching for Mich will return both Mich-app. and Mich. for ia_identifier in ia_session.search_items(reporter_key): logger.info("Got ia identifier: %s" % ia_identifier) ia_key = ia_identifier["identifier"] if ia_key.split(".")[3] != reporter: continue # Checks if we requested a specific volume of the # reporter and if so skips all other volumes of that reporter ia_volume = ia_key.split(".")[-1] if volume is not None: if volume != ia_volume: continue ia_item = ia_session.get_item(ia_key) for item in ia_item.get_files(): logger.info("Got item with name: %s" % item.name) if "json.json" in item.name: continue if "json" not in item.name: continue url = "https://archive.org/download/%s/%s" % ( ia_key, item.name, ) file_path = os.path.join( settings.MEDIA_ROOT, "harvard_corpus", "%s" % ia_key, "%s" % item.name, ) directory = file_path.rsplit("/", 1)[0] if os.path.exists(file_path): logger.info("Already captured: %s", url) continue logger.info("Capturing: %s", url) mkdir_p(directory) data = requests.get(url, timeout=10).json() with open(file_path, "w") as outfile: json.dump(data, outfile, indent=2)
def handle(self, *args, **options): courts = Court.objects.all() # Make the main bulk files kwargs_list = [ { 'obj_type_str': 'clusters', 'obj_type': OpinionCluster, 'court_attr': 'docket.court_id', 'serializer': OpinionClusterSerializer, }, { 'obj_type_str': 'opinions', 'obj_type': Opinion, 'court_attr': 'cluster.docket.court_id', 'serializer': OpinionSerializer, }, { 'obj_type_str': 'dockets', 'obj_type': Docket, 'court_attr': 'court_id', 'serializer': DocketSerializer, }, { 'obj_type_str': 'courts', 'obj_type': Court, 'court_attr': None, 'serializer': CourtSerializer, }, { 'obj_type_str': 'audio', 'obj_type': Audio, 'court_attr': 'docket.court_id', 'serializer': AudioSerializer, }, { 'obj_type_str': 'people', 'obj_type': Person, 'court_attr': None, 'serializer': PersonSerializer, }, { 'obj_type_str': 'schools', 'obj_type': School, 'court_attr': None, 'serializer': SchoolSerializer, }, { 'obj_type_str': 'positions', 'obj_type': Position, 'court_attr': None, 'serializer': PositionSerializer, }, { 'obj_type_str': 'retention-events', 'obj_type': RetentionEvent, 'court_attr': None, 'serializer': RetentionEventSerializer, }, { 'obj_type_str': 'educations', 'obj_type': Education, 'court_attr': None, 'serializer': EducationSerializer, }, { 'obj_type_str': 'politicial-affiliations', 'obj_type': PoliticalAffiliation, 'court_attr': None, 'serializer': PoliticalAffiliationSerializer, }, ] print 'Starting bulk file creation with %s celery tasks...' % \ len(kwargs_list) for kwargs in kwargs_list: make_bulk_data_and_swap_it_in.delay(courts, kwargs) # Make the citation bulk data obj_type_str = 'citations' print ' - Creating bulk data CSV for citations...' tmp_destination = join(settings.BULK_DATA_DIR, 'tmp', obj_type_str) final_destination = join(settings.BULK_DATA_DIR, obj_type_str) self.make_citation_data(tmp_destination, obj_type_str) print " - Swapping in the new citation archives..." mkdir_p(join(settings.BULK_DATA_DIR, obj_type_str)) shutil.move( join(tmp_destination, 'all.csv.gz'), join(final_destination, 'all.csv.gz'), ) print 'Done.\n'
def write_json_to_disk(courts, obj_type_str, obj_type, court_attr, serializer): """Write all items to disk as json files inside directories named by jurisdiction. The main trick is that we identify if we are creating a bulk archive from scratch. If so, we iterate over everything. If not, we only iterate over items that have been modified since the last good date. """ # Are there already bulk files? history = BulkJsonHistory(obj_type_str) last_good_date = history.get_last_good_date() history.add_current_attempt_and_save() if court_attr is not None: # Create a directory for every jurisdiction, if they don't already # exist. This does not clobber. for court in courts: mkdir_p( join( settings.BULK_DATA_DIR, 'tmp', obj_type_str, court.pk, )) if last_good_date is not None: print " - Incremental data found. Assuming it's good and using it..." qs = obj_type.objects.filter(date_modified__gte=last_good_date) else: print " - Incremental data not found. Working from scratch..." qs = obj_type.objects.all() if qs.count() == 0: print " - No %s-type items in the DB or none that have changed. All done here." % obj_type_str history.mark_success_and_save() return 0 else: if type(qs[0].pk) == int: item_list = queryset_generator(qs) else: # Necessary for Court objects, which don't have ints for ids. item_list = qs i = 0 renderer = JSONRenderer() r = RequestFactory().request() r.META[ 'SERVER_NAME'] = 'www.courtlistener.com' # Else, it's testserver r.version = 'v3' r.versioning_scheme = URLPathVersioning() context = dict(request=r) for item in item_list: json_str = renderer.render( serializer(item, context=context).data, accepted_media_type='application/json; indent=2', ) if court_attr is not None: loc = join(settings.BULK_DATA_DIR, 'tmp', obj_type_str, deepgetattr(item, court_attr), '%s.json' % item.pk) else: # A non-jurisdiction-centric object. loc = join(settings.BULK_DATA_DIR, 'tmp', obj_type_str, '%s.json' % item.pk) with open(loc, 'wb') as f: f.write(json_str) i += 1 print ' - %s %s json files created.' % (i, obj_type_str) history.mark_success_and_save() return i
def handle(self, *args, **options): courts = Court.objects.all() # Make the main bulk files kwargs_list = [ { 'obj_type_str': 'clusters', 'obj_type': OpinionCluster, 'court_attr': 'docket.court_id', 'serializer': OpinionClusterSerializer, }, { 'obj_type_str': 'opinions', 'obj_type': Opinion, 'court_attr': 'cluster.docket.court_id', 'serializer': OpinionSerializer, }, { 'obj_type_str': 'dockets', 'obj_type': Docket, 'court_attr': 'court_id', 'serializer': DocketSerializer, }, { 'obj_type_str': 'courts', 'obj_type': Court, 'court_attr': None, 'serializer': CourtSerializer, }, { 'obj_type_str': 'audio', 'obj_type': Audio, 'court_attr': 'docket.court_id', 'serializer': AudioSerializer, }, # has_beta_api_access # { # 'obj_type_str': 'judges', # 'obj_type': Judge, # 'court_attr': None, # 'serializer': JudgeSerializer, # }, # { # 'obj_type_str': 'positions', # 'obj_type': Position, # 'court_attr': None, # 'serializer': PositionSerializer, # }, # { # 'obj_type_str': 'politicians', # 'obj_type': Politician, # 'court_attr': None, # 'serializer': PoliticianSerializer, # }, # { # 'obj_type_str': 'retention-events', # 'obj_type': RetentionEvent, # 'court_attr': None, # 'serializer': RetentionEventSerializer, # }, # { # 'obj_type_str': 'educations', # 'obj_type': Education, # 'court_attr': None, # 'serializer': EducationSerializer, # }, # { # 'obj_type_str': 'schools', # 'obj_type': School, # 'court_attr': None, # 'serializer': SchoolSerializer, # }, # { # 'obj_type_str': 'careers', # 'obj_type': Career, # 'court_attr': None, # 'serializer': CareerSerializer, # }, # { # 'obj_type_str': 'titles', # 'obj_type': Title, # 'court_attr': None, # 'serializer': TitleSerializer, # }, # { # 'obj_type_str': 'politicial-affiliations', # 'obj_type': PoliticalAffiliation, # 'court_attr': None, # 'serializer': PoliticalAffiliationSerializer, # }, ] print 'Starting bulk file creation with %s celery tasks...' % \ len(kwargs_list) for kwargs in kwargs_list: make_bulk_data_and_swap_it_in.delay(courts, kwargs) # Make the citation bulk data obj_type_str = 'citations' print ' - Creating bulk data CSV for citations...' tmp_destination = join(settings.BULK_DATA_DIR, 'tmp', obj_type_str) final_destination = join(settings.BULK_DATA_DIR, obj_type_str) self.make_citation_data(tmp_destination, obj_type_str) print " - Swapping in the new citation archives..." mkdir_p(join(settings.BULK_DATA_DIR, obj_type_str)) shutil.move( join(tmp_destination, 'all.csv.gz'), join(final_destination, 'all.csv.gz'), ) print 'Done.\n'
def save_to_disk(self): mkdir_p(self.path.rsplit('/', 1)[0]) with open(self.path, 'w') as f: json.dump(self.json, f, indent=2)
def write_json_to_disk(courts, obj_type_str, obj_class, court_attr, serializer, bulk_dir): """Write all items to disk as json files inside directories named by jurisdiction. The main trick is that we identify if we are creating a bulk archive from scratch. If so, we iterate over everything. If not, we only iterate over items that have been modified since the last good date. We deal with two kinds of bulk data. The first is jurisdiction-centric, in which we want to make bulk data for that particular jurisdiction, such as opinions or PACER data, or whatever. The second is non-jurisdiction- specific, like people or schools. For jurisdiction-specific data, we make jurisdiction directories to put the data into. Otherwise, we do not. :param courts: Court objects that you expect to make data for. :param obj_type_str: A string to use for the directory name of a type of data. For example, for clusters, it's 'clusters'. :param obj_class: The actual class to make a bulk data for. :param court_attr: A string that can be used to find the court attribute on an object. For example, on clusters, this is currently docket.court_id. :param serializer: A DRF serializer to use to generate the data. :param bulk_dir: A directory to place the serialized JSON data into. :returns int: The number of items generated """ # Are there already bulk files? history = BulkJsonHistory(obj_type_str, bulk_dir) last_good_date = history.get_last_good_date() history.add_current_attempt_and_save() if court_attr is not None: # Create a directory for every jurisdiction, if they don't already # exist. This does not clobber. for court in courts: mkdir_p(join( bulk_dir, obj_type_str, court.pk, )) else: # Make a directory for the object type. mkdir_p(join(bulk_dir, obj_type_str)) if last_good_date is not None: print(" - Incremental data found. Assuming it's good and using it...") qs = obj_class.objects.filter(date_modified__gte=last_good_date) else: print(" - Incremental data not found. Working from scratch...") qs = obj_class.objects.all() if qs.count() == 0: print(" - No %s-type items in the DB or none that have changed. All " "done here." % obj_type_str) history.mark_success_and_save() return 0 else: if type(qs[0].pk) == int: item_list = queryset_generator(qs) else: # Necessary for Court objects, which don't have ints for ids. item_list = qs i = 0 renderer = JSONRenderer() r = RequestFactory().request() r.META['SERVER_NAME'] = 'www.courtlistener.com' # Else, it's testserver r.META['SERVER_PORT'] = '443' # Else, it's 80 r.META['wsgi.url_scheme'] = 'https' # Else, it's http. r.version = 'v3' r.versioning_scheme = URLPathVersioning() context = dict(request=r) for item in item_list: if i % 1000 == 0: print("Completed %s items so far." % i) json_str = renderer.render( serializer(item, context=context).data, accepted_media_type='application/json; indent=2', ) if court_attr is not None: loc = join(bulk_dir, obj_type_str, deepgetattr(item, court_attr), '%s.json' % item.pk) else: # A non-jurisdiction-centric object. loc = join(bulk_dir, obj_type_str, '%s.json' % item.pk) with open(loc, 'wb') as f: f.write(json_str) i += 1 print (' - %s %s json files created.' % (i, obj_type_str)) history.mark_success_and_save() return i
def handle(self, *args, **options): super(Command, self).handle(*args, **options) courts = Court.objects.all() kwargs_list = [ { "obj_type_str": "clusters", "obj_class": OpinionCluster, "court_attr": "docket.court_id", "serializer": OpinionClusterSerializer, }, { "obj_type_str": "opinions", "obj_class": Opinion, "court_attr": "cluster.docket.court_id", "serializer": OpinionSerializer, }, { "obj_type_str": "dockets", "obj_class": Docket, "court_attr": "court_id", "serializer": DocketSerializer, }, { "obj_type_str": "courts", "obj_class": Court, "court_attr": None, "serializer": CourtSerializer, }, { "obj_type_str": "audio", "obj_class": Audio, "court_attr": "docket.court_id", "serializer": AudioSerializer, }, { "obj_type_str": "people", "obj_class": Person, "court_attr": None, "serializer": PersonSerializer, }, { "obj_type_str": "schools", "obj_class": School, "court_attr": None, "serializer": SchoolSerializer, }, { "obj_type_str": "positions", "obj_class": Position, "court_attr": None, "serializer": PositionSerializer, }, { "obj_type_str": "retention-events", "obj_class": RetentionEvent, "court_attr": None, "serializer": RetentionEventSerializer, }, { "obj_type_str": "educations", "obj_class": Education, "court_attr": None, "serializer": EducationSerializer, }, { "obj_type_str": "politicial-affiliations", "obj_class": PoliticalAffiliation, "court_attr": None, "serializer": PoliticalAffiliationSerializer, }, ] logger.info( "Starting bulk file creation with %s celery tasks..." % len(kwargs_list) ) for kwargs in kwargs_list: make_bulk_data_and_swap_it_in( courts, settings.BULK_DATA_DIR, kwargs ) # Make the citation bulk data obj_type_str = "citations" logger.info(" - Creating bulk data CSV for citations...") tmp_destination = join(settings.BULK_DATA_DIR, "tmp", obj_type_str) final_destination = join(settings.BULK_DATA_DIR, obj_type_str) self.make_citation_data(tmp_destination) logger.info(" - Swapping in the new citation archives...") mkdir_p(final_destination) shutil.move( join(tmp_destination, "all.csv.gz"), join(final_destination, "all.csv.gz"), ) logger.info("Done.\n")
def write_json_to_disk(courts, obj_type_str, obj_class, court_attr, serializer, bulk_dir): """Write all items to disk as json files inside directories named by jurisdiction. The main trick is that we identify if we are creating a bulk archive from scratch. If so, we iterate over everything. If not, we only iterate over items that have been modified since the last good date. We deal with two kinds of bulk data. The first is jurisdiction-centric, in which we want to make bulk data for that particular jurisdiction, such as opinions or PACER data, or whatever. The second is non-jurisdiction- specific, like people or schools. For jurisdiction-specific data, we make jurisdiction directories to put the data into. Otherwise, we do not. :param courts: Court objects that you expect to make data for. :param obj_type_str: A string to use for the directory name of a type of data. For example, for clusters, it's 'clusters'. :param obj_class: The actual class to make a bulk data for. :param court_attr: A string that can be used to find the court attribute on an object. For example, on clusters, this is currently docket.court_id. :param serializer: A DRF serializer to use to generate the data. :param bulk_dir: A directory to place the serialized JSON data into. :returns int: The number of items generated """ # Are there already bulk files? history = BulkJsonHistory(obj_type_str, bulk_dir) last_good_date = history.get_last_good_date() history.add_current_attempt_and_save() if court_attr is not None: # Create a directory for every jurisdiction, if they don't already # exist. This does not clobber. for court in courts: mkdir_p(join( bulk_dir, obj_type_str, court.pk, )) else: # Make a directory for the object type. mkdir_p(join(bulk_dir, obj_type_str)) if last_good_date is not None: print( " - Incremental data found. Assuming it's good and using it...") qs = obj_class.objects.filter(date_modified__gte=last_good_date) else: print(" - Incremental data not found. Working from scratch...") qs = obj_class.objects.all() if qs.count() == 0: print(" - No %s-type items in the DB or none that have changed. All " "done here." % obj_type_str) history.mark_success_and_save() return 0 else: if type(qs[0].pk) == int: item_list = queryset_generator(qs) else: # Necessary for Court objects, which don't have ints for ids. item_list = qs i = 0 renderer = JSONRenderer() r = RequestFactory().request() r.META[ "SERVER_NAME"] = "www.courtlistener.com" # Else, it's testserver r.META["SERVER_PORT"] = "443" # Else, it's 80 r.META["wsgi.url_scheme"] = "https" # Else, it's http. r.version = "v3" r.versioning_scheme = URLPathVersioning() context = dict(request=r) for item in item_list: if i % 1000 == 0: print("Completed %s items so far." % i) json_str = renderer.render( serializer(item, context=context).data, accepted_media_type="application/json; indent=2", ) if court_attr is not None: loc = join( bulk_dir, obj_type_str, deepgetattr(item, court_attr), "%s.json" % item.pk, ) else: # A non-jurisdiction-centric object. loc = join(bulk_dir, obj_type_str, "%s.json" % item.pk) with open(loc, "wb") as f: f.write(json_str) i += 1 print(" - %s %s json files created." % (i, obj_type_str)) history.mark_success_and_save() return i
def handle(self, *args, **options): super(Command, self).handle(*args, **options) courts = Court.objects.all() kwargs_list = [ { 'obj_type_str': 'clusters', 'obj_class': OpinionCluster, 'court_attr': 'docket.court_id', 'serializer': OpinionClusterSerializer, }, { 'obj_type_str': 'opinions', 'obj_class': Opinion, 'court_attr': 'cluster.docket.court_id', 'serializer': OpinionSerializer, }, { 'obj_type_str': 'dockets', 'obj_class': Docket, 'court_attr': 'court_id', 'serializer': DocketSerializer, }, { 'obj_type_str': 'courts', 'obj_class': Court, 'court_attr': None, 'serializer': CourtSerializer, }, { 'obj_type_str': 'audio', 'obj_class': Audio, 'court_attr': 'docket.court_id', 'serializer': AudioSerializer, }, { 'obj_type_str': 'people', 'obj_class': Person, 'court_attr': None, 'serializer': PersonSerializer, }, { 'obj_type_str': 'schools', 'obj_class': School, 'court_attr': None, 'serializer': SchoolSerializer, }, { 'obj_type_str': 'positions', 'obj_class': Position, 'court_attr': None, 'serializer': PositionSerializer, }, { 'obj_type_str': 'retention-events', 'obj_class': RetentionEvent, 'court_attr': None, 'serializer': RetentionEventSerializer, }, { 'obj_type_str': 'educations', 'obj_class': Education, 'court_attr': None, 'serializer': EducationSerializer, }, { 'obj_type_str': 'politicial-affiliations', 'obj_class': PoliticalAffiliation, 'court_attr': None, 'serializer': PoliticalAffiliationSerializer, }, ] logger.info('Starting bulk file creation with %s celery tasks...' % len(kwargs_list)) for kwargs in kwargs_list: make_bulk_data_and_swap_it_in(courts, settings.BULK_DATA_DIR, kwargs) # Make the citation bulk data obj_type_str = 'citations' logger.info(' - Creating bulk data CSV for citations...') tmp_destination = join(settings.BULK_DATA_DIR, 'tmp', obj_type_str) final_destination = join(settings.BULK_DATA_DIR, obj_type_str) self.make_citation_data(tmp_destination) logger.info(" - Swapping in the new citation archives...") mkdir_p(final_destination) shutil.move( join(tmp_destination, 'all.csv.gz'), join(final_destination, 'all.csv.gz'), ) logger.info('Done.\n')
def write_json_to_disk(courts, obj_type_str, obj_type, court_attr, serializer): """Write all items to disk as json files inside directories named by jurisdiction. The main trick is that we identify if we are creating a bulk archive from scratch. If so, we iterate over everything. If not, we only iterate over items that have been modified in the last 32 days because it's assumed that the bulk files are generated once per month. """ # Are there already bulk files? history = BulkJsonHistory(obj_type_str) last_good_date = history.get_last_good_date() history.add_current_attempt_and_save() if court_attr is not None: # Create a directory for every jurisdiction, if they don't already # exist. This does not clobber. for court in courts: mkdir_p(join( settings.BULK_DATA_DIR, 'tmp', obj_type_str, court.pk, )) if last_good_date is not None: print " - Incremental data found. Assuming it's good and using it..." qs = obj_type.objects.filter(date_modified__gte=last_good_date) else: print " - Incremental data not found. Working from scratch..." qs = obj_type.objects.all() if qs.count() == 0: print " - No %s-type items in the DB or none that have changed. All done here." % obj_type_str history.mark_success_and_save() return 0 else: if type(qs[0].pk) == int: item_list = queryset_generator(qs) else: # Necessary for Court objects, which don't have ints for ids. item_list = qs i = 0 renderer = JSONRenderer() r = RequestFactory().request() r.META['SERVER_NAME'] = 'www.courtlistener.com' # Else, it's testserver r.version = 'v3' r.versioning_scheme = URLPathVersioning() context = dict(request=r) for item in item_list: json_str = renderer.render( serializer(item, context=context).data, accepted_media_type='application/json; indent=2', ) if court_attr is not None: loc = join(settings.BULK_DATA_DIR, 'tmp', obj_type_str, deepgetattr(item, court_attr), '%s.json' % item.pk) else: # A non-jurisdiction-centric object. loc = join(settings.BULK_DATA_DIR, 'tmp', obj_type_str, '%s.json' % item.pk) with open(loc, 'wb') as f: f.write(json_str) i += 1 print ' - %s %s json files created.' % (i, obj_type_str) history.mark_success_and_save() return i