def handle(self, *args, **options): both_list_and_endpoints = (options.get('doc_id') is not None and (options.get('start_id') is not None or options.get('end_id') is not None or options.get('filed_after') is not None)) no_option = (not any([options.get('doc_id') is None, options.get('start_id') is None, options.get('end_id') is None, options.get('filed_after') is None, options.get('all') is False])) if both_list_and_endpoints or no_option: raise CommandError('Please specify either a list of documents, a range of ids, a range of dates, or ' 'everything.') if options.get('filed_after'): start_date = make_aware(datetime.strptime(options['filed_after'], '%Y-%m-%d'), utc) index = options['index'].lower() # Use query chaining to build the query query = Document.objects.all() if options.get('doc_id'): query = query.filter(pk=options.get('doc_id')) if options.get('end_id'): query = query.filter(pk__lte=options.get('end_id')) if options.get('start_id'): query = query.filter(pk__gte=options.get('start_id')) if options.get('filed_after'): query = query.filter(date_filed__gte=start_date) if options.get('all'): query = Document.object.all() count = query.count() docs = queryset_generator(query, chunksize=10000) self.update_documents(docs, count, index)
def cleaner(simulate=False, verbose=False): """Re-run the anonymize function across the whole corpus. The anonymize function was previously missing any documents that contained punctuation before or after an ID. This script re-runs the function, fixing the error. """ docs = queryset_generator(Document.objects.all()) for doc in docs: text = doc.plain_text clean_lines = [] any_mods = [] for line in text.split('\n'): clean_line, modified = anonymize(line) if modified: print "Fixing text in document: %s" % doc.pk print "Line reads: %s" % line fix = raw_input("Fix the line? [Y/n]: ") or 'y' if fix.lower() == 'y': clean_lines.append(clean_line) any_mods.append(modified) else: clean_lines.append(line) else: clean_lines.append(line) if not simulate and any(any_mods): doc.plain_text = '\n'.join(clean_lines) doc.blocked = True doc.date_blocked = now() doc.save()
def add_or_update_by_datetime(self, dt): """ Given a datetime, adds or updates all documents newer than that time. """ self.stdout.write("Adding or updating document(s) newer than %s\n" % dt) qs = Document.objects.filter(time_retrieved__gt=dt) docs = queryset_generator(qs) count = qs.count() self._chunk_queryset_into_tasks(docs, count)
def write_json_to_disk(obj_type_str, obj_type, court_attr, api_resource_obj, courts): """Write all items to disk as json files inside directories named by jurisdiction. The main trick is that we identify if we are creating a bulk archive from scratch. If so, we iterate over everything. If not, we only iterate over items that have been modified in the last 32 days because it's assumed that the bulk files are generated once per month. """ # Are there already bulk files? incremental = test_if_old_bulk_files_exist(obj_type_str) # Create a directory for every jurisdiction, if they don't already # exist. This does not clobber. for court in courts: mkdir_p(os.path.join( settings.BULK_DATA_DIR, 'tmp', obj_type_str, court.pk, )) if incremental: # Make the archives using updated data from the last 32 days. print " - Incremental data! We assume it's good, and use it..." thirty_two_days_ago = now() - datetime.timedelta(days=32) qs = obj_type.objects.filter(date_modified__gt=thirty_two_days_ago) else: print " - Incremental data not found. Working from scratch..." qs = obj_type.objects.all() item_resource = api_resource_obj() if type(qs[0].pk) == int: item_list = queryset_generator(qs) else: # Necessary for jurisdictions, which don't have ints for ids. item_list = qs i = 0 for item in item_list: json_str = item_resource.serialize( None, item_resource.full_dehydrate( item_resource.build_bundle(obj=item)), 'application/json', ).encode('utf-8') with open(os.path.join( settings.BULK_DATA_DIR, 'tmp', obj_type_str, deepgetattr(item, court_attr), '%s.json' % item.pk), 'wb') as f: f.write(json_str) i += 1 print ' - all %s %s json files created.' % (i, obj_type_str)
def add_or_update_all(self): """ Iterates over the entire corpus, adding it to the index. Can be run on an empty index or an existing one. If run on an existing index, existing documents will be updated. """ self.stdout.write("Adding or updating all documents...\n") docs = queryset_generator(Document.objects.all(), chunksize=5000) count = Document.objects.all().count() self._chunk_queryset_into_tasks(docs, count)
def add_or_update_by_datetime(self, dt): """ Given a datetime, adds or updates all items newer than that time. """ self.stdout.write( "Adding or updating items(s) newer than %s\n" % dt) qs = self.type.objects.filter(time_retrieved__gt=dt) items = queryset_generator(qs) count = qs.count() self._chunk_queryset_into_tasks(items, count)
def write_json_to_disk(obj_type_str, obj_type, court_attr, api_resource_obj, courts): """Write all items to disk as json files inside directories named by jurisdiction. The main trick is that we identify if we are creating a bulk archive from scratch. If so, we iterate over everything. If not, we only iterate over items that have been modified in the last 32 days because it's assumed that the bulk files are generated once per month. """ # Are there already bulk files? incremental = test_if_old_bulk_files_exist(obj_type_str) # Create a directory for every jurisdiction, if they don't already # exist. This does not clobber. for court in courts: mkdir_p( os.path.join( settings.BULK_DATA_DIR, 'tmp', obj_type_str, court.pk, )) if incremental: # Make the archives using updated data from the last 32 days. print " - Incremental data! We assume it's good, and use it..." thirty_two_days_ago = now() - datetime.timedelta(days=32) qs = obj_type.objects.filter(date_modified__gt=thirty_two_days_ago) else: print " - Incremental data not found. Working from scratch..." qs = obj_type.objects.all() item_resource = api_resource_obj() if type(qs[0].pk) == int: item_list = queryset_generator(qs) else: # Necessary for jurisdictions, which don't have ints for ids. item_list = qs i = 0 for item in item_list: json_str = item_resource.serialize( None, item_resource.full_dehydrate(item_resource.build_bundle(obj=item)), 'application/json', ).encode('utf-8') with open( os.path.join(settings.BULK_DATA_DIR, 'tmp', obj_type_str, deepgetattr(item, court_attr), '%s.json' % item.pk), 'wb') as f: f.write(json_str) i += 1 print ' - all %s %s json files created.' % (i, obj_type_str)
def cleaner(simulate=False, verbose=False): docs = queryset_generator(Document.objects.filter(source="R", time_retrieved__gt="2011-06-01")) for doc in docs: original_link = doc.download_url fixed = link_fixer(original_link) doc.download_url = fixed if verbose: print "Changing: " + original_link print " to: " + fixed if not simulate: doc.save()
def add_or_update_all(self): """ Iterates over the entire corpus, adding it to the index. Can be run on an empty index or an existing one. If run on an existing index, existing items will be updated. """ self.stdout.write("Adding or updating all items...\n") q = self.type.objects.all() items = queryset_generator(q, chunksize=5000) count = q.count() self._chunk_queryset_into_tasks(items, count)
def cleaner(simulate=False, verbose=False): docs = queryset_generator( Document.objects.filter(source='R', time_retrieved__gt='2011-06-01')) for doc in docs: original_link = doc.download_url fixed = link_fixer(original_link) doc.download_url = fixed if verbose: print "Changing: " + original_link print " to: " + fixed if not simulate: doc.save()
def delete_by_datetime(self, dt): """ Given a datetime, deletes all documents in the index newer than that time. """ qs = Document.objects.filter(time_retrieved__gt=dt) count = qs.count() if self._proceed_with_deletion(count): self.stdout.write("Deleting all document(s) newer than %s\n" % dt) docs = queryset_generator(qs) for doc in docs: self.si.delete(doc) self.si.commit()
def delete_by_datetime(self, dt): """ Given a datetime, deletes all items in the index newer than that time. Relies on the items still being in the database. """ qs = self.type.objects.filter(time_retrieved__gt=dt) count = qs.count() if proceed_with_deletion(self.stdout, count): self.stdout.write("Deleting all item(s) newer than %s\n" % dt) items = queryset_generator(qs) for item in items: self.si.delete(item) self.si.commit()
def delete_data_by_time_and_court(courtID, SIMULATE, delTime=None, VERBOSITY=0): """ Deletes data for a court. If a time is given, uses that time as a constraint. """ if delTime is not None: if VERBOSITY >= 1: print "Deleting data newer than %s for court %s" % (delTime, courtID) count = Document.objects.filter(time_retrieved__gt=delTime, court=courtID).count() if count != 0: docs = queryset_generator(Document.objects.filter(time_retrieved__gt=delTime, court=courtID)) else: if VERBOSITY >= 1: print "Deleting all data for court %s" % courtID count = Document.objects.filter(court=courtID).count() if count != 0: docs = queryset_generator(Document.objects.filter(court=courtID)) if VERBOSITY >= 1: print "Deleting %s documents from the database." % count if (not SIMULATE) and (count != 0): for doc in docs: doc.delete()
def cleaner(simulate=False, verbose=False): docs = queryset_generator(Document.objects.filter(source = 'R')) for doc in docs: caseNameShortOrig = doc.citation.caseNameShort caseNameFullOrig = doc.citation.caseNameFull caseNameShort = titlecase(harmonize(clean_string(caseNameShortOrig))) caseNameFull = titlecase(harmonize(clean_string(caseNameFullOrig))) doc.citation.caseNameShort = caseNameShort doc.citation.caseNameFull = caseNameFull if verbose: if (caseNameShortOrig != caseNameShort) or (caseNameFullOrig != caseNameFull): print "Document: %s" % doc.pk if caseNameShortOrig != caseNameShort: print "Short name, replacing: '%s'" % caseNameShortOrig print " with: '%s'" % caseNameShort if caseNameFullOrig != caseNameFull: print " Full name, replacing: '%s'" % caseNameFullOrig print " with: '%s'\n" % caseNameFull if not simulate: doc.citation.save()
def cleaner(simulate=False, verbose=False): docs = queryset_generator(Document.objects.filter(source='R')) for doc in docs: caseNameShortOrig = doc.citation.caseNameShort caseNameFullOrig = doc.citation.caseNameFull caseNameShort = titlecase(harmonize(clean_string(caseNameShortOrig))) caseNameFull = titlecase(harmonize(clean_string(caseNameFullOrig))) doc.citation.caseNameShort = caseNameShort doc.citation.caseNameFull = caseNameFull if verbose: if (caseNameShortOrig != caseNameShort) or (caseNameFullOrig != caseNameFull): print "Document: %s" % doc.pk if caseNameShortOrig != caseNameShort: print "Short name, replacing: '%s'" % caseNameShortOrig print " with: '%s'" % caseNameShort if caseNameFullOrig != caseNameFull: print " Full name, replacing: '%s'" % caseNameFullOrig print " with: '%s'\n" % caseNameFull if not simulate: doc.citation.save()
def handle(self, *args, **options): both_list_and_endpoints = (options.get('doc_id') is not None and (options.get('start_id') is not None or options.get('end_id') is not None or options.get('filed_after') is not None)) no_option = (not any([ options.get('doc_id') is None, options.get('start_id') is None, options.get('end_id') is None, options.get('filed_after') is None, options.get('all') is False ])) if both_list_and_endpoints or no_option: raise CommandError( 'Please specify either a list of documents, a range of ids, a range of dates, or ' 'everything.') if options.get('filed_after'): start_date = make_aware( datetime.strptime(options['filed_after'], '%Y-%m-%d'), utc) index = options['index'].lower() # Use query chaining to build the query query = Document.objects.all() if options.get('doc_id'): query = query.filter(pk=options.get('doc_id')) if options.get('end_id'): query = query.filter(pk__lte=options.get('end_id')) if options.get('start_id'): query = query.filter(pk__gte=options.get('start_id')) if options.get('filed_after'): query = query.filter(date_filed__gte=start_date) if options.get('all'): query = Document.object.all() count = query.count() docs = queryset_generator(query, chunksize=10000) self.update_documents(docs, count, index)
def fixer(simulate=False, verbose=False): """Fix a few issues discovered.""" # docs = queryset_generator(Document.objects.filter(source='C', plain_text='')) # docs = Document.objects.raw('''select "pk" from "Document" where "source" = 'C' and "plain_text" ~ '^[[:space:]]*$' ''') # docs = Document.objects.raw('''select "pk" from "Document" where "source" = 'C' and "plain_text" = 'Unable to extract document content.' ''') def fix_plaintiffs(docs, left, simulate, verbose): for doc in docs: if verbose: print "Fixing document number %s: %s" % (doc.pk, doc) old_case_name = doc.case_name if left: new_case_name = old_case_name.replace("P. v.", "People v.") else: new_case_name = old_case_name.replace("v. P.", "v. People") print " Replacing %s" % old_case_name print " with %s" % new_case_name if not simulate: if left: doc.case_name = doc.case_name.replace("P. v.", "People v.") else: doc.case_name = doc.case_name.replace("v. P.", "v. People") doc.citation.save() def fix_michigan(docs, left, simulate, verbose): for doc in docs: if verbose: print "Fixing document number %s: %s" % (doc.pk, doc) old_case_name = doc.case_name if left: new_case_name = old_case_name.replace( "People of Mi", "People of Michigan") print " Replacing %s" % old_case_name print " with %s" % new_case_name if not simulate: if left: doc.case_name = doc.case_name.replace( "People of Mi", "People of Michigan") doc.citation.save() def fix_wva(docs, simulate, verbose): for doc in docs: if verbose: print "Fixing document number %s: %s" % (doc.pk, doc) if not simulate: doc.precedential_status = "Published" doc.save() # Round one! Fix plaintiffs. print "!!! ROUND ONE !!!" court = Court.objects.get(pk="cal") docs = queryset_generator( Document.objects.filter(source="C", court=court, citation__case_name__contains="P. v.")) fix_plaintiffs(docs, True, simulate, verbose) # Round three! Fix the Mi cases. print "!!! ROUND THREE !!!" court = Court.objects.get(pk="mich") docs = queryset_generator( Document.objects.filter( source="C", court=court, citation__case_name__startswith="People of Mi ", )) fix_michigan(docs, True, simulate, verbose) # Round four! Fix the statuses. print "!!! ROUND FOUR !!!" court = Court.objects.get(pk="wva") docs = queryset_generator( Document.objects.filter( precedential_status__in=[ "Memorandum Decision", "Per Curiam Opinion", "Signed Opinion", ], court=court, )) fix_wva(docs, simulate, verbose)
def fixer(simulate=False, verbose=False): """Fix a few issues discovered.""" #docs = queryset_generator(Document.objects.filter(source='C', plain_text='')) #docs = Document.objects.raw('''select "pk" from "Document" where "source" = 'C' and "plain_text" ~ '^[[:space:]]*$' ''') #docs = Document.objects.raw('''select "pk" from "Document" where "source" = 'C' and "plain_text" = 'Unable to extract document content.' ''') def fix_plaintiffs(docs, left, simulate, verbose): for doc in docs: if verbose: print "Fixing document number %s: %s" % (doc.pk, doc) old_case_name = doc.citation.case_name if left: new_case_name = old_case_name.replace('P. v.', 'People v.') else: new_case_name = old_case_name.replace('v. P.', 'v. People') print " Replacing %s" % old_case_name print " with %s" % new_case_name if not simulate: if left: doc.citation.case_name = doc.citation.case_name.replace('P. v.', 'People v.') else: doc.citation.case_name = doc.citation.case_name.replace('v. P.', 'v. People') doc.citation.save() def fix_michigan(docs, left, simulate, verbose): for doc in docs: if verbose: print "Fixing document number %s: %s" % (doc.pk, doc) old_case_name = doc.citation.case_name if left: new_case_name = old_case_name.replace('People of Mi', 'People of Michigan') print " Replacing %s" % old_case_name print " with %s" % new_case_name if not simulate: if left: doc.citation.case_name = doc.citation.case_name.replace('People of Mi', 'People of Michigan') doc.citation.save() def fix_wva(docs, simulate, verbose): for doc in docs: if verbose: print "Fixing document number %s: %s" % (doc.pk, doc) if not simulate: doc.precedential_status = "Published" doc.save() # Round one! Fix plaintiffs. print "!!! ROUND ONE !!!" court = Court.objects.get(pk='cal') docs = queryset_generator(Document.objects.filter(source="C", court=court, citation__case_name__contains='P. v.')) fix_plaintiffs(docs, True, simulate, verbose) # Round three! Fix the Mi cases. print "!!! ROUND THREE !!!" court = Court.objects.get(pk='mich') docs = queryset_generator(Document.objects.filter(source="C", court=court, citation__case_name__startswith='People of Mi ')) fix_michigan(docs, True, simulate, verbose) # Round four! Fix the statuses. print "!!! ROUND FOUR !!!" court = Court.objects.get(pk='wva') docs = queryset_generator(Document.objects.filter(precedential_status__in=['Memorandum Decision', 'Per Curiam Opinion', 'Signed Opinion'], court=court)) fix_wva(docs, simulate, verbose)
def make_archive(self, obj_type_str, obj_type, court_attr, api_resource_obj): """Generate compressed archives containing the contents of an object database. There are a few tricks to this, but the main one is that each item in the database goes into two files, all.tar.gz and {court}.tar.gz. This means that if we want to avoid iterating the database once per file, we need to generate all 350+ jurisdiction files simultaneously. We do this by making a dict of open file handles and adding each item to the correct two files: The all.tar.gz file and the {court}.tar.gz file. This function takes longer to run than almost any in the codebase and has been the subject of some profiling. The top results are as follows: ncalls tottime percall cumtime percall filename:lineno(function) 138072 5.007 0.000 6.138 0.000 {method 'sub' of '_sre.SRE_Pattern' objects} 6001 4.452 0.001 4.608 0.001 {method 'execute' of 'psycopg2._psycopg.cursor' objects} 24900 3.623 0.000 3.623 0.000 {built-in method compress} 2807031/69163 2.923 0.000 8.216 0.000 copy.py:145(deepcopy) 2427852 0.952 0.000 1.130 0.000 encoder.py:37(replace) Conclusions: 1. sub is from string_utils.py, where we nuke bad chars. Could remove this code by sanitizing all future input to system and fixing any current issues. Other than that, it's already optimized. 1. Next up is DB waiting. Queries could be optimized to make this better. 1. Next is compression, which we've turned down as much as possible already (compresslevel=1 for most bulk files =3 for all.tar.gz). 1. Encoding and copying bring up the rear. Not much to do there, and gains are limited. Could install a faster json decoder, but Python 2.7's json implementation is already written in C. Not sure how to remove the gazillion copy's that are happening. """ courts = Court.objects.all() self.stdout.write(' - Creating %s bulk %s files ' 'simultaneously...\n' % (len(courts), obj_type_str)) mkdir_p('/tmp/bulk/%s' % obj_type_str) # Open a gzip'ed tar file for every court tar_files = {} for court in courts: tar_files[court.pk] = tarfile.open( '/tmp/bulk/%s/%s.tar.gz' % (obj_type_str, court.pk), mode='w:gz', compresslevel=1, ) tar_files['all'] = tarfile.open( '/tmp/bulk/%s/all.tar.gz' % obj_type_str, mode='w:gz', compresslevel=3, ) # Make the archives qs = obj_type.objects.all() item_resource = api_resource_obj() if type(qs[0].pk) == int: item_list = queryset_generator(qs) else: item_list = qs for item in item_list: json_str = item_resource.serialize( None, item_resource.full_dehydrate( item_resource.build_bundle(obj=item)), 'application/json', ).encode('utf-8') # Add the json str to the two tarballs tarinfo = tarfile.TarInfo("%s.json" % item.pk) tarinfo.size = len(json_str) tarinfo.mtime = time.mktime(item.date_modified.timetuple()) tarinfo.type = tarfile.REGTYPE tar_files[deepgetattr(item, court_attr)].addfile( tarinfo, StringIO.StringIO(json_str)) tar_files['all'].addfile(tarinfo, StringIO.StringIO(json_str)) # Close off all the gzip'ed tar files for court in courts: tar_files[court.pk].close() tar_files['all'].close() self.stdout.write(' - all %s bulk files created.\n' % obj_type_str)
def do_pagerank(self, verbosity=1, chown=True): ##################### # Stage I # # Import Data to NX # ##################### sys.stdout.write('Initializing...\n') graph_size = Document.objects.all().count() citing_graph = nx.DiGraph() qs = Document.objects.only( 'pk', 'cases_cited', ) case_list = queryset_generator(qs, chunksize=10000) case_count = 0 timings = [] average_per_s = 0 # Build up the network graph and a list of all valid ids id_list = [] for source_case in case_list: case_count += 1 if case_count % 100 == 1: t1 = time.time() if case_count % 100 == 0: t2 = time.time() timings.append(t2 - t1) average_per_s = 100 / (sum(timings) / float(len(timings))) sys.stdout.write( "\rGenerating networkx graph...{:.0%} ({}/{}, {:.1f}/s)". format( case_count * 1.0 / graph_size, case_count, graph_size, average_per_s, )) sys.stdout.flush() for target_case in source_case.cases_cited.values_list( 'parent_documents__id'): citing_graph.add_edge(str(source_case.pk), str(target_case[0])) # Save all the keys since they get dropped by networkx in Stage II id_list.append(str(source_case.pk)) ###################### # Stage II # # Calculate Pagerank # ###################### if verbosity >= 1: sys.stdout.write('\n') sys.stdout.write('NetworkX PageRank calculating...') sys.stdout.flush() pr_result = nx.pagerank(citing_graph) if verbosity >= 1: sys.stdout.write('Complete!\n') ################### # Stage III # # Update Pagerank # ################### progress = 0 min_value = min(pr_result.values()) for id in id_list: progress += 1 try: new_pr = pr_result[id] except KeyError: # NetworkX removes the isolated nodes from the network, but they still need to go into the PR file. new_pr = min_value self.result_file.write('{}={}\n'.format(id, new_pr)) if verbosity >= 1: sys.stdout.write( '\rUpdating Pagerank in external file...{:.0%}'.format( progress * 1.0 / graph_size)) sys.stdout.flush() self.result_file.close() if verbosity >= 1: sys.stdout.write('\nPageRank calculation finished!') sys.stdout.write('See the django log for more details.\n') ######################## # Stage IV # # Maintenance Routines # ######################## if verbosity >= 1: sys.stdout.write( 'Sorting the temp pagerank file for improved Solr performance...\n' ) # Sort the temp file, creating a new file without the TEMP_EXTENSION value, then delete the temp file. os.system('sort -n %s%s > %s' % (self.RESULT_FILE_PATH, self.TEMP_EXTENSION, self.RESULT_FILE_PATH)) os.remove(self.RESULT_FILE_PATH + self.TEMP_EXTENSION) if verbosity >= 1: sys.stdout.write('Reloading the external file cache in Solr...\n') reload_pagerank_external_file_cache() if verbosity >= 1: sys.stdout.write( 'Copying pagerank file to %s, for bulk downloading...\n' % settings.BULK_DATA_DIR) shutil.copy(self.RESULT_FILE_PATH, settings.BULK_DATA_DIR) if chown: user_info = pwd.getpwnam('www-data') os.chown(settings.BULK_DATA_DIR + 'external_pagerank', user_info.pw_uid, user_info.pw_gid)
def do_pagerank(self, verbosity=1, chown=True): ##################### # Stage I # # Import Data to NX # ##################### sys.stdout.write('Initializing...\n') graph_size = Document.objects.all().count() citing_graph = nx.DiGraph() qs = Document.objects.only( 'pk', 'cases_cited', ) case_list = queryset_generator(qs, chunksize=10000) case_count = 0 timings = [] average_per_s = 0 # Build up the network graph and a list of all valid ids id_list = [] for source_case in case_list: case_count += 1 if case_count % 100 == 1: t1 = time.time() if case_count % 100 == 0: t2 = time.time() timings.append(t2 - t1) average_per_s = 100 / (sum(timings) / float(len(timings))) sys.stdout.write("\rGenerating networkx graph...{:.0%} ({}/{}, {:.1f}/s)".format( case_count * 1.0 / graph_size, case_count, graph_size, average_per_s, )) sys.stdout.flush() for target_case in source_case.cases_cited.values_list('parent_documents__id'): citing_graph.add_edge(str(source_case.pk), str(target_case[0])) # Save all the keys since they get dropped by networkx in Stage II id_list.append(str(source_case.pk)) ###################### # Stage II # # Calculate Pagerank # ###################### if verbosity >= 1: sys.stdout.write('\n') sys.stdout.write('NetworkX PageRank calculating...') sys.stdout.flush() pr_result = nx.pagerank(citing_graph) if verbosity >= 1: sys.stdout.write('Complete!\n') ################### # Stage III # # Update Pagerank # ################### progress = 0 min_value = min(pr_result.values()) for id in id_list: progress += 1 try: new_pr = pr_result[id] except KeyError: # NetworkX removes the isolated nodes from the network, but they still need to go into the PR file. new_pr = min_value self.result_file.write('{}={}\n'.format(id, new_pr)) if verbosity >= 1: sys.stdout.write('\rUpdating Pagerank in external file...{:.0%}'.format( progress * 1.0 / graph_size )) sys.stdout.flush() self.result_file.close() if verbosity >= 1: sys.stdout.write('\nPageRank calculation finished!') sys.stdout.write('See the django log for more details.\n') ######################## # Stage IV # # Maintenance Routines # ######################## if verbosity >= 1: sys.stdout.write('Sorting the temp pagerank file for improved Solr performance...\n') # Sort the temp file, creating a new file without the TEMP_EXTENSION value, then delete the temp file. os.system('sort -n %s%s > %s' % (self.RESULT_FILE_PATH, self.TEMP_EXTENSION, self.RESULT_FILE_PATH)) os.remove(self.RESULT_FILE_PATH + self.TEMP_EXTENSION) if verbosity >= 1: sys.stdout.write('Reloading the external file cache in Solr...\n') reload_pagerank_external_file_cache() if verbosity >= 1: sys.stdout.write('Copying pagerank file to %s, for bulk downloading...\n' % settings.BULK_DATA_DIR) shutil.copy(self.RESULT_FILE_PATH, settings.BULK_DATA_DIR) if chown: user_info = pwd.getpwnam('www-data') os.chown(settings.BULK_DATA_DIR + 'external_pagerank', user_info.pw_uid, user_info.pw_gid)
def make_archive(self, obj_type_str, obj_type, court_attr, api_resource_obj): """Generate compressed archives containing the contents of an object database. There are a few tricks to this, but the main one is that each item in the database goes into two files, all.tar.gz and {court}.tar.gz. This means that if we want to avoid iterating the database once per file, we need to generate all 350+ jurisdiction files simultaneously. We do this by making a dict of open file handles and adding each item to the correct two files: The all.tar.gz file and the {court}.tar.gz file. This function takes longer to run than almost any in the codebase and has been the subject of some profiling. The top results are as follows: ncalls tottime percall cumtime percall filename:lineno(function) 138072 5.007 0.000 6.138 0.000 {method 'sub' of '_sre.SRE_Pattern' objects} 6001 4.452 0.001 4.608 0.001 {method 'execute' of 'psycopg2._psycopg.cursor' objects} 24900 3.623 0.000 3.623 0.000 {built-in method compress} 2807031/69163 2.923 0.000 8.216 0.000 copy.py:145(deepcopy) 2427852 0.952 0.000 1.130 0.000 encoder.py:37(replace) Conclusions: 1. sub is from string_utils.py, where we nuke bad chars. Could remove this code by sanitizing all future input to system and fixing any current issues. Other than that, it's already optimized. 1. Next up is DB waiting. Queries could be optimized to make this better. 1. Next is compression, which we've turned down as much as possible already (compresslevel=1 for most bulk files =3 for all.tar.gz). 1. Encoding and copying bring up the rear. Not much to do there, and gains are limited. Could install a faster json decoder, but Python 2.7's json implementation is already written in C. Not sure how to remove the gazillion copy's that are happening. """ courts = Court.objects.all() self.stdout.write(' - Creating %s bulk %s files ' 'simultaneously...\n' % (len(courts), obj_type_str)) mkdir_p('/tmp/bulk/%s' % obj_type_str) # Open a gzip'ed tar file for every court tar_files = {} for court in courts: tar_files[court.pk] = tarfile.open( '/tmp/bulk/%s/%s.tar.gz' % (obj_type_str, court.pk), mode='w:gz', compresslevel=1, ) tar_files['all'] = tarfile.open( '/tmp/bulk/%s/all.tar.gz' % obj_type_str, mode='w:gz', compresslevel=3, ) # Make the archives qs = obj_type.objects.all() item_resource = api_resource_obj() if type(qs[0].pk) == int: item_list = queryset_generator(qs) else: item_list = qs for item in item_list: json_str = item_resource.serialize( None, item_resource.full_dehydrate( item_resource.build_bundle(obj=item)), 'application/json', ).encode('utf-8') # Add the json str to the two tarballs tarinfo = tarfile.TarInfo("%s.json" % item.pk) tarinfo.size = len(json_str) tarinfo.mtime = time.mktime(item.date_modified.timetuple()) tarinfo.type = tarfile.REGTYPE tar_files[deepgetattr(item, court_attr)].addfile( tarinfo, StringIO.StringIO(json_str)) tar_files['all'].addfile( tarinfo, StringIO.StringIO(json_str)) # Close off all the gzip'ed tar files for court in courts: tar_files[court.pk].close() tar_files['all'].close() self.stdout.write(' - all %s bulk files created.\n' % obj_type_str)