def cleaner(simulate=False, verbose=False): """Fixes the titles of cases where the name is untitle disposition. Basically, the algorithm here is to find all cases with the error, then open each in Firefox one by one. After each case is opened, a prompt will allow the case name to be typed in, and it will be corrected on the site. These corrections will go live immediately, but will require a reindex to be live in the search system. """ queryset = Document.search.query('@casename "unpublished disposition"') docs = queryset.set_options( mode="SPH_MATCH_EXTENDED2").order_by("-date_filed") if verbose: print "%s results found." % (docs.count()) # Must slice here, or else only get top 20 results for doc in docs[0:docs.count()]: if doc.citation.caseNameFull.lower() == "unpublished disposition": # Only do each case once, since the index isn't updated until # later, and I may run this script many times. print doc.download_url casename = raw_input("Case name: ") doc.citation.caseNameFull = casename doc.citation.caseNameShort = trunc(casename, 100) doc.citation.slug = trunc(slugify(casename), 50) doc.precedential_status = "Unpublished" if not simulate: doc.citation.save() doc.save() print ""
def view_audio_file(request, pk, _): """Using the ID, return the oral argument page. We also test if the item is a favorite and send data as such. """ af = get_object_or_404(Audio, pk=pk) title = "Oral Argument for " + trunc(af.case_name, 100) get_string = search_utils.make_get_string(request) try: fave = Favorite.objects.get(audio_id=af.pk, users__user=request.user) favorite_form = FavoriteForm(instance=fave) except (ObjectDoesNotExist, TypeError): # Not favorited or anonymous user favorite_form = FavoriteForm( initial={ 'audio_id': af.pk, 'name': trunc(af.docket.case_name, 100, ellipsis='...'), }) return render_to_response( 'audio/oral_argument.html', { 'title': title, 'af': af, 'favorite_form': favorite_form, 'get_string': get_string, 'private': af.blocked, }, RequestContext(request))
def view_audio_file(request, pk, _): """Using the ID, return the oral argument page. We also test if the item is a favorite and send data as such. """ af = get_object_or_404(Audio, pk=pk) title = "Oral Argument for " + trunc(af.case_name, 100) get_string = search_utils.make_get_string(request) try: fave = Favorite.objects.get(audio_id=af.pk, users__user=request.user) favorite_form = FavoriteForm(instance=fave) except (ObjectDoesNotExist, TypeError): # Not favorited or anonymous user favorite_form = FavoriteForm( initial={ 'audio_id': af.pk, 'name': trunc(af.docket.case_name, 100, ellipsis='...'), } ) return render_to_response( 'audio/oral_argument.html', {'title': title, 'af': af, 'favorite_form': favorite_form, 'get_string': get_string, 'private': af.blocked, }, RequestContext(request) )
def cleaner(simulate=False, verbose=False): """Fixes the titles of cases where the name is untitle disposition. Basically, the algorithm here is to find all cases with the error, then open each in Firefox one by one. After each case is opened, a prompt will allow the case name to be typed in, and it will be corrected on the site. These corrections will go live immediately, but will require a reindex to be live in the search system. """ queryset = Document.search.query('@casename "unpublished disposition"') docs = queryset.set_options(mode="SPH_MATCH_EXTENDED2").order_by('-date_filed') if verbose: print "%s results found." % (docs.count()) # Must slice here, or else only get top 20 results for doc in docs[0:docs.count()]: if doc.citation.caseNameFull.lower() == "unpublished disposition": # Only do each case once, since the index isn't updated until # later, and I may run this script many times. print doc.download_url casename = raw_input("Case name: ") doc.citation.caseNameFull = casename doc.citation.caseNameShort = trunc(casename, 100) doc.citation.slug = trunc(slugify(casename), 50) doc.precedential_status = "Unpublished" if not simulate: doc.citation.save() doc.save() print ""
def view_opinion_citations(request, pk, _): doc = get_object_or_404(Document, pk=pk) title = '%s, %s' % ( trunc(doc.citation.case_name, 100), make_citation_string(doc)) # Get list of cases we cite, ordered by citation count citing_opinions = doc.citation.citing_opinions.select_related( 'citation', 'docket__court').order_by('-citation_count', '-date_filed') paginator = Paginator(citing_opinions, 20, orphans=2) page = request.GET.get('page') try: citing_opinions = paginator.page(page) except (TypeError, PageNotAnInteger): # TypeError can be removed in Django 1.4, where it properly will be # caught upstream. citing_opinions = paginator.page(1) except EmptyPage: citing_opinions = paginator.page(paginator.num_pages) private = False if doc.blocked: private = True else: for case in citing_opinions.object_list: if case.blocked: private = True break return render_to_response('casepage/view_opinion_citations.html', {'title': title, 'doc': doc, 'private': private, 'citing_opinions': citing_opinions}, RequestContext(request))
def view_authorities(request, pk, case_name): pk = ascii_to_num(pk) doc = get_object_or_404(Document, pk=pk) title = '%s, %s' % (trunc(doc.citation.case_name, 100), make_citation_string(doc)) # Ordering is by case name is the norm. authorities = doc.cases_cited.all().select_related( 'document').order_by('case_name') private = False if doc.blocked: private = True else: for case in authorities: if case.parent_documents.all()[0].blocked: private = True break return render_to_response('view_case_authorities.html', {'title': title, 'doc': doc, 'private': private, 'authorities': authorities}, RequestContext(request))
def test_solr_ingestion_and_deletion(self): """Do items get added to the Solr index when they are ingested?""" site = test_opinion_scraper.Site().parse() path = os.path.join(settings.INSTALL_ROOT, 'alert', site.download_urls[0]) # a simple PDF with open(path) as f: content = f.read() cf = ContentFile(content) extension = get_extension(content) cite = Citation() cite.save(index=False) docket = Docket( court=self.court, case_name=site.case_names[0], ) docket.save() doc = Document( date_filed=site.case_dates[0], docket=docket, citation=cite, ) file_name = trunc(site.case_names[0].lower(), 75) + extension doc.local_path.save(file_name, cf, save=False) doc.save(index=False) extract_doc_content(doc.pk, callback=subtask(extract_by_ocr)) response = self.si.raw_query(**{ 'q': 'supreme', 'caller': 'scraper_test', }).execute() count = response.result.numFound self.assertEqual( count, 1, "There were %s items found when there should have been 1" % count)
def test_trunc(self): """Does trunc give us the results we expect?""" s = "Henry wants apple." tests = ( # Simple case {"length": 13, "result": "Henry wants"}, # Off by one cases {"length": 4, "result": "Henr"}, {"length": 5, "result": "Henry"}, {"length": 6, "result": "Henry"}, # Do we include the length of the ellipsis when measuring? {"length": 12, "ellipsis": "...", "result": "Henry..."}, # What happens when an alternate ellipsis is used instead? {"length": 15, "ellipsis": "....", "result": "Henry wants...."}, # Do we cut properly when no spaces are found? {"length": 2, "result": "He"}, # Do we cut properly when ellipsizing if no spaces found? {"length": 6, "ellipsis": "...", "result": "Hen..."}, # Do we return the whole s when length >= s? {"length": 50, "result": s}, ) for test_dict in tests: result = trunc(s=s, length=test_dict["length"], ellipsis=test_dict.get("ellipsis", None)) self.assertEqual( result, test_dict["result"], msg="Failed with dict: %s.\n" "%s != %s" % (test_dict, result, test_dict["result"]), ) self.assertTrue( len(result) <= test_dict["length"], msg="Failed with dict: %s.\n" "%s is longer than %s" % (test_dict, result, test_dict["length"]), )
def test_content_extraction(self): """Do all of the supported mimetypes get extracted to text successfully, including OCR?""" site = test_scraper.Site().parse() test_strings = ['supreme', 'intelligence', 'indiana', 'reagan', 'indiana', 'fidelity'] for i in range(0, len(site.case_names)): path = os.path.join(settings.INSTALL_ROOT, 'alert', site.download_urls[i]) with open(path) as f: content = f.read() cf = ContentFile(content) extension = get_extension(content) cite = Citation(case_name=site.case_names[i]) cite.save(index=False) doc = Document(date_filed=site.case_dates[i], court=self.court, citation=cite) file_name = trunc(site.case_names[i].lower(), 75) + extension doc.local_path.save(file_name, cf, save=False) doc.save(index=False) doc = extract_doc_content(doc.pk, callback=subtask(extract_by_ocr)) if extension in ['.html', '.wpd']: self.assertIn(test_strings[i], doc.html.lower()) else: self.assertIn(test_strings[i], doc.plain_text.lower()) doc.delete()
def test_solr_ingestion_and_deletion(self): """Do items get added to the Solr index when they are ingested?""" site = test_opinion_scraper.Site().parse() path = os.path.join(settings.INSTALL_ROOT, 'alert', site.download_urls[0]) # a simple PDF with open(path) as f: content = f.read() cf = ContentFile(content) extension = get_extension(content) cite = Citation() cite.save(index=False) docket = Docket( court=self.court, case_name=site.case_names[0], ) docket.save() doc = Document( date_filed=site.case_dates[0], docket=docket, citation=cite, ) file_name = trunc(site.case_names[0].lower(), 75) + extension doc.local_path.save(file_name, cf, save=False) doc.save(index=False) extract_doc_content(doc.pk, callback=subtask(extract_by_ocr)) response = self.si.raw_query(**{'q': 'supreme', 'caller': 'scraper_test',}).execute() count = response.result.numFound self.assertEqual(count, 1, "There were %s items found when there should have been 1" % count)
def view_opinion(request, pk, _): """Using the ID, return the document. We also test if the document ID is a favorite for the user, and send data as such. If it's a favorite, we send the bound form for the favorite so it can populate the form on the page. If it is not a favorite, we send the unbound form. """ # Look up the court, document, title and favorite information doc = get_object_or_404(Document, pk=pk) citation_string = make_citation_string(doc) title = '%s, %s' % (trunc(doc.citation.case_name, 100), citation_string) get_string = search_utils.make_get_string(request) try: fave = Favorite.objects.get(doc_id=doc.pk, users__user=request.user) favorite_form = FavoriteForm(instance=fave) except (ObjectDoesNotExist, TypeError): # Not favorited or anonymous user favorite_form = FavoriteForm( initial={ 'doc_id': doc.pk, 'name': trunc(doc.citation.case_name, 100, ellipsis='...'), } ) # get most influential opinions that cite this opinion cited_by_trunc = doc.citation.citing_opinions.select_related( 'citation').order_by('-citation_count', '-date_filed')[:5] authorities_trunc = doc.cases_cited.all().select_related( 'document').order_by('case_name')[:5] authorities_count = doc.cases_cited.all().count() return render_to_response( 'casepage/view_opinion.html', {'title': title, 'citation_string': citation_string, 'doc': doc, 'favorite_form': favorite_form, 'get_string': get_string, 'private': doc.blocked, 'cited_by_trunc': cited_by_trunc, 'authorities_trunc': authorities_trunc, 'authorities_count': authorities_count}, RequestContext(request) )
def view_opinion(request, pk, _): """Using the ID, return the document. We also test if the document ID is a favorite for the user, and send data as such. If it's a favorite, we send the bound form for the favorite so it can populate the form on the page. If it is not a favorite, we send the unbound form. """ # Look up the court, document, title and favorite information doc = get_object_or_404(Document, pk=pk) citation_string = make_citation_string(doc) title = "%s, %s" % (trunc(doc.citation.case_name, 100), citation_string) get_string = search_utils.make_get_string(request) try: fave = Favorite.objects.get(doc_id=doc.pk, users__user=request.user) favorite_form = FavoriteForm(instance=fave) except (ObjectDoesNotExist, TypeError): # Not favorited or anonymous user favorite_form = FavoriteForm( initial={"doc_id": doc.pk, "name": trunc(doc.citation.case_name, 100, ellipsis="...")} ) # get most influential opinions that cite this opinion cited_by_trunc = doc.citation.citing_opinions.select_related("citation").order_by("-citation_count", "-date_filed")[ :5 ] authorities_trunc = doc.cases_cited.all().select_related("document").order_by("case_name")[:5] authorities_count = doc.cases_cited.all().count() return render_to_response( "casepage/view_opinion.html", { "title": title, "citation_string": citation_string, "doc": doc, "favorite_form": favorite_form, "get_string": get_string, "private": doc.blocked, "cited_by_trunc": cited_by_trunc, "authorities_trunc": authorities_trunc, "authorities_count": authorities_count, }, RequestContext(request), )
def merge_cases_simple(new, target_id): """Add `new` to the database, merging with target_id Merging is done by picking the best fields from each item. """ target = Document.objects.get(pk=target_id) print "Merging %s with" % new.citation.case_name print " %s" % target.citation.case_name cached_source = target.source # Original value is needed below. if target.source == 'C': target.source = 'LC' elif target.source == 'R': target.source = 'LR' elif target.source == 'CR': target.source = 'LCR' # Add the URL if it's not a court one, replacing resource.org's info in some cases. if cached_source == 'R': target.download_url = new.download_url # Recreate the slug from the new case name (this changes the URL, but the old will continue working) target.citation.slug = trunc(slugify(new.citation.case_name), 50) # Take the case name from the new item; they tend to be pretty good target.citation.case_name = new.citation.case_name # Add the docket number if the old doesn't exist, but keep the old if one does. if not target.citation.docket_number: target.citation.docket_number = new.citation.docket_number # Get the citations from the new item (ditch the old). target.citation.federal_cite_one = new.citation.federal_cite_one target.citation.federal_cite_two = new.citation.federal_cite_two target.citation.federal_cite_three = new.citation.federal_cite_three target.citation.state_cite_one = new.citation.state_cite_one target.citation.state_cite_two = new.citation.state_cite_two target.citation.state_cite_three = new.citation.state_cite_three target.citation.state_cite_regional = new.citation.state_cite_regional target.citation.specialty_cite_one = new.citation.specialty_cite_one target.citation.scotus_early_cite = new.citation.scotus_early_cite target.citation.lexis_cite = new.citation.lexis_cite target.citation.westlaw_cite = new.citation.westlaw_cite target.citation.neutral_cite = new.citation.neutral_cite # Add judge information if lacking. New is dirty, but better than none. if not target.judges: target.judges = new.judges # Add the text. target.html_lawbox, blocked = anonymize(new.html) if blocked: target.blocked = True target.date_blocked = now() target.extracted_by_ocr = False # No longer true for any LB case. save_doc_and_cite(target, index=False)
def fixer(simulate=False, verbose=False): """If a Citation lacks a slug, we make one for it.""" citations = Citation.objects.filter(slug=None) for citation in citations: if verbose: print "Fixing %s" % citation citation.slug = trunc(slugify(citation.case_name), 50) if not simulate: citation.save()
def save(self, index=True, *args, **kwargs): """ Note that there is a pre_save receiver below. """ created = self.pk is None self.slug = trunc(slugify(self.case_name), 50) super(Citation, self).save(*args, **kwargs) # We only do this on update, not creation if index and not created: # Import is here to avoid looped import problem from search.tasks import update_cite update_cite.delay(self.pk)
def save(self, index=True, force_commit=False, *args, **kwargs): """ Note that there is a pre_save receiver below. """ created = self.pk is None self.slug = trunc(slugify(self.case_name), 50) super(Citation, self).save(*args, **kwargs) # We only do this on update, not creation if index and not created: # Import is here to avoid looped import problem from search.tasks import update_cite update_cite.delay(self.pk, force_commit)
def save(self, index=True, *args, **kwargs): """ create the URL from the case name, but only if this is the first time it has been saved. """ created = self.pk is None if created: # it's the first time it has been saved; generate the slug stuff self.slug = trunc(slugify(self.case_name), 50) super(Citation, self).save(*args, **kwargs) # We only do this on update, not creation if index and not created: # Import is here to avoid looped import problem from search.tasks import update_cite update_cite.delay(self.pk)
def view_authorities(request, pk, _): doc = get_object_or_404(Document, pk=pk) title = '%s, %s' % (trunc(doc.citation.case_name, 100), make_citation_string(doc)) # Ordering is by case name is the norm. authorities = doc.cases_cited.all().select_related('document').order_by( 'case_name') private = False if doc.blocked: private = True else: for case in authorities: if case.parent_documents.all()[0].blocked: private = True break
def view_authorities(request, pk, _): doc = get_object_or_404(Document, pk=pk) title = "%s, %s" % (trunc(doc.citation.case_name, 100), make_citation_string(doc)) # Ordering is by case name is the norm. authorities = doc.cases_cited.all().select_related("document").order_by("case_name") private = False if doc.blocked: private = True else: for case in authorities: if case.parent_documents.all()[0].blocked: private = True break return render_to_response( "casepage/view_opinion_authorities.html", {"title": title, "doc": doc, "private": private, "authorities": authorities}, RequestContext(request), )
def process_audio_file(pk): """Given the key to an audio file, extract its content and add the related meta data to the database. """ audio_file = Audio.objects.get(pk=pk) path_to_original = audio_file.local_path_original_file.path path_to_tmp_location = os.path.join('/tmp', str(time.time()) + '.mp3') # Convert original file to: # - mono (-ac 1) # - sample rate (audio samples / s) of 22050Hz (-ar 22050) # - constant bit rate (sample resolution) of 48kbps (-ab 48k) avconv_command = [ 'avconv', '-i', path_to_original, '-ac', '1', '-ar', '22050', '-ab', '48k', path_to_tmp_location ] _ = subprocess.check_output(avconv_command, stderr=subprocess.STDOUT) # Have to do this last because otherwise the mp3 hasn't yet been generated. file_name = trunc(audio_file.case_name.lower(), 72) + '_cl.mp3' set_mp3_meta_data(audio_file, path_to_tmp_location) audio_file.duration = eyed3.load(path_to_tmp_location).info.time_secs with open(path_to_tmp_location, 'r') as mp3: try: cf = ContentFile(mp3.read()) audio_file.local_path_mp3.save(file_name, cf, save=False) except: msg = "Unable to save mp3 to audio_file in scraper.tasks.process_" \ "audio_file for item: %s\nTraceback:\n%s" % \ (audio_file.pk, traceback.format_exc()) logger.critical(msg) ErrorLog(log_level='CRITICAL', court=audio_file.docket.court, message=msg).save() audio_file.processing_complete = True audio_file.save()
def process_audio_file(pk): """Given the key to an audio file, extract its content and add the related meta data to the database. """ audio_file = Audio.objects.get(pk=pk) path_to_original = audio_file.local_path_original_file.path path_to_tmp_location = os.path.join('/tmp', str(time.time()) + '.mp3') # Convert original file to: # - mono (-ac 1) # - sample rate (audio samples / s) of 22050Hz (-ar 22050) # - constant bit rate (sample resolution) of 48kbps (-ab 48k) avconv_command = ['avconv', '-i', path_to_original, '-ac', '1', '-ar', '22050', '-ab', '48k', path_to_tmp_location] _ = subprocess.check_output(avconv_command, stderr=subprocess.STDOUT) # Have to do this last because otherwise the mp3 hasn't yet been generated. file_name = trunc(audio_file.case_name.lower(), 72) + '_cl.mp3' set_mp3_meta_data(audio_file, path_to_tmp_location) audio_file.duration = eyed3.load(path_to_tmp_location).info.time_secs with open(path_to_tmp_location, 'r') as mp3: try: cf = ContentFile(mp3.read()) audio_file.local_path_mp3.save(file_name, cf, save=False) except: msg = "Unable to save mp3 to audio_file in scraper.tasks.process_" \ "audio_file for item: %s\nTraceback:\n%s" % \ (audio_file.pk, traceback.format_exc()) logger.critical(msg) ErrorLog(log_level='CRITICAL', court=audio_file.docket.court, message=msg).save() audio_file.processing_complete = True audio_file.save()
def test_trunc(self): """Does trunc give us the results we expect?""" s = 'Henry wants apple.' tests = ( # Simple case {'length': 13, 'result': 'Henry wants'}, # Off by one cases {'length': 4, 'result': 'Henr'}, {'length': 5, 'result': 'Henry'}, {'length': 6, 'result': 'Henry'}, # Do we include the length of the ellipsis when measuring? {'length': 12, 'ellipsis': '...', 'result': 'Henry...'}, # What happens when an alternate ellipsis is used instead? {'length': 15, 'ellipsis': '....', 'result': 'Henry wants....'}, # Do we cut properly when no spaces are found? {'length': 2, 'result': 'He'}, # Do we cut properly when ellipsizing if no spaces found? {'length': 6, 'ellipsis': '...', 'result': 'Hen...'}, # Do we return the whole s when length >= s? {'length': 50, 'result': s} ) for test_dict in tests: result = trunc( s=s, length=test_dict['length'], ellipsis=test_dict.get('ellipsis', None), ) self.assertEqual( result, test_dict['result'], msg='Failed with dict: %s.\n' '%s != %s' % (test_dict, result, test_dict['result']) ) self.assertTrue( len(result) <= test_dict['length'], msg="Failed with dict: %s.\n" "%s is longer than %s" % (test_dict, result, test_dict['length']) )
def test_content_extraction(self): """Do all of the supported mimetypes get extracted to text successfully, including OCR?""" site = test_opinion_scraper.Site().parse() test_strings = [ 'supreme', 'intelligence', 'indiana', 'reagan', 'indiana', 'fidelity' ] for i in range(0, len(site.case_names)): path = os.path.join(settings.INSTALL_ROOT, 'alert', site.download_urls[i]) with open(path) as f: content = f.read() cf = ContentFile(content) extension = get_extension(content) cite = Citation() cite.save(index=False) docket = Docket( case_name=site.case_names[i], court=self.court, ) docket.save() doc = Document( date_filed=site.case_dates[i], citation=cite, docket=docket, ) file_name = trunc(site.case_names[i].lower(), 75) + extension doc.local_path.save(file_name, cf, save=False) doc.save(index=False) doc = extract_doc_content(doc.pk, callback=subtask(extract_by_ocr)) if extension in ['.html', '.wpd']: self.assertIn(test_strings[i], doc.html.lower()) else: self.assertIn(test_strings[i], doc.plain_text.lower()) doc.delete()
def update_dockets_if_citation_case_name_changed(sender, instance, **kwargs): """Updates the docket.case_name field for all associated Dockets when the Citation.case_name field changes. - From http://stackoverflow.com/a/7934958/64911. There are a few alternative ways to implement this that don't hit the database an extra time (as this one does). However, those solutions are longer and more controversial, so I chose this one based on the fact that we rarely change objects once they are saved and the performance penalty is probably acceptable. """ try: cite = Citation.objects.get(pk=instance.pk) except Citation.DoesNotExist: # Object is new pass else: if not cite.case_name == instance.case_name: # Update the associated dockets for d in cite.parent_documents.all(): d.docket.case_name = instance.case_name d.docket.slug = trunc(slugify(instance.case_name), 50) d.docket.save()
def scrape_court(self, site, full_crawl=False): download_error = False # Get the court object early for logging # opinions.united_states.federal.ca9_u --> ca9 court_str = site.court_id.split('.')[-1].split('_')[0] court = Court.objects.get(pk=court_str) dup_checker = DupChecker(court, full_crawl=full_crawl) abort = dup_checker.abort_by_url_hash(site.url, site.hash) if not abort: if site.cookies: logger.info("Using cookies: %s" % site.cookies) for i in range(0, len(site.case_names)): msg, r = get_binary_content(site.download_urls[i], site.cookies, method=site.method) if msg: logger.warn(msg) ErrorLog(log_level='WARNING', court=court, message=msg).save() continue current_date = site.case_dates[i] try: next_date = site.case_dates[i + 1] except IndexError: next_date = None # Make a hash of the data sha1_hash = hashlib.sha1(r.content).hexdigest() if court_str == 'nev' and site.precedential_statuses[ i] == 'Unpublished': # Nevada's non-precedential cases have different SHA1 sums every time. onwards = dup_checker.should_we_continue_break_or_carry_on( Document, current_date, next_date, lookup_value=site.download_urls[i], lookup_by='download_url') else: onwards = dup_checker.should_we_continue_break_or_carry_on( Document, current_date, next_date, lookup_value=sha1_hash, lookup_by='sha1') if onwards == 'CONTINUE': # It's a duplicate, but we haven't hit any thresholds yet. continue elif onwards == 'BREAK': # It's a duplicate, and we hit a date or dup_count threshold. dup_checker.update_site_hash(sha1_hash) break elif onwards == 'CARRY_ON': # Not a duplicate, carry on logger.info('Adding new document found at: %s' % site.download_urls[i]) dup_checker.reset() cite, docket, doc = self.associate_meta_data_to_objects( site, i, court, sha1_hash) # Make and associate the file object try: cf = ContentFile(r.content) extension = get_extension(r.content) # See bitbucket issue #215 for why this must be # lower-cased. file_name = trunc(site.case_names[i].lower(), 75) + extension doc.local_path.save(file_name, cf, save=False) except: msg = 'Unable to save binary to disk. Deleted document: % s.\n % s' % \ (site.case_names[i], traceback.format_exc()) logger.critical(msg) ErrorLog(log_level='CRITICAL', court=court, message=msg).save() download_error = True continue # Save everything, but don't update Solr index yet self.save_everything(cite, docket, doc, index=False) random_delay = random.randint(0, 3600) extract_doc_content.delay(doc.pk, callback=subtask(extract_by_ocr), citation_countdown=random_delay) logger.info("Successfully added doc {pk}: {name}".format( pk=doc.pk, name=site.case_names[i])) # Update the hash if everything finishes properly. logger.info("%s: Successfully crawled opinions." % site.court_id) if not download_error and not full_crawl: # Only update the hash if no errors occurred. dup_checker.update_site_hash(site.hash)
def title(self, obj): return "Cases citing %s, ordered by filing date" % trunc(str(obj.citation), 50)
def title(self, obj): return "Cases Citing %s, Ordered by Filing Date" % \ trunc(obj.citation.case_name, 50)
def download_and_save(): """This function is run in many threads simultaneously. Each thread runs so long as there are items in the queue. Once an item is found, it's downloaded and saved. The number of items that can be concurrently saved is determined by the number of threads that are running this function. """ while True: item = queue.get() logger.info("Attempting to add item at: %s" % item['url']) try: msg, r = get_binary_content( item['url'], {}, ) except: logger.info("Unable to get item at: %s" % item['url']) queue.task_done() if msg: logger.warn(msg) queue.task_done() sha1_hash = hashlib.sha1(r.content).hexdigest() if Audio.objects.filter(sha1=sha1_hash).exists(): # Simpsons did it! Try the next one. logger.info("Item already exists, moving to next item.") queue.task_done() else: # New item, onwards! logger.info('Adding new document found at: %s' % item['url']) audio_file = Audio( source='H', sha1=sha1_hash, case_name=item['case_name'], date_argued=item['date_argued'], download_url=item['url'], processing_complete=False, ) if item['judges']: audio_file.judges = item['judges'] if item['docket_number']: audio_file.docket_number = item['docket_number'] court = Court.objects.get(pk=item['court_code']) docket = Docket( case_name=item['case_name'], court=court, ) # Make and associate the file object try: cf = ContentFile(r.content) extension = get_extension(r.content) if extension not in ['.mp3', '.wma']: extension = '.' + item['url'].rsplit('.', 1)[1] # See bitbucket issue #215 for why this must be # lower-cased. file_name = trunc(item['case_name'].lower(), 75) + extension audio_file.local_path_original_file.save(file_name, cf, save=False) except: msg = 'Unable to save binary to disk. Deleted document: % s.\n % s' % \ (item['case_name'], traceback.format_exc()) logger.critical(msg) queue.task_done() docket.save() audio_file.docket = docket audio_file.save(index=False) random_delay = random.randint(0, 3600) process_audio_file.apply_async((audio_file.pk, ), countdown=random_delay) logger.info("Successfully added audio file %s: %s" % (audio_file.pk, audio_file.case_name))
def scrape_court(self, site, full_crawl=False): download_error = False # Get the court object early for logging # opinions.united_states.federal.ca9_u --> ca9 court_str = site.court_id.split(".")[-1].split("_")[0] court = Court.objects.get(pk=court_str) dup_checker = DupChecker(court, full_crawl=full_crawl) abort = dup_checker.abort_by_url_hash(site.url, site.hash) if not abort: if site.cookies: logger.info("Using cookies: %s" % site.cookies) for i in range(0, len(site.case_names)): msg, r = get_binary_content( site.download_urls[i], site.cookies, site._get_adapter_instance(), method=site.method ) if msg: logger.warn(msg) ErrorLog(log_level="WARNING", court=court, message=msg).save() continue content = site.cleanup_content(r.content) current_date = site.case_dates[i] try: next_date = site.case_dates[i + 1] except IndexError: next_date = None # Make a hash of the data if isinstance(content, unicode): sha1_hash = hashlib.sha1(content.encode("utf-8")).hexdigest() else: sha1_hash = hashlib.sha1(content).hexdigest() if court_str == "nev" and site.precedential_statuses[i] == "Unpublished": # Nevada's non-precedential cases have different SHA1 # sums every time. onwards = dup_checker.should_we_continue_break_or_carry_on( Document, current_date, next_date, lookup_value=site.download_urls[i], lookup_by="download_url" ) else: onwards = dup_checker.should_we_continue_break_or_carry_on( Document, current_date, next_date, lookup_value=sha1_hash, lookup_by="sha1" ) if onwards == "CONTINUE": # It's a duplicate, but we haven't hit any thresholds yet. continue elif onwards == "BREAK": # It's a duplicate, and we hit a date or dup_count # threshold. dup_checker.update_site_hash(sha1_hash) break elif onwards == "CARRY_ON": # Not a duplicate, carry on logger.info("Adding new document found at: %s" % site.download_urls[i].encode("utf-8")) dup_checker.reset() cite, docket, doc = self.associate_meta_data_to_objects(site, i, court, sha1_hash) # Make and associate the file object try: cf = ContentFile(content) extension = get_extension(content) # See bitbucket issue #215 for why this must be # lower-cased. file_name = trunc(site.case_names[i].lower(), 75) + extension doc.local_path.save(file_name, cf, save=False) except: msg = "Unable to save binary to disk. Deleted " "document: % s.\n % s" % ( site.case_names[i], traceback.format_exc(), ) logger.critical(msg.encode("utf-8")) ErrorLog(log_level="CRITICAL", court=court, message=msg).save() download_error = True continue # Save everything, but don't update Solr index yet self.save_everything(cite, docket, doc, index=False) random_delay = random.randint(0, 3600) extract_doc_content.delay(doc.pk, callback=subtask(extract_by_ocr), citation_countdown=random_delay) logger.info( "Successfully added doc {pk}: {name}".format(pk=doc.pk, name=site.case_names[i].encode("utf-8")) ) # Update the hash if everything finishes properly. logger.info("%s: Successfully crawled opinions." % site.court_id) if not download_error and not full_crawl: # Only update the hash if no errors occurred. dup_checker.update_site_hash(site.hash)
def scrape_court(site, full_crawl=False): download_error = False # Get the court object early for logging # opinions.united_states.federal.ca9_u --> ca9 court_str = site.court_id.split('.')[-1].split('_')[0] court = Court.objects.get(pk=court_str) dup_checker = DupChecker(site.court_id, full_crawl=full_crawl) abort = dup_checker.abort_by_hash(site.hash) if not abort: for i in range(0, len(site.case_names)): msg, r = get_binary_content(site.download_urls[i], site._get_cookies()) clean_content = site._cleanup_content(r.content) if msg: logger.warn(msg) ErrorLog(log_level='WARNING', court=court, message=msg).save() continue current_date = site.case_dates[i] try: next_date = site.case_dates[i + 1] except IndexError: next_date = None # Make a hash of the data. Need to convert unicode to binary before hashing. if type(clean_content) == unicode: hash_content = clean_content.encode('utf-8') else: hash_content = clean_content sha1_hash = hashlib.sha1(hash_content).hexdigest() if court_str == 'nev' and site.precedential_statuses[i] == 'Unpublished': # Nevada's non-precedential cases have different SHA1 sums every time. onwards = dup_checker.should_we_continue_break_or_carry_on( current_date, next_date, lookup_value=site.download_urls[i], lookup_by='download_url' ) else: onwards = dup_checker.should_we_continue_break_or_carry_on( current_date, next_date, lookup_value=sha1_hash, lookup_by='sha1' ) if onwards == 'CONTINUE': # It's a duplicate, but we haven't hit any thresholds yet. continue elif onwards == 'BREAK': # It's a duplicate, and we hit a date or dup_count threshold. dup_checker.update_site_hash(sha1_hash) break elif onwards == 'CARRY_ON': # Not a duplicate, carry on logger.info('Adding new document found at: %s' % site.download_urls[i]) dup_checker.reset() # Make a citation cite = Citation(case_name=site.case_names[i]) if site.docket_numbers: cite.docket_number = site.docket_numbers[i] if site.neutral_citations: cite.neutral_cite = site.neutral_citations[i] if site.west_citations: cite.federal_cite_one = site.west_citations[i] if site.west_state_citations: cite.west_state_cite = site.west_state_citations[i] # Make the document object doc = Document(source='C', sha1=sha1_hash, date_filed=site.case_dates[i], court=court, download_url=site.download_urls[i], precedential_status=site.precedential_statuses[i]) # Make and associate the file object try: cf = ContentFile(clean_content) extension = get_extension(r.content) # See issue #215 for why this must be lower-cased. file_name = trunc(site.case_names[i].lower(), 75) + extension doc.local_path.save(file_name, cf, save=False) except: msg = 'Unable to save binary to disk. Deleted document: % s.\n % s' % \ (cite.case_name, traceback.format_exc()) logger.critical(msg) ErrorLog(log_level='CRITICAL', court=court, message=msg).save() download_error = True continue if site.judges: doc.judges = site.judges[i] if site.nature_of_suit: doc.nature_of_suit = site.nature_of_suit[i] # Save everything, but don't update Solr index yet cite.save(index=False) doc.citation = cite doc.save(index=False) # Extract the contents asynchronously. extract_doc_content(doc.pk, callback=subtask(extract_by_ocr)) logger.info("Successfully added doc %s: %s" % (doc.pk, site.case_names[i])) # Update the hash if everything finishes properly. logger.info("%s: Successfully crawled." % site.court_id) if not download_error and not full_crawl: # Only update the hash if no errors occurred. dup_checker.update_site_hash(site.hash)
def test_trunc(self): """Does trunc give us the results we expect?""" s = 'Henry wants apple.' tests = ( # Simple case { 'length': 13, 'result': 'Henry wants' }, # Off by one cases { 'length': 4, 'result': 'Henr' }, { 'length': 5, 'result': 'Henry' }, { 'length': 6, 'result': 'Henry' }, # Do we include the length of the ellipsis when measuring? { 'length': 12, 'ellipsis': '...', 'result': 'Henry...' }, # What happens when an alternate ellipsis is used instead? { 'length': 15, 'ellipsis': '....', 'result': 'Henry wants....' }, # Do we cut properly when no spaces are found? { 'length': 2, 'result': 'He' }, # Do we cut properly when ellipsizing if no spaces found? { 'length': 6, 'ellipsis': '...', 'result': 'Hen...' }, # Do we return the whole s when length >= s? { 'length': 50, 'result': s }) for test_dict in tests: result = trunc( s=s, length=test_dict['length'], ellipsis=test_dict.get('ellipsis', None), ) self.assertEqual(result, test_dict['result'], msg='Failed with dict: %s.\n' '%s != %s' % (test_dict, result, test_dict['result'])) self.assertTrue(len(result) <= test_dict['length'], msg="Failed with dict: %s.\n" "%s is longer than %s" % (test_dict, result, test_dict['length']))
def download_and_save(): """This function is run in many threads simultaneously. Each thread runs so long as there are items in the queue. Once an item is found, it's downloaded and saved. The number of items that can be concurrently saved is determined by the number of threads that are running this function. """ while True: item = queue.get() logger.info("Attempting to add item at: %s" % item['url']) try: msg, r = get_binary_content( item['url'], {}, ) except: logger.info("Unable to get item at: %s" % item['url']) queue.task_done() if msg: logger.warn(msg) queue.task_done() sha1_hash = hashlib.sha1(r.content).hexdigest() if Audio.objects.filter(sha1=sha1_hash).exists(): # Simpsons did it! Try the next one. logger.info("Item already exists, moving to next item.") queue.task_done() else: # New item, onwards! logger.info('Adding new document found at: %s' % item['url']) audio_file = Audio( source='H', sha1=sha1_hash, case_name=item['case_name'], date_argued=item['date_argued'], download_url=item['url'], processing_complete=False, ) if item['judges']: audio_file.judges = item['judges'] if item['docket_number']: audio_file.docket_number = item['docket_number'] court = Court.objects.get(pk=item['court_code']) docket = Docket( case_name=item['case_name'], court=court, ) # Make and associate the file object try: cf = ContentFile(r.content) extension = get_extension(r.content) if extension not in ['.mp3', '.wma']: extension = '.' + item['url'].rsplit('.', 1)[1] # See bitbucket issue #215 for why this must be # lower-cased. file_name = trunc(item['case_name'].lower(), 75) + extension audio_file.local_path_original_file.save(file_name, cf, save=False) except: msg = 'Unable to save binary to disk. Deleted document: % s.\n % s' % \ (item['case_name'], traceback.format_exc()) logger.critical(msg) queue.task_done() docket.save() audio_file.docket = docket audio_file.save(index=False) random_delay = random.randint(0, 3600) process_audio_file.apply_async( (audio_file.pk,), countdown=random_delay ) logger.info("Successfully added audio file %s: %s" % ( audio_file.pk, audio_file.case_name))
'48k', path_to_tmp_location ] try: _ = subprocess.check_output(avconv_command, stderr=subprocess.STDOUT) except subprocess.CalledProcessError, e: print 'avconv failed command: %s\nerror code: %s\noutput: %s\n' % \ (avconv_command, e.returncode, e.output) print traceback.format_exc() raise # Have to do this last because otherwise the mp3 hasn't yet been generated. set_mp3_meta_data(af, path_to_tmp_location) af.duration = eyed3.load(path_to_tmp_location).info.time_secs with open(path_to_tmp_location, 'r') as mp3: try: cf = ContentFile(mp3.read()) file_name = trunc(af.case_name.lower(), 72) + '_cl.mp3' af.local_path_mp3.save(file_name, cf, save=False) except: msg = "Unable to save mp3 to audio_file in scraper.tasks.process_" \ "audio_file for item: %s\nTraceback:\n%s" % \ (af.pk, traceback.format_exc()) ErrorLog(log_level='CRITICAL', court=af.docket.court, message=msg).save() af.processing_complete = True af.save() os.remove(path_to_tmp_location)
af.judges = item['judges'] if item['docket_number']: af.docket_number = item['docket_number'] court = Court.objects.get(pk=item['court_code']) docket.court = court # Fix the files. First save the location of the old files. original_local_path = af.local_path_original_file.path original_mp3_path = af.local_path_mp3.path # Create a new file with the contents of the old and a corrected # name. This is only in memory for the moment. cf = ContentFile(af.local_path_original_file.read()) extension = '.' + af.local_path_original_file.path.rsplit('.', 1)[1] file_name = trunc(item['case_name'].lower(), 75) + extension af.local_path_original_file.save(file_name, cf, save=False) # Create a new mp3 file with the new contents cf = ContentFile(af.local_path_mp3.read()) file_name = trunc(af.case_name.lower(), 72) + '_cl.mp3' af.local_path_mp3.save(file_name, cf, save=False) # Save things so they can be referenced in a sec. docket.save() af.save(index=False) # Update the ID3 information and duration data. new_mp3_path = af.local_path_mp3.path logger.info("Updating mpr at: %s" % new_mp3_path) set_mp3_meta_data(af, new_mp3_path)
def scrape_court(self, site, full_crawl=False): download_error = False # Get the court object early for logging # opinions.united_states.federal.ca9_u --> ca9 court_str = site.court_id.split('.')[-1].split('_')[0] court = Court.objects.get(pk=court_str) dup_checker = DupChecker(court, full_crawl=full_crawl) abort = dup_checker.abort_by_url_hash(site.url, site.hash) if not abort: if site.cookies: logger.info("Using cookies: %s" % site.cookies) for i in range(0, len(site.case_names)): msg, r = get_binary_content(site.download_urls[i], site.cookies, site._get_adapter_instance(), method=site.method) if msg: logger.warn(msg) ErrorLog(log_level='WARNING', court=court, message=msg).save() continue content = site.cleanup_content(r.content) current_date = site.case_dates[i] try: next_date = site.case_dates[i + 1] except IndexError: next_date = None sha1_hash = hashlib.sha1(content).hexdigest() onwards = dup_checker.should_we_continue_break_or_carry_on( Audio, current_date, next_date, lookup_value=sha1_hash, lookup_by='sha1') if onwards == 'CONTINUE': # It's a duplicate, but we haven't hit any thresholds yet. continue elif onwards == 'BREAK': # It's a duplicate, and we hit a date or dup_count threshold. dup_checker.update_site_hash(sha1_hash) break elif onwards == 'CARRY_ON': # Not a duplicate, carry on logger.info('Adding new document found at: %s' % site.download_urls[i]) dup_checker.reset() docket, audio_file = self.associate_meta_data_to_objects( site, i, court, sha1_hash) # Make and associate the file object try: cf = ContentFile(content) extension = get_extension(content) if extension not in ['.mp3', '.wma']: extension = '.' + site.download_urls[i].rsplit( '.', 1)[1] # See bitbucket issue #215 for why this must be # lower-cased. file_name = trunc(site.case_names[i].lower(), 75) + extension audio_file.local_path_original_file.save(file_name, cf, save=False) except: msg = 'Unable to save binary to disk. Deleted document: % s.\n % s' % \ (site.case_names[i], traceback.format_exc()) logger.critical(msg) ErrorLog(log_level='CRITICAL', court=court, message=msg).save() download_error = True continue self.save_everything(docket, audio_file) random_delay = random.randint(0, 3600) process_audio_file.apply_async((audio_file.pk, ), countdown=random_delay) logger.info("Successfully added audio file %s: %s" % (audio_file.pk, site.case_names[i])) # Update the hash if everything finishes properly. logger.info("%s: Successfully crawled oral arguments." % site.court_id) if not download_error and not full_crawl: # Only update the hash if no errors occurred. dup_checker.update_site_hash(site.hash)
def save(self, *args, **kwargs): self.slug = trunc(slugify(self.case_name), 50) super(Docket, self).save(*args, **kwargs)
_ = subprocess.check_output( avconv_command, stderr=subprocess.STDOUT ) except subprocess.CalledProcessError, e: print 'avconv failed command: %s\nerror code: %s\noutput: %s\n' % \ (avconv_command, e.returncode, e.output) print traceback.format_exc() raise # Have to do this last because otherwise the mp3 hasn't yet been generated. set_mp3_meta_data(af, path_to_tmp_location) af.duration = eyed3.load(path_to_tmp_location).info.time_secs with open(path_to_tmp_location, 'r') as mp3: try: cf = ContentFile(mp3.read()) file_name = trunc(af.case_name.lower(), 72) + '_cl.mp3' af.local_path_mp3.save(file_name, cf, save=False) except: msg = "Unable to save mp3 to audio_file in scraper.tasks.process_" \ "audio_file for item: %s\nTraceback:\n%s" % \ (af.pk, traceback.format_exc()) ErrorLog(log_level='CRITICAL', court=af.docket.court, message=msg).save() af.processing_complete = True af.save() os.remove(path_to_tmp_location)
def title(self, obj): return "Cases Citing %s, Ordered by Filing Date" % \ trunc(str(obj.citation.case_name), 50)
def scrape_court(self, site, full_crawl=False): download_error = False # Get the court object early for logging # opinions.united_states.federal.ca9_u --> ca9 court_str = site.court_id.split('.')[-1].split('_')[0] court = Court.objects.get(pk=court_str) dup_checker = DupChecker(court, full_crawl=full_crawl) abort = dup_checker.abort_by_url_hash(site.url, site.hash) if not abort: if site.cookies: logger.info("Using cookies: %s" % site.cookies) for i in range(0, len(site.case_names)): msg, r = get_binary_content( site.download_urls[i], site.cookies, method=site.method ) if msg: logger.warn(msg) ErrorLog(log_level='WARNING', court=court, message=msg).save() continue current_date = site.case_dates[i] try: next_date = site.case_dates[i + 1] except IndexError: next_date = None sha1_hash = hashlib.sha1(r.content).hexdigest() onwards = dup_checker.should_we_continue_break_or_carry_on( Audio, current_date, next_date, lookup_value=sha1_hash, lookup_by='sha1' ) if onwards == 'CONTINUE': # It's a duplicate, but we haven't hit any thresholds yet. continue elif onwards == 'BREAK': # It's a duplicate, and we hit a date or dup_count threshold. dup_checker.update_site_hash(sha1_hash) break elif onwards == 'CARRY_ON': # Not a duplicate, carry on logger.info('Adding new document found at: %s' % site.download_urls[i]) dup_checker.reset() docket, audio_file = self.associate_meta_data_to_objects( site, i, court, sha1_hash) audio_file.docket = docket # Make and associate the file object try: cf = ContentFile(r.content) extension = get_extension(r.content) if extension not in ['.mp3', '.wma']: extension = '.' + site.download_urls[i].rsplit('.', 1)[1] # See bitbucket issue #215 for why this must be # lower-cased. file_name = trunc(site.case_names[i].lower(), 75) + extension audio_file.local_path_original_file.save(file_name, cf, save=False) except: msg = 'Unable to save binary to disk. Deleted document: % s.\n % s' % \ (site.case_names[i], traceback.format_exc()) logger.critical(msg) ErrorLog(log_level='CRITICAL', court=court, message=msg).save() download_error = True continue self.save_everything(docket, audio_file) random_delay = random.randint(0, 3600) process_audio_file.apply_async( (audio_file.pk,), countdown=random_delay ) logger.info("Successfully added audio file %s: %s" % (audio_file.pk, site.case_names[i])) # Update the hash if everything finishes properly. logger.info("%s: Successfully crawled oral arguments." % site.court_id) if not download_error and not full_crawl: # Only update the hash if no errors occurred. dup_checker.update_site_hash(site.hash)
# - constant bit rate (sample resolution) of 48kbps (-ab 48k) avconv_command = ['avconv', '-i', path_to_original, '-ac', '1', '-ar', '22050', '-ab', '48k', path_to_tmp_location] try: output = subprocess.check_output(avconv_command, stderr=subprocess.STDOUT) except subprocess.CalledProcessError, e: print 'avconv failed command: %s\nerror code: %s\noutput: %s\n' % \ (avconv_command, e.returncode, e.output) print traceback.format_exc() raise # Have to do this last because otherwise the mp3 hasn't yet been generated. file_name = trunc(audio_file.case_name.lower(), 72) + '_cl.mp3' set_mp3_meta_data(audio_file, path_to_tmp_location) audio_file.duration = eyed3.load(path_to_tmp_location).info.time_secs with open(path_to_tmp_location, 'r') as mp3: try: cf = ContentFile(mp3.read()) audio_file.local_path_mp3.save(file_name, cf, save=False) except: msg = "Unable to save mp3 to audio_file in scraper.tasks.process_" \ "audio_file for item: %s\nTraceback:\n%s" % \ (audio_file.pk, traceback.format_exc()) ErrorLog(log_level='CRITICAL', court=audio_file.docket.court, message=msg).save()