def test_should_we_continue_break_or_carry_on_with_a_dup_found(self): # Set the dup_threshold to zero for this test self.dup_checkers = [ DupChecker(self.court, full_crawl=True, dup_threshold=0), DupChecker(self.court, full_crawl=False, dup_threshold=0) ] content = "this is dummy content that we hash" content_hash = hashlib.sha1(content).hexdigest() for dup_checker in self.dup_checkers: # Create a document, then use the dup_checker to see if it exists. docket = Docket(court=self.court) docket.save() doc = Document(sha1=content_hash, docket=docket) doc.save(index=False) onwards = dup_checker.should_we_continue_break_or_carry_on( Document, now(), now(), lookup_value=content_hash, lookup_by='sha1') if dup_checker.full_crawl: self.assertEqual( onwards, 'CONTINUE', 'DupChecker says to %s during a full crawl.' % onwards) else: self.assertEqual( onwards, 'BREAK', "DupChecker says to %s but there should be a duplicate in " "the database. dup_count is %s, and dup_threshold is %s" % (onwards, dup_checker.dup_count, dup_checker.dup_threshold)) doc.delete()
def scrape_court(self, site, full_crawl=False): download_error = False # Get the court object early for logging # opinions.united_states.federal.ca9_u --> ca9 court_str = site.court_id.split('.')[-1].split('_')[0] court = Court.objects.get(pk=court_str) dup_checker = DupChecker(court, full_crawl=full_crawl) abort = dup_checker.abort_by_url_hash(site.url, site.hash) if not abort: if site.cookies: logger.info("Using cookies: %s" % site.cookies) for i in range(0, len(site.case_names)): msg, r = get_binary_content( site.download_urls[i], site.cookies, method=site.method ) if msg: logger.warn(msg) ErrorLog(log_level='WARNING', court=court, message=msg).save() continue current_date = site.case_dates[i] try: next_date = site.case_dates[i + 1] except IndexError: next_date = None sha1_hash = hashlib.sha1(r.content).hexdigest() onwards = dup_checker.should_we_continue_break_or_carry_on( Audio, current_date, next_date, lookup_value=sha1_hash, lookup_by='sha1' ) if onwards == 'CONTINUE': # It's a duplicate, but we haven't hit any thresholds yet. continue elif onwards == 'BREAK': # It's a duplicate, and we hit a date or dup_count threshold. dup_checker.update_site_hash(sha1_hash) break elif onwards == 'CARRY_ON': # Not a duplicate, carry on logger.info('Adding new document found at: %s' % site.download_urls[i]) dup_checker.reset() docket, audio_file = self.associate_meta_data_to_objects( site, i, court, sha1_hash) audio_file.docket = docket # Make and associate the file object try: cf = ContentFile(r.content) extension = get_extension(r.content) if extension not in ['.mp3', '.wma']: extension = '.' + site.download_urls[i].rsplit('.', 1)[1] # See bitbucket issue #215 for why this must be # lower-cased. file_name = trunc(site.case_names[i].lower(), 75) + extension audio_file.local_path_original_file.save(file_name, cf, save=False) except: msg = 'Unable to save binary to disk. Deleted document: % s.\n % s' % \ (site.case_names[i], traceback.format_exc()) logger.critical(msg) ErrorLog(log_level='CRITICAL', court=court, message=msg).save() download_error = True continue self.save_everything(docket, audio_file) random_delay = random.randint(0, 3600) process_audio_file.apply_async( (audio_file.pk,), countdown=random_delay ) logger.info("Successfully added audio file %s: %s" % (audio_file.pk, site.case_names[i])) # Update the hash if everything finishes properly. logger.info("%s: Successfully crawled oral arguments." % site.court_id) if not download_error and not full_crawl: # Only update the hash if no errors occurred. dup_checker.update_site_hash(site.hash)
def scrape_court(self, site, full_crawl=False): download_error = False # Get the court object early for logging # opinions.united_states.federal.ca9_u --> ca9 court_str = site.court_id.split('.')[-1].split('_')[0] court = Court.objects.get(pk=court_str) dup_checker = DupChecker(court, full_crawl=full_crawl) abort = dup_checker.abort_by_url_hash(site.url, site.hash) if not abort: if site.cookies: logger.info("Using cookies: %s" % site.cookies) for i in range(0, len(site.case_names)): msg, r = get_binary_content(site.download_urls[i], site.cookies, method=site.method) if msg: logger.warn(msg) ErrorLog(log_level='WARNING', court=court, message=msg).save() continue current_date = site.case_dates[i] try: next_date = site.case_dates[i + 1] except IndexError: next_date = None # Make a hash of the data sha1_hash = hashlib.sha1(r.content).hexdigest() if court_str == 'nev' and site.precedential_statuses[ i] == 'Unpublished': # Nevada's non-precedential cases have different SHA1 sums every time. onwards = dup_checker.should_we_continue_break_or_carry_on( Document, current_date, next_date, lookup_value=site.download_urls[i], lookup_by='download_url') else: onwards = dup_checker.should_we_continue_break_or_carry_on( Document, current_date, next_date, lookup_value=sha1_hash, lookup_by='sha1') if onwards == 'CONTINUE': # It's a duplicate, but we haven't hit any thresholds yet. continue elif onwards == 'BREAK': # It's a duplicate, and we hit a date or dup_count threshold. dup_checker.update_site_hash(sha1_hash) break elif onwards == 'CARRY_ON': # Not a duplicate, carry on logger.info('Adding new document found at: %s' % site.download_urls[i]) dup_checker.reset() cite, docket, doc = self.associate_meta_data_to_objects( site, i, court, sha1_hash) # Make and associate the file object try: cf = ContentFile(r.content) extension = get_extension(r.content) # See bitbucket issue #215 for why this must be # lower-cased. file_name = trunc(site.case_names[i].lower(), 75) + extension doc.local_path.save(file_name, cf, save=False) except: msg = 'Unable to save binary to disk. Deleted document: % s.\n % s' % \ (site.case_names[i], traceback.format_exc()) logger.critical(msg) ErrorLog(log_level='CRITICAL', court=court, message=msg).save() download_error = True continue # Save everything, but don't update Solr index yet self.save_everything(cite, docket, doc, index=False) random_delay = random.randint(0, 3600) extract_doc_content.delay(doc.pk, callback=subtask(extract_by_ocr), citation_countdown=random_delay) logger.info("Successfully added doc {pk}: {name}".format( pk=doc.pk, name=site.case_names[i])) # Update the hash if everything finishes properly. logger.info("%s: Successfully crawled opinions." % site.court_id) if not download_error and not full_crawl: # Only update the hash if no errors occurred. dup_checker.update_site_hash(site.hash)
def scrape_court(self, site, full_crawl=False): download_error = False # Get the court object early for logging # opinions.united_states.federal.ca9_u --> ca9 court_str = site.court_id.split(".")[-1].split("_")[0] court = Court.objects.get(pk=court_str) dup_checker = DupChecker(court, full_crawl=full_crawl) abort = dup_checker.abort_by_url_hash(site.url, site.hash) if not abort: if site.cookies: logger.info("Using cookies: %s" % site.cookies) for i in range(0, len(site.case_names)): msg, r = get_binary_content( site.download_urls[i], site.cookies, site._get_adapter_instance(), method=site.method ) if msg: logger.warn(msg) ErrorLog(log_level="WARNING", court=court, message=msg).save() continue content = site.cleanup_content(r.content) current_date = site.case_dates[i] try: next_date = site.case_dates[i + 1] except IndexError: next_date = None # Make a hash of the data if isinstance(content, unicode): sha1_hash = hashlib.sha1(content.encode("utf-8")).hexdigest() else: sha1_hash = hashlib.sha1(content).hexdigest() if court_str == "nev" and site.precedential_statuses[i] == "Unpublished": # Nevada's non-precedential cases have different SHA1 # sums every time. onwards = dup_checker.should_we_continue_break_or_carry_on( Document, current_date, next_date, lookup_value=site.download_urls[i], lookup_by="download_url" ) else: onwards = dup_checker.should_we_continue_break_or_carry_on( Document, current_date, next_date, lookup_value=sha1_hash, lookup_by="sha1" ) if onwards == "CONTINUE": # It's a duplicate, but we haven't hit any thresholds yet. continue elif onwards == "BREAK": # It's a duplicate, and we hit a date or dup_count # threshold. dup_checker.update_site_hash(sha1_hash) break elif onwards == "CARRY_ON": # Not a duplicate, carry on logger.info("Adding new document found at: %s" % site.download_urls[i].encode("utf-8")) dup_checker.reset() cite, docket, doc = self.associate_meta_data_to_objects(site, i, court, sha1_hash) # Make and associate the file object try: cf = ContentFile(content) extension = get_extension(content) # See bitbucket issue #215 for why this must be # lower-cased. file_name = trunc(site.case_names[i].lower(), 75) + extension doc.local_path.save(file_name, cf, save=False) except: msg = "Unable to save binary to disk. Deleted " "document: % s.\n % s" % ( site.case_names[i], traceback.format_exc(), ) logger.critical(msg.encode("utf-8")) ErrorLog(log_level="CRITICAL", court=court, message=msg).save() download_error = True continue # Save everything, but don't update Solr index yet self.save_everything(cite, docket, doc, index=False) random_delay = random.randint(0, 3600) extract_doc_content.delay(doc.pk, callback=subtask(extract_by_ocr), citation_countdown=random_delay) logger.info( "Successfully added doc {pk}: {name}".format(pk=doc.pk, name=site.case_names[i].encode("utf-8")) ) # Update the hash if everything finishes properly. logger.info("%s: Successfully crawled opinions." % site.court_id) if not download_error and not full_crawl: # Only update the hash if no errors occurred. dup_checker.update_site_hash(site.hash)
def scrape_court(self, site, full_crawl=False): download_error = False # Get the court object early for logging # opinions.united_states.federal.ca9_u --> ca9 court_str = site.court_id.split('.')[-1].split('_')[0] court = Court.objects.get(pk=court_str) dup_checker = DupChecker(court, full_crawl=full_crawl) abort = dup_checker.abort_by_url_hash(site.url, site.hash) if not abort: if site.cookies: logger.info("Using cookies: %s" % site.cookies) for i in range(0, len(site.case_names)): msg, r = get_binary_content(site.download_urls[i], site.cookies, site._get_adapter_instance(), method=site.method) if msg: logger.warn(msg) ErrorLog(log_level='WARNING', court=court, message=msg).save() continue content = site.cleanup_content(r.content) current_date = site.case_dates[i] try: next_date = site.case_dates[i + 1] except IndexError: next_date = None sha1_hash = hashlib.sha1(content).hexdigest() onwards = dup_checker.should_we_continue_break_or_carry_on( Audio, current_date, next_date, lookup_value=sha1_hash, lookup_by='sha1') if onwards == 'CONTINUE': # It's a duplicate, but we haven't hit any thresholds yet. continue elif onwards == 'BREAK': # It's a duplicate, and we hit a date or dup_count threshold. dup_checker.update_site_hash(sha1_hash) break elif onwards == 'CARRY_ON': # Not a duplicate, carry on logger.info('Adding new document found at: %s' % site.download_urls[i]) dup_checker.reset() docket, audio_file = self.associate_meta_data_to_objects( site, i, court, sha1_hash) # Make and associate the file object try: cf = ContentFile(content) extension = get_extension(content) if extension not in ['.mp3', '.wma']: extension = '.' + site.download_urls[i].rsplit( '.', 1)[1] # See bitbucket issue #215 for why this must be # lower-cased. file_name = trunc(site.case_names[i].lower(), 75) + extension audio_file.local_path_original_file.save(file_name, cf, save=False) except: msg = 'Unable to save binary to disk. Deleted document: % s.\n % s' % \ (site.case_names[i], traceback.format_exc()) logger.critical(msg) ErrorLog(log_level='CRITICAL', court=court, message=msg).save() download_error = True continue self.save_everything(docket, audio_file) random_delay = random.randint(0, 3600) process_audio_file.apply_async((audio_file.pk, ), countdown=random_delay) logger.info("Successfully added audio file %s: %s" % (audio_file.pk, site.case_names[i])) # Update the hash if everything finishes properly. logger.info("%s: Successfully crawled oral arguments." % site.court_id) if not download_error and not full_crawl: # Only update the hash if no errors occurred. dup_checker.update_site_hash(site.hash)
def setUp(self): self.court = Court.objects.get(pk='test') self.dup_checkers = [ DupChecker(self.court, full_crawl=True), DupChecker(self.court, full_crawl=False) ]
def scrape_court(site, full_crawl=False): download_error = False # Get the court object early for logging # opinions.united_states.federal.ca9_u --> ca9 court_str = site.court_id.split('.')[-1].split('_')[0] court = Court.objects.get(pk=court_str) dup_checker = DupChecker(site.court_id, full_crawl=full_crawl) abort = dup_checker.abort_by_hash(site.hash) if not abort: for i in range(0, len(site.case_names)): msg, r = get_binary_content(site.download_urls[i], site._get_cookies()) clean_content = site._cleanup_content(r.content) if msg: logger.warn(msg) ErrorLog(log_level='WARNING', court=court, message=msg).save() continue current_date = site.case_dates[i] try: next_date = site.case_dates[i + 1] except IndexError: next_date = None # Make a hash of the data. Need to convert unicode to binary before hashing. if type(clean_content) == unicode: hash_content = clean_content.encode('utf-8') else: hash_content = clean_content sha1_hash = hashlib.sha1(hash_content).hexdigest() if court_str == 'nev' and site.precedential_statuses[i] == 'Unpublished': # Nevada's non-precedential cases have different SHA1 sums every time. onwards = dup_checker.should_we_continue_break_or_carry_on( current_date, next_date, lookup_value=site.download_urls[i], lookup_by='download_url' ) else: onwards = dup_checker.should_we_continue_break_or_carry_on( current_date, next_date, lookup_value=sha1_hash, lookup_by='sha1' ) if onwards == 'CONTINUE': # It's a duplicate, but we haven't hit any thresholds yet. continue elif onwards == 'BREAK': # It's a duplicate, and we hit a date or dup_count threshold. dup_checker.update_site_hash(sha1_hash) break elif onwards == 'CARRY_ON': # Not a duplicate, carry on logger.info('Adding new document found at: %s' % site.download_urls[i]) dup_checker.reset() # Make a citation cite = Citation(case_name=site.case_names[i]) if site.docket_numbers: cite.docket_number = site.docket_numbers[i] if site.neutral_citations: cite.neutral_cite = site.neutral_citations[i] if site.west_citations: cite.federal_cite_one = site.west_citations[i] if site.west_state_citations: cite.west_state_cite = site.west_state_citations[i] # Make the document object doc = Document(source='C', sha1=sha1_hash, date_filed=site.case_dates[i], court=court, download_url=site.download_urls[i], precedential_status=site.precedential_statuses[i]) # Make and associate the file object try: cf = ContentFile(clean_content) extension = get_extension(r.content) # See issue #215 for why this must be lower-cased. file_name = trunc(site.case_names[i].lower(), 75) + extension doc.local_path.save(file_name, cf, save=False) except: msg = 'Unable to save binary to disk. Deleted document: % s.\n % s' % \ (cite.case_name, traceback.format_exc()) logger.critical(msg) ErrorLog(log_level='CRITICAL', court=court, message=msg).save() download_error = True continue if site.judges: doc.judges = site.judges[i] if site.nature_of_suit: doc.nature_of_suit = site.nature_of_suit[i] # Save everything, but don't update Solr index yet cite.save(index=False) doc.citation = cite doc.save(index=False) # Extract the contents asynchronously. extract_doc_content(doc.pk, callback=subtask(extract_by_ocr)) logger.info("Successfully added doc %s: %s" % (doc.pk, site.case_names[i])) # Update the hash if everything finishes properly. logger.info("%s: Successfully crawled." % site.court_id) if not download_error and not full_crawl: # Only update the hash if no errors occurred. dup_checker.update_site_hash(site.hash)