def scrape_court(self, site, full_crawl=False): download_error = False # Get the court object early for logging # opinions.united_states.federal.ca9_u --> ca9 court_str = site.court_id.split('.')[-1].split('_')[0] court = Court.objects.get(pk=court_str) dup_checker = DupChecker(court, full_crawl=full_crawl) abort = dup_checker.abort_by_url_hash(site.url, site.hash) if not abort: if site.cookies: logger.info("Using cookies: %s" % site.cookies) for i in range(0, len(site.case_names)): msg, r = get_binary_content( site.download_urls[i], site.cookies, method=site.method ) if msg: logger.warn(msg) ErrorLog(log_level='WARNING', court=court, message=msg).save() continue current_date = site.case_dates[i] try: next_date = site.case_dates[i + 1] except IndexError: next_date = None sha1_hash = hashlib.sha1(r.content).hexdigest() onwards = dup_checker.should_we_continue_break_or_carry_on( Audio, current_date, next_date, lookup_value=sha1_hash, lookup_by='sha1' ) if onwards == 'CONTINUE': # It's a duplicate, but we haven't hit any thresholds yet. continue elif onwards == 'BREAK': # It's a duplicate, and we hit a date or dup_count threshold. dup_checker.update_site_hash(sha1_hash) break elif onwards == 'CARRY_ON': # Not a duplicate, carry on logger.info('Adding new document found at: %s' % site.download_urls[i]) dup_checker.reset() docket, audio_file = self.associate_meta_data_to_objects( site, i, court, sha1_hash) audio_file.docket = docket # Make and associate the file object try: cf = ContentFile(r.content) extension = get_extension(r.content) if extension not in ['.mp3', '.wma']: extension = '.' + site.download_urls[i].rsplit('.', 1)[1] # See bitbucket issue #215 for why this must be # lower-cased. file_name = trunc(site.case_names[i].lower(), 75) + extension audio_file.local_path_original_file.save(file_name, cf, save=False) except: msg = 'Unable to save binary to disk. Deleted document: % s.\n % s' % \ (site.case_names[i], traceback.format_exc()) logger.critical(msg) ErrorLog(log_level='CRITICAL', court=court, message=msg).save() download_error = True continue self.save_everything(docket, audio_file) random_delay = random.randint(0, 3600) process_audio_file.apply_async( (audio_file.pk,), countdown=random_delay ) logger.info("Successfully added audio file %s: %s" % (audio_file.pk, site.case_names[i])) # Update the hash if everything finishes properly. logger.info("%s: Successfully crawled oral arguments." % site.court_id) if not download_error and not full_crawl: # Only update the hash if no errors occurred. dup_checker.update_site_hash(site.hash)
def scrape_court(self, site, full_crawl=False): download_error = False # Get the court object early for logging # opinions.united_states.federal.ca9_u --> ca9 court_str = site.court_id.split('.')[-1].split('_')[0] court = Court.objects.get(pk=court_str) dup_checker = DupChecker(court, full_crawl=full_crawl) abort = dup_checker.abort_by_url_hash(site.url, site.hash) if not abort: if site.cookies: logger.info("Using cookies: %s" % site.cookies) for i in range(0, len(site.case_names)): msg, r = get_binary_content(site.download_urls[i], site.cookies, site._get_adapter_instance(), method=site.method) if msg: logger.warn(msg) ErrorLog(log_level='WARNING', court=court, message=msg).save() continue content = site.cleanup_content(r.content) current_date = site.case_dates[i] try: next_date = site.case_dates[i + 1] except IndexError: next_date = None sha1_hash = hashlib.sha1(content).hexdigest() onwards = dup_checker.should_we_continue_break_or_carry_on( Audio, current_date, next_date, lookup_value=sha1_hash, lookup_by='sha1') if onwards == 'CONTINUE': # It's a duplicate, but we haven't hit any thresholds yet. continue elif onwards == 'BREAK': # It's a duplicate, and we hit a date or dup_count threshold. dup_checker.update_site_hash(sha1_hash) break elif onwards == 'CARRY_ON': # Not a duplicate, carry on logger.info('Adding new document found at: %s' % site.download_urls[i]) dup_checker.reset() docket, audio_file = self.associate_meta_data_to_objects( site, i, court, sha1_hash) # Make and associate the file object try: cf = ContentFile(content) extension = get_extension(content) if extension not in ['.mp3', '.wma']: extension = '.' + site.download_urls[i].rsplit( '.', 1)[1] # See bitbucket issue #215 for why this must be # lower-cased. file_name = trunc(site.case_names[i].lower(), 75) + extension audio_file.local_path_original_file.save(file_name, cf, save=False) except: msg = 'Unable to save binary to disk. Deleted document: % s.\n % s' % \ (site.case_names[i], traceback.format_exc()) logger.critical(msg) ErrorLog(log_level='CRITICAL', court=court, message=msg).save() download_error = True continue self.save_everything(docket, audio_file) random_delay = random.randint(0, 3600) process_audio_file.apply_async((audio_file.pk, ), countdown=random_delay) logger.info("Successfully added audio file %s: %s" % (audio_file.pk, site.case_names[i])) # Update the hash if everything finishes properly. logger.info("%s: Successfully crawled oral arguments." % site.court_id) if not download_error and not full_crawl: # Only update the hash if no errors occurred. dup_checker.update_site_hash(site.hash)
def download_and_save(): """This function is run in many threads simultaneously. Each thread runs so long as there are items in the queue. Once an item is found, it's downloaded and saved. The number of items that can be concurrently saved is determined by the number of threads that are running this function. """ while True: item = queue.get() logger.info("Attempting to add item at: %s" % item['url']) try: msg, r = get_binary_content( item['url'], {}, ) except: logger.info("Unable to get item at: %s" % item['url']) queue.task_done() if msg: logger.warn(msg) queue.task_done() sha1_hash = hashlib.sha1(r.content).hexdigest() if Audio.objects.filter(sha1=sha1_hash).exists(): # Simpsons did it! Try the next one. logger.info("Item already exists, moving to next item.") queue.task_done() else: # New item, onwards! logger.info('Adding new document found at: %s' % item['url']) audio_file = Audio( source='H', sha1=sha1_hash, case_name=item['case_name'], date_argued=item['date_argued'], download_url=item['url'], processing_complete=False, ) if item['judges']: audio_file.judges = item['judges'] if item['docket_number']: audio_file.docket_number = item['docket_number'] court = Court.objects.get(pk=item['court_code']) docket = Docket( case_name=item['case_name'], court=court, ) # Make and associate the file object try: cf = ContentFile(r.content) extension = get_extension(r.content) if extension not in ['.mp3', '.wma']: extension = '.' + item['url'].rsplit('.', 1)[1] # See bitbucket issue #215 for why this must be # lower-cased. file_name = trunc(item['case_name'].lower(), 75) + extension audio_file.local_path_original_file.save(file_name, cf, save=False) except: msg = 'Unable to save binary to disk. Deleted document: % s.\n % s' % \ (item['case_name'], traceback.format_exc()) logger.critical(msg) queue.task_done() docket.save() audio_file.docket = docket audio_file.save(index=False) random_delay = random.randint(0, 3600) process_audio_file.apply_async((audio_file.pk, ), countdown=random_delay) logger.info("Successfully added audio file %s: %s" % (audio_file.pk, audio_file.case_name))
def download_and_save(): """This function is run in many threads simultaneously. Each thread runs so long as there are items in the queue. Once an item is found, it's downloaded and saved. The number of items that can be concurrently saved is determined by the number of threads that are running this function. """ while True: item = queue.get() logger.info("Attempting to add item at: %s" % item['url']) try: msg, r = get_binary_content( item['url'], {}, ) except: logger.info("Unable to get item at: %s" % item['url']) queue.task_done() if msg: logger.warn(msg) queue.task_done() sha1_hash = hashlib.sha1(r.content).hexdigest() if Audio.objects.filter(sha1=sha1_hash).exists(): # Simpsons did it! Try the next one. logger.info("Item already exists, moving to next item.") queue.task_done() else: # New item, onwards! logger.info('Adding new document found at: %s' % item['url']) audio_file = Audio( source='H', sha1=sha1_hash, case_name=item['case_name'], date_argued=item['date_argued'], download_url=item['url'], processing_complete=False, ) if item['judges']: audio_file.judges = item['judges'] if item['docket_number']: audio_file.docket_number = item['docket_number'] court = Court.objects.get(pk=item['court_code']) docket = Docket( case_name=item['case_name'], court=court, ) # Make and associate the file object try: cf = ContentFile(r.content) extension = get_extension(r.content) if extension not in ['.mp3', '.wma']: extension = '.' + item['url'].rsplit('.', 1)[1] # See bitbucket issue #215 for why this must be # lower-cased. file_name = trunc(item['case_name'].lower(), 75) + extension audio_file.local_path_original_file.save(file_name, cf, save=False) except: msg = 'Unable to save binary to disk. Deleted document: % s.\n % s' % \ (item['case_name'], traceback.format_exc()) logger.critical(msg) queue.task_done() docket.save() audio_file.docket = docket audio_file.save(index=False) random_delay = random.randint(0, 3600) process_audio_file.apply_async( (audio_file.pk,), countdown=random_delay ) logger.info("Successfully added audio file %s: %s" % ( audio_file.pk, audio_file.case_name))