def handle(self, *args, **options): """ Make it happen. """ super(Command, self).handle(*args, **options) self.clean_zip_path = os.path.join( get_download_directory(), 'calaccess_cleaned.zip' ) self.data_dir = get_download_directory() if os.path.exists(self.data_dir): shutil.rmtree(self.data_dir) os.makedirs(self.data_dir) versions = RawDataVersion.objects.filter(clean_zip_archive='') if versions: for version in versions: logger.debug( 'Creating zip file for {:%Y-%m-%d_%H-%M-%S} version'.format( version.release_datetime ) ) self.download_clean_files(version) self.create_zip_file(version) self.archive_zip_file(version)
def handle(self, *args, **options): # Parse command-line options self.verbosity = int(options['verbosity']) self.max_lines_per_load = int(options['max_lines_per_load']) if options['agencies'] is None: self.agencies = [] else: self.agencies = options['agencies'].split(',') if options['years'] is None: self.years = [] else: self.years = options['years'].split(',') self.force = options['force'] # Compute properties self.data_dir = os.path.join(get_download_directory(), 'csv') self.combined_csv_path = os.path.join( self.data_dir, 'netfile_cal201_transaction.csv') self.connect2 = Connect2API() # Run the thing! if not options['skip_download']: self.download() if not options['skip_combine']: self.combine() if not options['skip_load']: self.cursor = connection.cursor() self.load()
def set_options(self, *args, **kwargs): self.url = 'http://campaignfinance.cdn.sos.ca.gov/dbwebexport.zip' if kwargs['test_data']: self.data_dir = get_test_download_directory() settings.CALACCESS_DOWNLOAD_DIR = self.data_dir else: self.data_dir = get_download_directory() os.path.exists(self.data_dir) or os.mkdir(self.data_dir) self.zip_path = os.path.join(self.data_dir, 'calaccess.zip') self.tsv_dir = os.path.join(self.data_dir, "tsv/") self.csv_dir = os.path.join(self.data_dir, "csv/") os.path.exists(self.csv_dir) or os.mkdir(self.csv_dir) if kwargs['download']: self.download_metadata = self.get_download_metadata() self.local_metadata = self.get_local_metadata() prompt_context = dict( last_updated=self.download_metadata['last-modified'], time_ago=naturaltime(self.download_metadata['last-modified']), size=size(self.download_metadata['content-length']), last_download=self.local_metadata['last-download'], download_dir=self.data_dir, ) self.prompt = render_to_string( 'calaccess_raw/downloadcalaccessrawdata.txt', prompt_context, ) self.verbosity = int(kwargs['verbosity'])
def set_options(self, *args, **kwargs): self.url = 'http://campaignfinance.cdn.sos.ca.gov/dbwebexport.zip' self.verbosity = int(kwargs['verbosity']) if kwargs['test_data']: self.data_dir = get_test_download_directory() settings.CALACCESS_DOWNLOAD_DIR = self.data_dir if self.verbosity: self.log("Using test data") else: self.data_dir = get_download_directory() os.path.exists(self.data_dir) or os.makedirs(self.data_dir) self.zip_path = os.path.join(self.data_dir, 'calaccess.zip') self.tsv_dir = os.path.join(self.data_dir, "tsv/") self.csv_dir = os.path.join(self.data_dir, "csv/") os.path.exists(self.csv_dir) or os.makedirs(self.csv_dir) if kwargs['download']: self.download_metadata = self.get_download_metadata() self.local_metadata = self.get_local_metadata() prompt_context = dict( last_updated=self.download_metadata['last-modified'], time_ago=naturaltime(self.download_metadata['last-modified']), size=size(self.download_metadata['content-length']), last_download=self.local_metadata['last-download'], download_dir=self.data_dir, ) self.prompt = render_to_string( 'calaccess_raw/downloadcalaccessrawdata.txt', prompt_context, )
def handle(self, *args, **options): """ Sets options common to all commands. Any command subclassing this object should implement its own handle method, as is standard in Django, and run this method via a super call to inherit its functionality. """ # Set global options self.verbosity = options.get("verbosity") self.no_color = options.get("no_color") # Start the clock self.start_datetime = timezone.now() # set up processed data directory self.data_dir = get_download_directory() self.processed_data_dir = os.path.join( self.data_dir, 'processed', ) if not os.path.exists(self.processed_data_dir): # make the processed data director os.makedirs(self.processed_data_dir) # set permissions to allow other users to write and execute os.chmod(self.processed_data_dir, 0o703)
def set_options(self, *args, **kwargs): self.url = 'http://campaignfinance.cdn.sos.ca.gov/dbwebexport.zip' self.data_dir = get_download_directory() os.path.exists(self.data_dir) or os.mkdir(self.data_dir) self.zip_path = os.path.join(self.data_dir, 'calaccess.zip') self.tsv_dir = os.path.join(self.data_dir, "tsv/") self.csv_dir = os.path.join(self.data_dir, "csv/") os.path.exists(self.csv_dir) or os.mkdir(self.csv_dir) if kwargs['download']: self.download_metadata = self.get_download_metadata() self.local_metadata = self.get_local_metadata() prompt_context = dict( last_updated=self.download_metadata['last-modified'], time_ago=naturaltime(self.download_metadata['last-modified']), size=size(self.download_metadata['content-length']), last_download=self.local_metadata['last-download'], download_dir=self.data_dir, ) self.prompt = render_to_string( 'calaccess_raw/downloadcalaccessrawdata.txt', prompt_context, ) self.pbar = progressbar.ProgressBar( widgets=[ progressbar.Percentage(), progressbar.Bar(), ' ', progressbar.ETA(), ' ', progressbar.FileTransferSpeed() ], maxval=self.download_metadata['content-length'] ) self.verbosity = int(kwargs['verbosity'])
def set_config(self, *args, **options): self.data_dir = get_download_directory() self.test_data_dir = get_test_download_directory() self.tsv_dir = os.path.join(self.data_dir, "tsv/") self.sample_dir = os.path.join(self.test_data_dir, "tsv/") self.sample_rows = int(options['samplerows']) self.tsv_list = os.listdir(self.tsv_dir) self.verbosity = int(options['verbosity'])
def handle_label(self, label, **options): # Set options self.verbosity = options.get("verbosity") self.data_dir = get_download_directory() self.tsv_dir = os.path.join(self.data_dir, "tsv/") self.csv_dir = os.path.join(self.data_dir, "csv/") # Do it self.clean(label)
def get_download_directory(): """ Returns the download directory where we will store downloaded data. """ if hasattr(settings, 'NETFILE_DOWNLOAD_DIR'): return getattr(settings, 'NETFILE_DOWNLOAD_DIR') else: return calaccess_raw.get_download_directory()
def handle(self, *args, **options): self.header("Loading summary totals") self.data_dir = get_download_directory() self.source_csv = os.path.join(self.data_dir, 'csv', 'smry_cd.csv') self.target_csv = os.path.join(self.data_dir, 'csv', 'smry_cd_transformed.csv') self.transform_csv() self.load_csv()
def handle_label(self, label, **options): # Set options self.verbosity = int(options.get("verbosity")) self.data_dir = get_download_directory() self.tsv_dir = os.path.join(self.data_dir, "tsv/") self.csv_dir = os.path.join(self.data_dir, "csv/") self.log_dir = os.path.join(self.data_dir, "log/") # Do it self.clean(label)
def handle(self, *args, **options): super(Command, self).handle(*args, **options) # Set options self.data_dir = get_download_directory() self.test_data_dir = get_test_download_directory() self.tsv_dir = os.path.join(self.data_dir, "tsv/") self.sample_dir = os.path.join(self.test_data_dir, "tsv/") self.sample_rows = int(options['samplerows']) self.tsv_list = os.listdir(self.tsv_dir) self.verbosity = int(options['verbosity']) self.header("Sampling %i rows from %s source files" % ( self.sample_rows, len(self.tsv_list), )) # Make sure sample dir exists and is empty os.path.exists(self.test_data_dir) or os.makedirs(self.test_data_dir) os.path.exists(self.sample_dir) and shutil.rmtree(self.sample_dir) os.makedirs(self.sample_dir) # Loop through all the files in the source directory for name in progress.bar(self.tsv_list): # Find the input file = os.path.join(self.tsv_dir, name) out_file = os.path.join(self.sample_dir, name) if self.verbosity > 2: self.log(" Sampling %s" % file) # Open the file fi = FileInput(file, True) # Generate our sample sample = two_pass_sample(fi, sample_size=self.sample_rows) # Open our output file with open(out_file, 'wb') as out: # Write it out for line in chain(fi.header, sample): out.write(line) self.header("Compressing zip file...") self.save_zip() # Stash the release_datetime and size of the last completed download version = self.command_logs.filter( command='downloadcalaccessrawdata', finish_datetime__isnull=False ).order_by('-start_datetime')[0].version with open(self.test_data_dir + '/sampled_version.txt', 'w') as f: f.write(str(version.release_datetime) + '\n') f.write(str(version.size))
def handle(self, *args, **options): super(Command, self).handle(*args, **options) # set / compute any attributes that multiple class methods need self.app_name = options["app_name"] self.database = options["database"] self.keep_files = options["keep_files"] if options['test_data']: # if using test data, we don't need to download options["download"] = False # and always keep files when running test data self.keep_files = True if options['test_data']: self.data_dir = get_test_download_directory() # need to set this app-wide because cleancalaccessrawfile # also calls get_download_directory settings.CALACCESS_DOWNLOAD_DIR = self.data_dir else: self.data_dir = get_download_directory() os.path.exists(self.data_dir) or os.makedirs(self.data_dir) self.zip_path = os.path.join(self.data_dir, 'calaccess.zip') self.zip_metadata_path = os.path.join(self.data_dir, '.lastdownload') self.tsv_dir = os.path.join(self.data_dir, "tsv/") # Immediately check that the tsv directory exists when using test data, # so we can stop immediately. if options['test_data']: if not os.path.exists(self.tsv_dir): raise CommandError("Data tsv directory does not exist " "at %s" % self.tsv_dir) elif self.verbosity: self.log("Using test data") self.csv_dir = os.path.join(self.data_dir, "csv/") os.path.exists(self.csv_dir) or os.makedirs(self.csv_dir) if options['download']: call_command( "downloadcalaccessrawdata", keep_files=self.keep_files, verbosity=self.verbosity, resume=options['resume'], noinput=options['noinput'] ) # execute the other steps that haven't been skipped if options['clean']: self.clean() if options['load']: self.load() if self.verbosity: self.success("Done!")
def set_options(self, *args, **kwargs): self.data_dir = os.path.join(get_download_directory(), 'csv') # Make sure directory exists os.path.exists(self.data_dir) or os.mkdir(self.data_dir) self.cursor = connection.cursor() # Quarterlies stuff self.quarterly_tmp_csv = tempfile.NamedTemporaryFile().name self.quarterly_target_csv = os.path.join(self.data_dir, 'expn_cd_transformed.csv')
def handle(self, *args, **options): self.header("Loading summary totals") self.data_dir = get_download_directory() self.source_csv = os.path.join(self.data_dir, 'csv', 'smry_cd.csv') self.target_csv = os.path.join( self.data_dir, 'csv', 'smry_cd_transformed.csv' ) self.transform_csv() self.load_csv()
def set_options(self, *args, **kwargs): self.data_dir = os.path.join(get_download_directory(), 'csv') self.cursor = connection.cursor() # Quarterlies stuff self.quarterly_tmp_csv = tempfile.NamedTemporaryFile().name self.quarterly_target_csv = os.path.join(self.data_dir, 'rcpt_cd_transformed.csv') # Late filings stuff self.late_tmp_csv = tempfile.NamedTemporaryFile().name self.late_target_csv = os.path.join(self.data_dir, 's497_cd_transformed.csv') self.late_tmp_table = "TMP_%s" % S497Cd._meta.db_table
def handle(self, *args, **options): self.verbosity = int(options['verbosity']) self.max_lines_per_load = int(options.get('max_lines_per_load', 1000)) self.data_dir = os.path.join(get_download_directory(), 'csv') self.zip_path = os.path.join(self.data_dir, 'zipcode_metro.zip') if not options['skip_download']: self.download() if not options['skip_load']: self.cursor = connection.cursor() self.load()
def handle(self, *args, **options): self.csv = None self.database = options['database'] self.verbosity = int(options['verbosity']) self.data_dir = os.path.join(get_download_directory(), 'csv') self.zip_path = os.path.join(self.data_dir, 'zipcode_metro.zip') if not options['skip_download']: self.download() if not options['skip_load']: self.load()
def set_options(self, *args, **kwargs): self.data_dir = os.path.join(get_download_directory(), 'csv') # Make sure directory exists os.path.exists(self.data_dir) or os.mkdir(self.data_dir) self.cursor = connection.cursor() # Quarterlies stuff self.quarterly_tmp_csv = tempfile.NamedTemporaryFile().name self.quarterly_target_csv = os.path.join( self.data_dir, 'expn_cd_transformed.csv' )
def handle(self, *args, **options): super(Command, self).handle(*args, **options) # Set options self.file_name = options['file_name'] self.data_dir = get_download_directory() self.tsv_dir = os.path.join(self.data_dir, "tsv/") self.csv_dir = os.path.join(self.data_dir, "csv/") self.log_dir = os.path.join(self.data_dir, "log/") if self.verbosity > 2: self.log(" Cleaning %s" % self.file_name) caller = self.get_caller() if caller: # if called by another command, use its version record self.version = caller.version self.log_record = self.command_logs.create( version=self.version, command=self, called_by=caller, file_name=self.file_name.upper().replace('.TSV', '') ) else: # try getting the most recent version try: self.version = self.raw_data_versions.latest('release_datetime') except RawDataVersion.DoesNotExist: # if there's no version, assume this is a test and do not log # TODO: Figure out a more direct way to handle this self.version = None else: self.log_record = self.command_logs.create( # if called by another command, use it's version version=self.version, command=self, file_name=self.file_name.upper().replace('.TSV', '') ) self.clean(options['file_name']) # unless keeping files, remove tsv files if not options['keep_files']: os.remove(os.path.join(self.tsv_dir, options['file_name'])) if self.version: # save the log record self.log_record.finish_datetime = datetime.now() self.log_record.save()
def handle(self, *args, **options): super(Command, self).handle(*args, **options) # Set options self.file_name = options['file_name'] self.data_dir = get_download_directory() self.tsv_dir = os.path.join(self.data_dir, "tsv/") self.csv_dir = os.path.join(self.data_dir, "csv/") self.log_dir = os.path.join(self.data_dir, "log/") if self.verbosity > 2: self.log(" Cleaning %s" % self.file_name) caller = self.get_caller_log() if caller: # if called by another command, use its version record self.version = caller.version self.log_record = self.command_logs.create( version=self.version, command=self, called_by=caller, file_name=self.file_name.upper().replace('.TSV', '') ) else: # try getting the most recent version try: self.version = self.raw_data_versions.latest('release_datetime') except RawDataVersion.DoesNotExist: # if there's no version, assume this is a test and do not log # TODO: Figure out a more direct way to handle this self.version = None else: self.log_record = self.command_logs.create( # if called by another command, use it's version version=self.version, command=self, file_name=self.file_name.upper().replace('.TSV', '') ) self.clean(options['file_name']) # unless keeping files, remove tsv files if not options['keep_files']: os.remove(os.path.join(self.tsv_dir, options['file_name'])) if self.version: # save the log record self.log_record.finish_datetime = datetime.now() self.log_record.save()
def set_options(self, *args, **kwargs): self.data_dir = os.path.join(get_download_directory(), 'csv') self.cursor = connection.cursor() # Quarterlies stuff self.quarterly_tmp_csv = tempfile.NamedTemporaryFile().name self.quarterly_target_csv = os.path.join( self.data_dir, 'rcpt_cd_transformed.csv' ) # Late filings stuff self.late_tmp_csv = tempfile.NamedTemporaryFile().name self.late_target_csv = os.path.join( self.data_dir, 's497_cd_transformed.csv' ) self.late_tmp_table = "TMP_%s" % S497Cd._meta.db_table
def handle(self, *args, **options): self.verbosity = int(options['verbosity']) self.max_lines_per_load = int(options.get('max_lines_per_load', 1000)) self.data_dir = os.path.join(get_download_directory(), 'csv') self.combined_csv_path = os.path.join( self.data_dir, 'netfile_cal201_transaction.csv') self.connect2 = Connect2API() if not options['skip_download']: self.download() if not options['skip_combine']: self.combine() if not options['skip_load']: self.cursor = connection.cursor() self.load()
def handle(self, *args, **options): super(Command, self).handle(*args, **options) # get the dir were data goes from app settings self.data_dir = get_download_directory() # if data_dir doesn't exist, create it os.path.exists(self.data_dir) or os.makedirs(self.data_dir) # downloaded zipfile will go in data_dir self.zip_path = os.path.join(self.data_dir, 'calaccess.zip') # so will the file where we track the last download self.zip_metadata_path = os.path.join( self.data_dir, '.lastdownload' ) self.tsv_dir = os.path.join(self.data_dir, "tsv/") self.csv_dir = os.path.join(self.data_dir, "csv/") os.path.exists(self.csv_dir) or os.makedirs(self.csv_dir) self.download_metadata = self.get_download_metadata() self.local_metadata = self.get_local_metadata() total_size = self.download_metadata['content-length'] last_modified = self.download_metadata['last-modified'] last_download = self.local_metadata['last-download'] cur_size = 0 # if the user tries to resume, also have to make sure there is a zip file self.resume_download = (options['resume'] and os.path.exists(self.zip_path)) if self.resume_download: # Make sure the downloaded chunk is newer than the # last update to the remote data. timestamp = os.path.getmtime(self.zip_path) chunk_datetime = datetime.fromtimestamp(timestamp, utc) self.resume_download = chunk_datetime > last_modified # reset this vars if still resuming if self.resume_download: last_download = chunk_datetime cur_size = os.path.getsize(self.zip_path) # setting up the prompt prompt_context = dict( resuming=self.resume_download, already_downloaded=last_modified == last_download, last_modified=last_modified, last_download=last_download, time_ago=naturaltime(last_download), total_size=size(total_size), cur_size=size(cur_size), download_dir=self.data_dir, ) self.prompt = render_to_string( 'calaccess_raw/downloadcalaccessrawdata.txt', prompt_context, ) # If we're taking user input, make sure the user says exactly 'yes' if not options['noinput'] and self.confirm_download() != 'yes': self.failure("Download cancelled") return self.download() self.unzip() if not options['keep_files']: os.remove(self.zip_path) self.prep() if not options['keep_files']: shutil.rmtree(os.path.join(self.data_dir, 'CalAccess'))
def handle(self, *args, **options): super(Command, self).handle(*args, **options) # set / compute any attributes that multiple class methods need self.app_name = options["app_name"] self.keep_files = options["keep_files"] if options['test_data']: # if using test data, we don't need to download options['download'] = False # and always keep files when running test data self.keep_files = True if options['test_data']: self.data_dir = get_test_download_directory() # need to set this app-wide because cleancalaccessrawfile # also calls get_download_directory settings.CALACCESS_DOWNLOAD_DIR = self.data_dir else: self.data_dir = get_download_directory() os.path.exists(self.data_dir) or os.makedirs(self.data_dir) self.zip_path = os.path.join(self.data_dir, 'calaccess.zip') self.tsv_dir = os.path.join(self.data_dir, "tsv/") # Immediately check that the tsv directory exists when using test data, # so we can stop immediately. if options['test_data']: if not os.path.exists(self.tsv_dir): raise CommandError("Data tsv directory does not exist " "at %s" % self.tsv_dir) elif self.verbosity: self.log("Using test data") self.csv_dir = os.path.join(self.data_dir, "csv/") os.path.exists(self.csv_dir) or os.makedirs(self.csv_dir) download_metadata = self.get_download_metadata() self.current_release_datetime = download_metadata['last-modified'] self.last_update = self.get_last_log() self.resume_download = self.check_can_resume_download() self.log_record = None # if this isn't a test if not options['test_data']: # and there's a previous update if self.last_update: # which did not finish if not self.last_update.finish_datetime: # and either can resume download or skipping it altogether if self.resume_download or not options['download']: # can resume self.log_record = self.last_update # if not testing, but can't resume if not self.log_record: # get or create a version # .get_or_create() throws IntegrityError try: version = self.raw_data_versions.get( release_datetime=self.current_release_datetime ) except RawDataVersion.DoesNotExist: version = self.raw_data_versions.create( release_datetime=self.current_release_datetime, size=download_metadata['content-length'] ) # create a new log record self.log_record = self.command_logs.create( version=version, command=self, called_by=self.get_caller() ) if options['download']: call_command( "downloadcalaccessrawdata", keep_files=self.keep_files, verbosity=self.verbosity, resume=self.resume_download, noinput=options['noinput'], ) if self.verbosity: self.duration() # execute the other steps that haven't been skipped if options['clean']: self.clean() if self.verbosity: self.duration() if options['load']: self.load() if self.verbosity: self.duration() if self.verbosity: self.success("Done!") if not options['test_data']: self.log_record.finish_datetime = datetime.now() self.log_record.save()
def get_tsv_path(self): return os.path.join( get_download_directory(), 'tsv', self.get_tsv_name() )
def get_tsv_path(self): return os.path.join(get_download_directory(), 'tsv', self.get_tsv_name())
def handle(self, *args, **options): super(Command, self).handle(*args, **options) # set / compute any attributes that multiple class methods need self.app_name = options["app_name"] self.keep_files = options["keep_files"] self.test_mode = options['test_data'] self.downloading = options['download'] self.cleaning = options['clean'] self.loading = options['load'] if self.test_mode: # if using test data, we don't need to download self.downloading = False # and always keep files when running test data self.keep_files = True self.data_dir = get_test_download_directory() # need to set this app-wide because cleancalaccessrawfile # also calls get_download_directory settings.CALACCESS_DOWNLOAD_DIR = self.data_dir else: self.data_dir = get_download_directory() os.path.exists(self.data_dir) or os.makedirs(self.data_dir) self.zip_path = os.path.join(self.data_dir, 'calaccess.zip') self.tsv_dir = os.path.join(self.data_dir, "tsv/") # Immediately check that the tsv directory exists when using test data, # so we can stop immediately. if self.test_mode: if not os.path.exists(self.tsv_dir): raise CommandError("Data tsv directory does not exist " "at %s" % self.tsv_dir) elif self.verbosity: self.log("Using test data") self.csv_dir = os.path.join(self.data_dir, "csv/") os.path.exists(self.csv_dir) or os.makedirs(self.csv_dir) download_metadata = self.get_download_metadata() current_release_datetime = download_metadata['last-modified'] last_started_update = self.get_last_log() try: last_download = self.command_logs.filter( command='downloadcalaccessrawdata').order_by( '-start_datetime')[0] except IndexError: last_download = None up_to_date = False can_resume = False # if there's a previously started update if last_started_update: # if current release datetime matches version of last started update if current_release_datetime == last_started_update.version.release_datetime: # if the last update finished if last_started_update.finish_datetime: up_to_date = True else: # if the last update didn't finish # (but is still for the current version) can_resume = True # if the last started update didn't finish elif not last_started_update.finish_datetime: # can resume update of old version as long as skipping download if not self.downloading: can_resume = True # or if there is a last download elif last_download: # and last download's version matches the outstanding update version if last_download.version == last_started_update.version: # and last download completed if last_download.finish_datetime: can_resume = True if options['noinput']: # if not taking input and can resume, automatically go into resume mode self.resume_mode = can_resume else: prompt_context = dict( current_release_datetime=current_release_datetime, expected_size=size(download_metadata['content-length']), up_to_date=up_to_date, can_resume=can_resume, ) last_finished_update = self.get_last_log(finished=True) if last_finished_update: loaded_v = last_finished_update.version prompt_context['since_loaded_version'] = naturaltime( loaded_v.release_datetime) else: prompt_context['since_loaded_version'] = None prompt = render_to_string( 'calaccess_raw/updatecalaccessrawdata.txt', prompt_context, ) if can_resume: if self.confirm_proceed(prompt): self.resume_mode = True else: self.resume_mode = False if not self.confirm_proceed( 'Do you want re-start your update?\n'): raise CommandError("Update cancelled") else: self.resume_mode = False if not self.confirm_proceed(prompt): raise CommandError("Update cancelled") if not self.test_mode: if self.resume_mode: self.log_record = last_started_update else: # get or create a version # .get_or_create() throws IntegrityError try: version = self.raw_data_versions.get( release_datetime=current_release_datetime) except RawDataVersion.DoesNotExist: version = self.raw_data_versions.create( release_datetime=current_release_datetime, size=download_metadata['content-length']) # create a new log record self.log_record = self.command_logs.create( version=version, command=self, called_by=self.get_caller_log()) # if the user could have resumed but didn't force_restart_download = can_resume and not self.resume_mode # if not skipping download, and there's a previous download if self.downloading and last_download: # if not forcing a restart if not force_restart_download: # check if version we are updating is last one being downloaded if self.log_record.version == last_download.version: # if it finished if last_download.finish_datetime: self.log('Already downloaded.') self.downloading = False if self.downloading: call_command( "downloadcalaccessrawdata", keep_files=self.keep_files, verbosity=self.verbosity, noinput=True, restart=force_restart_download, ) if self.verbosity: self.duration() # execute the other steps that haven't been skipped if options['clean']: self.clean() if self.verbosity: self.duration() if options['load']: self.load() if self.verbosity: self.duration() if self.verbosity: self.success("Done!") if not self.test_mode: self.log_record.finish_datetime = datetime.now() self.log_record.save()
def set_options(self, *args, **kwargs): self.url = 'http://campaignfinance.cdn.sos.ca.gov/dbwebexport.zip' self.verbosity = int(kwargs['verbosity']) self.database = kwargs['database'] if kwargs['test_data']: self.data_dir = get_test_download_directory() settings.CALACCESS_DOWNLOAD_DIR = self.data_dir else: self.data_dir = get_download_directory() os.path.exists(self.data_dir) or os.makedirs(self.data_dir) self.zip_path = os.path.join(self.data_dir, 'calaccess.zip') self.zip_metadata_path = os.path.join(self.data_dir, '.lastdownload') self.tsv_dir = os.path.join(self.data_dir, "tsv/") # Immediately check that the tsv directory exists when using test data, # so we can stop immediately. if kwargs['test_data']: if not os.path.exists(self.tsv_dir): raise CommandError("Data tsv directory does not exist " "at %s" % self.tsv_dir) elif self.verbosity: self.log("Using test data") self.csv_dir = os.path.join(self.data_dir, "csv/") os.path.exists(self.csv_dir) or os.makedirs(self.csv_dir) if kwargs['download']: self.download_metadata = self.get_download_metadata() self.local_metadata = self.get_local_metadata() total_size = self.download_metadata['content-length'] last_modified = self.download_metadata['last-modified'] last_download = self.local_metadata['last-download'] cur_size = 0 self.resume_download = (kwargs['resume-download'] and os.path.exists(self.zip_path)) if self.resume_download: # Make sure the downloaded chunk is newer than the # last update to the remote data. timestamp = os.path.getmtime(self.zip_path) chunk_datetime = datetime.fromtimestamp(timestamp, utc) self.resume_download = chunk_datetime > last_modified if self.resume_download: last_download = chunk_datetime cur_size = os.path.getsize(self.zip_path) prompt_context = dict( resuming=self.resume_download, already_downloaded=last_modified == last_download, last_modified=last_modified, last_download=last_download, time_ago=naturaltime(last_download), total_size=size(total_size), cur_size=size(cur_size), download_dir=self.data_dir, ) self.prompt = render_to_string( 'calaccess_raw/downloadcalaccessrawdata.txt', prompt_context, )
def handle(self, *args, **options): """ Make it happen. """ super(Command, self).handle(*args, **options) # get the most recently loaded raw data version try: self.raw_version = RawDataVersion.objects.complete()[0] except IndexError: raise CommandError( 'No raw CAL-ACCESS data loaded (run `python manage.py ' 'updatecalaccessrawdata`).' ) # set up processed data directory self.processed_data_dir = os.path.join( get_download_directory(), 'processed', ) if not os.path.exists(self.processed_data_dir): os.makedirs(self.processed_data_dir) # get or create the ProcessedDataVersion instance self.processed_version, created = ProcessedDataVersion.objects.get_or_create( raw_version=self.raw_version, ) # log if starting or resuming if created: self.header( 'Processing {:%m-%d-%Y %H:%M:%S} snapshot'.format( self.raw_version.release_datetime ) ) else: self.header( 'Resuming processing of {:%m-%d-%Y %H:%M:%S} snapshot'.format( self.raw_version.release_datetime ) ) # if there isn't already a process start datetime, set it if not self.processed_version.process_start_datetime: self.processed_version.process_start_datetime = now() self.processed_version.save() # get all of the models self.processed_models = get_models_to_process() # iterate over all of the processed models for m in self.processed_models: # set up the ProcessedDataFile instance processed_file, created = ProcessedDataFile.objects.get_or_create( version=self.processed_version, file_name=m._meta.model_name, ) processed_file.process_start_datetime = now() processed_file.save() # flush the processed model if self.verbosity > 2: self.log(" Truncating %s" % m._meta.db_table) with connection.cursor() as c: c.execute('TRUNCATE TABLE "%s" CASCADE' % (m._meta.db_table)) # load the processed model if self.verbosity > 2: self.log(" Loading raw data into %s" % m._meta.db_table) m.objects.load_raw_data() processed_file.records_count = m.objects.count() processed_file.process_finish_datetime = now() processed_file.save() self.processed_version.process_finish_datetime = now() self.processed_version.save() self.success("Done!")
def handle(self, *args, **options): super(Command, self).handle(*args, **options) # Set options self.file_name = options['file_name'] self.data_dir = get_download_directory() self.tsv_dir = os.path.join(self.data_dir, "tsv/") self.csv_dir = os.path.join(self.data_dir, "csv/") self.log_dir = os.path.join(self.data_dir, "log/") if self.verbosity > 2: self.log(" Cleaning %s" % self.file_name) # Up the CSV data limit csv.field_size_limit(1000000000) # Input and output paths tsv_path = os.path.join(self.tsv_dir, self.file_name) csv_path = os.path.join( self.csv_dir, self.file_name.lower().replace("tsv", "csv") ) # Reader tsv_file = open(tsv_path, 'rb') # Writer csv_file = open(csv_path, 'w') csv_writer = CSVKitWriter(csv_file, quoting=csv.QUOTE_ALL) # Pull and clean the headers try: headers = tsv_file.readline() except StopIteration: return headers = headers.decode("ascii", "replace") headers_csv = CSVKitReader(StringIO(headers), delimiter=str('\t')) try: headers_list = next(headers_csv) except StopIteration: return headers_count = len(headers_list) csv_writer.writerow(headers_list) log_rows = [] # Loop through the rest of the data line_number = 1 for tsv_line in tsv_file: # Goofing around with the encoding while we're in there. tsv_line = tsv_line.decode("ascii", "replace") if six.PY2: tsv_line = tsv_line.replace('\ufffd', '?') # Nuke any null bytes null_bytes = tsv_line.count('\x00') if null_bytes: tsv_line = tsv_line.replace('\x00', ' ') # Nuke ASCII 26 char, the "substitute character" # or chr(26) in python sub_char = tsv_line.count('\x1a') if sub_char: tsv_line = tsv_line.replace('\x1a', '') # Split on tabs so we can later spit it back out as CSV # and remove extra newlines while we are there. csv_field_list = tsv_line.replace("\r\n", "").split("\t") # Check if our values line up with our headers # and if not, see if CSVkit can sort out the problems if not len(csv_field_list) == headers_count: csv_field_list = next(CSVKitReader( StringIO(tsv_line), delimiter=str('\t') )) if not len(csv_field_list) == headers_count: if self.verbosity > 2: msg = ' Bad parse of line %s (%s headers, %s values)' self.failure(msg % ( line_number, len(headers_list), len(csv_field_list) )) log_rows.append([ line_number, len(headers_list), len(csv_field_list), ','.join(csv_field_list) ]) continue # Write out the row csv_writer.writerow(csv_field_list) line_number += 1 # Log errors if there are any if log_rows: if self.verbosity > 1: msg = ' %s errors' self.failure(msg % (len(log_rows) - 1)) self.log_errors(log_rows) # Shut it down tsv_file.close() csv_file.close() # unless keeping files, remove tsv files if not options['keep_files']: os.remove(os.path.join(self.tsv_dir, options['file_name']))
def handle(self, *args, **options): super(Command, self).handle(*args, **options) # set / compute any attributes that multiple class methods need self.app_name = options["app_name"] self.keep_files = options["keep_files"] self.test_mode = options['test_data'] self.downloading = options['download'] self.cleaning = options['clean'] self.loading = options['load'] self.noinput = options['noinput'] if self.test_mode: # and always keep files when running test data self.keep_files = True self.data_dir = get_test_download_directory() # need to set this app-wide because cleancalaccessrawfile # also calls get_download_directory settings.CALACCESS_DOWNLOAD_DIR = self.data_dir self.noinput = True else: self.data_dir = get_download_directory() os.path.exists(self.data_dir) or os.makedirs(self.data_dir) self.zip_path = os.path.join(self.data_dir, 'calaccess.zip') self.tsv_dir = os.path.join(self.data_dir, "tsv/") # Immediately check that the tsv directory exists when using test data, # so we can stop immediately. if self.test_mode: if not os.path.exists(self.tsv_dir): raise CommandError("Data tsv directory does not exist " "at %s" % self.tsv_dir) elif self.verbosity: self.log("Using test data") self.csv_dir = os.path.join(self.data_dir, "csv/") os.path.exists(self.csv_dir) or os.makedirs(self.csv_dir) if self.test_mode: with open(self.data_dir + "/sampled_version.txt", "r") as f: current_release_datetime = f.readline() expected_size = f.readline() else: download_metadata = self.get_download_metadata() current_release_datetime = download_metadata['last-modified'] expected_size = download_metadata['content-length'] last_started_update = self.get_last_log() if self.test_mode: last_download = None else: try: last_download = self.command_logs.filter( command='downloadcalaccessrawdata' ).order_by('-start_datetime')[0] except IndexError: last_download = None up_to_date = False can_resume = False # if there's a previously started update if last_started_update: # if current release datetime matches version of last started update if current_release_datetime == last_started_update.version.release_datetime: # if the last update finished if last_started_update.finish_datetime: up_to_date = True else: # if the last update didn't finish # (but is still for the current version) can_resume = True # if the last started update didn't finish elif not last_started_update.finish_datetime: # can resume update of old version as long as skipping download if not self.downloading: can_resume = True # or if there is a last download elif last_download: # and last download's version matches the outstanding update version if last_download.version == last_started_update.version: # and last download completed if last_download.finish_datetime: can_resume = True if self.noinput: # if not taking input and can resume, automatically go into resume mode self.resume_mode = can_resume else: prompt_context = dict( current_release_datetime=current_release_datetime, expected_size=size(expected_size), up_to_date=up_to_date, can_resume=can_resume, ) last_finished_update = self.get_last_log(finished=True) if last_finished_update: loaded_v = last_finished_update.version prompt_context['since_loaded_version'] = naturaltime(loaded_v.release_datetime) else: prompt_context['since_loaded_version'] = None prompt = render_to_string( 'calaccess_raw/updatecalaccessrawdata.txt', prompt_context, ) if can_resume: if self.confirm_proceed(prompt): self.resume_mode = True else: self.resume_mode = False if not self.confirm_proceed('Do you want re-start your update?\n'): raise CommandError("Update cancelled") else: self.resume_mode = False if not self.confirm_proceed(prompt): raise CommandError("Update cancelled") if self.resume_mode: self.log_record = last_started_update else: # get or create a version # .get_or_create() throws IntegrityError try: version = self.raw_data_versions.get( release_datetime=current_release_datetime ) except RawDataVersion.DoesNotExist: version = self.raw_data_versions.create( release_datetime=current_release_datetime, size=expected_size ) # create a new log record self.log_record = self.command_logs.create( version=version, command=self, called_by=self.get_caller_log() ) # if the user could have resumed but didn't force_restart_download = can_resume and not self.resume_mode # if not skipping download, and there's a previous download if self.downloading and last_download: # if not forcing a restart if not force_restart_download: # check if version we are updating is last one being downloaded if self.log_record.version == last_download.version: # if it finished if last_download.finish_datetime: self.log('Already downloaded.') self.downloading = False if self.downloading: if self.test_mode: call_command( "downloadcalaccessrawdatatest", verbosity=self.verbosity, ) else: call_command( "downloadcalaccessrawdata", keep_files=self.keep_files, verbosity=self.verbosity, noinput=True, restart=force_restart_download, ) if self.verbosity: self.duration() # execute the other steps that haven't been skipped if options['clean']: self.clean() if self.verbosity: self.duration() if options['load']: self.load() if self.verbosity: self.duration() if self.verbosity: self.success("Done!") self.log_record.finish_datetime = now() self.log_record.save()
def handle(self, *args, **options): super(Command, self).handle(*args, **options) # get the dir where data goes from app settings self.data_dir = get_download_directory() # if data_dir doesn't exist, create it os.path.exists(self.data_dir) or os.makedirs(self.data_dir) # downloaded zip file will go in data_dir self.zip_path = os.path.join(self.data_dir, self.url.split('/')[-1]) # raw tsv files go in same data_dir in tsv/ self.tsv_dir = os.path.join(self.data_dir, "tsv/") download_metadata = self.get_download_metadata() self.current_release_datetime = download_metadata['last-modified'] self.current_release_size = download_metadata['content-length'] self.last_started_download = self.get_last_log() self.last_finished_download = self.get_last_log(finished=True) if self.last_finished_download: last_release_datetime = self.last_finished_download.version.release_datetime since_prev_version = naturaltime(last_release_datetime) else: last_release_datetime = None since_prev_version = None if last_release_datetime == self.current_release_datetime: already_downloaded = True else: already_downloaded = False # can resume only if possible and not forcing re-start self.resume_download = self.check_can_resume() and not options['restart'] if self.resume_download: # set current size to partially downloaded zip self.local_file_size = os.path.getsize(self.zip_path) # set the datetime of last download to last modified date # of zip file timestamp = os.path.getmtime(self.zip_path) self.local_file_datetime = datetime.fromtimestamp(timestamp, utc) else: self.local_file_size = 0 self.local_file_datetime = None if not options['noinput'] and not options['restart']: # setting up the prompt prompt_context = dict( current_release_datetime=self.current_release_datetime, resuming=self.resume_download, already_downloaded=already_downloaded, expected_size=size(self.current_release_size), local_file_size=size(self.local_file_size), download_dir=self.data_dir, since_prev_version=since_prev_version, since_local_file_modified=naturaltime(self.local_file_datetime) ) prompt = render_to_string( 'calaccess_raw/downloadcalaccessrawdata.txt', prompt_context, ) if not self.confirm_proceed(prompt): raise CommandError("Download cancelled") if self.resume_download: self.log_record = self.last_started_download self.version = self.log_record.version else: # get or create a version record # .get_or_create() throws IntegrityError try: self.version = self.raw_data_versions.get( release_datetime=self.current_release_datetime ) except RawDataVersion.DoesNotExist: self.version = self.raw_data_versions.create( release_datetime=self.current_release_datetime, size=download_metadata['content-length'] ) # create a log record self.log_record = self.command_logs.create( version=self.version, command=self, called_by=self.get_caller_log() ) self.download() self.unzip() self.prep() self.track_files() if getattr(settings, 'CALACCESS_STORE_ARCHIVE', False): self.archive() if not options['keep_files']: os.remove(self.zip_path) shutil.rmtree(os.path.join(self.data_dir, 'CalAccess')) self.log_record.finish_datetime = now() self.log_record.save()
def handle(self, *args, **options): super(Command, self).handle(*args, **options) # get the dir where data goes from app settings self.data_dir = get_download_directory() # if data_dir doesn't exist, create it os.path.exists(self.data_dir) or os.makedirs(self.data_dir) # downloaded zip file will go in data_dir self.zip_path = os.path.join(self.data_dir, 'calaccess.zip') # raw tsv files go in same data_dir in tsv/ self.tsv_dir = os.path.join(self.data_dir, "tsv/") download_metadata = self.get_download_metadata() self.current_release_datetime = download_metadata['last-modified'] self.current_release_size = download_metadata['content-length'] self.last_started_download = self.get_last_log() self.last_finished_download = self.get_last_log(finished=True) if self.last_finished_download: last_release_datetime = self.last_finished_download.version.release_datetime since_prev_version = naturaltime(last_release_datetime) else: last_release_datetime = None since_prev_version = None if last_release_datetime == self.current_release_datetime: already_downloaded = True else: already_downloaded = False # can resume only if possible and not forcing re-start self.resume_download = self.check_can_resume( ) and not options['restart'] if self.resume_download: # set current size to partially downloaded zip self.local_file_size = os.path.getsize(self.zip_path) # set the datetime of last download to last modified date # of zip file timestamp = os.path.getmtime(self.zip_path) self.local_file_datetime = datetime.fromtimestamp(timestamp, utc) else: self.local_file_size = 0 self.local_file_datetime = None if not options['noinput'] and not options['restart']: # setting up the prompt prompt_context = dict( current_release_datetime=self.current_release_datetime, resuming=self.resume_download, already_downloaded=already_downloaded, expected_size=size(self.current_release_size), local_file_size=size(self.local_file_size), download_dir=self.data_dir, since_prev_version=since_prev_version, since_local_file_modified=naturaltime( self.local_file_datetime)) prompt = render_to_string( 'calaccess_raw/downloadcalaccessrawdata.txt', prompt_context, ) if not self.confirm_proceed(prompt): raise CommandError("Download cancelled") if self.resume_download: self.log_record = self.last_started_download else: # get or create a version record # .get_or_create() throws IntegrityError try: version = self.raw_data_versions.get( release_datetime=self.current_release_datetime) except RawDataVersion.DoesNotExist: version = self.raw_data_versions.create( release_datetime=self.current_release_datetime, size=download_metadata['content-length']) # create a log record self.log_record = self.command_logs.create( version=version, command=self, called_by=self.get_caller_log()) self.download() self.unzip() if not options['keep_files']: os.remove(self.zip_path) self.prep() if not options['keep_files']: shutil.rmtree(os.path.join(self.data_dir, 'CalAccess')) self.log_record.finish_datetime = datetime.now() self.log_record.save()
def set_options(self, *args, **kwargs): self.url = 'http://campaignfinance.cdn.sos.ca.gov/dbwebexport.zip' self.verbosity = int(kwargs['verbosity']) if kwargs['test_data']: self.data_dir = get_test_download_directory() settings.CALACCESS_DOWNLOAD_DIR = self.data_dir else: self.data_dir = get_download_directory() os.path.exists(self.data_dir) or os.makedirs(self.data_dir) self.zip_path = os.path.join(self.data_dir, 'calaccess.zip') self.tsv_dir = os.path.join(self.data_dir, "tsv/") # Immediately check that the tsv directory exists when using test data, # so we can stop immediately. if kwargs['test_data']: if not os.path.exists(self.tsv_dir): raise CommandError("Data tsv directory does not exist " "at %s" % self.tsv_dir) elif self.verbosity: self.log("Using test data") self.csv_dir = os.path.join(self.data_dir, "csv/") os.path.exists(self.csv_dir) or os.makedirs(self.csv_dir) if kwargs['download']: self.download_metadata = self.get_download_metadata() self.local_metadata = self.get_local_metadata() total_size = self.download_metadata['content-length'] last_modified = self.download_metadata['last-modified'] last_download = self.local_metadata['last-download'] cur_size = 0 self.resume_download = (kwargs['resume-download'] and os.path.exists(self.zip_path)) if self.resume_download: # Make sure the downloaded chunk is newer than the # last update to the remote data. timestamp = os.path.getmtime(self.zip_path) chunk_datetime = datetime.fromtimestamp(timestamp, utc) self.resume_download = chunk_datetime > last_modified if self.resume_download: last_download = chunk_datetime cur_size = os.path.getsize(self.zip_path) prompt_context = dict( resuming=self.resume_download, already_downloaded=last_modified==last_download, last_modified=last_modified, last_download=last_download, time_ago=naturaltime(last_download), total_size=size(total_size), cur_size=size(cur_size), download_dir=self.data_dir, ) self.prompt = render_to_string( 'calaccess_raw/downloadcalaccessrawdata.txt', prompt_context, )