def profile_as_job(database_file): profiler = Profiler.Profiler() global STOP_RUNNING if STOP_RUNNING: return try: pid = str(os.getpid()) print 'Begin: [' + pid + ']: ' + database_file profiler.profile(database_file) print 'End: [' + pid + ']: ' + database_file except KeyboardInterrupt: App.error('KeyboardInterrupt with: ' + database_file) STOP_RUNNING = ApplicationOptions.OPTIONS['stop_on_error'] except: msg = '[' + pid + '] ERROR in THREAD:\n' msg += '[' + pid + '] -----------------------------------------------------------------\n' for line in traceback.format_exc().split('\n'): msg += '[' + pid + '] ' + line + '\n' msg += '[' + pid + '] -----------------------------------------------------------------' # # Will print colored here instead of app.error as facilitates reading error output and debuging # print tc.RED + msg + tc.ENDC ApplicationOptions.error(msg) # raise finally: return profiler
def save_complete_dataset(row, count, summaries, types, column_metadata, gps_counts, report_progress=False): pid = str(os.getpid()) try: db = save_database(row, count, summaries) save_columns(db, types, column_metadata) if not IGNORE_INDEX: if COPY_GEO_INDEX: # link_gps_data(db.database_id, db.id) print ' -> GPS Data will be linked to datasets at end of processing.' else: save_gps_data(db, gps_counts) # return db except: msg = '[' + pid + '] ERROR in THREAD:\n' msg += '[' + pid + '] -----------------------------------------------------------------\n' msg += '[' + pid + '] ' + traceback.format_exc() + '\n' msg += '[' + pid + '] -----------------------------------------------------------------' ApplicationOptions.error(msg) raise if report_progress: report(db)
def run_bash(bash_command, folder=None): cmd = bash_command if type(cmd) == str: cmd = cmd.split() if folder is None: process = subprocess.Popen(cmd, stdout=subprocess.PIPE) else: process = subprocess.Popen(cmd, stdout=subprocess.PIPE, cwd=folder, stderr=subprocess.STDOUT) stdout_data, stderr_data = process.communicate() if process.returncode != 0: message = "%r failed, status code %s stdout %r stderr %r" % ( cmd, process.returncode, stdout_data, stderr_data) App.error(message) raise RuntimeError(message) output = '' if stdout_data: output += stdout_data if stderr_data: output += stderr_data return output
def profile(self, file_name, skip_rows=0, n_rows=None): # print '-----------------> file_name:', file_name file_rows = None metadata = None self.db_name = None try: metadata, file_rows = ProfilerUtils.init_profiler( file_name, part=self.part, metadata_file=self.metadata_file, ignore_metadata=self.ignore_metadata) self.db_name = metadata['db_name'] # This is (a temporary fix) for any uncatched error that can ocour. TODO: Improve this. self.last_sumary = self.create_error_sumary( 'Something went wrong!', file_rows, metadata=metadata) self.printv('File rows: {0:n}'.format(file_rows - 1)) if file_rows == 0: self.last_sumary = self.create_error_sumary( 'Error: Empty file.', None) return self.printv('Getting metadata.') # metadata = SocrataUtils.metadata_of(self.db_name) if self.check_if_skip_dataset(metadata): App.warn( 'Skipping database analisys: Not primary Socrata Database') self.last_sumary = self.create_error_sumary( Profiler.MSG_SKIP_VIEW, file_rows, metadata) else: self.printv("Loading file: " + file_name) database = PandasUtils.load_database(file_name, skiprows=skip_rows, nrows=n_rows) self.do_profile(database, file_name, skip_rows, n_rows, metadata) except (KeyboardInterrupt, SystemError) as e: ex_type, ex, tb = sys.exc_info() error_msg = '\n'.join(traceback.format_tb(tb)) self.last_sumary = self.create_error_sumary( 'Interrupted by Keyboard', file_rows, metadata=metadata) # logging.exception(e) App.error('Interrupted by Keyboard \n' + error_msg) raise except Exception as e: ex_type, ex, tb = sys.exc_info() error_msg = '\n'.join(traceback.format_tb(tb)) App.error(error_msg) self.last_sumary = self.create_error_sumary('Error: ' + error_msg, file_rows, metadata=metadata) if self.stop_on_error: raise except: self.last_sumary = self.create_error_sumary( 'Unknown Error: {0}'.format(sys.exc_info()), file_rows, metadata=metadata) # logging.exception(sys.exc_info()) App.error(sys.exc_info()) if self.stop_on_error: raise
def main(): print "===> ETL - Profiler <===" # ---------------------> Initialize batch processing begin_time = TimeUtils.current_time_formated() global opts opts = process_args() # print '1>>>>>>>>>>>>>>>>>>>> ', ApplicationOptions.OPTIONS ApplicationOptions.OPTIONS = opts # print '2>>>>>>>>>>>>>>>>>>>> ', ApplicationOptions.OPTIONS verb = ApplicationOptions.get_option('verbose', default=False) print '[verbose]=', verb # print '3>>>>>>>>>>>>>>>>>>>> ', ApplicationOptions.OPTIONS database_files = [] if opts['sources_in_metadata']: print 'Processing metadata csv to get source files.' database_files = pandas.read_csv(opts['metadata_file'], header=0, index_col=None) if 'source_file' not in database_files.columns: message = 'metadata file should have a source_file column when "sources_in_metadata" option is used.' ApplicationOptions.error(message) raise Exception(message) database_files = database_files['source_file'].tolist() elif opts['cusp_dw']: print 'Generating file list based on CUSP DW folder.' database_files = CuspDW.tabular_ids_in_DW() opts['db_ids_file'] = True print 'Found ', len(database_files), ' ids.' filename = CuspDW.OUTPUT_FILE_NAME if opts['to_folder'] is not None: filename = opts['to_folder'] + filename CuspDW.save_to_id_file(database_files, filename) print 'Saved file with updated ids: ', filename # ApplicationOptions # TODO: why is ApplicationOptions here? return elif opts['file_refs'] is None and opts['db_ids_file'] is None: database_files = [opts['file']] elif opts['db_ids_file'] is not None: print 'Generation file list based on: db_ids_file.' for id in open(opts['db_ids_file']).readlines(): database_files.append(id.strip()) else: print 'Generation file list based on: file_refs' lines = open(opts['file_refs']).readlines() for file in lines: if file.strip() == '' or file.startswith('#'): continue database_files.append(file.rstrip()) print 'Databases to profile: {0}'.format(len(database_files)) profile_databases(database_files) # ---------------------> Process Profile data process_profiles() show_summaries() end_time = TimeUtils.current_time_formated() # ---------------------> Save Data save_files() print '\nBegin: ', begin_time print 'End: ', end_time print '\n\n==> The End. <=='
def process_profiles(): try: global summaries global all_gps_rows global geo_index_by_dataset global all_gps_rows_by_database global all_zip_rows global gps_db_count global zip_db_count global profiled_database_types profiled_database_types = None global profiled_column_metadata profiled_column_metadata = None summaries = pandas.DataFrame() # Count of rows by ZIP and GPS all_gps_rows = pandas.DataFrame() all_zip_rows = pandas.DataFrame() geo_index_by_dataset = pandas.DataFrame() all_gps_rows_by_database = pandas.DataFrame() # Count of databases that have ZIP and GPS gps_db_count = pandas.DataFrame() zip_db_count = pandas.DataFrame() i = 0 for profiler in profilers: i += 1 summaries = pandas.concat([summaries[:], profiler.last_sumary]) if profiler.last_sumary['ETL-Profiler Status'][0] == 'OK': print 'Counting zip and gps data: ', i, '/', len(profilers) print ' by Rows' all_zip_rows = PandasUtils.merge_series_summing_values( all_zip_rows, profiler.last_zip_rows) all_gps_rows = PandasUtils.merge_series_summing_values( all_gps_rows, profiler.last_gps_rows) print ' rows by Databases' all_gps_rows_by_database = PandasUtils.merge_by_database( all_gps_rows_by_database, profiler.last_gps_rows, profiler.last_sumary.ix[0].Name) if not opts['ignore_index']: geo_index_by_dataset = geo_index_by_dataset.append( profiler.last_geo_index, ignore_index=True) print ' by Databases' # To consider that this database counts only by one, even if it appears more temp = pandas.DataFrame(profiler.last_zip_rows, columns=['count']) temp['count'] = 1 temp = temp['count'] zip_db_count = PandasUtils.merge_series_summing_values( zip_db_count, temp) temp = pandas.DataFrame(profiler.last_gps_rows, columns=['count']) temp['count'] = 1 temp = temp['count'] gps_db_count = PandasUtils.merge_series_summing_values( gps_db_count, temp) if profiled_database_types is None: profiled_database_types = profiler.types_summary.copy() else: profiled_database_types = profiled_database_types.append( profiler.types_summary, ignore_index=True) if profiled_column_metadata is None: profiled_column_metadata = profiler.column_metadata.copy() else: profiled_column_metadata = profiled_column_metadata.append( profiler.column_metadata, ignore_index=True) except: if opts['stop_on_error']: raise ApplicationOptions.error(Exception('Error processing profilers')) if 'stop_on_error' in opts and opts['stop_on_error'] and has_error_on( summaries): ApplicationOptions.error(Exception('Error on summaries'))
def run(*args): begin = current_time_formated() # if len(args) != 1: raise Exception('Summaries file must be passed as param. Only this param should be used.') print '---------- Considered Configuration ------------' print 'Args = ', args global COPY_GEO_INDEX CLEAN = args[0] == 'clean' or (len(args) > 1 and args[1] == 'clean') global IMPORT_ONLY for arg in args: if arg.startswith("import-only="): IMPORT_ONLY = arg.split('=')[1] IGNORE_INDEX = 'ignore-index' in args REFRESH_VIEWS = 'skip-refresh-views' not in args COPY_GEO_INDEX_ONLY = 'copy-index-only' in args COPY_GEO_INDEX = 'copy-geo-index' in args or COPY_GEO_INDEX_ONLY print 'CLEAN = ', CLEAN print 'IGNORE_INDEX = ', IGNORE_INDEX print 'COPY_GEO_INDEX = ', COPY_GEO_INDEX print 'COPY_GEO_INDEX_ONLY = ', COPY_GEO_INDEX_ONLY print 'REFRESH_VIEWS = ', REFRESH_VIEWS print 'IMPORT_ONLY dataset = ', IMPORT_ONLY print '----------------------\n\n' # link_missing_gpsdata() # stop=1/0 if CLEAN: print 'Cleaning database before import.' ColumnData.objects.all().delete() Column.objects.all().delete() GpsData.objects.all().delete() Database.objects.all().delete() summaires_file = args[0] column_metadata_file = summaires_file.replace('.csv', '_columns_metadata.csv') if not os.path.isfile(column_metadata_file): column_metadata_file = None gps_counts_file = summaires_file.replace('.csv', '_index.csv') if not os.path.isfile(gps_counts_file): gps_counts_file = None if args[0] not in ['update_alarms_only'] and not COPY_GEO_INDEX_ONLY: ####################### Open files: summaries, columns, geo_index print 'Loading summaries...' summaries = pandas.read_csv(summaires_file, na_values="", dtype=object) print 'Loading columns...' types_file = summaires_file.replace('.csv', '_columns.csv') types = pandas.read_csv(types_file) print ' > ', len(types) print 'Loading column data...' if column_metadata_file: column_metadata = pandas.read_csv(column_metadata_file) print ' > ', len(column_metadata_file) gps_counts = None if IGNORE_INDEX: print 'IGNORING Geo Index...' else: if COPY_GEO_INDEX: print 'Geo Index will be COPIED and Linked after insert datasets ...' else: print 'Loading Geo Index...' if gps_counts_file: gps_counts = pandas.read_csv(gps_counts_file) else: gps_counts = pandas.DataFrame() print 'Summaries to import: ', len(summaries), '\n\n' columns = summaries.columns # pool = Pool(WORKERS) # print 'Created pool with size:', WORKERS global total_databases total_databases = len(summaries) for count, row in summaries.iterrows(): if IMPORT_ONLY and row.Name != IMPORT_ONLY: continue # if count < 903: continue # if row.Name != 'hh8s-wfy6': continue try: save_complete_dataset(row, count + 1, summaries, types, column_metadata, gps_counts, report_progress=True) # print ' + Creating job for ', row.Name # pool.apply_async(save_complete_dataset, # args=(row, count, summaries, types, column_metadata, gps_counts), # callback= report) except Exception as e: ApplicationOptions.error(e) if STOP_ON_ERROR: raise e else: ex_type, ex, tb = sys.exc_info() ERRORS.append('{0}: {1}\n{2}\n'.format( row.Name, e, traceback.format_tb(tb))) # break # print 'All jobs created. Waiting for results...' # pool.close() # pool.join() # print 'All profilers done.' print 'All imported !!!' print '==============================' print ' Datasets = ', len(summaries) print ' Columns = ', len(types) print ' Column Data = ', len(column_metadata) # print ' GPS Data = ', len(gps_counts) print '==============================' if COPY_GEO_INDEX: print 'Copying GEO INDEX. (This can take some minutes)' copy_from_geo_index(gps_counts_file, summaries_file=summaires_file) print 'Updating alarms. (This can take sometime)' update_alarms() if REFRESH_VIEWS and gps_counts_file is not None: print 'Refreshing Materialized views. (This can take sometime)' refresh_materialized_views() else: print 'Materialized views were not refreshed.' print 'Updating System Control...' system = System() if args[0] == 'update_alarms_only': system.source_file = 'Updated Index Only' elif args[0] == 'clean': system.source_file = 'Clean DB only' else: system.source_file = summaires_file system.save() print 'Clearing cache so it will be updated on demand.' # clear_cache() print 'clear cache to update Urban Profiler' print ' -----------------------------------< ERRORS >------------------------------------------' for e in ERRORS: print ApplicationOptions.error(e) print ' ------------------------------------< END >------------------------------------------' end = current_time_formated() print '\nBegin:', begin print 'End: ', end print '!!! END OF SCRIPT !!!'