def get_geo_columns_for_index(column_types, dataset): App.debug('Preparing Geo Index') geo_cols = {} lat_col_name = None lon_col_name = None for count_cols, row in column_types.iterrows(): App.debug('row= ', row['profiler-most-detected'], row['column-name']) # print "row['profiler-most-detected'] in INDEX_ACCEPTED_TYPES_GEO=", row['profiler-most-detected'] in INDEX_ACCEPTED_TYPES_GEO if row['profiler-most-detected'] in INDEX_ACCEPTED_TYPES_GEO: col_name = row['column-name'] App.debug('> Found:', col_name) # Improve for LATITUDE and LONGITUDE in different columns if row['profiler-most-detected'] == TypeDetector.GEO_GPS_LATLON: if 'LONGITUDE' in col_name.upper(): lon_col_name = col_name elif 'LATITUDE' in col_name.upper(): lat_col_name = col_name else: geo_cols[col_name] = row['profiler-most-detected'] if lat_col_name and lon_col_name: new_gps_col = NEW_GEO_COL_GPS_PREFIX dataset[new_gps_col] = PandasUtils.join_lat_lon_into_gps(dataset, lat_col_name, lon_col_name) geo_cols[new_gps_col] = TypeDetector.GEO_GPS App.debug('CREATED GPS COL:', dataset[new_gps_col]) App.debug('Geo cols to index:', geo_cols) return geo_cols
def save_complete_dataset(row, count, summaries, types, column_metadata, gps_counts, report_progress=False): pid = str(os.getpid()) try: db = save_database(row, count, summaries) save_columns(db, types, column_metadata) if not IGNORE_INDEX: if COPY_GEO_INDEX: # link_gps_data(db.database_id, db.id) print ' -> GPS Data will be linked to datasets at end of processing.' else: save_gps_data(db, gps_counts) # return db except: msg = '[' + pid + '] ERROR in THREAD:\n' msg += '[' + pid + '] -----------------------------------------------------------------\n' msg += '[' + pid + '] ' + traceback.format_exc() + '\n' msg += '[' + pid + '] -----------------------------------------------------------------' ApplicationOptions.error(msg) raise if report_progress: report(db)
def is_us_address(value): try: tag = usaddress.tag(value) return len(tag) > 2 or tag[-1] is not 'Ambiguous' except usaddress.RepeatedLabelError as ex: App.debug('Error detecting Geo-Address:', ex) return False
def profile_as_job(database_file): profiler = Profiler.Profiler() global STOP_RUNNING if STOP_RUNNING: return try: pid = str(os.getpid()) print 'Begin: [' + pid + ']: ' + database_file profiler.profile(database_file) print 'End: [' + pid + ']: ' + database_file except KeyboardInterrupt: App.error('KeyboardInterrupt with: ' + database_file) STOP_RUNNING = ApplicationOptions.OPTIONS['stop_on_error'] except: msg = '[' + pid + '] ERROR in THREAD:\n' msg += '[' + pid + '] -----------------------------------------------------------------\n' for line in traceback.format_exc().split('\n'): msg += '[' + pid + '] ' + line + '\n' msg += '[' + pid + '] -----------------------------------------------------------------' # # Will print colored here instead of app.error as facilitates reading error output and debuging # print tc.RED + msg + tc.ENDC ApplicationOptions.error(msg) # raise finally: return profiler
def detect_null(column): App.debug('Detecting: Null') not_null_indexes = column.astype(str).apply( lambda x: x.lower() not in NULL_VALUES) not_null = column[not_null_indexes] null_indexes = column[not_null_indexes == False] App.debug(' detected: ', len(null_indexes)) # App.debug(' null_indexes: \n', null_indexes) # if len(null_indexes) > 0: return NULL, null_indexes, not_null
def simplify(column_types_count): simple_types_count = {} for prefix in TYPE_PREFIXES: simple_types_count[prefix] = 0 for type in column_types_count: App.debug('Computing [{0}]: {1}'.format(type, column_types_count[type])) prefix = type.split('-')[0] simple_types_count[prefix] += column_types_count[type] App.debug('Simple types count: ', simple_types_count) return simple_types_count
def prepare_location_columns(database, metadata_types): for col in database.columns: # print 'Checking:' + col + ' -type:' + metadata_types[col] col_is_string = database[col].dtype == object if col_is_string and col in metadata_types.keys() and metadata_types[col].lower() == 'location': #is it complex ? if True in database[col].astype(str).apply(lambda x: '<br />' in x or '\n' in x).value_counts(): App.debug('Separating location column: ', col) #split in multiple columns new_col = col + PREFIX_NEW_COLUMN + 'gps' database[new_col] = database[col].apply(lambda x: extract_gps_from_composite_location(x))
def process_args(value_opts={}, boolean_opts={}): v_opts = value_opts.copy() b_opts = boolean_opts.copy() b_opts['debug'] = False opts = {} args = ARGS or sys.argv[1:] print 'args=', args for arg in args: param = arg.split('=')[0][2:] if '=' in arg else arg[2:] value = arg.split('=')[1] if '=' in arg else None if param == 'help': print 'Possible simple params are:' for opt in b_opts.keys(): print ' --' + opt print 'And valuable params are:' for opt in v_opts.keys(): print ' --' + opt sys.exit (0) elif value is None: if param not in b_opts: print 'param: {0}'.format(param) raise SystemExit('ERROR: Invalid arg "{0}".\nValid options are: {1} with preceding --.'.format(arg, b_opts.keys())); else: b_opts[param] = True elif param in v_opts: v_opts[param] = value else: raise SystemExit('ERROR: Invalid value arg "{0}".\nValid options are: {1} with preceding -- and with numeric values. like --nrows=10.'.format(arg, v_opts.keys())); opts.update(b_opts) opts.update(v_opts) if opts['debug']: App.start_debuging() if 'debug' in opts and opts['debug']: print 'Considered options:' for k in opts: print ' - {0} = {1}'.format(k, opts[k]) return opts
def most_detected(detected_types): App.debug('Detected types: ', detected_types.keys()) most_detected_type = NULL precision = detected_types[most_detected_type] if detected_types is None or len(detected_types) == 0: return NULL, 100 #% for key in detected_types.keys(): current = detected_types[key] App.debug('Current: [{0}]={1}'.format(key, current)) if current > 0 and current >= precision: most_detected_type = key precision = detected_types[most_detected_type] App.debug('most_detected updated with key:', key) App.debug('[most_detected] detected_types=', detected_types) App.debug('[most_detected]:', most_detected_type) return most_detected_type, precision
def run_bash(bash_command, folder=None): cmd = bash_command if type(cmd) == str: cmd = cmd.split() if folder is None: process = subprocess.Popen(cmd, stdout=subprocess.PIPE) else: process = subprocess.Popen(cmd, stdout=subprocess.PIPE, cwd=folder, stderr=subprocess.STDOUT) stdout_data, stderr_data = process.communicate() if process.returncode != 0: message = "%r failed, status code %s stdout %r stderr %r" % ( cmd, process.returncode, stdout_data, stderr_data) App.error(message) raise RuntimeError(message) output = '' if stdout_data: output += stdout_data if stderr_data: output += stderr_data return output
def get_temp_columns_for_index(column_types, dataset): App.debug('Preparing Temporal Index') temp_cols = {} date_col_name = None time_col_name = None for count_cols, row in column_types.iterrows(): if row['profiler-most-detected'] in INDEX_ACCEPTED_TYPES_TEMP: col_name = row['column-name'] App.debug('> Found:', col_name) if col_name.upper() == 'DATE': date_col_name = col_name elif col_name.upper() == 'TIME': time_col_name = col_name else: temp_cols[col_name] = row['profiler-most-detected'] # print 'date_col_name and time_col_name=', date_col_name, time_col_name # If dataset has TIME and DATE join both as one column if date_col_name and time_col_name: new_col_name = NEW_TEMP_COL_DATETIME_PREFIX dataset[new_col_name] = TimeUtils.join_date_and_time(dataset[date_col_name], dataset[time_col_name]) temp_cols[new_col_name] = TypeDetector.TEMPORAL_DATE_TIME # Has only date, but not date and time. If has only time, disconsider it elif date_col_name: temp_cols[date_col_name] = row['profiler-most-detected'] App.debug('Temp cols to index', temp_cols) return temp_cols
def detect_text(column): App.debug('Detecting: Text') nulls, not_nulls = detect_null(column)[1:] App.debug('Text Values:', not_nulls.values[:10]) App.debug('Non Text Values:', nulls.values[:10]) return TEXTUAL, not_nulls, nulls
def valid_values_of_type(type_name, column_values): App.debug('valid_values_of_type()') detectors_type, type_detectors = data_detectors() for detector in type_detectors: if detector['name'] == type_name: App.debug('Detecting valid values for:', type_name) detected, not_detected, type_name = detect_type( detector, detectors_type, column_values) # type_name, detected, not_detected = detect_using_dynamic(detector, column_values) App.debug('Detected: ', len(detected)) return detected return None
def detect_us_address(column): type = GEO_ADDRESS prepared_col_data = column.dropna() is_address = prepared_col_data.astype(str).str.upper().apply( lambda x: is_us_address(x)) detected = prepared_col_data[is_address == True] not_detected = prepared_col_data[is_address == False] App.debug('Detected Type:', type) App.debug('Detected Values:', len(detected), ' - ', detected.values[:10]) App.debug('Non Detected Values:', len(not_detected), ' - ', not_detected.values[:10]) return type, detected, not_detected
def generate_index_on(index_geo_cols, index_temp_cols, dataset, db_name): index = pandas.DataFrame(columns=INDEX_COLUMNS) # No columns to generate index if len(index_geo_cols.keys()) == 0 and len(index_temp_cols.keys()) == 0: return index # Prepare the list of cols # If is empty add None just to loop into it and call the generate_partial_index function if index_geo_cols is None or len(index_geo_cols) == 0: index_geo_cols[PHANTON_COL] = None if index_temp_cols is None or len(index_temp_cols) == 0: index_temp_cols[PHANTON_COL] = None # Clean dataset before create partial index print 'Cleaning dataset to process index' print 'dataset size:', len(dataset) cols_to_clean = index_geo_cols.copy() cols_to_clean.update(index_temp_cols) for col in cols_to_clean: print ' > {0} - {1}'.format(col, cols_to_clean[col]).ljust(50) + '@' + TimeUtils.current_time_formated() # If current col is the PHANTON col, skip it if col is PHANTON_COL: continue clean_invalid_values(cols_to_clean[col], col, dataset) print ' dataset size:', len(dataset) for geo_col in index_geo_cols.keys(): geo_type = index_geo_cols[geo_col] for temp_col in index_temp_cols.keys(): temp_type = index_temp_cols[temp_col] an_index = generate_partial_index_on(geo_col, geo_type, temp_col, temp_type, dataset, db_name) App.info(' Adding to index... '.ljust(50) + '@' + TimeUtils.current_time_formated()) index = index.append(an_index, ignore_index=True) App.info('Index created with {0} rows'.format(len(index))) App.debug('>>>>> INDEX <<<<<\n', index) return index
def process_profiles(): try: global summaries global all_gps_rows global geo_index_by_dataset global all_gps_rows_by_database global all_zip_rows global gps_db_count global zip_db_count global profiled_database_types profiled_database_types = None global profiled_column_metadata profiled_column_metadata = None summaries = pandas.DataFrame() # Count of rows by ZIP and GPS all_gps_rows = pandas.DataFrame() all_zip_rows = pandas.DataFrame() geo_index_by_dataset = pandas.DataFrame() all_gps_rows_by_database = pandas.DataFrame() # Count of databases that have ZIP and GPS gps_db_count = pandas.DataFrame() zip_db_count = pandas.DataFrame() i = 0 for profiler in profilers: i += 1 summaries = pandas.concat([summaries[:], profiler.last_sumary]) if profiler.last_sumary['ETL-Profiler Status'][0] == 'OK': print 'Counting zip and gps data: ', i, '/', len(profilers) print ' by Rows' all_zip_rows = PandasUtils.merge_series_summing_values( all_zip_rows, profiler.last_zip_rows) all_gps_rows = PandasUtils.merge_series_summing_values( all_gps_rows, profiler.last_gps_rows) print ' rows by Databases' all_gps_rows_by_database = PandasUtils.merge_by_database( all_gps_rows_by_database, profiler.last_gps_rows, profiler.last_sumary.ix[0].Name) if not opts['ignore_index']: geo_index_by_dataset = geo_index_by_dataset.append( profiler.last_geo_index, ignore_index=True) print ' by Databases' # To consider that this database counts only by one, even if it appears more temp = pandas.DataFrame(profiler.last_zip_rows, columns=['count']) temp['count'] = 1 temp = temp['count'] zip_db_count = PandasUtils.merge_series_summing_values( zip_db_count, temp) temp = pandas.DataFrame(profiler.last_gps_rows, columns=['count']) temp['count'] = 1 temp = temp['count'] gps_db_count = PandasUtils.merge_series_summing_values( gps_db_count, temp) if profiled_database_types is None: profiled_database_types = profiler.types_summary.copy() else: profiled_database_types = profiled_database_types.append( profiler.types_summary, ignore_index=True) if profiled_column_metadata is None: profiled_column_metadata = profiler.column_metadata.copy() else: profiled_column_metadata = profiled_column_metadata.append( profiler.column_metadata, ignore_index=True) except: if opts['stop_on_error']: raise ApplicationOptions.error(Exception('Error processing profilers')) if 'stop_on_error' in opts and opts['stop_on_error'] and has_error_on( summaries): ApplicationOptions.error(Exception('Error on summaries'))
def types_of(column): App.debug('Detecting types of: ', column.name) App.debug(' size: ', len(column)) detectors_type, detectors = data_detectors() App.debug(' Initializing detected_types. ') detected_types = {} # Initialize with all zeros for detector in detectors: detected_types[detector[DETECTOR_NAME]] = 0.0 if len(column) == 0: App.debug('Empty column!') return detected_types remaining_values_to_detect_type = column.copy() ## If column is in unicode, transform to ASCII to avoid errors during processing. ## Check for unicode in column values unicode_values = remaining_values_to_detect_type.apply( lambda x: (type(x) is unicode)) unicode_values_counts = unicode_values.value_counts() ## Transform the unicode values into ascii if there are any if True in unicode_values_counts.keys( ) and unicode_values_counts[True] > 0: App.info('Recoding values... (this can take some time)') remaining_values_to_detect_type = remaining_values_to_detect_type.apply( lambda x: TextUtils.reencode_text_if_not_ascii(x)) for detector in detectors: detected, not_detected, type_name = detect_type( detector, detectors_type, remaining_values_to_detect_type) detected_types[type_name] = round( len(detected) * 100.0 / len(column), PERCENTUAL_PRECISION) remaining_values_to_detect_type = not_detected App.debug(' Remaining: ', len(not_detected)) # if len(remaining_values_to_detect_type) == 0: # break return detected_types
def metadata_of(database_id, first=True, portal_url=NYC_OPENDATA_URL_BASE): App.debug(' SocrataUtils.metadata_of({0})'.format(database_id)) url = portal_url + '/views/' + database_id + JSON_EXTENSION + APP_TOKEN_PARAM # App.debug('url: ', url) metadata = {'source':'Socrata'} # try: if True: App.debug('Url to get metadata from: ' + url) response = urllib.urlopen(url) data = json.loads(response.read()) if 'id' in data and data['id'] == database_id: App.debug(' -> Success retrieving metadata!') App.debug('Retrieved metadata Keys:\n - ' + '\n - '.join(data.keys() )) App.debug('Retrieved metadata:\n' + json.dumps(data, indent=4, sort_keys=True)) App.debug('==========================================') if 'rowIdentifierColumnId' in data: id_column_id = data['rowIdentifierColumnId'] for col in data['columns']: if col['id'] == id_column_id: metadata[MetadataConstants.ID_COLUMN] = col['name'] else: metadata[MetadataConstants.ID_COLUMN] = None metadata[MetadataConstants.METADATA_SOURCE_URL] = key_as_str(data, url) metadata[MetadataConstants.METADATA_SOURCE_NAME] = key_as_str(data, 'Socrata Portal ' + portal_url) metadata[MetadataConstants.NAME] = key_as_str(data, 'name') metadata[MetadataConstants.PREFIX + 'Description'] = key_as_str(data, 'description') metadata[MetadataConstants.DISPLAY_TYPE_KEY] = key_as_str(data, 'displayType') metadata[MetadataConstants.PREFIX + 'Category'] = key_as_str(data, 'category') metadata[MetadataConstants.PREFIX + 'Owner'] = key_as_str(data['owner'], 'displayName') metadata[MetadataConstants.PREFIX + 'Download Count'] = key_as_str(data, 'downloadCount') metadata[MetadataConstants.PREFIX + 'View Count'] = key_as_str(data, 'viewCount') metadata[MetadataConstants.PREFIX + 'Comments'] = key_as_str(data, 'numberOfComments') metadata[MetadataConstants.PREFIX + 'Author'] = key_as_str(data['tableAuthor'], 'displayName') metadata[MetadataConstants.PREFIX + 'Id'] = key_as_str(data, 'id') metadata[MetadataConstants.PREFIX + 'Attribution'] = key_as_str(data, 'attribution') metadata[MetadataConstants.PREFIX + 'View Type'] = key_as_str(data, 'viewType') metadata[MetadataConstants.PREFIX + 'Display Type'] = key_as_str(data, 'displayType') metadata[MetadataConstants.PREFIX + 'Number of Coments'] = key_as_str(data, 'numberOfComments') ##> Discover if this dataset is a view if 'modifyingViewUid' not in data: metadata[MetadataConstants.PREFIX + 'View From'] = None else: metadata[MetadataConstants.PREFIX + 'View From'] = key_as_str(data,'modifyingViewUid') timestamp = int(data['createdAt'].__str__()) metadata[MetadataConstants.PREFIX + 'Created At'] = datetime.datetime.fromtimestamp(timestamp).__str__() timestamp = int(data['viewLastModified'].__str__()) metadata[MetadataConstants.PREFIX + 'Last Modified'] = datetime.datetime.fromtimestamp(timestamp).__str__() timestamp = int(data['publicationDate'].__str__()) metadata[MetadataConstants.PREFIX + 'Publication Date'] = datetime.datetime.fromtimestamp(timestamp).__str__() metadata['Tags'] = key_as_str(data, 'tags') if metadata['Tags'] == 'None': metadata['Tags'] = None if 'metadata' in data and 'custom_fields' in data['metadata']: custom_fields = data['metadata']['custom_fields'] if 'Update' in custom_fields and 'Update Frequency' in custom_fields['Update']: metadata[MetadataConstants.PREFIX + 'Update Frequency'] = custom_fields['Update']['Update Frequency'].__str__() if 'Dataset Information' in custom_fields and 'Agency' in custom_fields['Dataset Information']: metadata[MetadataConstants.PREFIX + 'Agency'] = custom_fields['Dataset Information']['Agency'].__str__() types = {} columns = data['columns'] for col in columns: col_name = col['name'].strip(' ').encode('ascii','ignore') col_type = col['dataTypeName'] types[col_name] = col_type metadata[MetadataConstants.PREFIX + 'Types'] = types metadata[MetadataConstants.STATUS] = MetadataConstants.STATUS_SUCCESS else: if 'Cannot find view with id' in data['message']: metadata[MetadataConstants.STATUS] = MetadataConstants.STATUS_ERROR_VIEW_NOT_FOUND else: metadata[MetadataConstants.STATUS] = 'Error' metadata['message'] = data['message'] # except e: # raise e # #This means that it is not from socrata # # Or that some other error occurred # #just return None # if first: # App.debug('Waiting to try again') # sleep(0.5) # return metadata_of(database_id, first=False) # metadata[MetadataConstants.STATUS] = 'Error Exception' # metadata['message'] = 'Error acessing a Socrata Portal with url: {0}'.format(url) if metadata[MetadataConstants.STATUS] is not MetadataConstants.STATUS_SUCCESS: # logging.warn(metadata[STATUS]) App.debug('WARNING: ', metadata[MetadataConstants.STATUS]) #before return, turn the Unicodes to normal str for k in metadata.keys(): TextUtils.reencode_text_if_not_ascii(metadata[k]) # if type(metadata[k]) is unicode: # # print ' Unicode info found on key: ' , k, '=', metadata[k] # metadata[k] = metadata[k].encode('ascii','ignore') ##> If there was an error, show url so user can check if metadata[MetadataConstants.STATUS] == MetadataConstants.STATUS_ERROR_VIEW_NOT_FOUND: App.info(' Metadata not found on Socrata with url: ' + url) ##> Show dataset retrieved name to indicate success if metadata[MetadataConstants.STATUS] == MetadataConstants.STATUS_SUCCESS: App.info(' OK. Dataset Retrieved Name: ' + metadata[ MetadataConstants.NAME ] ) App.debug('Retrieved Metadata: \n' + json.dumps(metadata, ensure_ascii=False, indent=4, sort_keys=True) ) return metadata
def load_database(database_file, skiprows=None, nrows=None): # It is a socrata CSV database. The wget on compute is not geting the extension as should. file_type = 'CSV' #default if no extension is found. if database_file.endswith('.csv'): file_type = 'CSV' if database_file.endswith('.json'): file_type = 'JSON' file_encoding = get_encoding(database_file) App.info(' > File encoding: %s' % file_encoding) if file_type == 'CSV': App.debug('CSV: Reading column headers from first line.') cols = FileUtils.get_cols_from_csv_header(database_file) App.debug('Preparing column types for pandas.') dtypes = prepare_dtypes_for_loading(cols) try: App.debug('Trying to read csv...') return pandas.read_csv(database_file, skiprows=skiprows, nrows=nrows, low_memory=LOW_MEMORY, encoding=file_encoding, dtype=dtypes) except: App.debug('Default CSV did not work.') App.debug('Trying to read with tab as separator...') # This error can be because the file is a tab separated values instead of comma return pandas.read_csv(database_file, skiprows=skiprows, nrows=nrows, low_memory=LOW_MEMORY, encoding=file_encoding, sep='\t', dtype=dtypes) elif file_type == 'JSON': # This works for json under socrata format, which have data field. # If not this way, lets supose it is already the data. json_file = open(database_file) json_data = json.load(json_file) if 'data' in json_data.keys(): App.debug('JSON: Read data from data field. (Socrata format)') data = json_data['data'] cols = [] cols_with_sub_cols = [] App.debug('Getting column names from metadata...') for col in json_data['meta']['view']['columns']: cols.append(col['name']) if 'subColumnTypes' in col.keys(): print ' (!) Column ', col[ 'name'], ' has sub columns: ', col['subColumnTypes'] cols_with_sub_cols.append(col) dtypes = prepare_dtypes_for_loading(cols) df = pandas.DataFrame(data, columns=cols) #create subcolumn data for col in cols_with_sub_cols: print ' Fetching sub columns of ', col['name'] i = 0 for sub_col in col['subColumnTypes']: print ' >', sub_col df[col['name'] + NEW_COLUMN_NAME_SEPARATOR + sub_col] = df[col['name']].apply(lambda x: x[i]) i += 1 print ' Removing source column ', col[ 'name'], ' from data frame.' #Then remove multivalored column df.drop(col['name'], axis=1, inplace=True) return df else: App.debug( 'JSON: There is no data field. Getting column names from JSON keys.' ) #get the list of cols from the json cols = list(json_data.keys()) dtypes = prepare_dtypes_for_loading(cols) return pandas.DataFrame(json_data, dtypes=dtypes) else: print '===> PandasUtilError: Invalid database file: [{0}]' # raise ApplicationExecption('File must be json or csv!'.format(database_file)) raise RuntimeError( 'File must be json (with data inside a data field) or csv!'.format( database_file))