def get_metadata_from_file(database_name, file_name, metadata_file): #Read metadata from File metadata = {} App.info("Reading metadata from file.") metadata_from_file = pandas.read_csv(metadata_file) ds_metadata = metadata_from_file.loc[ metadata_from_file[METADATA_FILE_SOURCE_FILE] == file_name] # print '=============================================\n' # print 'ds_metadata\n', ds_metadata.T # print '=============================================\n' if len(ds_metadata) == 0: metadata[MetadataConstants. STATUS] = MetadataConstants.STATUS_ERROR_METADATA_NOT_FOUND return metadata ds_metadata = ds_metadata.iloc[0] metadata[MetadataConstants. ORGANIZATION] = ds_metadata[METADATA_FILE_ORGANIZATION] ##> Load SOURCE info and check if shoudl continue or return metadata[SOURCE] = ds_metadata[SOURCE] # print '=========metadata[SOURCE]=================>', metadata[SOURCE] if metadata_source_is_socrata(metadata[SOURCE]): metadata[MetadataConstants.SOURCE_URL] = ds_metadata[SOURCE_URL] return metadata metadata[ MetadataConstants.AGENCY] = ds_metadata[METADATA_FILE_ORGANIZATION] metadata[MetadataConstants.CATEGORY] = ds_metadata[METADATA_FILE_CATEGORY] metadata[MetadataConstants.OWNER] = ds_metadata[METADATA_FILE_ORGANIZATION] metadata[MetadataConstants.AUTHOR] = ds_metadata[METADATA_FILE_MAINTAINER] metadata[MetadataConstants. UPDATE_FREQUENCY] = ds_metadata[METADATA_FILE_UPDATE_FREQUENCY] metadata[MetadataConstants.TAGS] = ds_metadata[METADATA_FILE_TAGS] metadata[ MetadataConstants.ACCESS_TYPE] = ds_metadata[METADATA_FILE_ACCESS_TYPE] metadata[MetadataConstants. ACCESS_TYPE] += ' - ' + ds_metadata[METADATA_FILE_CUSP_COLOR] metadata[MetadataConstants.DISPLAY_TYPE] = 'Table' metadata[MetadataConstants.VIEW_TYPE] = 'Tabular' # print '-------------__> metadata:', metadata metadata[MetadataConstants.STATUS] = MetadataConstants.STATUS_SUCCESS return metadata
def types_of(column): App.debug('Detecting types of: ', column.name) App.debug(' size: ', len(column)) detectors_type, detectors = data_detectors() App.debug(' Initializing detected_types. ') detected_types = {} # Initialize with all zeros for detector in detectors: detected_types[detector[DETECTOR_NAME]] = 0.0 if len(column) == 0: App.debug('Empty column!') return detected_types remaining_values_to_detect_type = column.copy() ## If column is in unicode, transform to ASCII to avoid errors during processing. ## Check for unicode in column values unicode_values = remaining_values_to_detect_type.apply( lambda x: (type(x) is unicode)) unicode_values_counts = unicode_values.value_counts() ## Transform the unicode values into ascii if there are any if True in unicode_values_counts.keys( ) and unicode_values_counts[True] > 0: App.info('Recoding values... (this can take some time)') remaining_values_to_detect_type = remaining_values_to_detect_type.apply( lambda x: TextUtils.reencode_text_if_not_ascii(x)) for detector in detectors: detected, not_detected, type_name = detect_type( detector, detectors_type, remaining_values_to_detect_type) detected_types[type_name] = round( len(detected) * 100.0 / len(column), PERCENTUAL_PRECISION) remaining_values_to_detect_type = not_detected App.debug(' Remaining: ', len(not_detected)) # if len(remaining_values_to_detect_type) == 0: # break return detected_types
def generate_index_on(index_geo_cols, index_temp_cols, dataset, db_name): index = pandas.DataFrame(columns=INDEX_COLUMNS) # No columns to generate index if len(index_geo_cols.keys()) == 0 and len(index_temp_cols.keys()) == 0: return index # Prepare the list of cols # If is empty add None just to loop into it and call the generate_partial_index function if index_geo_cols is None or len(index_geo_cols) == 0: index_geo_cols[PHANTON_COL] = None if index_temp_cols is None or len(index_temp_cols) == 0: index_temp_cols[PHANTON_COL] = None # Clean dataset before create partial index print 'Cleaning dataset to process index' print 'dataset size:', len(dataset) cols_to_clean = index_geo_cols.copy() cols_to_clean.update(index_temp_cols) for col in cols_to_clean: print ' > {0} - {1}'.format(col, cols_to_clean[col]).ljust(50) + '@' + TimeUtils.current_time_formated() # If current col is the PHANTON col, skip it if col is PHANTON_COL: continue clean_invalid_values(cols_to_clean[col], col, dataset) print ' dataset size:', len(dataset) for geo_col in index_geo_cols.keys(): geo_type = index_geo_cols[geo_col] for temp_col in index_temp_cols.keys(): temp_type = index_temp_cols[temp_col] an_index = generate_partial_index_on(geo_col, geo_type, temp_col, temp_type, dataset, db_name) App.info(' Adding to index... '.ljust(50) + '@' + TimeUtils.current_time_formated()) index = index.append(an_index, ignore_index=True) App.info('Index created with {0} rows'.format(len(index))) App.debug('>>>>> INDEX <<<<<\n', index) return index
def metadata_of(database_id, first=True, portal_url=NYC_OPENDATA_URL_BASE): App.debug(' SocrataUtils.metadata_of({0})'.format(database_id)) url = portal_url + '/views/' + database_id + JSON_EXTENSION + APP_TOKEN_PARAM # App.debug('url: ', url) metadata = {'source':'Socrata'} # try: if True: App.debug('Url to get metadata from: ' + url) response = urllib.urlopen(url) data = json.loads(response.read()) if 'id' in data and data['id'] == database_id: App.debug(' -> Success retrieving metadata!') App.debug('Retrieved metadata Keys:\n - ' + '\n - '.join(data.keys() )) App.debug('Retrieved metadata:\n' + json.dumps(data, indent=4, sort_keys=True)) App.debug('==========================================') if 'rowIdentifierColumnId' in data: id_column_id = data['rowIdentifierColumnId'] for col in data['columns']: if col['id'] == id_column_id: metadata[MetadataConstants.ID_COLUMN] = col['name'] else: metadata[MetadataConstants.ID_COLUMN] = None metadata[MetadataConstants.METADATA_SOURCE_URL] = key_as_str(data, url) metadata[MetadataConstants.METADATA_SOURCE_NAME] = key_as_str(data, 'Socrata Portal ' + portal_url) metadata[MetadataConstants.NAME] = key_as_str(data, 'name') metadata[MetadataConstants.PREFIX + 'Description'] = key_as_str(data, 'description') metadata[MetadataConstants.DISPLAY_TYPE_KEY] = key_as_str(data, 'displayType') metadata[MetadataConstants.PREFIX + 'Category'] = key_as_str(data, 'category') metadata[MetadataConstants.PREFIX + 'Owner'] = key_as_str(data['owner'], 'displayName') metadata[MetadataConstants.PREFIX + 'Download Count'] = key_as_str(data, 'downloadCount') metadata[MetadataConstants.PREFIX + 'View Count'] = key_as_str(data, 'viewCount') metadata[MetadataConstants.PREFIX + 'Comments'] = key_as_str(data, 'numberOfComments') metadata[MetadataConstants.PREFIX + 'Author'] = key_as_str(data['tableAuthor'], 'displayName') metadata[MetadataConstants.PREFIX + 'Id'] = key_as_str(data, 'id') metadata[MetadataConstants.PREFIX + 'Attribution'] = key_as_str(data, 'attribution') metadata[MetadataConstants.PREFIX + 'View Type'] = key_as_str(data, 'viewType') metadata[MetadataConstants.PREFIX + 'Display Type'] = key_as_str(data, 'displayType') metadata[MetadataConstants.PREFIX + 'Number of Coments'] = key_as_str(data, 'numberOfComments') ##> Discover if this dataset is a view if 'modifyingViewUid' not in data: metadata[MetadataConstants.PREFIX + 'View From'] = None else: metadata[MetadataConstants.PREFIX + 'View From'] = key_as_str(data,'modifyingViewUid') timestamp = int(data['createdAt'].__str__()) metadata[MetadataConstants.PREFIX + 'Created At'] = datetime.datetime.fromtimestamp(timestamp).__str__() timestamp = int(data['viewLastModified'].__str__()) metadata[MetadataConstants.PREFIX + 'Last Modified'] = datetime.datetime.fromtimestamp(timestamp).__str__() timestamp = int(data['publicationDate'].__str__()) metadata[MetadataConstants.PREFIX + 'Publication Date'] = datetime.datetime.fromtimestamp(timestamp).__str__() metadata['Tags'] = key_as_str(data, 'tags') if metadata['Tags'] == 'None': metadata['Tags'] = None if 'metadata' in data and 'custom_fields' in data['metadata']: custom_fields = data['metadata']['custom_fields'] if 'Update' in custom_fields and 'Update Frequency' in custom_fields['Update']: metadata[MetadataConstants.PREFIX + 'Update Frequency'] = custom_fields['Update']['Update Frequency'].__str__() if 'Dataset Information' in custom_fields and 'Agency' in custom_fields['Dataset Information']: metadata[MetadataConstants.PREFIX + 'Agency'] = custom_fields['Dataset Information']['Agency'].__str__() types = {} columns = data['columns'] for col in columns: col_name = col['name'].strip(' ').encode('ascii','ignore') col_type = col['dataTypeName'] types[col_name] = col_type metadata[MetadataConstants.PREFIX + 'Types'] = types metadata[MetadataConstants.STATUS] = MetadataConstants.STATUS_SUCCESS else: if 'Cannot find view with id' in data['message']: metadata[MetadataConstants.STATUS] = MetadataConstants.STATUS_ERROR_VIEW_NOT_FOUND else: metadata[MetadataConstants.STATUS] = 'Error' metadata['message'] = data['message'] # except e: # raise e # #This means that it is not from socrata # # Or that some other error occurred # #just return None # if first: # App.debug('Waiting to try again') # sleep(0.5) # return metadata_of(database_id, first=False) # metadata[MetadataConstants.STATUS] = 'Error Exception' # metadata['message'] = 'Error acessing a Socrata Portal with url: {0}'.format(url) if metadata[MetadataConstants.STATUS] is not MetadataConstants.STATUS_SUCCESS: # logging.warn(metadata[STATUS]) App.debug('WARNING: ', metadata[MetadataConstants.STATUS]) #before return, turn the Unicodes to normal str for k in metadata.keys(): TextUtils.reencode_text_if_not_ascii(metadata[k]) # if type(metadata[k]) is unicode: # # print ' Unicode info found on key: ' , k, '=', metadata[k] # metadata[k] = metadata[k].encode('ascii','ignore') ##> If there was an error, show url so user can check if metadata[MetadataConstants.STATUS] == MetadataConstants.STATUS_ERROR_VIEW_NOT_FOUND: App.info(' Metadata not found on Socrata with url: ' + url) ##> Show dataset retrieved name to indicate success if metadata[MetadataConstants.STATUS] == MetadataConstants.STATUS_SUCCESS: App.info(' OK. Dataset Retrieved Name: ' + metadata[ MetadataConstants.NAME ] ) App.debug('Retrieved Metadata: \n' + json.dumps(metadata, ensure_ascii=False, indent=4, sort_keys=True) ) return metadata
def load_database(database_file, skiprows=None, nrows=None): # It is a socrata CSV database. The wget on compute is not geting the extension as should. file_type = 'CSV' #default if no extension is found. if database_file.endswith('.csv'): file_type = 'CSV' if database_file.endswith('.json'): file_type = 'JSON' file_encoding = get_encoding(database_file) App.info(' > File encoding: %s' % file_encoding) if file_type == 'CSV': App.debug('CSV: Reading column headers from first line.') cols = FileUtils.get_cols_from_csv_header(database_file) App.debug('Preparing column types for pandas.') dtypes = prepare_dtypes_for_loading(cols) try: App.debug('Trying to read csv...') return pandas.read_csv(database_file, skiprows=skiprows, nrows=nrows, low_memory=LOW_MEMORY, encoding=file_encoding, dtype=dtypes) except: App.debug('Default CSV did not work.') App.debug('Trying to read with tab as separator...') # This error can be because the file is a tab separated values instead of comma return pandas.read_csv(database_file, skiprows=skiprows, nrows=nrows, low_memory=LOW_MEMORY, encoding=file_encoding, sep='\t', dtype=dtypes) elif file_type == 'JSON': # This works for json under socrata format, which have data field. # If not this way, lets supose it is already the data. json_file = open(database_file) json_data = json.load(json_file) if 'data' in json_data.keys(): App.debug('JSON: Read data from data field. (Socrata format)') data = json_data['data'] cols = [] cols_with_sub_cols = [] App.debug('Getting column names from metadata...') for col in json_data['meta']['view']['columns']: cols.append(col['name']) if 'subColumnTypes' in col.keys(): print ' (!) Column ', col[ 'name'], ' has sub columns: ', col['subColumnTypes'] cols_with_sub_cols.append(col) dtypes = prepare_dtypes_for_loading(cols) df = pandas.DataFrame(data, columns=cols) #create subcolumn data for col in cols_with_sub_cols: print ' Fetching sub columns of ', col['name'] i = 0 for sub_col in col['subColumnTypes']: print ' >', sub_col df[col['name'] + NEW_COLUMN_NAME_SEPARATOR + sub_col] = df[col['name']].apply(lambda x: x[i]) i += 1 print ' Removing source column ', col[ 'name'], ' from data frame.' #Then remove multivalored column df.drop(col['name'], axis=1, inplace=True) return df else: App.debug( 'JSON: There is no data field. Getting column names from JSON keys.' ) #get the list of cols from the json cols = list(json_data.keys()) dtypes = prepare_dtypes_for_loading(cols) return pandas.DataFrame(json_data, dtypes=dtypes) else: print '===> PandasUtilError: Invalid database file: [{0}]' # raise ApplicationExecption('File must be json or csv!'.format(database_file)) raise RuntimeError( 'File must be json (with data inside a data field) or csv!'.format( database_file))
def generate_partial_index_on(geo_col, geo_type, temp_col, temp_type, dataset, db_name): App.info('Generating index for ({0}) and ({1})'.format(geo_col, temp_col), TimeUtils.current_time_formated()) print geo_type an_index = pandas.DataFrame(columns=INDEX_COLUMNS) countby = [] # ## 1. ADD GEO VALUES TO INDEX if geo_type: App.info(' Processing geo part... '.ljust(50) + '@' + TimeUtils.current_time_formated()) # TODO: ENHANCE GEO INDEX <-------- (TODO) if geo_type == TypeDetector.GEO_GPS: an_index.lat, an_index.lon = PandasUtils.get_lat_lon_from_gps(dataset[geo_col]) countby += ['lat', 'lon'] elif geo_type in [TypeDetector.GEO_ZIP, TypeDetector.GEO_ZIP_9]: an_index.zipcode = dataset[geo_col] countby += ['zipcode'] # ## 2. ADD TEMPORAL VALUES TO INDEX if temp_type: App.info(' Processing temporal part... '.ljust(50) + '@' + TimeUtils.current_time_formated()) datetimes = dataset[temp_col].apply(lambda x: TimeUtils.datetime_from_str_date(x)) if temp_type in [TypeDetector.TEMPORAL_DATE, TypeDetector.TEMPORAL_DATE_TIME]: # an_index['epoch_secs'] = dataset[temp_col].apply(lambda x: TimeUtils.epoch_from_str_date(x)) an_index['year'] = datetimes.apply(lambda x: str(x.year) if x else Constants.MISSING_DATA_SYMBOL) an_index['month'] = datetimes.apply(lambda x: str(x.month) if x else Constants.MISSING_DATA_SYMBOL) an_index['day'] = datetimes.apply(lambda x: str(x.day) if x else Constants.MISSING_DATA_SYMBOL) # countby += ['epoch_secs'] countby += ['year', 'month', 'day'] # if temp_type == TypeDetector.TEMPORAL_DATE_TIME: # an_index['hour'] = datetimes.apply(lambda x: str(x.hour) if x else Constants.MISSING_DATA_SYMBOL) App.info(' Counting... '.ljust(50) + '@' + TimeUtils.current_time_formated()) # This order cannot change unless change this algorithm! First count, then clean # --------- Count rows for Index ------------------------------------------------------------------ # 3. create index counts # print '-------------------- countby=', countby temp = an_index[countby].reset_index().groupby(countby).agg(['count']) temp.columns = ['count'] temp.reset_index(inplace=True) # join with real dataset and add to index merged = pandas.merge(an_index, temp, how='inner', on=countby) # Add count to an_index an_index['count'] = merged['count'] # --------- 4. Clean Index: null and invalid values -------------------------------------------------- # print '<><><><><><><><><><><><><> an_index.count()=', an_index.count() App.info(' Cleaning... '.ljust(50) + '@' + TimeUtils.current_time_formated()) used_index_cols = list(an_index.count()[an_index.count() > 0].index) for col in used_index_cols: # geo if col in ['lat', 'lon']: col_type = TypeDetector.GEO_GPS_LATLON if col == 'zipcode': col_type = TypeDetector.GEO_ZIP if col == 'address': col_type = TypeDetector.GEO_ADDRESS if col == 'borough': col_type = TypeDetector.GEO_BOROUGH # temp if col in ['epoch_secs', 'year', 'month', 'day', 'hour']: col_type = TypeDetector.NUMERIC_INT App.info(' > {0}: {1}'.format(col, col_type).ljust(50) + '@' + TimeUtils.current_time_formated()) # clean_invalid_values(col_type, col, an_index) an_index = an_index[an_index[col].apply(lambda x: PandasUtils.is_valid(x))] App.debug('>>>>> an_index (len 20) <<<<<') App.debug(an_index[:20]) App.info(' Partial Index created with {0} rows'.format(len(an_index))) # 5. return index to be added to the main index return an_index
def reduce_summaries(first, second): """ This method joins two profile summaries in one that has information from both. If more than two summaries should be joined, join two by two. This method relies on naming convention for the variables to know how to join their values. For example, if the variable is count, then is just sum both counts. However if the variable is unique, then to join we need to consider the sets of values. Other examples are: sum, std, min, max. :param first: a summary to be joined :param second: another summary to be joined :return: a joined summary """ # Init reprocess_column_types = False # print '\n\n------------------------------------------- reduce -------------------------------------------' # print '1st =>', first # print '\n2nd =>', second # return '(' + first + ' <_> ' + second + ')' # Verify if structure is the same and dataset too if first['Name'] != second['Name']: raise Exception('Summaries are not from the same dataset.') elif first['Columns'] != second['Columns']: raise Exception('Number of columns is not the same.') joined = {} # we'll assume both summaries have the same keys # TODO: Protect to when both don't have the same keys all_keys = first.keys() # all_keys = [TextUtils.reencode_if_not_ascii(k) for k in first.keys()] # Join values based on key convention names or specific keys App.info('Processing all keys: %s' % all_keys) for key in all_keys: App.debug('- kEY: %s' % key) if key.lower().endswith('min') or key.lower().endswith('begin'): joined[key] = min(first[key], second[key]) elif key.lower().endswith('max') or key.lower().endswith('end'): joined[key] = max(first[key], second[key]) # if the keys are not max, min, std, mean or unique just use first -- we`re assuming both are the same # After we join the dataset metadata we still have to join the column metadata elif key == Profiler.COLUMN_METADATA: joined['Column Metadata'] = reduce_column_metadata(first, second) # TODO: join geo-temp index # elif key == 'Geo-Temp Index': elif key in [ 'Rows', 'Values', 'Values Missing', 'ETL-Profiler Processing Time (sec)', 'ETL-Profiler Total Memory (MB)', 'GPS Values' ]: joined[key] = first[key] + second[key] elif key in ['Values Missing Percent']: total = int(first['Rows']) + int(second['Rows']) temp = (float(first[key]) * int(first['Rows']) + float(second[key]) * int(second['Rows'])) / total joined[key] = round(temp, 2) elif key in [ 'Column Names Geo', 'Column Names Numeric', 'Column Names Temporal', 'Column Names Text', 'Columns Names Null' ]: if first[key] == second[key]: joined[key] = first[key] else: # TODO: should be processed later based on column types reprocess_column_types = True else: App.debug(' first["%s"]= %s' % (key, first[key])) if first[key]: joined[key] = first[key] else: App.debug(' -> Ignoring null Value') return joined