Esempio n. 1
0
def get_metadata_from_file(database_name, file_name, metadata_file):
    #Read metadata from File
    metadata = {}
    App.info("Reading metadata from file.")

    metadata_from_file = pandas.read_csv(metadata_file)
    ds_metadata = metadata_from_file.loc[
        metadata_from_file[METADATA_FILE_SOURCE_FILE] == file_name]
    # print '=============================================\n'
    # print 'ds_metadata\n', ds_metadata.T
    # print '=============================================\n'
    if len(ds_metadata) == 0:
        metadata[MetadataConstants.
                 STATUS] = MetadataConstants.STATUS_ERROR_METADATA_NOT_FOUND
        return metadata

    ds_metadata = ds_metadata.iloc[0]

    metadata[MetadataConstants.
             ORGANIZATION] = ds_metadata[METADATA_FILE_ORGANIZATION]
    ##> Load SOURCE info and check if shoudl continue or return
    metadata[SOURCE] = ds_metadata[SOURCE]
    # print '=========metadata[SOURCE]=================>', metadata[SOURCE]
    if metadata_source_is_socrata(metadata[SOURCE]):
        metadata[MetadataConstants.SOURCE_URL] = ds_metadata[SOURCE_URL]
        return metadata

    metadata[
        MetadataConstants.AGENCY] = ds_metadata[METADATA_FILE_ORGANIZATION]
    metadata[MetadataConstants.CATEGORY] = ds_metadata[METADATA_FILE_CATEGORY]
    metadata[MetadataConstants.OWNER] = ds_metadata[METADATA_FILE_ORGANIZATION]
    metadata[MetadataConstants.AUTHOR] = ds_metadata[METADATA_FILE_MAINTAINER]
    metadata[MetadataConstants.
             UPDATE_FREQUENCY] = ds_metadata[METADATA_FILE_UPDATE_FREQUENCY]
    metadata[MetadataConstants.TAGS] = ds_metadata[METADATA_FILE_TAGS]

    metadata[
        MetadataConstants.ACCESS_TYPE] = ds_metadata[METADATA_FILE_ACCESS_TYPE]
    metadata[MetadataConstants.
             ACCESS_TYPE] += ' - ' + ds_metadata[METADATA_FILE_CUSP_COLOR]

    metadata[MetadataConstants.DISPLAY_TYPE] = 'Table'
    metadata[MetadataConstants.VIEW_TYPE] = 'Tabular'

    # print '-------------__> metadata:', metadata
    metadata[MetadataConstants.STATUS] = MetadataConstants.STATUS_SUCCESS

    return metadata
Esempio n. 2
0
def types_of(column):
    App.debug('Detecting types of: ', column.name)
    App.debug('    size: ', len(column))
    detectors_type, detectors = data_detectors()
    App.debug('    Initializing detected_types. ')
    detected_types = {}
    # Initialize with all zeros
    for detector in detectors:
        detected_types[detector[DETECTOR_NAME]] = 0.0
    if len(column) == 0:
        App.debug('Empty column!')
        return detected_types

    remaining_values_to_detect_type = column.copy()

    ## If column is in unicode, transform to ASCII to avoid errors during processing.
    ## Check for unicode in column values
    unicode_values = remaining_values_to_detect_type.apply(
        lambda x: (type(x) is unicode))
    unicode_values_counts = unicode_values.value_counts()
    ## Transform the unicode values into ascii if there are any
    if True in unicode_values_counts.keys(
    ) and unicode_values_counts[True] > 0:
        App.info('Recoding values... (this can take some time)')
        remaining_values_to_detect_type = remaining_values_to_detect_type.apply(
            lambda x: TextUtils.reencode_text_if_not_ascii(x))

    for detector in detectors:
        detected, not_detected, type_name = detect_type(
            detector, detectors_type, remaining_values_to_detect_type)
        detected_types[type_name] = round(
            len(detected) * 100.0 / len(column), PERCENTUAL_PRECISION)
        remaining_values_to_detect_type = not_detected
        App.debug('    Remaining: ', len(not_detected))


#        if len(remaining_values_to_detect_type) == 0:
#            break
    return detected_types
Esempio n. 3
0
def generate_index_on(index_geo_cols, index_temp_cols, dataset, db_name):
    index = pandas.DataFrame(columns=INDEX_COLUMNS)

    # No columns to generate index
    if len(index_geo_cols.keys()) == 0 and len(index_temp_cols.keys()) == 0: return index

    # Prepare the list of cols
    # If is empty add None just to loop into it and call the generate_partial_index function
    if index_geo_cols is None or len(index_geo_cols) == 0: index_geo_cols[PHANTON_COL] = None
    if index_temp_cols is None or len(index_temp_cols) == 0: index_temp_cols[PHANTON_COL] = None

    # Clean dataset before create partial index
    print 'Cleaning dataset to process index'
    print 'dataset size:', len(dataset)
    cols_to_clean = index_geo_cols.copy()
    cols_to_clean.update(index_temp_cols)
    for col in cols_to_clean:
        print '     > {0} - {1}'.format(col, cols_to_clean[col]).ljust(50) + '@' + TimeUtils.current_time_formated()
        # If current col is the PHANTON col, skip it
        if col is PHANTON_COL: continue
        clean_invalid_values(cols_to_clean[col], col, dataset)
        print '          dataset size:', len(dataset)

    for geo_col in index_geo_cols.keys():
        geo_type = index_geo_cols[geo_col]

        for temp_col in index_temp_cols.keys():
            temp_type = index_temp_cols[temp_col]

            an_index = generate_partial_index_on(geo_col, geo_type, temp_col, temp_type, dataset, db_name)
            App.info('	Adding to index... '.ljust(50) + '@' + TimeUtils.current_time_formated())
            index = index.append(an_index, ignore_index=True)

    App.info('Index created with {0} rows'.format(len(index)))
    App.debug('>>>>> INDEX <<<<<\n', index)

    return index
def metadata_of(database_id, first=True, portal_url=NYC_OPENDATA_URL_BASE):
    App.debug(' SocrataUtils.metadata_of({0})'.format(database_id))
    
    url = portal_url + '/views/' + database_id + JSON_EXTENSION + APP_TOKEN_PARAM
    # App.debug('url: ', url)
    metadata = {'source':'Socrata'}
    # try:
    if True:
        App.debug('Url to get metadata from: ' + url)
        response = urllib.urlopen(url)
        data = json.loads(response.read())
        
        if 'id' in data and data['id'] == database_id:
            App.debug('    -> Success retrieving metadata!')
            App.debug('Retrieved metadata Keys:\n - ' + '\n - '.join(data.keys() ))
            App.debug('Retrieved metadata:\n' + json.dumps(data, indent=4, sort_keys=True))
            App.debug('==========================================')
            
            if 'rowIdentifierColumnId' in data:
                id_column_id = data['rowIdentifierColumnId']
                for col in data['columns']:
                    if col['id'] == id_column_id: 
                        metadata[MetadataConstants.ID_COLUMN] = col['name']
            else:
                metadata[MetadataConstants.ID_COLUMN] = None
                

            metadata[MetadataConstants.METADATA_SOURCE_URL] = key_as_str(data, url)
            metadata[MetadataConstants.METADATA_SOURCE_NAME] = key_as_str(data, 'Socrata Portal ' + portal_url)
                
            metadata[MetadataConstants.NAME] = key_as_str(data, 'name')
            metadata[MetadataConstants.PREFIX + 'Description'] = key_as_str(data, 'description')
            metadata[MetadataConstants.DISPLAY_TYPE_KEY] = key_as_str(data, 'displayType')
            metadata[MetadataConstants.PREFIX + 'Category'] = key_as_str(data, 'category')
            metadata[MetadataConstants.PREFIX + 'Owner'] = key_as_str(data['owner'], 'displayName')
            metadata[MetadataConstants.PREFIX + 'Download Count'] = key_as_str(data, 'downloadCount')
            metadata[MetadataConstants.PREFIX + 'View Count'] = key_as_str(data, 'viewCount')
            metadata[MetadataConstants.PREFIX + 'Comments'] = key_as_str(data, 'numberOfComments')
            metadata[MetadataConstants.PREFIX + 'Author'] = key_as_str(data['tableAuthor'], 'displayName')
    	    metadata[MetadataConstants.PREFIX + 'Id'] = key_as_str(data, 'id')
    	    metadata[MetadataConstants.PREFIX + 'Attribution'] = key_as_str(data, 'attribution')
            metadata[MetadataConstants.PREFIX + 'View Type'] = key_as_str(data, 'viewType')
            metadata[MetadataConstants.PREFIX + 'Display Type'] = key_as_str(data, 'displayType')
            metadata[MetadataConstants.PREFIX + 'Number of Coments'] = key_as_str(data, 'numberOfComments')
            ##> Discover if this dataset is a view
            if 'modifyingViewUid' not in data: metadata[MetadataConstants.PREFIX + 'View From'] = None
            else: metadata[MetadataConstants.PREFIX + 'View From'] = key_as_str(data,'modifyingViewUid')
            
            timestamp = int(data['createdAt'].__str__())
            metadata[MetadataConstants.PREFIX + 'Created At'] = datetime.datetime.fromtimestamp(timestamp).__str__()
            timestamp = int(data['viewLastModified'].__str__())
            metadata[MetadataConstants.PREFIX + 'Last Modified'] = datetime.datetime.fromtimestamp(timestamp).__str__()
            timestamp = int(data['publicationDate'].__str__())
            metadata[MetadataConstants.PREFIX + 'Publication Date'] = datetime.datetime.fromtimestamp(timestamp).__str__()
            metadata['Tags'] = key_as_str(data, 'tags')
            if metadata['Tags'] == 'None': metadata['Tags'] = None
            
            if 'metadata' in data and 'custom_fields' in data['metadata']:
                custom_fields = data['metadata']['custom_fields']
                if 'Update' in custom_fields and 'Update Frequency' in custom_fields['Update']: 
                    metadata[MetadataConstants.PREFIX + 'Update Frequency'] = custom_fields['Update']['Update Frequency'].__str__()
                if 'Dataset Information' in custom_fields and 'Agency' in custom_fields['Dataset Information']: 
                    metadata[MetadataConstants.PREFIX + 'Agency'] = custom_fields['Dataset Information']['Agency'].__str__()

            types = {}
            columns = data['columns']
            for col in columns:
                col_name = col['name'].strip(' ').encode('ascii','ignore')
                col_type = col['dataTypeName']
                types[col_name] = col_type
            metadata[MetadataConstants.PREFIX + 'Types'] = types

            metadata[MetadataConstants.STATUS] = MetadataConstants.STATUS_SUCCESS
        else:
            if 'Cannot find view with id' in data['message']:
                metadata[MetadataConstants.STATUS] = MetadataConstants.STATUS_ERROR_VIEW_NOT_FOUND 
            else:
                metadata[MetadataConstants.STATUS] = 'Error'
            metadata['message'] = data['message']
    # except e:
    #     raise e
    #      #This means that it is not from socrata
    #      # Or that some other error occurred
    #      #just return None
    #     if first: 
    #         App.debug('Waiting to try again')
    #         sleep(0.5)
    #         return metadata_of(database_id, first=False)
    #     metadata[MetadataConstants.STATUS] = 'Error Exception'
    #     metadata['message'] = 'Error acessing a Socrata Portal with url: {0}'.format(url)
    
    if metadata[MetadataConstants.STATUS] is not MetadataConstants.STATUS_SUCCESS: 
        # logging.warn(metadata[STATUS])
        App.debug('WARNING: ', metadata[MetadataConstants.STATUS])
    
    #before return, turn the Unicodes to normal str
    for k in metadata.keys():
        TextUtils.reencode_text_if_not_ascii(metadata[k])
#         if type(metadata[k]) is unicode: 
# #            print '    Unicode info found on key: ' , k, '=', metadata[k] 
#             metadata[k] = metadata[k].encode('ascii','ignore')

    ##> If there was an error, show url so user can check
    if metadata[MetadataConstants.STATUS] == MetadataConstants.STATUS_ERROR_VIEW_NOT_FOUND: 
        App.info('    Metadata not found on Socrata with url: ' + url)
    
    ##> Show dataset retrieved name to indicate success
    if metadata[MetadataConstants.STATUS] == MetadataConstants.STATUS_SUCCESS: 
        App.info('    OK. Dataset Retrieved Name: ' + metadata[ MetadataConstants.NAME ] )

    App.debug('Retrieved Metadata: \n' + json.dumps(metadata, ensure_ascii=False, indent=4, sort_keys=True) )
    return metadata
Esempio n. 5
0
def load_database(database_file, skiprows=None, nrows=None):
    # It is a socrata CSV database. The wget on compute is not geting the extension as should.
    file_type = 'CSV'  #default if no extension is found.
    if database_file.endswith('.csv'): file_type = 'CSV'
    if database_file.endswith('.json'): file_type = 'JSON'

    file_encoding = get_encoding(database_file)
    App.info('   > File encoding: %s' % file_encoding)

    if file_type == 'CSV':
        App.debug('CSV: Reading column headers from first line.')
        cols = FileUtils.get_cols_from_csv_header(database_file)
        App.debug('Preparing column types for pandas.')
        dtypes = prepare_dtypes_for_loading(cols)
        try:
            App.debug('Trying to read csv...')
            return pandas.read_csv(database_file,
                                   skiprows=skiprows,
                                   nrows=nrows,
                                   low_memory=LOW_MEMORY,
                                   encoding=file_encoding,
                                   dtype=dtypes)
        except:
            App.debug('Default CSV did not work.')
            App.debug('Trying to read with tab as separator...')
            # This error can be because the file is a tab separated values instead of comma
            return pandas.read_csv(database_file,
                                   skiprows=skiprows,
                                   nrows=nrows,
                                   low_memory=LOW_MEMORY,
                                   encoding=file_encoding,
                                   sep='\t',
                                   dtype=dtypes)

    elif file_type == 'JSON':
        # This works for json under socrata format, which have data field.
        # If not this way, lets supose it is already the data.
        json_file = open(database_file)
        json_data = json.load(json_file)

        if 'data' in json_data.keys():
            App.debug('JSON: Read data from data field. (Socrata format)')
            data = json_data['data']
            cols = []
            cols_with_sub_cols = []

            App.debug('Getting column names from metadata...')
            for col in json_data['meta']['view']['columns']:
                cols.append(col['name'])

                if 'subColumnTypes' in col.keys():
                    print '    (!) Column ', col[
                        'name'], ' has sub columns: ', col['subColumnTypes']
                    cols_with_sub_cols.append(col)

            dtypes = prepare_dtypes_for_loading(cols)
            df = pandas.DataFrame(data, columns=cols)

            #create subcolumn data
            for col in cols_with_sub_cols:
                print '    Fetching sub columns of ', col['name']
                i = 0
                for sub_col in col['subColumnTypes']:
                    print '         >', sub_col
                    df[col['name'] + NEW_COLUMN_NAME_SEPARATOR +
                       sub_col] = df[col['name']].apply(lambda x: x[i])
                    i += 1
                print '    Removing source column ', col[
                    'name'], ' from data frame.'
                #Then remove multivalored column
                df.drop(col['name'], axis=1, inplace=True)
            return df

        else:
            App.debug(
                'JSON: There is no data field. Getting column names from JSON keys.'
            )
            #get the list of cols from the json
            cols = list(json_data.keys())
            dtypes = prepare_dtypes_for_loading(cols)
            return pandas.DataFrame(json_data, dtypes=dtypes)
    else:
        print '===> PandasUtilError: Invalid database file: [{0}]'
        #        raise ApplicationExecption('File must be json or csv!'.format(database_file))
        raise RuntimeError(
            'File must be json (with data inside a data field) or csv!'.format(
                database_file))
Esempio n. 6
0
def generate_partial_index_on(geo_col, geo_type, temp_col, temp_type, dataset, db_name):
    App.info('Generating index for ({0}) and ({1})'.format(geo_col, temp_col), TimeUtils.current_time_formated())
    print geo_type

    an_index = pandas.DataFrame(columns=INDEX_COLUMNS)
    countby = []

    # ## 1. ADD GEO VALUES TO INDEX
    if geo_type:
        App.info('	Processing geo part... '.ljust(50) + '@' + TimeUtils.current_time_formated())
        # TODO: ENHANCE GEO INDEX <-------- (TODO)
        if geo_type == TypeDetector.GEO_GPS:
            an_index.lat, an_index.lon = PandasUtils.get_lat_lon_from_gps(dataset[geo_col])
            countby += ['lat', 'lon']

        elif geo_type in [TypeDetector.GEO_ZIP, TypeDetector.GEO_ZIP_9]:
            an_index.zipcode = dataset[geo_col]
            countby += ['zipcode']

    # ## 2. ADD TEMPORAL VALUES TO INDEX
    if temp_type:
        App.info('	Processing temporal part... '.ljust(50) + '@' + TimeUtils.current_time_formated())
        datetimes = dataset[temp_col].apply(lambda x: TimeUtils.datetime_from_str_date(x))
        if temp_type in [TypeDetector.TEMPORAL_DATE, TypeDetector.TEMPORAL_DATE_TIME]:
            # an_index['epoch_secs'] = dataset[temp_col].apply(lambda x: TimeUtils.epoch_from_str_date(x))

            an_index['year'] = datetimes.apply(lambda x: str(x.year) if x else Constants.MISSING_DATA_SYMBOL)
            an_index['month'] = datetimes.apply(lambda x: str(x.month) if x else Constants.MISSING_DATA_SYMBOL)
            an_index['day'] = datetimes.apply(lambda x: str(x.day) if x else Constants.MISSING_DATA_SYMBOL)
            # countby += ['epoch_secs']
            countby += ['year', 'month', 'day']

        # if temp_type == TypeDetector.TEMPORAL_DATE_TIME:
        # 	an_index['hour'] = datetimes.apply(lambda x: str(x.hour) if x else Constants.MISSING_DATA_SYMBOL)

        App.info('	Counting... '.ljust(50) + '@' + TimeUtils.current_time_formated())

    # This order cannot change unless change this algorithm! First count, then clean
    # --------- Count rows for Index ------------------------------------------------------------------
    # 3. create index counts
    # print '-------------------- countby=', countby
    temp = an_index[countby].reset_index().groupby(countby).agg(['count'])
    temp.columns = ['count']
    temp.reset_index(inplace=True)
    # join with real dataset and add to index
    merged = pandas.merge(an_index, temp, how='inner', on=countby)
    # Add count to an_index
    an_index['count'] = merged['count']

    # --------- 4. Clean Index: null and invalid values --------------------------------------------------
    # print '<><><><><><><><><><><><><> an_index.count()=', an_index.count()
    App.info('	Cleaning... '.ljust(50) + '@' + TimeUtils.current_time_formated())

    used_index_cols = list(an_index.count()[an_index.count() > 0].index)
    for col in used_index_cols:
        # geo
        if col in ['lat', 'lon']: col_type = TypeDetector.GEO_GPS_LATLON
        if col == 'zipcode': col_type = TypeDetector.GEO_ZIP
        if col == 'address': col_type = TypeDetector.GEO_ADDRESS
        if col == 'borough': col_type = TypeDetector.GEO_BOROUGH
        # temp
        if col in ['epoch_secs', 'year', 'month', 'day', 'hour']: col_type = TypeDetector.NUMERIC_INT
        App.info('	   > {0}: {1}'.format(col, col_type).ljust(50) + '@' + TimeUtils.current_time_formated())
        # clean_invalid_values(col_type, col, an_index)
        an_index = an_index[an_index[col].apply(lambda x: PandasUtils.is_valid(x))]

    App.debug('>>>>> an_index (len 20) <<<<<')
    App.debug(an_index[:20])
    App.info('     Partial Index created with {0} rows'.format(len(an_index)))
    # 5. return index to be added to the main index
    return an_index
Esempio n. 7
0
def reduce_summaries(first, second):
    """
    This method joins two profile summaries in one that has information from both.
    If more than two summaries should be joined, join two by two.

    This method relies on naming convention for the variables to know how to join their values.
    For example, if the variable is count, then is just sum both counts.
    However if the variable is unique, then to join we need to consider the sets of values.
    Other examples are: sum, std, min, max.

    :param first: a summary to be joined
    :param second: another summary to be joined
    :return: a joined summary
    """
    # Init
    reprocess_column_types = False
    # print '\n\n------------------------------------------- reduce -------------------------------------------'
    # print '1st =>', first
    # print '\n2nd =>', second

    # return '(' + first + ' <_> ' + second + ')'

    # Verify if structure is the same and dataset too
    if first['Name'] != second['Name']:
        raise Exception('Summaries are not from the same dataset.')
    elif first['Columns'] != second['Columns']:
        raise Exception('Number of columns is not the same.')

    joined = {}
    # we'll assume both summaries have the same keys
    # TODO: Protect to when both don't have the same keys
    all_keys = first.keys()
    # all_keys = [TextUtils.reencode_if_not_ascii(k) for k in first.keys()]

    # Join values based on key convention names or specific keys
    App.info('Processing all keys: %s' % all_keys)
    for key in all_keys:
        App.debug('- kEY: %s' % key)
        if key.lower().endswith('min') or key.lower().endswith('begin'):
            joined[key] = min(first[key], second[key])

        elif key.lower().endswith('max') or key.lower().endswith('end'):
            joined[key] = max(first[key], second[key])

        # if the keys are not max, min, std, mean or unique just use first -- we`re assuming both are the same
        # After we join the dataset metadata we still have to join the column metadata
        elif key == Profiler.COLUMN_METADATA:
            joined['Column Metadata'] = reduce_column_metadata(first, second)

        # TODO: join geo-temp index
        # elif key == 'Geo-Temp Index':

        elif key in [
                'Rows', 'Values', 'Values Missing',
                'ETL-Profiler Processing Time (sec)',
                'ETL-Profiler Total Memory (MB)', 'GPS Values'
        ]:
            joined[key] = first[key] + second[key]

        elif key in ['Values Missing Percent']:
            total = int(first['Rows']) + int(second['Rows'])
            temp = (float(first[key]) * int(first['Rows']) +
                    float(second[key]) * int(second['Rows'])) / total
            joined[key] = round(temp, 2)

        elif key in [
                'Column Names Geo', 'Column Names Numeric',
                'Column Names Temporal', 'Column Names Text',
                'Columns Names Null'
        ]:
            if first[key] == second[key]:
                joined[key] = first[key]
            else:
                # TODO: should be processed later based on column types
                reprocess_column_types = True
        else:
            App.debug('    first["%s"]= %s' % (key, first[key]))
            if first[key]:
                joined[key] = first[key]
            else:
                App.debug('    -> Ignoring null Value')

    return joined