Example #1
0
def get_geo_columns_for_index(column_types, dataset):
    App.debug('Preparing Geo Index')

    geo_cols = {}
    lat_col_name = None
    lon_col_name = None
    for count_cols, row in column_types.iterrows():
        App.debug('row= ', row['profiler-most-detected'], row['column-name'])
        # print "row['profiler-most-detected'] in INDEX_ACCEPTED_TYPES_GEO=", row['profiler-most-detected'] in INDEX_ACCEPTED_TYPES_GEO
        if row['profiler-most-detected'] in INDEX_ACCEPTED_TYPES_GEO:
            col_name = row['column-name']
            App.debug('> Found:', col_name)

            # Improve for LATITUDE and LONGITUDE in different columns
            if row['profiler-most-detected'] == TypeDetector.GEO_GPS_LATLON:
                if 'LONGITUDE' in col_name.upper():
                    lon_col_name = col_name
                elif 'LATITUDE' in col_name.upper():
                    lat_col_name = col_name
            else:
                geo_cols[col_name] = row['profiler-most-detected']

    if lat_col_name and lon_col_name:
        new_gps_col = NEW_GEO_COL_GPS_PREFIX
        dataset[new_gps_col] = PandasUtils.join_lat_lon_into_gps(dataset, lat_col_name, lon_col_name)
        geo_cols[new_gps_col] = TypeDetector.GEO_GPS
        App.debug('CREATED GPS COL:', dataset[new_gps_col])

    App.debug('Geo cols to index:', geo_cols)
    return geo_cols
def save_complete_dataset(row,
                          count,
                          summaries,
                          types,
                          column_metadata,
                          gps_counts,
                          report_progress=False):
    pid = str(os.getpid())
    try:
        db = save_database(row, count, summaries)
        save_columns(db, types, column_metadata)
        if not IGNORE_INDEX:
            if COPY_GEO_INDEX:
                # link_gps_data(db.database_id, db.id)
                print '         -> GPS Data will be linked to datasets at end of processing.'
            else:
                save_gps_data(db, gps_counts)

        # return db
    except:
        msg = '[' + pid + '] ERROR in THREAD:\n'
        msg += '[' + pid + '] -----------------------------------------------------------------\n'
        msg += '[' + pid + '] ' + traceback.format_exc() + '\n'
        msg += '[' + pid + '] -----------------------------------------------------------------'
        ApplicationOptions.error(msg)
        raise
    if report_progress: report(db)
Example #3
0
def is_us_address(value):
    try:
        tag = usaddress.tag(value)
        return len(tag) > 2 or tag[-1] is not 'Ambiguous'
    except usaddress.RepeatedLabelError as ex:
        App.debug('Error detecting Geo-Address:', ex)
        return False
Example #4
0
def profile_as_job(database_file):
    profiler = Profiler.Profiler()

    global STOP_RUNNING
    if STOP_RUNNING: return

    try:
        pid = str(os.getpid())
        print 'Begin: [' + pid + ']: ' + database_file
        profiler.profile(database_file)
        print 'End: [' + pid + ']: ' + database_file

    except KeyboardInterrupt:
        App.error('KeyboardInterrupt with: ' + database_file)
        STOP_RUNNING = ApplicationOptions.OPTIONS['stop_on_error']

    except:
        msg = '[' + pid + '] ERROR in THREAD:\n'
        msg += '[' + pid + '] -----------------------------------------------------------------\n'
        for line in traceback.format_exc().split('\n'):
            msg += '[' + pid + '] ' + line + '\n'
        msg += '[' + pid + '] -----------------------------------------------------------------'
        # # Will print colored here instead of app.error as facilitates reading error output and debuging
        # print tc.RED + msg + tc.ENDC
        ApplicationOptions.error(msg)
        # raise
    finally:
        return profiler
Example #5
0
def detect_null(column):
    App.debug('Detecting: Null')
    not_null_indexes = column.astype(str).apply(
        lambda x: x.lower() not in NULL_VALUES)
    not_null = column[not_null_indexes]
    null_indexes = column[not_null_indexes == False]
    App.debug('   detected: ', len(null_indexes))
    # App.debug('   null_indexes: \n', null_indexes)
    # if len(null_indexes) > 0:
    return NULL, null_indexes, not_null
Example #6
0
def simplify(column_types_count):
    simple_types_count = {}
    for prefix in TYPE_PREFIXES:
        simple_types_count[prefix] = 0

    for type in column_types_count:
        App.debug('Computing [{0}]: {1}'.format(type,
                                                column_types_count[type]))
        prefix = type.split('-')[0]
        simple_types_count[prefix] += column_types_count[type]

    App.debug('Simple types count: ', simple_types_count)
    return simple_types_count
def prepare_location_columns(database, metadata_types):
    
    for col in database.columns:
#        print 'Checking:' + col + ' -type:' + metadata_types[col]

        col_is_string = database[col].dtype == object
        if col_is_string and col in metadata_types.keys() and metadata_types[col].lower() == 'location':
            #is it complex ?
            if True in database[col].astype(str).apply(lambda x: '<br />' in x or '\n' in x).value_counts():
                App.debug('Separating location column: ', col)
                #split in multiple columns

                new_col = col + PREFIX_NEW_COLUMN + 'gps'
                database[new_col] = database[col].apply(lambda x: extract_gps_from_composite_location(x))
Example #8
0
def process_args(value_opts={}, boolean_opts={}):
    v_opts = value_opts.copy()
    b_opts = boolean_opts.copy()
    b_opts['debug'] = False
    
    opts = {}
    
    args = ARGS or sys.argv[1:]
    print 'args=', args
    
    for arg in args:
        param = arg.split('=')[0][2:] if '=' in arg else arg[2:]
        value = arg.split('=')[1] if '=' in arg else None
        
        if param == 'help':
            print 'Possible simple params are:'
            for opt in b_opts.keys():
                print '    --' + opt
            print 'And valuable params are:'
            for opt in v_opts.keys():
                print '    --' + opt
            sys.exit (0)
            
        elif value is None:
            if param not in b_opts:
                print 'param: {0}'.format(param)
                raise SystemExit('ERROR: Invalid arg "{0}".\nValid options are: {1} with preceding --.'.format(arg, b_opts.keys()));
            else:
                b_opts[param] = True
            
        elif param in v_opts:
            v_opts[param] = value
                        
        else:
            raise SystemExit('ERROR: Invalid value arg "{0}".\nValid options are: {1} with preceding -- and with numeric values. like --nrows=10.'.format(arg, v_opts.keys()));
            
    opts.update(b_opts)
    opts.update(v_opts)
    
    if opts['debug']:
        App.start_debuging()
        
    if 'debug' in opts and opts['debug']:
        print 'Considered options:'
        for k in opts:
            print '   - {0} = {1}'.format(k, opts[k])

    return opts
Example #9
0
def most_detected(detected_types):
    App.debug('Detected types: ', detected_types.keys())
    most_detected_type = NULL
    precision = detected_types[most_detected_type]

    if detected_types is None or len(detected_types) == 0:
        return NULL, 100  #%

    for key in detected_types.keys():
        current = detected_types[key]
        App.debug('Current: [{0}]={1}'.format(key, current))
        if current > 0 and current >= precision:
            most_detected_type = key
            precision = detected_types[most_detected_type]
            App.debug('most_detected updated with key:', key)

    App.debug('[most_detected] detected_types=', detected_types)
    App.debug('[most_detected]:', most_detected_type)
    return most_detected_type, precision
Example #10
0
def run_bash(bash_command, folder=None):
    cmd = bash_command
    if type(cmd) == str: cmd = cmd.split()
    
    if folder is None:
        process = subprocess.Popen(cmd, stdout=subprocess.PIPE)
    else:
        process = subprocess.Popen(cmd, stdout=subprocess.PIPE, cwd=folder, stderr=subprocess.STDOUT)

    stdout_data, stderr_data = process.communicate()
    if process.returncode != 0:
        message = "%r failed, status code %s stdout %r stderr %r" % (
                       cmd, process.returncode, stdout_data, stderr_data)
        App.error(message)
        raise RuntimeError(message)
    output = ''
    if stdout_data: output += stdout_data
    if stderr_data: output += stderr_data
    return output
Example #11
0
def get_temp_columns_for_index(column_types, dataset):
    App.debug('Preparing Temporal Index')

    temp_cols = {}
    date_col_name = None
    time_col_name = None
    for count_cols, row in column_types.iterrows():
        if row['profiler-most-detected'] in INDEX_ACCEPTED_TYPES_TEMP:
            col_name = row['column-name']
            App.debug('> Found:', col_name)

            if col_name.upper() == 'DATE':
                date_col_name = col_name
            elif col_name.upper() == 'TIME':
                time_col_name = col_name
            else:
                temp_cols[col_name] = row['profiler-most-detected']

    # print 'date_col_name and time_col_name=', date_col_name, time_col_name
    # If dataset has TIME and DATE join both as one column
    if date_col_name and time_col_name:
        new_col_name = NEW_TEMP_COL_DATETIME_PREFIX
        dataset[new_col_name] = TimeUtils.join_date_and_time(dataset[date_col_name], dataset[time_col_name])
        temp_cols[new_col_name] = TypeDetector.TEMPORAL_DATE_TIME

    # Has only date, but not date and time. If has only time, disconsider it
    elif date_col_name:
        temp_cols[date_col_name] = row['profiler-most-detected']

    App.debug('Temp cols to index', temp_cols)
    return temp_cols
Example #12
0
def detect_text(column):
    App.debug('Detecting: Text')
    nulls, not_nulls = detect_null(column)[1:]

    App.debug('Text Values:', not_nulls.values[:10])
    App.debug('Non Text Values:', nulls.values[:10])

    return TEXTUAL, not_nulls, nulls
Example #13
0
def valid_values_of_type(type_name, column_values):
    App.debug('valid_values_of_type()')
    detectors_type, type_detectors = data_detectors()
    for detector in type_detectors:
        if detector['name'] == type_name:
            App.debug('Detecting valid values for:', type_name)
            detected, not_detected, type_name = detect_type(
                detector, detectors_type, column_values)
            # type_name, detected, not_detected = detect_using_dynamic(detector, column_values)
            App.debug('Detected: ', len(detected))
            return detected
    return None
Example #14
0
def detect_us_address(column):
    type = GEO_ADDRESS
    prepared_col_data = column.dropna()

    is_address = prepared_col_data.astype(str).str.upper().apply(
        lambda x: is_us_address(x))
    detected = prepared_col_data[is_address == True]
    not_detected = prepared_col_data[is_address == False]

    App.debug('Detected Type:', type)
    App.debug('Detected Values:', len(detected), ' - ', detected.values[:10])
    App.debug('Non Detected Values:', len(not_detected), ' - ',
              not_detected.values[:10])

    return type, detected, not_detected
Example #15
0
def generate_index_on(index_geo_cols, index_temp_cols, dataset, db_name):
    index = pandas.DataFrame(columns=INDEX_COLUMNS)

    # No columns to generate index
    if len(index_geo_cols.keys()) == 0 and len(index_temp_cols.keys()) == 0: return index

    # Prepare the list of cols
    # If is empty add None just to loop into it and call the generate_partial_index function
    if index_geo_cols is None or len(index_geo_cols) == 0: index_geo_cols[PHANTON_COL] = None
    if index_temp_cols is None or len(index_temp_cols) == 0: index_temp_cols[PHANTON_COL] = None

    # Clean dataset before create partial index
    print 'Cleaning dataset to process index'
    print 'dataset size:', len(dataset)
    cols_to_clean = index_geo_cols.copy()
    cols_to_clean.update(index_temp_cols)
    for col in cols_to_clean:
        print '     > {0} - {1}'.format(col, cols_to_clean[col]).ljust(50) + '@' + TimeUtils.current_time_formated()
        # If current col is the PHANTON col, skip it
        if col is PHANTON_COL: continue
        clean_invalid_values(cols_to_clean[col], col, dataset)
        print '          dataset size:', len(dataset)

    for geo_col in index_geo_cols.keys():
        geo_type = index_geo_cols[geo_col]

        for temp_col in index_temp_cols.keys():
            temp_type = index_temp_cols[temp_col]

            an_index = generate_partial_index_on(geo_col, geo_type, temp_col, temp_type, dataset, db_name)
            App.info('	Adding to index... '.ljust(50) + '@' + TimeUtils.current_time_formated())
            index = index.append(an_index, ignore_index=True)

    App.info('Index created with {0} rows'.format(len(index)))
    App.debug('>>>>> INDEX <<<<<\n', index)

    return index
Example #16
0
def process_profiles():
    try:
        global summaries
        global all_gps_rows
        global geo_index_by_dataset
        global all_gps_rows_by_database
        global all_zip_rows
        global gps_db_count
        global zip_db_count
        global profiled_database_types
        profiled_database_types = None
        global profiled_column_metadata
        profiled_column_metadata = None
        summaries = pandas.DataFrame()
        # Count of rows by ZIP and GPS
        all_gps_rows = pandas.DataFrame()
        all_zip_rows = pandas.DataFrame()
        geo_index_by_dataset = pandas.DataFrame()
        all_gps_rows_by_database = pandas.DataFrame()
        # Count of databases that have ZIP and GPS
        gps_db_count = pandas.DataFrame()
        zip_db_count = pandas.DataFrame()

        i = 0
        for profiler in profilers:
            i += 1
            summaries = pandas.concat([summaries[:], profiler.last_sumary])

            if profiler.last_sumary['ETL-Profiler Status'][0] == 'OK':
                print 'Counting zip and gps data: ', i, '/', len(profilers)
                print '    by Rows'
                all_zip_rows = PandasUtils.merge_series_summing_values(
                    all_zip_rows, profiler.last_zip_rows)
                all_gps_rows = PandasUtils.merge_series_summing_values(
                    all_gps_rows, profiler.last_gps_rows)

                print '    rows by Databases'
                all_gps_rows_by_database = PandasUtils.merge_by_database(
                    all_gps_rows_by_database, profiler.last_gps_rows,
                    profiler.last_sumary.ix[0].Name)
                if not opts['ignore_index']:
                    geo_index_by_dataset = geo_index_by_dataset.append(
                        profiler.last_geo_index, ignore_index=True)

                print '    by Databases'
                # To consider that this database counts only by one, even if it appears more
                temp = pandas.DataFrame(profiler.last_zip_rows,
                                        columns=['count'])
                temp['count'] = 1
                temp = temp['count']
                zip_db_count = PandasUtils.merge_series_summing_values(
                    zip_db_count, temp)

                temp = pandas.DataFrame(profiler.last_gps_rows,
                                        columns=['count'])
                temp['count'] = 1
                temp = temp['count']
                gps_db_count = PandasUtils.merge_series_summing_values(
                    gps_db_count, temp)

                if profiled_database_types is None:
                    profiled_database_types = profiler.types_summary.copy()
                else:
                    profiled_database_types = profiled_database_types.append(
                        profiler.types_summary, ignore_index=True)

                if profiled_column_metadata is None:
                    profiled_column_metadata = profiler.column_metadata.copy()
                else:
                    profiled_column_metadata = profiled_column_metadata.append(
                        profiler.column_metadata, ignore_index=True)
    except:
        if opts['stop_on_error']:
            raise
        ApplicationOptions.error(Exception('Error processing profilers'))

    if 'stop_on_error' in opts and opts['stop_on_error'] and has_error_on(
            summaries):
        ApplicationOptions.error(Exception('Error on summaries'))
Example #17
0
def types_of(column):
    App.debug('Detecting types of: ', column.name)
    App.debug('    size: ', len(column))
    detectors_type, detectors = data_detectors()
    App.debug('    Initializing detected_types. ')
    detected_types = {}
    # Initialize with all zeros
    for detector in detectors:
        detected_types[detector[DETECTOR_NAME]] = 0.0
    if len(column) == 0:
        App.debug('Empty column!')
        return detected_types

    remaining_values_to_detect_type = column.copy()

    ## If column is in unicode, transform to ASCII to avoid errors during processing.
    ## Check for unicode in column values
    unicode_values = remaining_values_to_detect_type.apply(
        lambda x: (type(x) is unicode))
    unicode_values_counts = unicode_values.value_counts()
    ## Transform the unicode values into ascii if there are any
    if True in unicode_values_counts.keys(
    ) and unicode_values_counts[True] > 0:
        App.info('Recoding values... (this can take some time)')
        remaining_values_to_detect_type = remaining_values_to_detect_type.apply(
            lambda x: TextUtils.reencode_text_if_not_ascii(x))

    for detector in detectors:
        detected, not_detected, type_name = detect_type(
            detector, detectors_type, remaining_values_to_detect_type)
        detected_types[type_name] = round(
            len(detected) * 100.0 / len(column), PERCENTUAL_PRECISION)
        remaining_values_to_detect_type = not_detected
        App.debug('    Remaining: ', len(not_detected))


#        if len(remaining_values_to_detect_type) == 0:
#            break
    return detected_types
def metadata_of(database_id, first=True, portal_url=NYC_OPENDATA_URL_BASE):
    App.debug(' SocrataUtils.metadata_of({0})'.format(database_id))
    
    url = portal_url + '/views/' + database_id + JSON_EXTENSION + APP_TOKEN_PARAM
    # App.debug('url: ', url)
    metadata = {'source':'Socrata'}
    # try:
    if True:
        App.debug('Url to get metadata from: ' + url)
        response = urllib.urlopen(url)
        data = json.loads(response.read())
        
        if 'id' in data and data['id'] == database_id:
            App.debug('    -> Success retrieving metadata!')
            App.debug('Retrieved metadata Keys:\n - ' + '\n - '.join(data.keys() ))
            App.debug('Retrieved metadata:\n' + json.dumps(data, indent=4, sort_keys=True))
            App.debug('==========================================')
            
            if 'rowIdentifierColumnId' in data:
                id_column_id = data['rowIdentifierColumnId']
                for col in data['columns']:
                    if col['id'] == id_column_id: 
                        metadata[MetadataConstants.ID_COLUMN] = col['name']
            else:
                metadata[MetadataConstants.ID_COLUMN] = None
                

            metadata[MetadataConstants.METADATA_SOURCE_URL] = key_as_str(data, url)
            metadata[MetadataConstants.METADATA_SOURCE_NAME] = key_as_str(data, 'Socrata Portal ' + portal_url)
                
            metadata[MetadataConstants.NAME] = key_as_str(data, 'name')
            metadata[MetadataConstants.PREFIX + 'Description'] = key_as_str(data, 'description')
            metadata[MetadataConstants.DISPLAY_TYPE_KEY] = key_as_str(data, 'displayType')
            metadata[MetadataConstants.PREFIX + 'Category'] = key_as_str(data, 'category')
            metadata[MetadataConstants.PREFIX + 'Owner'] = key_as_str(data['owner'], 'displayName')
            metadata[MetadataConstants.PREFIX + 'Download Count'] = key_as_str(data, 'downloadCount')
            metadata[MetadataConstants.PREFIX + 'View Count'] = key_as_str(data, 'viewCount')
            metadata[MetadataConstants.PREFIX + 'Comments'] = key_as_str(data, 'numberOfComments')
            metadata[MetadataConstants.PREFIX + 'Author'] = key_as_str(data['tableAuthor'], 'displayName')
    	    metadata[MetadataConstants.PREFIX + 'Id'] = key_as_str(data, 'id')
    	    metadata[MetadataConstants.PREFIX + 'Attribution'] = key_as_str(data, 'attribution')
            metadata[MetadataConstants.PREFIX + 'View Type'] = key_as_str(data, 'viewType')
            metadata[MetadataConstants.PREFIX + 'Display Type'] = key_as_str(data, 'displayType')
            metadata[MetadataConstants.PREFIX + 'Number of Coments'] = key_as_str(data, 'numberOfComments')
            ##> Discover if this dataset is a view
            if 'modifyingViewUid' not in data: metadata[MetadataConstants.PREFIX + 'View From'] = None
            else: metadata[MetadataConstants.PREFIX + 'View From'] = key_as_str(data,'modifyingViewUid')
            
            timestamp = int(data['createdAt'].__str__())
            metadata[MetadataConstants.PREFIX + 'Created At'] = datetime.datetime.fromtimestamp(timestamp).__str__()
            timestamp = int(data['viewLastModified'].__str__())
            metadata[MetadataConstants.PREFIX + 'Last Modified'] = datetime.datetime.fromtimestamp(timestamp).__str__()
            timestamp = int(data['publicationDate'].__str__())
            metadata[MetadataConstants.PREFIX + 'Publication Date'] = datetime.datetime.fromtimestamp(timestamp).__str__()
            metadata['Tags'] = key_as_str(data, 'tags')
            if metadata['Tags'] == 'None': metadata['Tags'] = None
            
            if 'metadata' in data and 'custom_fields' in data['metadata']:
                custom_fields = data['metadata']['custom_fields']
                if 'Update' in custom_fields and 'Update Frequency' in custom_fields['Update']: 
                    metadata[MetadataConstants.PREFIX + 'Update Frequency'] = custom_fields['Update']['Update Frequency'].__str__()
                if 'Dataset Information' in custom_fields and 'Agency' in custom_fields['Dataset Information']: 
                    metadata[MetadataConstants.PREFIX + 'Agency'] = custom_fields['Dataset Information']['Agency'].__str__()

            types = {}
            columns = data['columns']
            for col in columns:
                col_name = col['name'].strip(' ').encode('ascii','ignore')
                col_type = col['dataTypeName']
                types[col_name] = col_type
            metadata[MetadataConstants.PREFIX + 'Types'] = types

            metadata[MetadataConstants.STATUS] = MetadataConstants.STATUS_SUCCESS
        else:
            if 'Cannot find view with id' in data['message']:
                metadata[MetadataConstants.STATUS] = MetadataConstants.STATUS_ERROR_VIEW_NOT_FOUND 
            else:
                metadata[MetadataConstants.STATUS] = 'Error'
            metadata['message'] = data['message']
    # except e:
    #     raise e
    #      #This means that it is not from socrata
    #      # Or that some other error occurred
    #      #just return None
    #     if first: 
    #         App.debug('Waiting to try again')
    #         sleep(0.5)
    #         return metadata_of(database_id, first=False)
    #     metadata[MetadataConstants.STATUS] = 'Error Exception'
    #     metadata['message'] = 'Error acessing a Socrata Portal with url: {0}'.format(url)
    
    if metadata[MetadataConstants.STATUS] is not MetadataConstants.STATUS_SUCCESS: 
        # logging.warn(metadata[STATUS])
        App.debug('WARNING: ', metadata[MetadataConstants.STATUS])
    
    #before return, turn the Unicodes to normal str
    for k in metadata.keys():
        TextUtils.reencode_text_if_not_ascii(metadata[k])
#         if type(metadata[k]) is unicode: 
# #            print '    Unicode info found on key: ' , k, '=', metadata[k] 
#             metadata[k] = metadata[k].encode('ascii','ignore')

    ##> If there was an error, show url so user can check
    if metadata[MetadataConstants.STATUS] == MetadataConstants.STATUS_ERROR_VIEW_NOT_FOUND: 
        App.info('    Metadata not found on Socrata with url: ' + url)
    
    ##> Show dataset retrieved name to indicate success
    if metadata[MetadataConstants.STATUS] == MetadataConstants.STATUS_SUCCESS: 
        App.info('    OK. Dataset Retrieved Name: ' + metadata[ MetadataConstants.NAME ] )

    App.debug('Retrieved Metadata: \n' + json.dumps(metadata, ensure_ascii=False, indent=4, sort_keys=True) )
    return metadata
def load_database(database_file, skiprows=None, nrows=None):
    # It is a socrata CSV database. The wget on compute is not geting the extension as should.
    file_type = 'CSV'  #default if no extension is found.
    if database_file.endswith('.csv'): file_type = 'CSV'
    if database_file.endswith('.json'): file_type = 'JSON'

    file_encoding = get_encoding(database_file)
    App.info('   > File encoding: %s' % file_encoding)

    if file_type == 'CSV':
        App.debug('CSV: Reading column headers from first line.')
        cols = FileUtils.get_cols_from_csv_header(database_file)
        App.debug('Preparing column types for pandas.')
        dtypes = prepare_dtypes_for_loading(cols)
        try:
            App.debug('Trying to read csv...')
            return pandas.read_csv(database_file,
                                   skiprows=skiprows,
                                   nrows=nrows,
                                   low_memory=LOW_MEMORY,
                                   encoding=file_encoding,
                                   dtype=dtypes)
        except:
            App.debug('Default CSV did not work.')
            App.debug('Trying to read with tab as separator...')
            # This error can be because the file is a tab separated values instead of comma
            return pandas.read_csv(database_file,
                                   skiprows=skiprows,
                                   nrows=nrows,
                                   low_memory=LOW_MEMORY,
                                   encoding=file_encoding,
                                   sep='\t',
                                   dtype=dtypes)

    elif file_type == 'JSON':
        # This works for json under socrata format, which have data field.
        # If not this way, lets supose it is already the data.
        json_file = open(database_file)
        json_data = json.load(json_file)

        if 'data' in json_data.keys():
            App.debug('JSON: Read data from data field. (Socrata format)')
            data = json_data['data']
            cols = []
            cols_with_sub_cols = []

            App.debug('Getting column names from metadata...')
            for col in json_data['meta']['view']['columns']:
                cols.append(col['name'])

                if 'subColumnTypes' in col.keys():
                    print '    (!) Column ', col[
                        'name'], ' has sub columns: ', col['subColumnTypes']
                    cols_with_sub_cols.append(col)

            dtypes = prepare_dtypes_for_loading(cols)
            df = pandas.DataFrame(data, columns=cols)

            #create subcolumn data
            for col in cols_with_sub_cols:
                print '    Fetching sub columns of ', col['name']
                i = 0
                for sub_col in col['subColumnTypes']:
                    print '         >', sub_col
                    df[col['name'] + NEW_COLUMN_NAME_SEPARATOR +
                       sub_col] = df[col['name']].apply(lambda x: x[i])
                    i += 1
                print '    Removing source column ', col[
                    'name'], ' from data frame.'
                #Then remove multivalored column
                df.drop(col['name'], axis=1, inplace=True)
            return df

        else:
            App.debug(
                'JSON: There is no data field. Getting column names from JSON keys.'
            )
            #get the list of cols from the json
            cols = list(json_data.keys())
            dtypes = prepare_dtypes_for_loading(cols)
            return pandas.DataFrame(json_data, dtypes=dtypes)
    else:
        print '===> PandasUtilError: Invalid database file: [{0}]'
        #        raise ApplicationExecption('File must be json or csv!'.format(database_file))
        raise RuntimeError(
            'File must be json (with data inside a data field) or csv!'.format(
                database_file))