Esempio n. 1
0
def main(database_choice):
    """
    Main routine, calls all the other ones in this file as needed.

    Big picture:
        - Use manifest.csv to find out all the files we want to load
        - Compare current manifest.csv to the sql database manifest table, which reflects manifest.csv as
          of the last time this script was run, and tells us whether or not the file has already been loaded in this database.
        - Use meta.json to make sure the CSV file has all the fields we expect it to, and to decide the data type of each field
        - Load the csv into the database

    If there is an error loading a file (either flagged error from fields not matching, or parsing error if data type isn't right):
    - skip loading this file,
    - report the error to SQL using update_sql_manifest(status="error")
    - use logging.warning() to write the specific error encountered to the log file
    - at the end of loading print an error message to the console
    """
    # Check if local database is running
    try:
        is_local_db_running = dbtools.check_for_local_database()
        if not is_local_db_running:
            dbtools.start_local_database_server()
            # Load manifest data into a table.
            dbtools.create_manifest_table('manifest_sample.csv')
    except Exception as e:
        print("Could not start postgres database is docker running?")

    meta = load_meta_data('meta_sample.json')
    database_connection = dbtools.get_database_connection(database_choice)

    manifest = ManifestReader('manifest_sample.csv')

    if not manifest.has_unique_ids():
        raise ValueError('Manifest has duplicate unique_data_id!')

    for manifest_row in manifest:
        logging.info("Preparing to load row {} from the manifest".format(
            len(manifest)))

        sql_manifest_row = get_sql_manifest_row(
            database_connection=database_connection, csv_row=manifest_row)
        csv_reader = DataReader(manifest_row=manifest_row)

        if csv_reader.should_file_be_loaded(sql_manifest_row=sql_manifest_row):
            if csv_reader.do_fields_match(meta):

                print("  Ready to clean {}".format(
                    csv_reader.destination_table))
                for data_row in csv_reader:
                    #clean rows and add them to cleaned.csv
                    pass
                    #TODO normalize the date functions for all date field types
                    #TODO apply cleaning functions specific to this data file
                    #TODO write the row to a temporary cleaned.csv file

                print("  Ready to load")

                #TODO write the cleaned.csv to the appropriate SQL table
                #TODO add/update the appropriate row to the SQL manifest table indicating new status
                pass
Esempio n. 2
0
    def get_dist(self, unique_data_ids=None, sample=False, output_type='csv',db='local_database'):

        if 'wmata_dist' in unique_data_ids:
            u = 'wmata_dist'
            #Variable to hold data until written to file
            self.distOutput = []
            self.distHeader = ('nlihc_id','type','stop_id_or_station_code','dist_in_miles','crow_distance','building_lat','building_lon','stop_or_station_lat','stop_or_station_lon')


            #First, find which projects we should be calculating from
            try:
                #Configure the connection
                engine = dbtools.get_database_engine(db)
                conn = dbtools.get_database_connection(db)
                logger.info("  Connected to Housing Insights database")
                columnset = conn.execute('select column_name from INFORMATION_SCHEMA.COLUMNS where TABLE_NAME=\'project\'')

                #Get the rows
                proj_query = 'select * from project'
                if sample==True:
                    proj_query = proj_query + " limit 1"
                rows = conn.execute(proj_query)

                conn.close()
                engine.dispose()
            except Exception as e:
                logger.error("I am unable to connect to the database: %s", e)

            columns = []
            for c in columnset:
                columns.append(c)

            numrow = 0
            total_rows = rows.rowcount
            logger.info("Total rows: %s", total_rows)

            #Get the rail stations once (no option to only get closest ones provided by wmata)
            wmata_headers = self._get_wmata_headers()
            railResponse = requests.get("https://api.wmata.com/Rail.svc/json/jStations", headers=wmata_headers)
            self.railStations = railResponse.json()['Stations']

            #for every project, get nearby stations and walking distance
            for idx, row in enumerate(rows):
                radius = self._get_meters(0.5)

                project_details = self._get_project_info(row,columns)
                lat = project_details['lat']
                lon = project_details['lon']

                if lat == None or lon == None:
                    logger.warning("  Lat or Lon not available for project {}".format(project_details['nlihcid']))

                if lat != None and lon != None:
                    logger.info("  Processing project %s of %s", numrow,total_rows)

                    # find all metro stations within 0.5 miles
                    logger.info("  Starting processing rail stations for %s", project_details['nlihcid'])
                    self._find_rail_stations(self.railStations,project_details,radius,sample=sample,db=db)
                    logger.info("  Completed processing rail stations for project id %s", project_details['nlihcid'])

                    # find all bus stops within 0.5 miles
                    logger.info("  Starting processing bus stations for project id %s", project_details['nlihcid'])
                    self._find_bus_stations(project_details, radius,sample=sample,db=db)
                    logger.info("  Completed processing bus stations for project id %s", project_details['nlihcid'])

            #Save the data
            if ( output_type == 'csv'):
                self._array_to_csv(self.distHeader, self.distOutput, self.output_paths[u])

            elif ( output_type == 'stdout'):
                print("==========================================================================\n")
                print(self.distHeader)
                print("==========================================================================\n")
                for line in self.distOutput:
                    print(line)

            else:
                self._array_to_csv(self.distHeader, self.distOutput,output_type)
        else:
            #not a unique data id supported by this class
            pass
Esempio n. 3
0
    def _get_walking_distance(self, srcLat, srcLon, destLat, destLon,db='local_database'):
        """Returns the walking distance in meters between two locations

           Parameters:
           srcLat - latitude for source location
           srcLon - longitude for source location
           destLat - latitude for destination location
           destLon - longitude for destination location
           mapbox_api_key - api key for mapbox REST services
           """

        if self.use_cached_distance == True:
            try:
                #Configure the connection
                engine = dbtools.get_database_engine(db)
                conn = dbtools.get_database_connection(db)

                #Pull columns to see if the database has updated columns in wmata_dist
                columnset = conn.execute('select column_name from INFORMATION_SCHEMA.COLUMNS where TABLE_NAME=\'wmata_dist\'')
                columns = []
                for c in columnset:
                    columns.append(c[0])

                if ( 'building_lat' in columns and 'building_lon' in columns and 'stop_or_station_lat' in columns and 'stop_or_station_lon' in columns ):
                #See if row exists
                    proj_query = 'select * from wmata_dist where building_lat=\'' + str(srcLat) + '\' and building_lon=\'' + str(srcLon) + '\' and stop_or_station_lat=\''+ str(destLat) + '\' and stop_or_station_lon=\'' + str(destLon) + '\''
                    proxy = conn.execute(proj_query)
                    results = [dict(x) for x in proxy.fetchall()]

                    if ( len(results) != 0 ):
                        logger.info("  Found cached row!")
                        walking_distance = results[0]['dist_in_miles']

                        conn.close()
                        engine.dispose()

                        return float(walking_distance)*self.meters_per_mile
                    else:
                        logger.info("  Couldn't find cached row for %s", proj_query)
                else:
                    logger.info("Couldn't find all columns")

                conn.close()
                engine.dispose()
            except Exception as e:
                logger.error("Unable to connect to the database: %s", e)

        distReqCoords = str(srcLon) + ',' + str(srcLat) + ';' + str(destLon) + ',' + str(destLat)

        mapbox_params = self.mapbox_api_key

        # according to documentation, this doesn't work in Python SDK so switched to using REST API
        walkDistResponse = requests.get("https://api.mapbox.com/directions/v5/mapbox/walking/" + distReqCoords,params=mapbox_params)
        time.sleep(0.8)
        i = 0
        while "Too Many Requests" in str(walkDistResponse.json()) and i < 10:
            walkDistResponse = requests.get("https://api.mapbox.com/directions/v5/mapbox/walking/" + distReqCoords,params=mapbox_params)
            i = i + 1
            time.sleep(0.8)
            if i == 10:
                raise Exception('This is some exception to be defined later')
        print("Return value: " ,walkDistResponse.json()['routes'][0]['legs'][0]['distance'])
        return walkDistResponse.json()['routes'][0]['legs'][0]['distance']