コード例 #1
0
def main():

    if len(sys.argv) < 3:
        try:
            start_date = datetime(2013,5,26,16)
            #load current model
            file_name = "nn_current_model"
            fileObject = open(file_name,'rb')
        except:
            raise Exception("Please look at the variables defined in the script. The date format in '%Y-%m-%d %H:%M:%S', or the right name for the saved model are incorrect")
    else:
        try:
            start_date = time.strptime(sys.argv[2], "%Y-%m-%d %H:%M:%S")
            #load current model
            file_name = datetime(sys.argv[2])
            fileObject = open(file_name,'rb')
        except:
            raise Exception("Please enter the date format in '%Y-%m-%d %H:%M:%S', or the right name for the saved model")


    print("Using {}, for date: {}".format(file_name, start_date))
    pipeline = pickle.load(fileObject)

    #get data for model input
    sql_string = 'select date, location_name, if(WEEKDAY(date)<5, true, false) AS weekdays, WEEKDAY(date) AS dayoftheweek, co  from Samples where user_id=2 and date="{0}" and (location_name="Prospect" or location_name="Rozelle" or location_name="Liverpool" or location_name="Chullora") order by location_name;'.format(start_date)

    fixed_samples_data = data_from_db(sql_string, exit_on_zero=False, verbose=False)
    try:
        #assert that more than 4 stations need to be returned
        #sometimes 8 rows are returned (duplicate records..)
        assert fixed_samples_data is not None and len(fixed_samples_data) >= 4
    except AssertionError:
        #print("Assertion on number of rows returned failed")
        raise Exception("No rows on {0}\n".format(start_date));

    try:
        specific_hour = start_date.hour

        if use_hour_simplification_feature:
            hour_feature = classify_hour(specific_hour)
        else:
            hour_feature = specific_hour

        FIXED_LOCATIONS = ['Chullora', 'Liverpool', 'Prospect', 'Rozelle']
        mean_fixed = np.nanmean([fixed_samples_data[fixed_samples_data.location_name==location]['co'].iloc[0] for location in FIXED_LOCATIONS])
        co_chullora = fixed_samples_data[fixed_samples_data.location_name=='Chullora']['co'].iloc[0] if not np.isnan(fixed_samples_data[fixed_samples_data.location_name=='Chullora']['co'].iloc[0]) else mean_fixed
        co_liverpool = fixed_samples_data[fixed_samples_data.location_name=='Liverpool']['co'].iloc[0] if not np.isnan(fixed_samples_data[fixed_samples_data.location_name=='Liverpool']['co'].iloc[0]) else mean_fixed
        co_prospect = fixed_samples_data[fixed_samples_data.location_name=='Prospect']['co'].iloc[0]  if not np.isnan(fixed_samples_data[fixed_samples_data.location_name=='Prospect']['co'].iloc[0]) else mean_fixed
        co_rozelle = fixed_samples_data[fixed_samples_data.location_name=='Rozelle']['co'].iloc[0]  if not np.isnan(fixed_samples_data[fixed_samples_data.location_name=='Rozelle']['co'].iloc[0]) else mean_fixed

        #prepare data to be inserted into svm_estimates table
        data = [fixed_samples_data['weekdays'].iloc[0], hour_feature, get_season(start_date), 0, 0, co_liverpool, co_prospect, co_chullora, co_rozelle]
    except Exception,  ex:
        raise Exception("Error on {}; SQL Statement is {}; Error is str({})\n".format(start_date, sql_string, str(ex)));
コード例 #2
0
def main():
    # Open database connection
    db = MySQLdb.connect("localhost","pollution","pollution","pollution_monitoring" )
    # prepare a cursor object using cursor() method
    cursor = db.cursor()

    #start_date = datetime(2015,5,1)
    start_date = datetime(2015,5,17)
    end_date = datetime(2016,5,2)


    #log file
    log_file_name = "populate_SVM_estimates_{0}_log.txt".format(datetime.now().strftime("%Y-%m-%d %H"))
    logObject = open(log_file_name,'w')
    logObject.write("Start date: {0}, End date: {1}\n".format(start_date, end_date));

    #load current model
    file_name = "svm_current_model"
    fileObject = open(file_name,'rb')
    pipeline = pickle.load(fileObject)

    #variable to store the number of rows committed to the db
    row_count = 0

    #TODO: Make this set difference work
    #precheck to make the query go faster
    sql_string = """select distinct datetime from {0}; """.format(svm_estimates_table)
    cursor.execute(sql_string)
    inserted_datetimes = cursor.fetchall()
    
    total_hours = (end_date - start_date).days*24
    total_datetimes = {start_date + timedelta(seconds=i*3600) for i in xrange(total_hours)}

    remaining_datetimes = sorted(list(total_datetimes - set(inserted_datetimes)))

    #iterate through all the hours
    for start_date in remaining_datetimes:
        #precheck for date and hour in svm_estimates tables
        if start_date in inserted_datetimes or start_date.hour < 8:
            continue

        #get data for model input
        sql_string = 'select date, location_name, if(WEEKDAY(date)<5, true, false) AS weekdays, WEEKDAY(date) AS dayoftheweek, co  from Samples where user_id=2 and date="{0}" and (location_name="Prospect" or location_name="Rozelle" or location_name="Liverpool" or location_name="Chullora") order by location_name;'.format(start_date)

        fixed_samples_data = data_from_db(sql_string, exit_on_zero=False)
        try:
            #assert that more than 4 stations need to be returned
            #sometimes 8 rows are returned (duplicate records..)
            assert fixed_samples_data is not None and len(fixed_samples_data) >= 4
        except AssertionError as aex:
            #print("Assertion on number of rows returned failed")
            logObject.write("No rows on {0}\n".format(start_date));
            continue
            #pdb.set_trace()

        try:
            specific_hour = start_date.hour

            if use_hour_simplification_feature:
                hour_feature = classify_hour(specific_hour)
            else:
                hour_feature = specific_hour

            FIXED_LOCATIONS = ['Chullora', 'Liverpool', 'Prospect', 'Rozelle']
            mean_fixed = np.nanmean([fixed_samples_data[fixed_samples_data.location_name==location]['co'].iloc[0] for location in FIXED_LOCATIONS])
            co_chullora = fixed_samples_data[fixed_samples_data.location_name=='Chullora']['co'].iloc[0] if not np.isnan(fixed_samples_data[fixed_samples_data.location_name=='Chullora']['co'].iloc[0]) else mean_fixed
            co_liverpool = fixed_samples_data[fixed_samples_data.location_name=='Liverpool']['co'].iloc[0] if not np.isnan(fixed_samples_data[fixed_samples_data.location_name=='Liverpool']['co'].iloc[0]) else mean_fixed
            co_prospect = fixed_samples_data[fixed_samples_data.location_name=='Prospect']['co'].iloc[0]  if not np.isnan(fixed_samples_data[fixed_samples_data.location_name=='Prospect']['co'].iloc[0]) else mean_fixed
            co_rozelle = fixed_samples_data[fixed_samples_data.location_name=='Rozelle']['co'].iloc[0]  if not np.isnan(fixed_samples_data[fixed_samples_data.location_name=='Rozelle']['co'].iloc[0]) else mean_fixed

            #prepare data to be inserted into svm_estimates table
            data = [fixed_samples_data['weekdays'].iloc[0], hour_feature, get_season(start_date), 0, 0, co_liverpool, co_prospect, co_chullora, co_rozelle]
        except Exception,  ex:
            logObject.write("Error on {}; SQL Statement is {}; Error is str({})\n".format(start_date, sql_string, str(ex)));
            continue

        #print X_train
        X = np.float64([data])

        #go through 100x100 grid pixels
        for i in xrange(100):
            for j in xrange(100):
                X[0][3] = i 
                X[0][4] = j
                y_val = pipeline.predict(X)[0]

                insert_data = ['"{0}"'.format(start_date), '"{0}"'.format(start_date.date()), start_date.hour, fixed_samples_data['weekdays'].iloc[0], fixed_samples_data['dayoftheweek'].iloc[0], get_season(start_date), i, j, co_liverpool, co_prospect, co_chullora, co_rozelle, 5.7464*y_val+3.48652]

                insert_str = """insert ignore into {0} (datetime, date, time, weekdays, dayoftheweek, season, grid_location_row, grid_location_col, co_chullora, co_liverpool, co_prospect, co_rozelle, co_original) values ({1}); """.format(svm_estimates_table, ','.join([ str(x) for x in insert_data]))
                #print insert_str
                try:
                    cursor.execute(insert_str)
                except:
                    print insert_str
                    pdb.set_trace()
        #commit the db every 10000 rows
        db.commit()
        row_count += 10000
        print("Committed date: {0} with row count: {1}".format(start_date, row_count))
コード例 #3
0
def main():
    """

    Populating the Estimates was an initial method used by NN, then by SVM. Here, 
    Interpolation was used to calculate values for all the grids. This data was then
    fed into the model to train.

    This technigue could be left for populating the samplesGridData and using the model
    to estimate points. i.e. no interpolation used for estimation

    """

    # Open database connection
    db = MySQLdb.connect("localhost","pollution","pollution","pollution_monitoring" )

    # prepare a cursor object using cursor() method
    cursor = db.cursor()

    # get the oldest date
    #sql_str = """select distinct date from Samples  where user_id = 2 order by date asc limit 1;"""
    # start date
    #cursor.execute(sql_str)
    #start_date = cursor.fetchone()[0]
    start_date = datetime(2013,3,1)

    # get the newest date
    #sql_str = """select distinct date from Samples  where user_id = 2 order by date desc limit 1;"""
    # end date
    #cursor.execute(sql_str)
    #end_date = cursor.fetchone()[0]
    #override 
    end_date = datetime(2015,11,1)

    #Choose the data table to use
    zero_mean = True
    non_zero_grid_count_threshold = 10

    #table which has data inserted for model training
    data_table =  "Estimates_zeroMean" if zero_mean else  "Estimates_old"

    #epochs are the time periods to iterate over
    #provide some buffer time (extra epoch)
    epochs =  ((end_date - start_date).days*24 + 1)
    
    print "Start data and end date: {0} to {1}".format(start_date, end_date)
    print "Number of hours of data: {0}".format(epochs)
    
    first_date = start_date
    total_rows = 0
    no_epoch_count = 0
    skip_epoch_count = 0

    for _ in xrange(epochs):
        #do a quick check to see if data for a datetime exists, skip if it does
        sql_str = """ select datetime from {0} where datetime="{1}" limit 1;""".format(data_table, first_date)
        cursor.execute(sql_str)
        if cursor.rowcount > 0:
            skip_epoch_count += 1
            first_date += timedelta(seconds=3600)
            continue

        #is there sensor data, skip if not
        select_str = """select * from Samples where user_id != 2 and date like "{0}%" and co < 60 and co > 0 limit 1;""".format(first_date.strftime("%Y-%m-%d %H"))
        cursor.execute(select_str)
        if cursor.rowcount == 0:
            skip_epoch_count += 1
            print "Skipped {0} due to lack of sensor data".format(first_date)
            first_date += timedelta(seconds=3600)
            continue;
    
        #get data for an hour
        select_str = """SELECT 
                            date as datetime, DATE_FORMAT(date,"%Y-%m-%d") AS date, DATE_FORMAT(date,"%H") as time, if(WEEKDAY(date)<5, true, false) AS weekdays, WEEKDAY(date) AS dayoftheweek, latitude, longitude, user_id, co 
                        FROM 
                            Samples 
                        WHERE 
                            user_id != 2 and date between "{0}" and date_add("{0}", interval 1 hour) and co is not null and latitude is not null and longitude is not null AND (latitude <= {1} AND latitude >= {2}) AND (longitude >= {3} AND longitude <= {4}) AND co > 0 AND co < 60
                        ORDER BY
                            date asc """.format(first_date, NW_BOUND[0], SW_BOUND[0], NW_BOUND[1], NE_BOUND[1])
        df_mysql = data_from_db(select_str, verbose=True, exit_on_zero=False)
        if df_mysql is None:
            print "No data returned for {0}".format(first_date)
            no_epoch_count += 1
            first_date += timedelta(seconds=3600)
            continue

        #check the number of bins populated
        _, non_zero_grid_count = create_mean_value_grid(df_mysql)

        #discount grid if it doesn't have enough pixels (i.e. less than threshold)
        if non_zero_grid_count < non_zero_grid_count_threshold:
            skip_epoch_count += 1
            print "Skipped {0} due to non zero grid count less than threshold".format(first_date)
            first_date += timedelta(seconds=3600)
            continue

        #interpolate to get a grid
        known, z, ask, _ = gridify_sydney(df_mysql, verbose=False, heatmap=False)
        
        if len(known) == 0:
            raise Exception("No data for {0}".format(first_date))
            sys.exit()

        columns = df_mysql.columns.values
        vals = list(df_mysql.iloc[0])
        row_dict = dict(zip(columns, vals))
        relevant_columns = ['time','weekdays','dayoftheweek']
        data_common = ['"{0}"'.format(row_dict['datetime'].strftime("%Y-%m-%d %H:00:00"))] + ['"{0}"'.format(row_dict['date'])] + ["{0}".format(row_dict[col]) for col in relevant_columns] + ["{0}".format(get_season(row_dict['datetime']))]

        if len(known) < 8:
            Nnear = len(known)
        else:
            Nnear = 8

        # do the interpolation
        (interpolation_grid, interpol_name) = idw_interpol(known, z, ask, Nnear=Nnear)

        #implement for zero mean Estimates table
        if zero_mean:
            #do the zero mean bit
            interpolation_grid = interpolation_grid.flatten()
            interpolation_grid = (interpolation_grid - np.mean(interpolation_grid))/np.nanstd(interpolation_grid)

        #add each element to the db as a row
        for i in xrange(len(interpolation_grid)):
            total_rows += 1
            # input data into sql
            data = data_common + ["{0}".format(x) for x in [i, interpolation_grid[i]]]
            #print data
            insert_str = """insert ignore into {0} () values ({1}); """.format(data_table, ','.join(data))
            cursor.execute(insert_str)
        
        print "At {0}, Number of rows considered in total: {1}".format(first_date, total_rows)
        # commit at each epoch, i.e. every 10000 rows
        db.commit()
        first_date += timedelta(seconds=3600)

    db.close()
    print "No epoch count: {0} and Skip epoch counts {1}".format(no_epoch_count,skip_epoch_count)
コード例 #4
0
        co_rozelle = fixed_samples_data[fixed_samples_data.location_name=='Rozelle']['co'].iloc[0]  if not np.isnan(fixed_samples_data[fixed_samples_data.location_name=='Rozelle']['co'].iloc[0]) else mean_fixed

        #prepare data to be inserted into svm_estimates table
        data = [fixed_samples_data['weekdays'].iloc[0], hour_feature, get_season(start_date), 0, 0, co_liverpool, co_prospect, co_chullora, co_rozelle]
    except Exception,  ex:
        raise Exception("Error on {}; SQL Statement is {}; Error is str({})\n".format(start_date, sql_string, str(ex)));

    #print X_train
    X = np.float64([data])


    print("date,hour,weekdays,dayoftheweek,season,grid_row,grid_col,co_liverpool,co_prospect,co_chullora,co_rozelle,co")

    #go through 100x100 grid pixels
    for i in xrange(100):
        for j in xrange(100):
            X[0][3] = i
            X[0][4] = j
            y_val = pipeline.predict(X)[0]

            insert_data = ['"{0}"'.format(start_date), '"{0}"'.format(start_date.date()), start_date.hour, fixed_samples_data['weekdays'].iloc[0], 
                    fixed_samples_data['dayoftheweek'].iloc[0], get_season(start_date), i, j, co_liverpool, co_prospect, co_chullora, co_rozelle, 5.7464*y_val[0]+3.48652]

            print(','.join([ str(x) for x in insert_data]))

if __name__ == "__main__":
    print("Starting script")
    # execute only if run as a script
    main()
    print("Script finished!")
コード例 #5
0
def main():
    """

    SVM uses the known data, called SamplesGridData, training the model with this data
    and using the resultant model to infer unknown data points. There is no interpolation
    in this method.. 

    """

    # Open database connection
    db = MySQLdb.connect("localhost","pollution","pollution","pollution_monitoring" )

    # prepare a cursor object using cursor() method
    cursor = db.cursor()

    # get the oldest date
    #sql_str = """select distinct date from Samples  where user_id = 2 order by date asc limit 1;"""
    # start date
    #cursor.execute(sql_str)
    #start_date = cursor.fetchone()[0]
    start_date = datetime(2013,3,1)

    # get the newest date
    #sql_str = """select distinct date from Samples  where user_id = 2 order by date desc limit 1;"""
    # end date
    #cursor.execute(sql_str)
    #end_date = cursor.fetchone()[0]
    #override 
    end_date = datetime(2015,11,1)

    #Choose the data table to use
    zero_mean = True
    non_zero_grid_count_threshold = 10

    #second pass is needed for inputting the mean and stddev for all the rows of the table
    populate_initially = True
    populate_second_pass = True


    #table which has data inserted for model training
    data_table = "samplesGridData"

    #epochs are time period to iterate over
    #provide some buffer time (extra epoch)
    epochs =  ((end_date - start_date).days*24 + 1)
    
    print "Start data and end date: {0} to {1}".format(start_date, end_date)
    print "Number of hours of data: {0}".format(epochs)
    
    first_date = start_date
    total_rows = 0
    no_epoch_count = 0
    skip_epoch_count = 0

    #populate the first stage of the process for samplesGridData
    if populate_initially:
        for _ in xrange(epochs):
            #do a quick check to see if data for a datetime exists, skip if it does
            sql_str = """ select datetime from {0} where datetime="{1}" limit 1;""".format(data_table, first_date)
            cursor.execute(sql_str)
            if cursor.rowcount > 0:
                skip_epoch_count += 1
                first_date += timedelta(seconds=3600)
                continue

            #is there sensor data, skip if not
            select_str = """select * from Samples where user_id != 2 and date like "{0}%" and co < 60 and co > 0 limit 1;""".format(first_date.strftime("%Y-%m-%d %H"))
            cursor.execute(select_str)
            if cursor.rowcount == 0:
                skip_epoch_count += 1
                print "Skipped {0} due to lack of sensor data".format(first_date)
                first_date += timedelta(seconds=3600)
                continue;
        
            #get data for an hour
            select_str = """SELECT 
                                date as datetime, DATE_FORMAT(date,"%Y-%m-%d") AS date, DATE_FORMAT(date,"%H") as time, if(WEEKDAY(date)<5, true, false) AS weekdays, WEEKDAY(date) AS dayoftheweek, latitude, longitude, user_id, co 
                            FROM 
                                Samples 
                            WHERE 
                                user_id != 2 and date between "{0}" and date_add("{0}", interval 1 hour) and co is not null and latitude is not null and longitude is not null AND (latitude <= {1} AND latitude >= {2}) AND (longitude >= {3} AND longitude <= {4}) AND co > 0 AND co < 60
                            ORDER BY
                                date asc """.format(first_date, NW_BOUND[0], SW_BOUND[0], NW_BOUND[1], NE_BOUND[1])

            df_mysql = data_from_db(select_str, verbose=True, exit_on_zero=False)
            if df_mysql is None:
                print "No data returned for {0}".format(first_date)
                no_epoch_count += 1
                first_date += timedelta(seconds=3600)
                continue

            #check the number of bins or grid locations populated
            _, non_zero_grid_count = create_mean_value_grid(df_mysql)

            #discount grid if it doesn't have enough pixels (i.e. less than threshold)
            if non_zero_grid_count < non_zero_grid_count_threshold:
                skip_epoch_count += 1
                print "Skipped {0} due to non zero grid count less than threshold".format(first_date)
                first_date += timedelta(seconds=3600)
                continue

            #interpolate to get a grid
            known, z, ask, _ = gridify_sydney(df_mysql, verbose=False, heatmap=False)
            
            if len(known) == 0:
                raise Exception("No data for {0}".format(first_date))
                sys.exit()

            columns = df_mysql.columns.values
            vals = list(df_mysql.iloc[0])
            row_dict = dict(zip(columns, vals))
            relevant_columns = ['time','weekdays','dayoftheweek']
            data_common = ['"{0}"'.format(row_dict['datetime'].strftime("%Y-%m-%d %H:00:00"))] + ['"{0}"'.format(row_dict['date'])] + ["{0}".format(row_dict[col]) for col in relevant_columns] + ["{0}".format(get_season(row_dict['datetime']))]

            if zero_mean:
                select_str = 'select date, location_name, co  from Samples where user_id=2 and date="{0}" and (location_name="Prospect" or location_name="Rozelle" or location_name="Liverpool" or location_name="Chullora") order by location_name;'.format(row_dict['datetime'].strftime("%Y-%m-%d %H:00:00"))
                fixed_samples_data = data_from_db(select_str, verbose=False, exit_on_zero=False)
                assert len(fixed_samples_data) == 4
                FIXED_LOCATIONS = ['Chullora', 'Liverpool', 'Prospect', 'Rozelle']
                mean_fixed = np.nanmean([fixed_samples_data[fixed_samples_data.location_name==location]['co'].iloc[0] for location in FIXED_LOCATIONS])
                co_chullora = fixed_samples_data[fixed_samples_data.location_name=='Chullora']['co'].iloc[0] if not np.isnan(fixed_samples_data[fixed_samples_data.location_name=='Chullora']['co'].iloc[0]) else mean_fixed
                co_liverpool = fixed_samples_data[fixed_samples_data.location_name=='Liverpool']['co'].iloc[0] if not np.isnan(fixed_samples_data[fixed_samples_data.location_name=='Liverpool']['co'].iloc[0]) else mean_fixed
                co_prospect = fixed_samples_data[fixed_samples_data.location_name=='Prospect']['co'].iloc[0]  if not np.isnan(fixed_samples_data[fixed_samples_data.location_name=='Prospect']['co'].iloc[0]) else mean_fixed
                co_rozelle = fixed_samples_data[fixed_samples_data.location_name=='Rozelle']['co'].iloc[0]  if not np.isnan(fixed_samples_data[fixed_samples_data.location_name=='Rozelle']['co'].iloc[0]) else mean_fixed

                for i, _ in enumerate(z):
                    total_rows += 1
                    # input data into sql
                    grid_location_row, grid_location_col = known[i]
                    data = data_common + ["{0}".format(x) for x in [grid_location_row, grid_location_col, co_chullora, co_liverpool, co_prospect, co_rozelle, z[i]]]
                    #print data
                    insert_str = """insert ignore into {0} (datetime, date, time, weekdays, dayoftheweek, season, grid_location_row, grid_location_col, co_chullora, co_liverpool, co_prospect, co_rozelle, co_original) values ({1}); """.format(data_table, ','.join(data))
                    try:
                        cursor.execute(insert_str)
                    except:
                        print insert_str
                        pdb.set_trace()
            else:
                  raise Exception("You should always be running this zero mean set")
            
            print "At {0}, Number of rows considered in total: {1}".format(first_date, total_rows)
            # commit
            db.commit()
            first_date += timedelta(seconds=3600)

    #after all the rows have been populated with the original co, we need to populate the normalised value, mean and std
    if populate_second_pass:
        select_str = """ select * from {0};""".format(data_table)
        df_mysql = data_from_db(select_str, verbose=True, exit_on_zero=False)
        co_mean, co_stddev = df_mysql['co_original'].mean(), df_mysql['co_original'].std(ddof=0)
        df_mysql['co_mean'] = co_mean
        df_mysql['co_stddev'] = co_stddev
        df_mysql['co'] = (df_mysql['co_original']-co_mean)/co_stddev

        for index, row in df_mysql.iterrows():
            update_sql = "UPDATE {0} SET co={1}, co_mean={2}, co_stddev={3} WHERE datetime='{4}' AND grid_location_row={5} AND grid_location_col={6}".format(data_table, row['co'], row['co_mean'], row['co_stddev'], row['datetime'], row['grid_location_row'], row['grid_location_col'])
            cursor.execute(update_sql)
        db.commit()

    db.close()
    print "No epoch count: {0} and Skip epoch counts {1}".format(no_epoch_count,skip_epoch_count)
コード例 #6
0
def main(granularity, start_date, end_date):
    """

    SVM uses the known data, called SamplesGridData, training the model with this data
    and using the resultant model to infer unknown data points. There is no interpolation
    in this method.. 

    """

    global total_rows, skip_epoch_count, non_zero_grid_count, no_epoch_count

    # Open database connection
    #db = MySQLdb.connect("localhost","pollution","pollution","pollution_monitoring" )

    # prepare a cursor object using cursor() method
    #cursor = db.cursor()

    non_zero_grid_count_threshold = 1

    errors = []

    #second pass is needed for inputting the mean and stddev for all the rows of the table
    populate_initially = True
    populate_second_pass = False

    interval = granularity['interval']
    data_table = granularity['data_table']
    interval_period = granularity['interval_period']
    date_format = granularity["date_format"]
    epoch_variable = granularity['epoch_variable']

    target_datetimes = get_time_periods_with_sensor_data(start_date, end_date, data_table, interval)

    #epochs are time period to iterate over
    epochs =  len(target_datetimes)

    print "Start data and end date: {} to {}".format(start_date, end_date)
    print "Number of time periods of data: {}, granularity is {}".format(epochs, interval)

    db = MySQLdb.connect("localhost","pollution","pollution","pollution_monitoring" )
    cursor = db.cursor()


    #populate the first stage of the process for samplesGridData
    if populate_initially:
        for target_datetime in target_datetimes:
            target_datetime = datetime.strptime(target_datetime, '%Y-%m-%d %H:%M')
            #get data for an time period
            select_str = """SELECT 
                                date as datetime, DATE_FORMAT(date,"%Y-%m-%d") AS date, 
                                DATE_FORMAT(date,"%H") as hour, 
                                DATE_FORMAT(date,"%i") as minute, 
                                if(WEEKDAY(date)<5, true, false) AS weekdays, 
                                WEEKDAY(date) AS dayoftheweek, 
                                latitude, longitude, user_id, co 
                            FROM 
                                Samples 
                            WHERE 
                                user_id != 2 AND date between "{0}" 
                                AND DATE_ADD("{0}", INTERVAL {5} SECOND) 
                                AND co is not null 
                                and latitude is not null and longitude is not null 
                                AND (latitude <= {1} AND latitude >= {2}) 
                                AND (longitude >= {3} AND longitude <= {4}) 
                                AND co > 0 AND co < 60
                            ORDER BY
                                date asc """.format(
                                    target_datetime, 
                                    NW_BOUND[0], SW_BOUND[0], NW_BOUND[1], NE_BOUND[1], 
                                    interval_period
                                )

            df_mysql = data_from_db(select_str, verbose=True, exit_on_zero=False)
            if df_mysql is None:
                print "No data returned for {0}".format(target_datetime)
                no_epoch_count += 1
                continue

            #check the number of bins or grid locations populated
            _, non_zero_grid_count = create_mean_value_grid(df_mysql)

            #discount grid if it doesn't have enough pixels (i.e. less than threshold)
            if non_zero_grid_count < non_zero_grid_count_threshold:
                skip_epoch_count += 1
                print "Skipped {0} due to non zero grid count less than threshold".format(target_datetime)
                continue

            #interpolate to get a grid
            known, z, ask, _ = gridify_sydney(df_mysql, verbose=False, heatmap=False)
            
            if len(known) == 0:
                raise Exception("No data for {0}".format(target_datetime))
                sys.exit()

            columns = df_mysql.columns.values
            vals = list(df_mysql.iloc[0])
            row_dict = dict(zip(columns, vals))
            relevant_columns = ['hour','minute', 'weekdays','dayoftheweek']
            data_common = ['"{}"'.format(row_dict['datetime'].strftime("%Y-%m-%d %H:00:00"))] + \
                    ['"{}"'.format(row_dict['date'])] + \
                    ["{}".format(row_dict[col]) for col in relevant_columns] + \
                    ["{}".format(get_season(row_dict['datetime']))]

            # hour always needs to be used here to retrieve fixed station values
            select_str = """select
                              date, location_name, co
                          from
                              Samples
                          where
                              user_id=2 and date="{0}" and
                              (location_name="Prospect" or location_name="Rozelle"
                              or location_name="Liverpool" or location_name="Chullora")
                          order by
                              location_name;""".format(row_dict['datetime'].strftime("%Y-%m-%d %H:00:00"))

            fixed_samples_data = data_from_db(select_str, verbose=False, exit_on_zero=False)

            try:
                assert len(set(fixed_samples_data.location_name)) == 4
            except AssertionError:
                print "error: 4 fixed station values not found for {}".format(target_datetime)
                errors.append(target_datetime)
                continue

            FIXED_LOCATIONS = ['Chullora', 'Liverpool', 'Prospect', 'Rozelle']
            mean_fixed = np.nanmean([fixed_samples_data[fixed_samples_data.location_name==location]['co'].iloc[0] for location in FIXED_LOCATIONS])

            co_chullora = fixed_samples_data[fixed_samples_data.location_name=='Chullora']['co'].iloc[0] \
                if not np.isnan(fixed_samples_data[fixed_samples_data.location_name=='Chullora']['co'].iloc[0]) else mean_fixed
            co_liverpool = fixed_samples_data[fixed_samples_data.location_name=='Liverpool']['co'].iloc[0] \
                if not np.isnan(fixed_samples_data[fixed_samples_data.location_name=='Liverpool']['co'].iloc[0]) else mean_fixed
            co_prospect = fixed_samples_data[fixed_samples_data.location_name=='Prospect']['co'].iloc[0]  \
                if not np.isnan(fixed_samples_data[fixed_samples_data.location_name=='Prospect']['co'].iloc[0]) else mean_fixed
            co_rozelle = fixed_samples_data[fixed_samples_data.location_name=='Rozelle']['co'].iloc[0]  \
                if not np.isnan(fixed_samples_data[fixed_samples_data.location_name=='Rozelle']['co'].iloc[0]) else mean_fixed

            for i, _ in enumerate(z):
                total_rows += 1
                # input data into sql
                grid_location_row, grid_location_col = known[i]
                data = data_common + \
                        ["{0}".format(x) for x in [grid_location_row, grid_location_col, co_chullora, co_liverpool, co_prospect, co_rozelle, z[i]]]
                
                insert_str = """
                                 insert ignore into {0} 
                                     (datetime, date, hour, minute, weekdays, 
                                     dayoftheweek, season, 
                                     grid_location_row, 
                                     grid_location_col, 
                                     co_chullora, co_liverpool, co_prospect, co_rozelle, co_original) 
                                 values 
                                     ({1}); 
                             """.format(data_table, ','.join(data))
                try:
                    cursor.execute(insert_str)
                except:
                    print insert_str
                    pdb.set_trace()
            
            print "At {0}, Number of rows considered in total: {1}".format(target_datetime, total_rows)
            # commit
            db.commit()

    print "No epoch count: {0} and Skip epoch counts {1}".format(no_epoch_count,skip_epoch_count)
    print "dates with no complete fixed station data are {}".format(errors)

    # after all the rows have been populated with the original co, 
    # we need to populate the normalised value, mean and std
    if populate_second_pass:
        select_str = """ select * from {};""".format(data_table)
        df_mysql = data_from_db(select_str, verbose=True, exit_on_zero=False)
        if not df_mysql:
            print "no rows in {}. Script completed".format(data_table)
            return 
        co_mean, co_stddev = df_mysql['co_original'].mean(), df_mysql['co_original'].std(ddof=0)
        df_mysql['co_mean'] = co_mean
        df_mysql['co_stddev'] = co_stddev
        df_mysql['co'] = (df_mysql['co_original']-co_mean)/co_stddev

        for index, row in df_mysql.iterrows():
            update_sql = """
            UPDATE 
                {0} 
            SET 
                co={1}, co_mean={2}, co_stddev={3} 
            WHERE 
                datetime='{4}' AND grid_location_row={5} AND grid_location_col={6}
            """.format(data_table, row['co'], row['co_mean'], row['co_stddev'], row['datetime'], row['grid_location_row'], row['grid_location_col'])
            cursor.execute(update_sql)
        db.commit()

    db.close()