Esempio n. 1
0
def add_single_login(login_timestamp):
    """Loads one client login data point i.e. "2012-03-01T00:05:55+00:00",
    into the database.  If hour entry exists, adds 1 to existing value.
    Returns error message if anything goes wrong.
    """
    login_dt = defo.validate_login_string(login_timestamp)
    if login_dt is None:
        return { 'error': 'Invalid timestamp', 
             'timestamp_example': '2012-03-01T00:05:55+00:00' }
    db = dbh.get_db()
    cur = db.cursor()
    cur.execute('SELECT * FROM login_history WHERE id=?', (login_dt,))
    match = cur.fetchone()
    print login_dt
    print 'Match is:'
    print match
    added_login = {}
    if match:
        # Update hour entry, add 1 to existing value
        print match['num_logins']
        cur.execute('UPDATE login_history SET num_logins=? WHERE id=?', (1+match['num_logins'], login_dt))
        added_login['update'] = 1
    else:
        # Entry does not exist
        cur.execute('INSERT INTO login_history ' + \
            '(id, day_name, hour, num_logins) ' + \
            'values (?, ?, ?, ?)', \
            (login_dt,defo.get_day_2char(login_dt), defo.get_hour(login_dt),1))
        added_login['insert'] = 1
    db.commit()
    added_login['timestamp'] = login_timestamp
    return added_login
Esempio n. 2
0
def add_single_login(login_timestamp):
    """Loads one client login data point i.e. "2012-03-01T00:05:55+00:00",
    into the database.  If hour entry exists, adds 1 to existing value.
    Returns error message if anything goes wrong.
    """
    login_dt = defo.validate_login_string(login_timestamp)
    if login_dt is None:
        return {
            'error': 'Invalid timestamp',
            'timestamp_example': '2012-03-01T00:05:55+00:00'
        }
    db = dbh.get_db()
    cur = db.cursor()
    cur.execute('SELECT * FROM login_history WHERE id=?', (login_dt, ))
    match = cur.fetchone()
    print login_dt
    print 'Match is:'
    print match
    added_login = {}
    if match:
        # Update hour entry, add 1 to existing value
        print match['num_logins']
        cur.execute('UPDATE login_history SET num_logins=? WHERE id=?',
                    (1 + match['num_logins'], login_dt))
        added_login['update'] = 1
    else:
        # Entry does not exist
        cur.execute('INSERT INTO login_history ' + \
            '(id, day_name, hour, num_logins) ' + \
            'values (?, ?, ?, ?)', \
            (login_dt,defo.get_day_2char(login_dt), defo.get_hour(login_dt),1))
        added_login['insert'] = 1
    db.commit()
    added_login['timestamp'] = login_timestamp
    return added_login
Esempio n. 3
0
def mark_outlier(outlier_id, reason='DefaultOutlier'):
    """Add analyzed (manual input) insights to database, marking outliers
    to ignore from predictions"""
    print "Marking outlier: %s" % outlier_id
    # Error check user input
    outlier_id = defo.validate_id(outlier_id)
    if outlier_id is None:
        return "Outlier ID Format Invalid"

    db = dbh.get_db()
    cur = db.cursor()
    cur.execute('SELECT * FROM login_history WHERE id=?', (outlier_id, ))
    match = cur.fetchone()
    if not match:
        return "ID not in database"
    else:
        print "Outlier Demand=%d" % (match['num_logins'])
        cur.execute('SELECT * FROM history_outliers WHERE id=?',
                    (outlier_id, ))
        match = cur.fetchone()
        if match:
            # Replace matching entry in outlier table
            #print 'Updating %s in Outlier DB' % (outlier_id,)
            cur.execute('UPDATE history_outliers SET reason=? WHERE id=?',
                        (str(reason), outlier_id))
        else:
            #print 'Adding %s in Outlier DB' % (str(outlier_id))
            cur.execute(
                'INSERT INTO history_outliers (id, reason) values (?, ?)',
                (outlier_id, str(reason)))
        db.commit()
    return None
Esempio n. 4
0
def mark_outlier(outlier_id, reason='DefaultOutlier'):
    """Add analyzed (manual input) insights to database, marking outliers
    to ignore from predictions"""
    print "Marking outlier: %s" % outlier_id
    # Error check user input
    outlier_id = defo.validate_id(outlier_id)
    if outlier_id is None:
        return "Outlier ID Format Invalid"
    
    db = dbh.get_db()
    cur = db.cursor()
    cur.execute('SELECT * FROM login_history WHERE id=?',(outlier_id,))
    match = cur.fetchone()
    if not match:
        return "ID not in database"
    else:
        print "Outlier Demand=%d"%(match['num_logins'])
        cur.execute('SELECT * FROM history_outliers WHERE id=?',(outlier_id,))
        match = cur.fetchone()
        if match:
            # Replace matching entry in outlier table
            #print 'Updating %s in Outlier DB' % (outlier_id,)
            cur.execute('UPDATE history_outliers SET reason=? WHERE id=?', (str(reason), outlier_id))
        else:
            #print 'Adding %s in Outlier DB' % (str(outlier_id))
            cur.execute('INSERT INTO history_outliers (id, reason) values (?, ?)', (outlier_id, str(reason)))
        db.commit()
    return None
Esempio n. 5
0
def fill_missing_hours():
    """Reads login data from database and fills in any missing hours.
    Inserts new entries with number of login counts set to 0."""
    db = dbh.get_db()
    cur = db.cursor()
    cur.execute('SELECT id FROM login_history ORDER BY id ASC')
    all_data = cur.fetchall()
    if all_data:
        hours_missing = False
        prev_id = defo.add_x_hours(all_data[0]['id'], -1) # start at 1 hour before first entry
        for hour in all_data:
            if hour['id'] != defo.add_x_hours(prev_id, 1):
                print 'prev: %s'%(prev_id)
                print 'next: %s'%(hour['id'])
                if defo.dy_subtract_ids(hour['id'], prev_id) < 3:
                    # Only insert 0's for missing entries when the gap
                    # between data is less than 3 days
                    missing_hour = defo.add_x_hours(prev_id, 1)
                    while missing_hour < hour['id']:
                        cur.execute('INSERT INTO login_history ' + \
                            '(id, day_name, hour, num_logins) ' + \
                            'values (?, ?, ?, ?)', \
                            (missing_hour, defo.get_day_2char(missing_hour), \
                             defo.get_hour(missing_hour), 0))
                        missing_hour = defo.add_x_hours(missing_hour, 1)
                    hours_missing = True
            prev_id = hour['id']
        if hours_missing:
            db.commit()
Esempio n. 6
0
def fill_missing_hours():
    """Reads login data from database and fills in any missing hours.
    Inserts new entries with number of login counts set to 0."""
    db = dbh.get_db()
    cur = db.cursor()
    cur.execute('SELECT id FROM login_history ORDER BY id ASC')
    all_data = cur.fetchall()
    if all_data:
        hours_missing = False
        prev_id = defo.add_x_hours(all_data[0]['id'],
                                   -1)  # start at 1 hour before first entry
        for hour in all_data:
            if hour['id'] != defo.add_x_hours(prev_id, 1):
                print 'prev: %s' % (prev_id)
                print 'next: %s' % (hour['id'])
                if defo.dy_subtract_ids(hour['id'], prev_id) < 3:
                    # Only insert 0's for missing entries when the gap
                    # between data is less than 3 days
                    missing_hour = defo.add_x_hours(prev_id, 1)
                    while missing_hour < hour['id']:
                        cur.execute('INSERT INTO login_history ' + \
                            '(id, day_name, hour, num_logins) ' + \
                            'values (?, ?, ?, ?)', \
                            (missing_hour, defo.get_day_2char(missing_hour), \
                             defo.get_hour(missing_hour), 0))
                        missing_hour = defo.add_x_hours(missing_hour, 1)
                    hours_missing = True
            prev_id = hour['id']
        if hours_missing:
            db.commit()
Esempio n. 7
0
def add_multiple_logins(login_data):
    login_dict = defo.datetimes_to_dict(login_data)
    if not login_dict:
        return { 'error': 'No valid timestamps', 
            'timestamps_example': '["2012-03-01T00:05:55+00:00", "2012-03-01T00:06:23+00:00"]'}
    else:
        latest_dt = None
        db = dbh.get_db()
        cur = db.cursor()
        added_logins = {}
        for id_str,hour in login_dict.items():
            #print "Read in hour: %s"%(id_str)
            cur_hour = len(hour) # simple count of logins in this hour
            cur.execute('SELECT * FROM login_history WHERE id=?',(id_str,))
            match = cur.fetchone()
            if match:
                print 'Updating hour count: %d + %d' % (match['num_logins'], cur_hour)
                cur.execute('UPDATE login_history SET num_logins=? WHERE id=?', (match['num_logins']+cur_hour, id_str))
                added_logins['update'] = added_logins.get('update',0) + 1
            else:
                print 'Adding %s with %d logins' % (id_str,len(hour))
                cur.execute('INSERT INTO login_history ' + \
                    '(id, day_name, hour, num_logins) ' + \
                    'values (?, ?, ?, ?)', \
                    (id_str, defo.get_day_2char(id_str), defo.get_hour(id_str), cur_hour))
                added_logins['insert'] = added_logins.get('insert',0) + 1
        # Commit changes
        db.commit()
        added_logins['timestamps'] = login_dict.keys()
        return added_logins
Esempio n. 8
0
def predict_demand(year,month,day,num_days,enable_plots=None):
    """
    Given a valid database DB with saved formatted *.json files,
    runs the enabled algorithms to produce predictions for all the days within
    the (inclusive) time span [(beg_year,beg_month,beg_day) to (end_year,end_month,end_day)]
    Returns error string if something goes wrong, None if prediction successful.
    """
    print "Predicting Demand for %d days starting on %d/%d/%d" % (num_days,month,day,year)
    db = dbh.get_db()
    cur = db.cursor()
    # For now (smaller dataset), loading all 3 tables in memory is not a problem
    cur.execute('SELECT * FROM history_outliers')
    outlier_data = cur.fetchall()
    cur.execute('SELECT * FROM prediction_outliers')
    predicted_outlier_data = cur.fetchall()
    cur.execute('SELECT * FROM login_history ORDER BY id ASC')
    all_data = cur.fetchall()
    if not all_data:
        return {'error':'No data in login_history DB'}
    if len(all_data) < 7*24:
        return {'error':'Not enough data to accurately predict demand'}
    predicted_ids,predictions,predicted_slopes=depr.lin_reg_by_hour(all_data,outlier_data)
    cur_pred_id = defo.get_id_str(year, month, day, 0)
    end_pred_id = defo.add_x_hours(cur_pred_id,24*(num_days+1))
    delta_days = defo.dy_delta_days(predicted_ids[0],cur_pred_id)
    # Filter predicted outlier ids to those within prediction timespan
    ol_dict = {}
    demand_predictions = {}
    if predicted_outlier_data:
        map(lambda y: ol_dict.update({str(y['id']):float(y['multiplier'])}), \
            filter(lambda x: x['id']>=cur_pred_id and x['id']<end_pred_id, predicted_outlier_data))
    for count in range(num_days):
        # Calculate the number of weeks to extrapolate on
        extrap_weeks = int(delta_days+count/7)
        #print 'Predicting %s, %d weeks from (%s,%s) predictions'%(cur_pred_id,extrap_weeks,predicted_ids[0],predicted_ids[-1])
        pred_day_str = defo.get_day_str(year, month, day)
        prediction_list = []
        pred_id_list = []
        pred_data = []
        for hour in range(24):
            cur_pred_id = defo.get_id_str(year, month, day, hour)
            offset = int(defo.hr_subtract_ids(cur_pred_id,predicted_ids[0])%(24*7))
            prediction = predictions[offset] + extrap_weeks*predicted_slopes[offset]
            if cur_pred_id in ol_dict.keys():
                prediction *= ol_dict[cur_pred_id]
                #print 'Predicted (%fx) Multiplier'%ol_dict[cur_pred_id]
            #print 'Prediction ID: %s, Logins: %f (%s: %fx%dWeeks + %f)'%(cur_pred_id,prediction,
            #    predicted_ids[offset],predicted_slopes[offset],extrap_weeks,predictions[offset])
            pred_data.append((cur_pred_id,prediction))
            demand_predictions[cur_pred_id] = prediction
        # Add to database, doing predictions on a day at a time basis (always 24 entries/hours)
        cur.executemany("INSERT or REPLACE into login_predictions (id, num_logins) values (?, ?)",\
            pred_data)
        # Move to next day
        year, month, day = defo.tp_add_x_days(year, month, day, 1)
    db.commit()
    return demand_predictions
Esempio n. 9
0
def clear_existing_predictions(year, month, day):
    """Delete all predictions associated with the input day
    from the login_predictions database"""
    if defo.validate_id(defo.get_id_str(year,month,day,0)) is not None:
        db = dbh.get_db()
        cur = db.cursor()
        cur.execute("DELETE FROM login_predictions WHERE id LIKE '",\
            (str(defo.get_id_str(year, month, day, 00))[:-2]+"__'",))
        db.commit()
    else:
        print "Invalid Input to clear_existing_predictions"
Esempio n. 10
0
def clear_existing_predictions(year, month, day):
    """Delete all predictions associated with the input day
    from the login_predictions database"""
    if defo.validate_id(defo.get_id_str(year, month, day, 0)) is not None:
        db = dbh.get_db()
        cur = db.cursor()
        cur.execute("DELETE FROM login_predictions WHERE id LIKE '",\
            (str(defo.get_id_str(year, month, day, 00))[:-2]+"__'",))
        db.commit()
    else:
        print "Invalid Input to clear_existing_predictions"
Esempio n. 11
0
def mark_predicted_outlier(outlier_id, multiplier, reason='DefaultOutlier'):
    """Add analyzed (manual input) insights to database, marking predicted
    future outliers to adjust predictions"""
    # Error check user input
    outlier_id = defo.validate_id(outlier_id)
    if outlier_id is None:
        return "Outlier ID Format Invalid"
    multiplier = float(multiplier)
    print "Marking predicted (future) outlier: %s, with %f multiplier" %(outlier_id,multiplier)
    
    db = dbh.get_db()
    cur = db.cursor()
    cur.execute("INSERT or REPLACE into prediction_outliers (id, multiplier, reason) values (?, ?, ?)",\
            (outlier_id, float(multiplier), reason))
    db.commit()
    return None
Esempio n. 12
0
def delete_predictions_with_actuals():
    """Finds any predicted hours that have actual data in the login_history table,
    removes matching entries from login_predictions"""
    db = dbh.get_db()
    cur = db.cursor()
    cur.execute("SELECT id FROM login_predictions")
    pred_ids = cur.fetchall()
    cur.execute("SELECT id FROM login_history ORDER BY id ASC")
    hist_ids = cur.fetchall()
    if hist_ids:
        all_ids = [x['id'] for x in hist_ids]
        for id in pred_ids:
            if id['id'] in all_ids:
                cur.execute("DELETE FROM login_predictions WHERE id=?",\
                    (id['id'],))
        db.commit()
Esempio n. 13
0
def delete_predictions_with_actuals():
    """Finds any predicted hours that have actual data in the login_history table,
    removes matching entries from login_predictions"""
    db = dbh.get_db()
    cur = db.cursor()
    cur.execute("SELECT id FROM login_predictions")
    pred_ids = cur.fetchall()
    cur.execute("SELECT id FROM login_history ORDER BY id ASC")
    hist_ids = cur.fetchall()
    if hist_ids:
        all_ids = [x['id'] for x in hist_ids]
        for id in pred_ids:
            if id['id'] in all_ids:
                cur.execute("DELETE FROM login_predictions WHERE id=?",\
                    (id['id'],))
        db.commit()
Esempio n. 14
0
def api_predict(num_days_to_predict):
    """Returns the predicted values that are in the database.
    If num_days_to_predict is None, returns all predictions.
    Otherwise, num_days_to_predict specifies the number of predicted days
    to be returned.  If this value exceeds the number of predicted days currently
    in the database, returns an error."""
    if num_days_to_predict is not None:
        if num_days_to_predict <= 0:
            return {'error':'Number of days to predict must be positive'}
        if num_days_to_predict >= 100:
            return {'error':'Cannot predict more than 99 days forward'}
    
    try:
        db = dbh.get_db()
        cur = db.cursor()
        predictions = None
        if num_days_to_predict is not None:
            cur.execute('SELECT id FROM login_predictions ORDER BY id ASC')
            first_pred = cur.fetchone() # Get first prediction id
            if first_pred:
                last_year, last_month, last_day = defo.tp_add_x_days_to_id(first_pred['id'], num_days_to_predict)
                last_pred = defo.get_id_str(last_year, last_month, last_day, 0)
                cur.execute('SELECT id, num_logins FROM login_predictions WHERE id<? ORDER BY id ASC',(last_pred,))
                predictions = cur.fetchall()
            else:
                return {'error':'No predictions in DB - try to PUT api/predict resource first'}
        else:
            cur.execute('SELECT id, num_logins FROM login_predictions ORDER BY id ASC')
            predictions = cur.fetchall()
        if predictions:
            pred_dict = {}
            [pred_dict.update({pred['id']:pred['num_logins']}) for pred in predictions]
            return pred_dict
        else:
            return {'error':'No predictions in DB - try to PUT api/predict resource first'}
        return predictions
    except ValueError as err:
        return {'error':'No predictions in DB - try to PUT api/predict resource first'}
        
    cur.execute('SELECT id FROM login_history ORDER BY id DESC')
    latest = cur.fetchone()
    if latest:
        next_year, next_month, next_day = defo.tp_add_x_days_to_id(latest['id'], 1)
        return predict_demand(next_year,next_month,next_day,num_days_to_predict)
    else:
        return {'error':'No data in login_history DB'}
Esempio n. 15
0
def run_analytics(debug=1):
    """Runs linear regression and smoothing models, outlier identification, and
    stores results in database for prediction to use.
    Pass empty array [] to turn off debug printouts"""
    db = dbh.get_db()
    cur = db.cursor()
    cur.execute('SELECT * FROM login_history ORDER BY id ASC')
    all_data = cur.fetchall()
    cur.execute('SELECT * FROM history_outliers')
    outlier_data = cur.fetchall()
    cur.execute('SELECT * FROM prediction_outliers')
    predicted_outlier_data = cur.fetchall()
    if not all_data:
        print "No data loaded in DB"
        return
    predicted_ids,predictions,predicted_slopes=depr.lin_reg_by_hour(all_data,outlier_data,debug)
    depl.scatter_plot(range(len(predicted_slopes)),predicted_slopes,'Predicted_Slopes','Hour','Slope',predicted_ids[-1])
Esempio n. 16
0
def mark_predicted_outlier(outlier_id, multiplier, reason='DefaultOutlier'):
    """Add analyzed (manual input) insights to database, marking predicted
    future outliers to adjust predictions"""
    # Error check user input
    outlier_id = defo.validate_id(outlier_id)
    if outlier_id is None:
        return "Outlier ID Format Invalid"
    multiplier = float(multiplier)
    print "Marking predicted (future) outlier: %s, with %f multiplier" % (
        outlier_id, multiplier)

    db = dbh.get_db()
    cur = db.cursor()
    cur.execute("INSERT or REPLACE into prediction_outliers (id, multiplier, reason) values (?, ?, ?)",\
            (outlier_id, float(multiplier), reason))
    db.commit()
    return None
Esempio n. 17
0
def run_analytics(debug=1):
    """Runs linear regression and smoothing models, outlier identification, and
    stores results in database for prediction to use.
    Pass empty array [] to turn off debug printouts"""
    db = dbh.get_db()
    cur = db.cursor()
    cur.execute('SELECT * FROM login_history ORDER BY id ASC')
    all_data = cur.fetchall()
    cur.execute('SELECT * FROM history_outliers')
    outlier_data = cur.fetchall()
    cur.execute('SELECT * FROM prediction_outliers')
    predicted_outlier_data = cur.fetchall()
    if not all_data:
        print "No data loaded in DB"
        return
    predicted_ids, predictions, predicted_slopes = depr.lin_reg_by_hour(
        all_data, outlier_data, debug)
    depl.scatter_plot(range(len(predicted_slopes)), predicted_slopes,
                      'Predicted_Slopes', 'Hour', 'Slope', predicted_ids[-1])
Esempio n. 18
0
def api_update_predictions(num_days_to_predict):
    """Updates the predictions based on historic logins that are contained within
    the database.  Deletes existing predictions that have actual data for matching
    days and loads the predetermined outliers.  
    The input paramter num_days_to_predict specifies the number of days that will
    be predicted, starting at the day following the latest actual (historic) timestamp."""
    if num_days_to_predict <= 0:
        return {'error':'Number of days to predict must be positive'}
    if num_days_to_predict >= 100:
        return {'error':'Cannot predict more than 99 days forward'}
    delete_predictions_with_actuals()
    mark_predetermined_outliers()
    db = dbh.get_db()
    cur = db.cursor()
    cur.execute('SELECT id FROM login_history ORDER BY id DESC')
    latest = cur.fetchone() # Get latest id so we can start predictions on following day
    if latest:
        next_year, next_month, next_day = defo.tp_add_x_days_to_id(latest['id'], 1)
        return predict_demand(next_year,next_month,next_day,num_days_to_predict)
    else:
        return {'error':'No data in login_history DB'}
Esempio n. 19
0
def add_multiple_logins(login_data):
    login_dict = defo.datetimes_to_dict(login_data)
    if not login_dict:
        return {
            'error':
            'No valid timestamps',
            'timestamps_example':
            '["2012-03-01T00:05:55+00:00", "2012-03-01T00:06:23+00:00"]'
        }
    else:
        latest_dt = None
        db = dbh.get_db()
        cur = db.cursor()
        added_logins = {}
        for id_str, hour in login_dict.items():
            #print "Read in hour: %s"%(id_str)
            cur_hour = len(hour)  # simple count of logins in this hour
            cur.execute('SELECT * FROM login_history WHERE id=?', (id_str, ))
            match = cur.fetchone()
            if match:
                print 'Updating hour count: %d + %d' % (match['num_logins'],
                                                        cur_hour)
                cur.execute('UPDATE login_history SET num_logins=? WHERE id=?',
                            (match['num_logins'] + cur_hour, id_str))
                added_logins['update'] = added_logins.get('update', 0) + 1
            else:
                print 'Adding %s with %d logins' % (id_str, len(hour))
                cur.execute('INSERT INTO login_history ' + \
                    '(id, day_name, hour, num_logins) ' + \
                    'values (?, ?, ?, ?)', \
                    (id_str, defo.get_day_2char(id_str), defo.get_hour(id_str), cur_hour))
                added_logins['insert'] = added_logins.get('insert', 0) + 1
        # Commit changes
        db.commit()
        added_logins['timestamps'] = login_dict.keys()
        return added_logins
Esempio n. 20
0
def api_update_predictions(num_days_to_predict):
    """Updates the predictions based on historic logins that are contained within
    the database.  Deletes existing predictions that have actual data for matching
    days and loads the predetermined outliers.  
    The input paramter num_days_to_predict specifies the number of days that will
    be predicted, starting at the day following the latest actual (historic) timestamp."""
    if num_days_to_predict <= 0:
        return {'error': 'Number of days to predict must be positive'}
    if num_days_to_predict >= 100:
        return {'error': 'Cannot predict more than 99 days forward'}
    delete_predictions_with_actuals()
    mark_predetermined_outliers()
    db = dbh.get_db()
    cur = db.cursor()
    cur.execute('SELECT id FROM login_history ORDER BY id DESC')
    latest = cur.fetchone(
    )  # Get latest id so we can start predictions on following day
    if latest:
        next_year, next_month, next_day = defo.tp_add_x_days_to_id(
            latest['id'], 1)
        return predict_demand(next_year, next_month, next_day,
                              num_days_to_predict)
    else:
        return {'error': 'No data in login_history DB'}
Esempio n. 21
0
def api_predict(num_days_to_predict):
    """Returns the predicted values that are in the database.
    If num_days_to_predict is None, returns all predictions.
    Otherwise, num_days_to_predict specifies the number of predicted days
    to be returned.  If this value exceeds the number of predicted days currently
    in the database, returns an error."""
    if num_days_to_predict is not None:
        if num_days_to_predict <= 0:
            return {'error': 'Number of days to predict must be positive'}
        if num_days_to_predict >= 100:
            return {'error': 'Cannot predict more than 99 days forward'}

    try:
        db = dbh.get_db()
        cur = db.cursor()
        predictions = None
        if num_days_to_predict is not None:
            cur.execute('SELECT id FROM login_predictions ORDER BY id ASC')
            first_pred = cur.fetchone()  # Get first prediction id
            if first_pred:
                last_year, last_month, last_day = defo.tp_add_x_days_to_id(
                    first_pred['id'], num_days_to_predict)
                last_pred = defo.get_id_str(last_year, last_month, last_day, 0)
                cur.execute(
                    'SELECT id, num_logins FROM login_predictions WHERE id<? ORDER BY id ASC',
                    (last_pred, ))
                predictions = cur.fetchall()
            else:
                return {
                    'error':
                    'No predictions in DB - try to PUT api/predict resource first'
                }
        else:
            cur.execute(
                'SELECT id, num_logins FROM login_predictions ORDER BY id ASC')
            predictions = cur.fetchall()
        if predictions:
            pred_dict = {}
            [
                pred_dict.update({pred['id']: pred['num_logins']})
                for pred in predictions
            ]
            return pred_dict
        else:
            return {
                'error':
                'No predictions in DB - try to PUT api/predict resource first'
            }
        return predictions
    except ValueError as err:
        return {
            'error':
            'No predictions in DB - try to PUT api/predict resource first'
        }

    cur.execute('SELECT id FROM login_history ORDER BY id DESC')
    latest = cur.fetchone()
    if latest:
        next_year, next_month, next_day = defo.tp_add_x_days_to_id(
            latest['id'], 1)
        return predict_demand(next_year, next_month, next_day,
                              num_days_to_predict)
    else:
        return {'error': 'No data in login_history DB'}
Esempio n. 22
0
def plot_logins():
    """Use the loaded history of client login data to create plots,
    which are saved within the predict_demand/plots folder.
    Used for manual analysis"""
    print "Running analytics on DB\n"
    db = dbh.get_db()
    cur = db.cursor()
    cur.execute('SELECT * FROM login_history ORDER BY id ASC')
    all_data = cur.fetchall()
    if not all_data:
        print "No data loaded in DB"
        return
    
    # Tabulate by hour
    hour_x = []
    hour_y = []
    base_day = None
    
    ## Get first predicted day (1 day past last history day)
    pred_year,pred_month,pred_day = defo.tp_add_x_days_to_id(all_data[-1]['id'],1)
    
    ## Plot trends per day over time (for first week predictions)
    for i in range(7):
        pred_day_str = defo.get_day_str(pred_year, pred_month, pred_day)
        hist_day = filter(lambda x: x['day_name']==pred_day_str, all_data)
        pred_id = defo.get_id_str(pred_year, pred_month, pred_day, 00)
        depl.plot_day_trend(pred_id, hist_day)
        pred_year,pred_month,pred_day = defo.tp_add_x_days(pred_year,pred_month,pred_day,1)
    # Weekday analysis
    depl.plot_weekdays([(x['id'],x['num_logins']) for x in all_data \
        if x['day_name'] in ['Mo', 'Tu', 'We', 'Th']])
    
    ## Tabulate by day
    depl.plot_each_day(all_data)
    # Creates dictionaries for each day, 
    #  where the keys are each hour of that day,
    #  and values are a list of tuple pairs (id, count)
    cur.execute('SELECT MAX(num_logins) FROM login_history')
    max_login = cur.fetchone()[0]
    # Monday
    mo_dict = defo.get_hours_dict()
    map(lambda y: mo_dict[y['hour']].append((y['id'],y['num_logins'])), \
        filter(lambda x: x['day_name']=='Mo', all_data))
    depl.plot_day_dict(mo_dict, '1_Monday', max_login)
    # Tuesday
    tu_dict = defo.get_hours_dict()
    map(lambda y: tu_dict[y['hour']].append((y['id'],y['num_logins'])), \
        filter(lambda x: x['day_name']=='Tu', all_data))
    depl.plot_day_dict(tu_dict, '2_Tuesday', max_login)
    # Wednesday
    we_dict = defo.get_hours_dict()
    map(lambda y: we_dict[y['hour']].append((y['id'],y['num_logins'])), \
        filter(lambda x: x['day_name']=='We', all_data))
    depl.plot_day_dict(we_dict, '3_Wednesday', max_login)
    # Thursday
    th_dict = defo.get_hours_dict()
    map(lambda y: th_dict[y['hour']].append((y['id'],y['num_logins'])), \
        filter(lambda x: x['day_name']=='Th', all_data))
    depl.plot_day_dict(th_dict, '4_Thursday', max_login)
    # Friday
    fr_dict = defo.get_hours_dict()
    map(lambda y: fr_dict[y['hour']].append((y['id'],y['num_logins'])), \
        filter(lambda x: x['day_name']=='Fr', all_data))
    depl.plot_day_dict(fr_dict, '5_Friday', max_login)
    # Saturday
    sa_dict = defo.get_hours_dict()
    map(lambda y: sa_dict[y['hour']].append((y['id'],y['num_logins'])), \
        filter(lambda x: x['day_name']=='Sa', all_data))
    depl.plot_day_dict(sa_dict, '6_Saturday', max_login)
    # Sunday
    su_dict = defo.get_hours_dict()
    map(lambda y: su_dict[y['hour']].append((y['id'],y['num_logins'])), \
        filter(lambda x: x['day_name']=='Su', all_data))
    depl.plot_day_dict(su_dict, '7_Sunday', max_login)
    day_dict = {}
    base_year = None
    
    ## Tabulate by week
    # Creates list of most recent week (7 consecutive days)
    last_complete_week_id = None
    temp_complete = deque([])
    min_id = None
    
    for entry in all_data:
        # Getting keys of most recent consecutive week
        if not temp_complete:
            #print 'Starting w/ %d' % entry_doy
            temp_complete.append(entry['id']) # add subsequent day
        else:
            day_delta = defo.dy_subtract_ids(entry['id'],temp_complete[-1])
            if day_delta == 1:
                temp_complete.append(entry['id']) # add subsequent day
                if len(temp_complete) > 7: # can also use while
                    temp_complete.popleft()
                # Save id corresponding to day (hour 0) that has a complete weeks
                # worth of previous data
                if len(temp_complete) == 7:
                    last_complete_week_id = entry['id']
            elif day_delta != 0: # current entry is not the same or a subsequent day
                temp_complete = deque([entry['id']])
        # Save minimum (start) ID key
        if min_id is None or min_id > entry['id']:
            min_id = entry['id']
        
        # Create dictionary with key as the day of year
        # Handle multiple years in database
        year_str = defo.get_year(entry['id'])
        if base_year is None:
            base_year = year_str # if multiple years, assuming years are monotonically increasing
        if year_str != base_year:
            days = (int(year_str)-int(base_year))*365 # Doesn't account for leap years...
        else:
            days = 0
        days = days+int(defo.get_day_of_year(entry['id']))
        day_dict[days] = day_dict.get(days,0) + entry['num_logins']
        if base_day is None:
            base_day = days
        hours = (days - base_day)*24 + entry['hour']
        hour_x.append(hours)
        hour_y.append(entry['num_logins'])
    depl.scatter_plot(hour_x, hour_y)
    depl.plot_by_day(day_dict)
    
    # Need at least 1 consecutive week's worth of data
    if last_complete_week_id is not None: 
        print 'Plotting week data...'
        end_id = last_complete_week_id[:-2]+'23' # Last hour of the day
        start_id = defo.subtract_one_week(end_id)
        print "%s to %s"%(start_id,end_id)
        # Plot full weeks starting at the latest complete week,
        #  where complete means there is at least one data point for 7 consecutive days
        # Lexigraphical (default) string comparison should work with ID format yyyy-hh-ddThh
        while end_id > min_id:
            cur.execute('SELECT id, num_logins FROM login_history WHERE id>? AND id <=? ' \
                + 'ORDER BY id ASC', (start_id, end_id))
            wk_data = cur.fetchall() # Data from an entire week, sorted by most recent first
            # Find the time delta in hours (compute negative x values so  
            #  the most recent is on the right)
            if wk_data:
                last_time = wk_data[-1][0]
                time_delta = map(lambda entry: defo.hr_subtract_ids(entry[0], last_time), wk_data)
                id_list,val_list = [list(entry) for entry in zip(*wk_data)]
                depl.plot_by_week(time_delta, val_list, id_list, max_login)
                
            end_id = start_id
            start_id = defo.subtract_one_week(end_id)
    else:
        print('WARNING: Database does not have continuous week of data')
Esempio n. 23
0
def plot_predictions(update_plots=None):
    """Updates the predictions (if update_plots is not None) which will also plot
    the linear regression predictions with past data,
    and Plots (saved to file) each predicted day in login_predictions"""
    db = dbh.get_db()
    cur = db.cursor()
    if update_plots is not None:
        num_days_predicted=15
        delete_predictions_with_actuals()
        # Find start day for predictions (=1+last day of actuals)
        cur.execute("SELECT id FROM login_history ORDER BY id DESC")
        latest = cur.fetchone()
        if latest:
            start_year, start_month, start_day = defo.tp_add_x_days_to_id(latest[0], 1)
            predict_demand(start_year, start_month, start_day, num_days_predicted, 1)

    cur.execute("SELECT * FROM login_predictions ORDER BY id ASC")
    pred_data = cur.fetchall()
    if not pred_data:
        print "No predictions in database! Nothing to plot"
        return
    # Get list of days that have been predicted
    pred_dict = {}
    max_pred = 0 # Save max prediction to set identical y axis scales
    for hours in pred_data:
        if hours['num_logins'] > max_pred:
            max_pred = hours['num_logins']
        day_id = hours['id'][:-3]
        if day_id in pred_dict:
            pred_dict[day_id].append(hours)
        else:
            pred_dict[day_id] = [hours]
    for day_id,pred_list in pred_dict.items():
        depl.plot_single_day(pred_list, 'predicted/'+defo.get_year_month_day_str(day_id+'T00'), max_pred)
        
    pred_start = pred_data[0]['id']
    pred_end = pred_data[-1]['id']
    pred_week_start = defo.subtract_one_week(pred_end)
    while pred_week_start > pred_start:
        # Plot predicted weeks
        pred_y = [x['num_logins'] for x in pred_data if x['id']>pred_week_start and x['id']<=pred_end]
        pred_id = [x['id'] for x in pred_data if x['id']>pred_week_start and x['id']<=pred_end]
        if pred_id:
            depl.plot_by_week(x_list=range(-1*len(pred_y),0),y_list=pred_y,id_list=pred_id,
                fix_y=max_pred,predicted_color=1,savename='predicted/Week_'+pred_end[:10])
        pred_end = pred_week_start
        pred_week_start = defo.subtract_one_week(pred_end)
    
    if pred_end > pred_start: 
        # Print part predicted, part actual
        hist_start = defo.subtract_one_week(pred_end)
        cur.execute("SELECT id, num_logins FROM login_history WHERE id>? ORDER BY id ASC ",(hist_start,))
        hist_data = cur.fetchall()
        if hist_data:
            hist_y = [x['num_logins'] for x in hist_data] # Shouldn't have overlap between actual & predicted
            hist_id = [x['id'] for x in hist_data]
            pred_y = [x['num_logins'] for x in pred_data if x['id']<=pred_end]
            pred_id = [x['id'] for x in pred_data if x['id']<=pred_end]
            plot_y = hist_y+pred_y
            depl.plot_by_week(x_list=range(-1*len(plot_y),0),y_list=plot_y,id_list=hist_id+pred_id,
                fix_y=max_pred,savename='predicted/Week_'+pred_end[:10],split=len(hist_y))
Esempio n. 24
0
def plot_predictions(update_plots=None):
    """Updates the predictions (if update_plots is not None) which will also plot
    the linear regression predictions with past data,
    and Plots (saved to file) each predicted day in login_predictions"""
    db = dbh.get_db()
    cur = db.cursor()
    if update_plots is not None:
        num_days_predicted = 15
        delete_predictions_with_actuals()
        # Find start day for predictions (=1+last day of actuals)
        cur.execute("SELECT id FROM login_history ORDER BY id DESC")
        latest = cur.fetchone()
        if latest:
            start_year, start_month, start_day = defo.tp_add_x_days_to_id(
                latest[0], 1)
            predict_demand(start_year, start_month, start_day,
                           num_days_predicted, 1)

    cur.execute("SELECT * FROM login_predictions ORDER BY id ASC")
    pred_data = cur.fetchall()
    if not pred_data:
        print "No predictions in database! Nothing to plot"
        return
    # Get list of days that have been predicted
    pred_dict = {}
    max_pred = 0  # Save max prediction to set identical y axis scales
    for hours in pred_data:
        if hours['num_logins'] > max_pred:
            max_pred = hours['num_logins']
        day_id = hours['id'][:-3]
        if day_id in pred_dict:
            pred_dict[day_id].append(hours)
        else:
            pred_dict[day_id] = [hours]
    for day_id, pred_list in pred_dict.items():
        depl.plot_single_day(
            pred_list,
            'predicted/' + defo.get_year_month_day_str(day_id + 'T00'),
            max_pred)

    pred_start = pred_data[0]['id']
    pred_end = pred_data[-1]['id']
    pred_week_start = defo.subtract_one_week(pred_end)
    while pred_week_start > pred_start:
        # Plot predicted weeks
        pred_y = [
            x['num_logins'] for x in pred_data
            if x['id'] > pred_week_start and x['id'] <= pred_end
        ]
        pred_id = [
            x['id'] for x in pred_data
            if x['id'] > pred_week_start and x['id'] <= pred_end
        ]
        if pred_id:
            depl.plot_by_week(x_list=range(-1 * len(pred_y), 0),
                              y_list=pred_y,
                              id_list=pred_id,
                              fix_y=max_pred,
                              predicted_color=1,
                              savename='predicted/Week_' + pred_end[:10])
        pred_end = pred_week_start
        pred_week_start = defo.subtract_one_week(pred_end)

    if pred_end > pred_start:
        # Print part predicted, part actual
        hist_start = defo.subtract_one_week(pred_end)
        cur.execute(
            "SELECT id, num_logins FROM login_history WHERE id>? ORDER BY id ASC ",
            (hist_start, ))
        hist_data = cur.fetchall()
        if hist_data:
            hist_y = [x['num_logins'] for x in hist_data
                      ]  # Shouldn't have overlap between actual & predicted
            hist_id = [x['id'] for x in hist_data]
            pred_y = [
                x['num_logins'] for x in pred_data if x['id'] <= pred_end
            ]
            pred_id = [x['id'] for x in pred_data if x['id'] <= pred_end]
            plot_y = hist_y + pred_y
            depl.plot_by_week(x_list=range(-1 * len(plot_y), 0),
                              y_list=plot_y,
                              id_list=hist_id + pred_id,
                              fix_y=max_pred,
                              savename='predicted/Week_' + pred_end[:10],
                              split=len(hist_y))
Esempio n. 25
0
def predict_demand(year, month, day, num_days, enable_plots=None):
    """
    Given a valid database DB with saved formatted *.json files,
    runs the enabled algorithms to produce predictions for all the days within
    the (inclusive) time span [(beg_year,beg_month,beg_day) to (end_year,end_month,end_day)]
    Returns error string if something goes wrong, None if prediction successful.
    """
    print "Predicting Demand for %d days starting on %d/%d/%d" % (
        num_days, month, day, year)
    db = dbh.get_db()
    cur = db.cursor()
    # For now (smaller dataset), loading all 3 tables in memory is not a problem
    cur.execute('SELECT * FROM history_outliers')
    outlier_data = cur.fetchall()
    cur.execute('SELECT * FROM prediction_outliers')
    predicted_outlier_data = cur.fetchall()
    cur.execute('SELECT * FROM login_history ORDER BY id ASC')
    all_data = cur.fetchall()
    if not all_data:
        return {'error': 'No data in login_history DB'}
    if len(all_data) < 7 * 24:
        return {'error': 'Not enough data to accurately predict demand'}
    predicted_ids, predictions, predicted_slopes = depr.lin_reg_by_hour(
        all_data, outlier_data)
    cur_pred_id = defo.get_id_str(year, month, day, 0)
    end_pred_id = defo.add_x_hours(cur_pred_id, 24 * (num_days + 1))
    delta_days = defo.dy_delta_days(predicted_ids[0], cur_pred_id)
    # Filter predicted outlier ids to those within prediction timespan
    ol_dict = {}
    demand_predictions = {}
    if predicted_outlier_data:
        map(lambda y: ol_dict.update({str(y['id']):float(y['multiplier'])}), \
            filter(lambda x: x['id']>=cur_pred_id and x['id']<end_pred_id, predicted_outlier_data))
    for count in range(num_days):
        # Calculate the number of weeks to extrapolate on
        extrap_weeks = int(delta_days + count / 7)
        #print 'Predicting %s, %d weeks from (%s,%s) predictions'%(cur_pred_id,extrap_weeks,predicted_ids[0],predicted_ids[-1])
        pred_day_str = defo.get_day_str(year, month, day)
        prediction_list = []
        pred_id_list = []
        pred_data = []
        for hour in range(24):
            cur_pred_id = defo.get_id_str(year, month, day, hour)
            offset = int(
                defo.hr_subtract_ids(cur_pred_id, predicted_ids[0]) % (24 * 7))
            prediction = predictions[
                offset] + extrap_weeks * predicted_slopes[offset]
            if cur_pred_id in ol_dict.keys():
                prediction *= ol_dict[cur_pred_id]
                #print 'Predicted (%fx) Multiplier'%ol_dict[cur_pred_id]
            #print 'Prediction ID: %s, Logins: %f (%s: %fx%dWeeks + %f)'%(cur_pred_id,prediction,
            #    predicted_ids[offset],predicted_slopes[offset],extrap_weeks,predictions[offset])
            pred_data.append((cur_pred_id, prediction))
            demand_predictions[cur_pred_id] = prediction
        # Add to database, doing predictions on a day at a time basis (always 24 entries/hours)
        cur.executemany("INSERT or REPLACE into login_predictions (id, num_logins) values (?, ?)",\
            pred_data)
        # Move to next day
        year, month, day = defo.tp_add_x_days(year, month, day, 1)
    db.commit()
    return demand_predictions
Esempio n. 26
0
def plot_logins():
    """Use the loaded history of client login data to create plots,
    which are saved within the predict_demand/plots folder.
    Used for manual analysis"""
    print "Running analytics on DB\n"
    db = dbh.get_db()
    cur = db.cursor()
    cur.execute('SELECT * FROM login_history ORDER BY id ASC')
    all_data = cur.fetchall()
    if not all_data:
        print "No data loaded in DB"
        return

    # Tabulate by hour
    hour_x = []
    hour_y = []
    base_day = None

    ## Get first predicted day (1 day past last history day)
    pred_year, pred_month, pred_day = defo.tp_add_x_days_to_id(
        all_data[-1]['id'], 1)

    ## Plot trends per day over time (for first week predictions)
    for i in range(7):
        pred_day_str = defo.get_day_str(pred_year, pred_month, pred_day)
        hist_day = filter(lambda x: x['day_name'] == pred_day_str, all_data)
        pred_id = defo.get_id_str(pred_year, pred_month, pred_day, 00)
        depl.plot_day_trend(pred_id, hist_day)
        pred_year, pred_month, pred_day = defo.tp_add_x_days(
            pred_year, pred_month, pred_day, 1)
    # Weekday analysis
    depl.plot_weekdays([(x['id'],x['num_logins']) for x in all_data \
        if x['day_name'] in ['Mo', 'Tu', 'We', 'Th']])

    ## Tabulate by day
    depl.plot_each_day(all_data)
    # Creates dictionaries for each day,
    #  where the keys are each hour of that day,
    #  and values are a list of tuple pairs (id, count)
    cur.execute('SELECT MAX(num_logins) FROM login_history')
    max_login = cur.fetchone()[0]
    # Monday
    mo_dict = defo.get_hours_dict()
    map(lambda y: mo_dict[y['hour']].append((y['id'],y['num_logins'])), \
        filter(lambda x: x['day_name']=='Mo', all_data))
    depl.plot_day_dict(mo_dict, '1_Monday', max_login)
    # Tuesday
    tu_dict = defo.get_hours_dict()
    map(lambda y: tu_dict[y['hour']].append((y['id'],y['num_logins'])), \
        filter(lambda x: x['day_name']=='Tu', all_data))
    depl.plot_day_dict(tu_dict, '2_Tuesday', max_login)
    # Wednesday
    we_dict = defo.get_hours_dict()
    map(lambda y: we_dict[y['hour']].append((y['id'],y['num_logins'])), \
        filter(lambda x: x['day_name']=='We', all_data))
    depl.plot_day_dict(we_dict, '3_Wednesday', max_login)
    # Thursday
    th_dict = defo.get_hours_dict()
    map(lambda y: th_dict[y['hour']].append((y['id'],y['num_logins'])), \
        filter(lambda x: x['day_name']=='Th', all_data))
    depl.plot_day_dict(th_dict, '4_Thursday', max_login)
    # Friday
    fr_dict = defo.get_hours_dict()
    map(lambda y: fr_dict[y['hour']].append((y['id'],y['num_logins'])), \
        filter(lambda x: x['day_name']=='Fr', all_data))
    depl.plot_day_dict(fr_dict, '5_Friday', max_login)
    # Saturday
    sa_dict = defo.get_hours_dict()
    map(lambda y: sa_dict[y['hour']].append((y['id'],y['num_logins'])), \
        filter(lambda x: x['day_name']=='Sa', all_data))
    depl.plot_day_dict(sa_dict, '6_Saturday', max_login)
    # Sunday
    su_dict = defo.get_hours_dict()
    map(lambda y: su_dict[y['hour']].append((y['id'],y['num_logins'])), \
        filter(lambda x: x['day_name']=='Su', all_data))
    depl.plot_day_dict(su_dict, '7_Sunday', max_login)
    day_dict = {}
    base_year = None

    ## Tabulate by week
    # Creates list of most recent week (7 consecutive days)
    last_complete_week_id = None
    temp_complete = deque([])
    min_id = None

    for entry in all_data:
        # Getting keys of most recent consecutive week
        if not temp_complete:
            #print 'Starting w/ %d' % entry_doy
            temp_complete.append(entry['id'])  # add subsequent day
        else:
            day_delta = defo.dy_subtract_ids(entry['id'], temp_complete[-1])
            if day_delta == 1:
                temp_complete.append(entry['id'])  # add subsequent day
                if len(temp_complete) > 7:  # can also use while
                    temp_complete.popleft()
                # Save id corresponding to day (hour 0) that has a complete weeks
                # worth of previous data
                if len(temp_complete) == 7:
                    last_complete_week_id = entry['id']
            elif day_delta != 0:  # current entry is not the same or a subsequent day
                temp_complete = deque([entry['id']])
        # Save minimum (start) ID key
        if min_id is None or min_id > entry['id']:
            min_id = entry['id']

        # Create dictionary with key as the day of year
        # Handle multiple years in database
        year_str = defo.get_year(entry['id'])
        if base_year is None:
            base_year = year_str  # if multiple years, assuming years are monotonically increasing
        if year_str != base_year:
            days = (int(year_str) -
                    int(base_year)) * 365  # Doesn't account for leap years...
        else:
            days = 0
        days = days + int(defo.get_day_of_year(entry['id']))
        day_dict[days] = day_dict.get(days, 0) + entry['num_logins']
        if base_day is None:
            base_day = days
        hours = (days - base_day) * 24 + entry['hour']
        hour_x.append(hours)
        hour_y.append(entry['num_logins'])
    depl.scatter_plot(hour_x, hour_y)
    depl.plot_by_day(day_dict)

    # Need at least 1 consecutive week's worth of data
    if last_complete_week_id is not None:
        print 'Plotting week data...'
        end_id = last_complete_week_id[:-2] + '23'  # Last hour of the day
        start_id = defo.subtract_one_week(end_id)
        print "%s to %s" % (start_id, end_id)
        # Plot full weeks starting at the latest complete week,
        #  where complete means there is at least one data point for 7 consecutive days
        # Lexigraphical (default) string comparison should work with ID format yyyy-hh-ddThh
        while end_id > min_id:
            cur.execute('SELECT id, num_logins FROM login_history WHERE id>? AND id <=? ' \
                + 'ORDER BY id ASC', (start_id, end_id))
            wk_data = cur.fetchall(
            )  # Data from an entire week, sorted by most recent first
            # Find the time delta in hours (compute negative x values so
            #  the most recent is on the right)
            if wk_data:
                last_time = wk_data[-1][0]
                time_delta = map(
                    lambda entry: defo.hr_subtract_ids(entry[0], last_time),
                    wk_data)
                id_list, val_list = [list(entry) for entry in zip(*wk_data)]
                depl.plot_by_week(time_delta, val_list, id_list, max_login)

            end_id = start_id
            start_id = defo.subtract_one_week(end_id)
    else:
        print('WARNING: Database does not have continuous week of data')