Beispiel #1
0
def test_process_explode():
    data = load_binary()
    users, cycles, active_days, tracking = data['users'], data['cycles'], data['active_days'], data['tracking']
    print("Going over all cycles ~20k:")
    fc = process_explode(tracking, cycles)
    print(len(fc), ' data points in tracking pivot')
    assert(len(fc)>600000)
Beispiel #2
0
def run():
    data = load_binary()

    # Extract features
    user_feat_matrix = process_level2(data)  # X

    del user_feat_matrix['X']['user_id']
    X = user_feat_matrix['X'].values
    X[np.isnan(X)] = 0
    Y = user_feat_matrix['Y']
    Y.fillna(0, inplace=True)
    del user_feat_matrix['X_all']['user_id']
    X_all = user_feat_matrix['X_all'].values
    X_all[np.isnan(X_all)] = 0

    cols = list(Y.columns.values)
    symptoms = [
        'happy', 'pms', 'sad', 'sensitive_emotion', 'energized', 'exhausted',
        'high_energy', 'low_energy', 'cramps', 'headache', 'ovulation_pain',
        'tender_breasts', 'acne_skin', 'good_skin', 'oily_skin', 'dry_skin'
    ]
    with open("result.txt", 'w') as f:
        f.write("user_id,day_in_cycle,symptom,probability\n")

    for symptom in symptoms:
        print(symptom)

        pipeline = Pipeline([
            ('remove_low_variance_features', VarianceThreshold(threshold=0.0)),
            #('standard_scale', StandardScaler()),
            ('estimator', Lasso()),
        ])

        param_grid = {'estimator__alpha': [.1, .3, .5, .7, .8]}
        model = GridSearchCV(pipeline,
                             param_grid=param_grid,
                             n_jobs=4,
                             verbose=2)
        model.fit(X, s_Y.values)

        print("dumping...")
        data_dir = 'data'
        cycles0 = pd.read_csv(join(data_dir, 'cycles0.csv'))
        c_length = {
            k: v
            for k, v in zip(cycles0.user_id.values,
                            cycles0.expected_cycle_length)
        }
        dump(symptom, model, X_all, c_length, data['users'].user_id)
Beispiel #3
0
def check_probability_access(data):
    '''find probability_access'''

    df_active = data['active_days']
    df_cycles = data['cycles']
    access_prob = []
    for i in range(1, 30):
        access_prob.append((df_active['day_in_cycle'] == i).sum() / (
            df_cycles['cycle_length'][df_cycles['cycle_length'] >= i]).count())

        # access_prob.plot(X)

    return access_prob


df = pd.read_csv('result.txt')
# now is done until 15 day, afterwords our predictions are wrong
daily_profiles = create_profile_for_symptoms(df, date_range=15)

data = load_binary()
access_profile = check_probability_access(data)
plt.plot(access_profile[0:29])  # probability of access

for symptom in symptoms:
    real_prob = daily_profiles[symptom].copy()
    for i in range(15):
        real_prob.loc[i] = real_prob.loc[i] / access_profile[i]

    plt.plot(real_prob)
Beispiel #4
0
def test_load_binary():
    data = load_binary()
    users, cycles, active_days, tracking = data['users'], data['cycles'], data[
        'active_days'], data['tracking']
    print(len(users), 'users loaded')
    assert (len(users) > 3000)