Esempio n. 1
0
def plot_lc_all():
    # Load test set learning curve values
    scores = {}
    models = ['linear', 'ridge', 'lasso', 'xgb_a']
    colors = ['green', 'c', 'orangered', 'mediumslateblue', 'saddlebrown']
    for i in range(0, len(models)):
        filepath = get_abspath('{0}/{0}_LC_test.csv'.format(models[i]),
                               'outputs')
        scores[models[i]] = (np.mean(pd.read_csv(filepath), axis=1), colors[i])

    # Set training sizes and intervals
    train_sizes = np.arange(0.01, 1.0, 0.05)

    # Create learning curve plot
    fig, ax = plt.subplots()
    for k, v in scores.items():
        ax.plot(train_sizes, v[0], marker='.', color=v[1], label=k)
    ax.legend(loc='best')
    ax.grid(linestyle='dotted')
    ax.set_title('Learning curves - test score comparison')
    ax.set_xlabel('Samples used for training as a percentage of total')
    ax.set_ylabel('RMSE (Negative)')

    # Save learning curve plot as PNG
    plotpath = get_abspath('models_LC.png', 'graphs')
    ensure_dir_exists(plotpath)
    plt.tight_layout(pad=1.0)
    plt.savefig(plotpath)
    plt.close()
Esempio n. 2
0
def main():
    config = db_config()
    conn = db_conn(config)

    # Create scorer to train models using RMSE
    scorer = make_scorer(rmse, greater_is_better=False)

    # Load features
    features_a = preprocess_features(conn)
    features_b = preprocess_features(conn, with_sa=False)

    # Get train and test sets for both feature sets
    X_train_a, X_test_a, y_train_a, y_test_a = split_data(features_a)
    X_train_b, X_test_b, y_train_b, y_test_b = split_data(features_b)

    # Train regression models with sentiment score features
    modelpath = 'models'
    linear_models = {
        'linear': LinearRegression(),
        'ridge': Ridge(),
        'lasso': Lasso()
    }

    for name, model in linear_models.items():
        grid = train_model(model_object=model,
                           model_type=name,
                           X_train=X_train_a,
                           y_train=y_train_a,
                           score_function=scorer,
                           cv=3)
        filename = get_abspath('{}.model'.format(name), modelpath)
        ensure_dir_exists(filename)
        save_pickled_model(grid, filename)

    # Train XGBoost model with sentiment score features
    xgb = XGBRegressor(objective='reg:linear')
    grid = train_model(model_object=xgb,
                       model_type='xgb',
                       X_train=X_train_a,
                       y_train=y_train_a,
                       score_function=scorer,
                       cv=3)
    filename = get_abspath('xgb_a.model', modelpath)
    save_pickled_model(grid, filename)

    # Train XGBoost model without sentiment score features
    grid = train_model(model_object=xgb,
                       model_type='xgb',
                       X_train=X_train_b,
                       y_train=y_train_b,
                       score_function=scorer,
                       cv=3)
    filename = get_abspath('xgb_b.model', modelpath)
    save_pickled_model(grid, filename)
Esempio n. 3
0
def backup(args):
    db = args.db
    out = get_abspath(args.output_dir)
    now = datetime.datetime.now()
    out = os.path.join(out, '%s__%s' % (db, now.strftime('%Y_%m_%d_%H%M%S')))
    cmd = ['mongodump', '-o', '%s' % out]
    execute(cmd, args)
Esempio n. 4
0
def plot_learning_curve(model_name, train_scores, test_scores):
    # Set training sizes and intervals
    train_sizes = np.arange(0.01, 1.0, 0.05)

    # Create learning curve plot
    fig, ax = plt.subplots()
    ax.plot(train_sizes,
            np.mean(train_scores, axis=1),
            marker='.',
            color='b',
            label='Training score')
    ax.plot(train_sizes,
            np.mean(test_scores, axis=1),
            marker='.',
            color='g',
            label='Cross-validation score')
    ax.legend(loc='best')
    ax.grid(linestyle='dotted')
    ax.set_title('Learning curve - {} model'.format(model_name))
    ax.set_xlabel('Samples used for training as a percentage of total')
    ax.set_ylabel('RMSE (Negative)')

    # Save learning curve plot as PNG
    plot_tgt = '{0}/{1}'.format('graphs', model_name)
    plotpath = get_abspath('{}_LC.png'.format(model_name), plot_tgt)
    ensure_dir_exists(plotpath)
    plt.tight_layout(pad=1.0)
    plt.savefig(plotpath)
    plt.close()
Esempio n. 5
0
def create_learning_curve(estimator,
                          scorer,
                          X_train,
                          y_train,
                          model_name,
                          cv=3,
                          save_file=True):
    # Set training sizes and intervals
    train_sizes = np.arange(0.01, 1.0, 0.05)

    # Set cross validation strategy to use StratifiedShuffleSplit
    cv_strategy = StratifiedShuffleSplit(n_splits=cv, random_state=0)

    # Create learning curve object
    LC = learning_curve(estimator,
                        X_train,
                        y_train,
                        cv=cv_strategy,
                        train_sizes=train_sizes,
                        scoring=scorer,
                        n_jobs=-1)

    # Extract training and test scores as data frames
    train_scores = pd.DataFrame(index=LC[0], data=LC[1])
    test_scores = pd.DataFrame(index=LC[0], data=LC[2])

    # Save data frames to CSV
    if save_file:
        res_tgt = '{0}/{1}'.format('outputs', model_name)
        train_file = get_abspath('{}_LC_train.csv'.format(model_name), res_tgt)
        test_file = get_abspath('{}_LC_test.csv'.format(model_name), res_tgt)
        ensure_dir_exists(train_file)
        train_scores.to_csv(train_file, index=False)
        test_scores.to_csv(test_file, index=False)

    return train_scores, test_scores
Esempio n. 6
0
def plot_score_histograms(df, score_type, filename, filepath='graphs'):
    # Set figure parameters
    sns.set(font_scale=1.3, rc={'figure.figsize': (12, 8)})

    # Create plot and set parameters
    fig, ax = plt.subplots()
    ax.hist(df, bins=20)
    fig.suptitle('{} Polarity Score Frequency'.format(score_type))
    ax.set_xlabel('Value')
    ax.set_ylabel('Frequency')

    # Save plot
    plotpath = get_abspath(filename, filepath)
    ensure_dir_exists(plotpath)
    plt.savefig(plotpath)
    plt.close()
Esempio n. 7
0
def basic_results(grid, X_test, y_test, model_name):
    # Get best score, test score, scoring function and best parameters
    bs = grid.best_score_
    ts = grid.score(X_test, y_test)
    sf = grid.scorer_
    bp = grid.best_params_

    # Calculate R^2
    y_pred = grid.predict(X_test)
    r2 = r2_score(y_test, y_pred)

    # Write results to a combined results file
    resfile = get_abspath('basic_results.csv', 'outputs')
    ensure_dir_exists(resfile)
    with open(resfile, 'a') as f:
        f.write('{0},{1},{2},"{3}","{4}",{5}\n'.format(model_name, bs, ts, sf,
                                                       bp, r2))
Esempio n. 8
0
def get_feature_importances(model_name, grid, features, save_file=True):
    # Get feature importance values
    feat_importance = pd.Series(
        grid.best_estimator_.get_booster().get_fscore()).sort_values(
            ascending=False)

    # Get feature names
    feat_idxs = [i.strip('f') for i in feat_importance.keys()]
    feat_names = list(features.columns.values)
    feats = [x for _, x in sorted(zip(feat_idxs, feat_names))]

    # Add feature names
    df = feat_importance.to_frame(name='f_score')
    df.reset_index(drop=True, inplace=True)
    df['name'] = feats

    # Save feature importance to CSV
    if save_file:
        file_tgt = '{0}/{1}'.format('outputs', model_name)
        feats_file = get_abspath('{}_FI.csv'.format(model_name), file_tgt)
        df.to_csv(feats_file, index=False)

    return df
Esempio n. 9
0
def create_validation_curve(estimator, X_train, y_train, model_name,
                            param_name, param_range, scorer):
    # Generate validation curve results
    train_scores, test_scores = validation_curve(estimator,
                                                 X_train,
                                                 y_train,
                                                 param_name=param_name,
                                                 param_range=param_range,
                                                 cv=3,
                                                 scoring=scorer,
                                                 n_jobs=-1)

    # Generate validation curve plot
    fig, ax = plt.subplots()
    ax.plot(param_range,
            np.mean(train_scores, axis=1),
            marker='.',
            color='b',
            label='Train Score')
    ax.plot(param_range,
            np.mean(test_scores, axis=1),
            marker='.',
            color='g',
            label='Cross-validation Score')
    ax.legend(loc='best')
    ax.grid(linestyle='dotted')
    ax.set_title('Validation curve - {} model'.format(model_name))
    ax.set_xlabel(param_name)
    ax.set_ylabel('RMSE (Negative)')

    # Save validation curve plot as PNG
    plot_tgt = '{0}/{1}'.format('graphs', model_name)
    plotpath = get_abspath('{}_VC.png'.format(model_name), plot_tgt)
    ensure_dir_exists(plotpath)
    plt.tight_layout(pad=1.0)
    plt.savefig(plotpath)
    plt.close()
Esempio n. 10
0
def plot_feature_importances(model_name, feature_importances, nfeats=10):
    # Subset feature importances data frame
    subset = feature_importances.iloc[:nfeats, :]

    # Generate feature importance plot
    fig, ax = plt.subplots()
    sns.barplot(x='f_score',
                y='name',
                data=subset,
                ax=ax,
                palette=sns.color_palette('YlGnBu_r', n_colors=15),
                dodge=False)
    ax.tick_params(labelsize=8)
    ax.set_ylabel('')
    ax.set_xlabel('F score')
    ax.set_title('Feature importances - {} model'.format(model_name))

    # Save feature importance plot as PNG
    plot_tgt = '{0}/{1}'.format('graphs', model_name)
    plotpath = get_abspath('{}_FI.png'.format(model_name), plot_tgt)
    ensure_dir_exists(plotpath)
    plt.tight_layout(pad=1.0)
    plt.savefig(plotpath)
    plt.close()
Esempio n. 11
0
parser = argparse.ArgumentParser(
    prog='detect',
    description='Detect faces in an image.'
)

parser.add_argument(
    'path',
    type=str,
    help='path or URL of a photo where faces will be detected')

args = parser.parse_args()

# ----------------------------------------------------------------------
# Setup
# ----------------------------------------------------------------------

img_url = args.path if is_url(args.path) else get_abspath(args.path)
face_attrs = ['age', 'gender', 'glasses', 'emotion', 'occlusion']

# ----------------------------------------------------------------------
# Call face API to detect and describe faces
# ----------------------------------------------------------------------

# Request subscription key and endpoint from user.
subscription_key, endpoint = get_private()

credentials = CognitiveServicesCredentials(subscription_key)  # Set credentials
client = FaceClient(endpoint, credentials)  # Setup Azure face API client
faces = azface_detect(client, img_url, return_face_attributes=face_attrs)
print_detection_results(faces)
Esempio n. 12
0
# ----------------------------------------------------------------------

parser = argparse.ArgumentParser(prog='score',
                                 parents=[option_parser],
                                 description='Detect faces in an image.')

parser.add_argument('image', type=str, help='image path or URL')

args = parser.parse_args()

# Wrap face detection parameters.

face_params = FaceParams(args.scaleFactor, args.minNeighbors, args.minSize)

# ----------------------------------------------------------------------
# Face detection
# ----------------------------------------------------------------------

image = read_cv_image_from(
    args.image if is_url(args.image) else get_abspath(args.image))

faces = detect_faces(image, face_params=face_params)

print("Found {0} faces!".format(len(faces)))

result = mark_faces(image, faces)

image, result = convert_cv2matplot(image, result)

plot_side_by_side_comparison(image, result, rightlabel="Detected Faces")
Esempio n. 13
0
)

parser.add_argument(
    'target',
    help='path or URL of a photo of the faces to be found')

parser.add_argument(
    'candidate',
    help='path or URL of a photo to find expected target faces')

args = parser.parse_args()

# ----------------------------------------------------------------------
# Setup
# ----------------------------------------------------------------------
target_url = args.target if is_url(args.target) else get_abspath(args.target)  # Get the photo of target faces
candidate_url = args.candidate if is_url(args.candidate) else get_abspath(args.candidate)  # Get the photo to be checked

if os.path.isdir(target_url) or os.path.isdir(candidate_url):
    stop("Only one photo allowed!")

# ----------------------------------------------------------------------
# Prepare Face API client
# ----------------------------------------------------------------------

# Request subscription key and endpoint from user.
subscription_key, endpoint = get_private()

credentials = CognitiveServicesCredentials(subscription_key)  # Set credentials
client = FaceClient(endpoint, credentials)  # Setup Azure face API client
Esempio n. 14
0
def main():
    config = db_config()
    conn = db_conn(config)

    # Remove basic results file
    try:
        combined = get_abspath('basic_results.csv', 'outputs')
        os.remove(combined)
    except IOError:
        pass

    # Load features
    features_a = preprocess_features(conn)
    features_b = preprocess_features(conn, with_sa=False)

    # Create scorer to train models using RMSE
    scorer = make_scorer(rmse, greater_is_better=False)

    # Load models in a dict
    models = {
        'linear': load_pickled_model('models/linear.model'),
        'ridge': load_pickled_model('models/ridge.model'),
        'lasso': load_pickled_model('models/lasso.model'),
        'xgb_a': load_pickled_model('models/xgb_a.model'),
        'xgb_b': load_pickled_model('models/xgb_b.model')
    }

    # Validation curve parameter names and ranges
    vc_params = {
        'xgb_a': ('max_depth', np.arange(1, 20, 1)),
        'xgb_b': ('max_depth', np.arange(1, 20, 1))
    }

    # Split into train and test sets
    X_train_a, X_test_a, y_train_a, y_test_a = split_data(features_a)
    X_train_b, X_test_b, y_train_b, y_test_b = split_data(features_b)

    # Generate basic results and learning curves for all models
    for name, grid in models.items():
        if name in ['linear', 'ridge', 'lasso']:
            basic_results(grid, X_test_a, y_test_a, name)
            train_scores, test_scores = create_learning_curve(
                grid.best_estimator_,
                scorer,
                X_train_a,
                y_train_a,
                model_name=name,
                cv=3)
            plot_learning_curve(name, train_scores, test_scores)
        if name == 'xgb_b':
            basic_results(grid, X_test_b, y_test_b, name)
            train_scores, test_scores = create_learning_curve(
                grid.best_estimator_,
                scorer,
                X_train_b,
                y_train_b,
                model_name=name,
                cv=3)
            plot_learning_curve(name, train_scores, test_scores)

    # Generate validation curves for XGBoost models
    create_validation_curve(models['xgb_a'].best_estimator_,
                            X_train_a,
                            y_train_a,
                            model_name='xgb_a',
                            param_name=vc_params['xgb_a'][0],
                            param_range=vc_params['xgb_a'][1],
                            scorer=scorer)

    create_validation_curve(models['xgb_b'].best_estimator_,
                            X_train_b,
                            y_train_b,
                            model_name='xgb_b',
                            param_name=vc_params['xgb_b'][0],
                            param_range=vc_params['xgb_b'][1],
                            scorer=scorer)

    # Generate XGBoost feature importance plots and results
    fi_a = get_feature_importances('xgb_a', models['xgb_a'], features_a)
    fi_b = get_feature_importances('xgb_b', models['xgb_b'], features_b)
    plot_feature_importances('xgb_a', fi_a, nfeats=15)
    plot_feature_importances('xgb_b', fi_b, nfeats=15)

    # Plot test set learning curves of all five models
    plot_lc_all()
Esempio n. 15
0
def restore(args):
    input_dir = get_abspath(args.input_dir)
    cmd = ['mongorestore', '%s' % input_dir]
    execute(cmd, args)