def run_script(pdate_str, busjson, revjson, tipjson, senticsv, outfile): # convert pdate to seconds since the epoch pdate = du.date2int(du.str2date(pdate_str)) # load business objects print 'Loading business objects from %s...' % busjson all_buses, junk = ju.load_objects(busjson) # load review objects print 'loading review objects from %s...' % revjson all_reviews, junk = ju.load_objects(revjson) # load tip objects print 'loading tip objects from %s...' % tipjson all_tips, junk = ju.load_objects(tipjson) # load sentiment ranking data derived from tip and review data print 'loading sentiment rankings from %s...' % senticsv all_senti = cu.load_matrix(senticsv, has_hdr=False) # generate a data set the specified prediction date print('generate data set for prediction date %s...' % pdate_str) buses = du.gen_dataset(pdate, all_buses, all_reviews, all_tips, all_senti) # write data set to file print('writing generated data set to %s...' % outfile) ju.save_objects(buses, outfile)
def run_script(busjson, revjson, tipjson, senticsv, init_pdate, delta, ctype=linsvm, usamp=True, binary=None, rfe=False, pca=-1, reg=False, feat_info=fi.data_feat_info, states=None): print 'Initial prediction date: %s' % init_pdate print 'Time delta: %d months' % delta if (states): print 'limiting data to restaurants in: %s' % str(states) # convert pdate to secondds since the epoch pdate = du.date2int(du.str2date(init_pdate)) # load business objects print 'Loading business objects from %s...' % busjson all_buses, junk = ju.load_objects(busjson) # load review objects print 'loading review objects from %s...' % revjson all_reviews, junk = ju.load_objects(revjson) # load tip objects print 'loading tip objects from %s...' % tipjson all_tips, junk = ju.load_objects(tipjson) # load sentiment ranking data derived from tip and review data print 'loading sentiment rankings from %s...' % senticsv all_senti = cu.load_matrix(senticsv, has_hdr=False) # reduce the number of features using recursive feature elimination # - See http://scikit-learn.org/stable/auto_examples/plot_rfe_with_cross_validation.html#example-plot-rfe-with-cross-validation-py # - See http://stackoverflow.com/questions/23815938/recursive-feature-elimination-and-grid-search-using-scikit-learn if (reg): # create the least squares linear regressor print 'using least squares linear regression...' c = linmod.LinearRegression() # grid search not supported for linear regression (???) param_grid = None elif (ctype==rbfsvm): # create RBF SVM to test #c = svm.NuSVC(kernel='rbf') c = svm.SVC(kernel='rbf') # configure parameter grid for grid search C_range = 10.0 ** np.arange(-3, 5) gamma_range = 10.0 ** np.arange(-4, 3) if (rfe): print 'RFE not currently supported for RBF SVM...' #c = fs.RFECV(c, step=1) #pgrid = [] #for C in C_range: # for gamma in gamma_range: # pgrid.append({'C':C,'gamma':gamma}) #pgrid = [{'gamma':0.5},{'gamma':0.1},{'gamma':0.01},{'gamma':0.001},{'gamma':0.0001}] #param_grid = {'estimator_params': pgrid} print 'using RBF SVM...' param_grid = dict(gamma=gamma_range, C=C_range) elif (ctype==knn): # create a KNN classifier c = neigh.KNeighborsClassifier() if (rfe): print 'RFE not currently supported for k-nearesrt neighbors...' print 'using k-mearest neighbors...' param_grid = {'n_neighbors':[1,2,3,4,5,6,7,8,9,10,15,20,25,30], 'weights':['uniform','distance'], 'p':[1,2,3,4,5,6,7,8,9,10]} elif (ctype==ada): # create boosted classifier c = ensemble.AdaBoostClassifier() if (rfe): print 'RFE not currently supported for AdaBoost...' print 'using AdaBoost...' param_grid = {'n_estimators':[5, 10, 25, 40, 50, 60, 75, 85, 100], 'learning_rate':[0.25, 0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0]} elif (ctype==rf): # create random forest classifier c = ensemble.RandomForestClassifier() if (rfe): print 'RFE not currently supported for random forest...' print 'using random forest...' param_grid = {'n_estimators':[5, 10, 25, 40, 50, 60, 75, 85, 100], 'max_depth':[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, None]} elif (ctype==dt): # create decision tree classifier c = tree.DecisionTreeClassifier() # max feats - subtract 1 because data feats includes the class label if (rfe): print 'RFE not supported with decision trees...' print 'using decision tree...' param_grid = {'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, None]} else: # create linear SVM to test c = svm.LinearSVC() # configure parameter grid for grid search C_range = 10.0 ** np.arange(-3, 5) if (rfe): print 'using linear SVM with RFE...' c = fs.RFECV(c, step=1) pgrid = [] for C in C_range: pgrid.append({'C':C}) #pgrid = [{'C':0.01},{'C':0.1},{'C':1},{'C':10},{'C':100},{'C':1000},{'C':10000}] param_grid = {'estimator_params': pgrid} else: print 'using linear SVM...' param_grid = {'C': C_range} # run the walk-forward cross validation and collect the results print('run walk-forward cross validation...') if (usamp): print(' under-sampling still open class...') else: print(' NOT under-sampling still open class...') results = wfcvutils.wfcv(c, param_grid, all_buses, all_reviews, all_tips, all_senti, pdate, delta*du.month, pca=pca, usamp=usamp, binary=binary, reg=reg, feat_info=feat_info, states=states) # combine the results to produce overall metrics y_true = None y_pred = None for r in results: if (y_true is None): y_true = r[0] else: y_true = np.hstack((y_true, r[0])) if (y_pred is None): y_pred = r[1] else: y_pred = np.hstack((y_pred, r[1])) # print out an overall classification report print('\n=========================================') print('Overall metrics for all prediction dates:\n') if (len(results) != 0): if (reg): wfcvutils.print_reg_metrics(y_true, y_pred) else: cm = metrics.confusion_matrix(y_true, y_pred) wfcvutils.print_cm(cm) #print(metrics.classification_report(y_true, y_pred, target_names=fi.class_names)) else: print ' NO RESULTS\n'