def run_script(pdate_str, busjson, revjson, tipjson, senticsv, outfile): # convert pdate to seconds since the epoch pdate = du.date2int(du.str2date(pdate_str)) # load business objects print 'Loading business objects from %s...' % busjson all_buses, junk = ju.load_objects(busjson) # load review objects print 'loading review objects from %s...' % revjson all_reviews, junk = ju.load_objects(revjson) # load tip objects print 'loading tip objects from %s...' % tipjson all_tips, junk = ju.load_objects(tipjson) # load sentiment ranking data derived from tip and review data print 'loading sentiment rankings from %s...' % senticsv all_senti = cu.load_matrix(senticsv, has_hdr=False) # generate a data set the specified prediction date print('generate data set for prediction date %s...' % pdate_str) buses = du.gen_dataset(pdate, all_buses, all_reviews, all_tips, all_senti) # write data set to file print('writing generated data set to %s...' % outfile) ju.save_objects(buses, outfile)
def run_script(csvfile, periods, gap): # load CSV data print 'Loading matrix from %s...' % csvfile matrix, headers = cu.load_matrix(csvfile, True) # negative periods argument means plot all the data if (periods < 0): periods = matrix.shape[0] print ' plot data for %d periods...' % periods print ' plot every %dth data point...' % gap # the matrix is structured as follows: # -- the first row contains the column headers # -- the first column contains the generation numbers # -- the next eight columns contain data for the assessment module bits # set the font for the legend matplotlib.rcParams.update({'font.size': 8}) fontP = FontProperties() fontP.set_size(8) # clear the plot plt.figure(figsize=(8,3)) plt.clf() colors = {'b0':'b','b1':'g','b2':'r','b3':'c', 'b4':'m','b5':'y','b6':'k','b7':'0.75', 'payout':'b','max':'g','%max':'r'} for i in range(1,matrix.shape[1]-3): indices = np.arange(0,periods) indices = indices[::gap] plt.plot(indices, matrix[indices,i], linewidth=0.5, color=colors[headers[i].strip()], aa=True) # label the axes #if (csvfile.find('scounts') >=0): # ylab = "strategy frequencies (%)" #elif (csvfile.find('spayouts') >= 0): # ylab = "average payouts" #elif (csvfile.find('sfitness') >= 0): # ylab = "average fitness" #else: # ylab = "unknown data" ylab = "num of tribes" plt.ylabel(ylab) plt.xlabel("generations") # create the legend ax = plt.subplot(111) box = ax.get_position() ax.set_position([box.x0, box.y0 + box.height * 0.5, box.width, box.height*0.5]) plt.legend(headers[1::], loc='upper center', bbox_to_anchor=(0.5, -0.3), ncol=matrix.shape[1]-1, prop=fontP, frameon=False) # get file name for plot idx = csvfile.find('.csv') if (idx >= 0): pngfile = csvfile[0:idx] + '.png' else: pngfile = csvfile + '.png' # write the plot to a file plt.savefig(pngfile, bbox_inches='tight');
def run_script(busjson, revjson, tipjson, senticsv, init_pdate, delta, ctype=linsvm, usamp=True, binary=None, rfe=False, pca=-1, reg=False, feat_info=fi.data_feat_info, states=None): print 'Initial prediction date: %s' % init_pdate print 'Time delta: %d months' % delta if (states): print 'limiting data to restaurants in: %s' % str(states) # convert pdate to secondds since the epoch pdate = du.date2int(du.str2date(init_pdate)) # load business objects print 'Loading business objects from %s...' % busjson all_buses, junk = ju.load_objects(busjson) # load review objects print 'loading review objects from %s...' % revjson all_reviews, junk = ju.load_objects(revjson) # load tip objects print 'loading tip objects from %s...' % tipjson all_tips, junk = ju.load_objects(tipjson) # load sentiment ranking data derived from tip and review data print 'loading sentiment rankings from %s...' % senticsv all_senti = cu.load_matrix(senticsv, has_hdr=False) # reduce the number of features using recursive feature elimination # - See http://scikit-learn.org/stable/auto_examples/plot_rfe_with_cross_validation.html#example-plot-rfe-with-cross-validation-py # - See http://stackoverflow.com/questions/23815938/recursive-feature-elimination-and-grid-search-using-scikit-learn if (reg): # create the least squares linear regressor print 'using least squares linear regression...' c = linmod.LinearRegression() # grid search not supported for linear regression (???) param_grid = None elif (ctype==rbfsvm): # create RBF SVM to test #c = svm.NuSVC(kernel='rbf') c = svm.SVC(kernel='rbf') # configure parameter grid for grid search C_range = 10.0 ** np.arange(-3, 5) gamma_range = 10.0 ** np.arange(-4, 3) if (rfe): print 'RFE not currently supported for RBF SVM...' #c = fs.RFECV(c, step=1) #pgrid = [] #for C in C_range: # for gamma in gamma_range: # pgrid.append({'C':C,'gamma':gamma}) #pgrid = [{'gamma':0.5},{'gamma':0.1},{'gamma':0.01},{'gamma':0.001},{'gamma':0.0001}] #param_grid = {'estimator_params': pgrid} print 'using RBF SVM...' param_grid = dict(gamma=gamma_range, C=C_range) elif (ctype==knn): # create a KNN classifier c = neigh.KNeighborsClassifier() if (rfe): print 'RFE not currently supported for k-nearesrt neighbors...' print 'using k-mearest neighbors...' param_grid = {'n_neighbors':[1,2,3,4,5,6,7,8,9,10,15,20,25,30], 'weights':['uniform','distance'], 'p':[1,2,3,4,5,6,7,8,9,10]} elif (ctype==ada): # create boosted classifier c = ensemble.AdaBoostClassifier() if (rfe): print 'RFE not currently supported for AdaBoost...' print 'using AdaBoost...' param_grid = {'n_estimators':[5, 10, 25, 40, 50, 60, 75, 85, 100], 'learning_rate':[0.25, 0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0]} elif (ctype==rf): # create random forest classifier c = ensemble.RandomForestClassifier() if (rfe): print 'RFE not currently supported for random forest...' print 'using random forest...' param_grid = {'n_estimators':[5, 10, 25, 40, 50, 60, 75, 85, 100], 'max_depth':[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, None]} elif (ctype==dt): # create decision tree classifier c = tree.DecisionTreeClassifier() # max feats - subtract 1 because data feats includes the class label if (rfe): print 'RFE not supported with decision trees...' print 'using decision tree...' param_grid = {'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, None]} else: # create linear SVM to test c = svm.LinearSVC() # configure parameter grid for grid search C_range = 10.0 ** np.arange(-3, 5) if (rfe): print 'using linear SVM with RFE...' c = fs.RFECV(c, step=1) pgrid = [] for C in C_range: pgrid.append({'C':C}) #pgrid = [{'C':0.01},{'C':0.1},{'C':1},{'C':10},{'C':100},{'C':1000},{'C':10000}] param_grid = {'estimator_params': pgrid} else: print 'using linear SVM...' param_grid = {'C': C_range} # run the walk-forward cross validation and collect the results print('run walk-forward cross validation...') if (usamp): print(' under-sampling still open class...') else: print(' NOT under-sampling still open class...') results = wfcvutils.wfcv(c, param_grid, all_buses, all_reviews, all_tips, all_senti, pdate, delta*du.month, pca=pca, usamp=usamp, binary=binary, reg=reg, feat_info=feat_info, states=states) # combine the results to produce overall metrics y_true = None y_pred = None for r in results: if (y_true is None): y_true = r[0] else: y_true = np.hstack((y_true, r[0])) if (y_pred is None): y_pred = r[1] else: y_pred = np.hstack((y_pred, r[1])) # print out an overall classification report print('\n=========================================') print('Overall metrics for all prediction dates:\n') if (len(results) != 0): if (reg): wfcvutils.print_reg_metrics(y_true, y_pred) else: cm = metrics.confusion_matrix(y_true, y_pred) wfcvutils.print_cm(cm) #print(metrics.classification_report(y_true, y_pred, target_names=fi.class_names)) else: print ' NO RESULTS\n'
def process_review_tip_census_data(in_revjson, in_tipjson, in_demoeconcsv, buses): # load the census tracts print 'loading demographic and economic data from %s...' % in_demoeconcsv demo_econ_data = csvutils.load_matrix(in_demoeconcsv,False) # initialize dictionaries to hold the first and last review dates and # add lookup data for demo & econ data print 'initialize dictionaries...' first_review_dates = {} last_review_dates = {} demo_econ_lookup = {} for bus in buses: # add the business IDs for restaurants to the dictionaries bid = bus[fi.business_id] first_review_dates[bid] = None last_review_dates[bid] = None demo_econ_lookup[bid] = -1 # initialize lookup table for demo and econ data print 'initialize lookup table for demographic and economic data...' for i in xrange(demo_econ_data.shape[0]): bid = demo_econ_data[i,fi.census_bus_id_idx] if (bid): demo_econ_lookup[bid] = i # collect the reviews that were written for one of the businesses in the list # of businesses and identify the first/last review/tip dates for each business reviews = [] print 'processing reviews from %s...' % in_revjson with open(in_revjson, 'r') as fin: # there is one JSON object per line, iterate over the lines and load the JSON for line in fin: # load the JSON object as a dictionary review = json.loads(line) # if the review is for one of the requested businesses then update # the current first/last review/tip date for that business if necessary bid = review[fi.business_id] if (bid in last_review_dates): # append this review to the list of reviews reviews.append(review) # process review dates review_date = date2int(str2date(review[fi.date])) review[fi.date] = review_date # process first and last review/tip dates current_first = first_review_dates[bid] current_last = last_review_dates[bid] # if this review date is earlier than the current first review/tip # date then set the first review/tip date to this review date if (current_first is None or current_first > review_date): first_review_dates[bid] = review_date # if this review date is more recent than the current last review/tip # date then set the last review/tip date to this review date if (current_last is None or current_last < review_date): last_review_dates[bid] = review_date # collect the tips that were written for one of the businesses in the list # of businesses and update the first/last review/tip dates for each business tips = [] print 'processing tips from %s...' % in_tipjson with open(in_tipjson, 'r') as fin: # there is one JSON object per line, iterate over the lines and load the JSON for line in fin: # load the JSON object as a dictionary tip = json.loads(line) # if the tip is for one of the requested businesses then update # the current first/last review/tip date for that business if necessary bid = tip[fi.business_id] if (bid in last_review_dates): # append this tip to the list of tips tips.append(tip) # process tip dates tip_date = date2int(str2date(tip[fi.date])) tip[fi.date] = tip_date # process first and last review/tip dates current_first = first_review_dates[bid] current_last = last_review_dates[bid] # if this tip date is earlier than the current first review/tip # date then set the first review/tip date to this review date if (current_first is None or current_first > tip_date): first_review_dates[bid] = tip_date # if this tip date is more recent than the current last review/tip # date then set the last review/tip date to this review date if (current_last is None or current_last < tip_date): last_review_dates[bid] = tip_date # copy the last review dates and census tracts into the business objects print 'adding first/last review date and census tract to business objects...' for bus in buses: bid = bus[fi.business_id] first_review_date = first_review_dates[bid] last_review_date = last_review_dates[bid] is_closed = not bus[fi.is_open] demo_econ_idx = demo_econ_lookup[bid] if (first_review_date is not None): bus[fi.first_review_date] = first_review_date if (last_review_date is not None): bus[fi.last_review_date] = last_review_date if (is_closed): bus[fi.close_date] = last_review_date if (demo_econ_idx >= 0): add_demo_econ_data(bus, demo_econ_data[demo_econ_idx,:]) # return the augmented business objects, list of reviews and list of tips return buses, reviews, tips