def run_script(pdate_str, busjson, revjson, tipjson, senticsv, outfile):
    # convert pdate to seconds since the epoch
    pdate = du.date2int(du.str2date(pdate_str))

    # load business objects
    print 'Loading business objects from %s...' % busjson
    all_buses, junk = ju.load_objects(busjson)

    # load review objects
    print 'loading review objects from %s...' % revjson
    all_reviews, junk = ju.load_objects(revjson)

    # load tip objects
    print 'loading tip objects from %s...' % tipjson
    all_tips, junk = ju.load_objects(tipjson)
    
    # load sentiment ranking data derived from tip and review data
    print 'loading sentiment rankings from %s...' % senticsv
    all_senti = cu.load_matrix(senticsv, has_hdr=False)

    # generate a data set the specified prediction date
    print('generate data set for prediction date %s...' % pdate_str)
    buses = du.gen_dataset(pdate, all_buses, all_reviews, all_tips, all_senti)
    
    # write data set to file
    print('writing generated data set to %s...' % outfile)
    ju.save_objects(buses, outfile)
Example #2
0
def run_script(csvfile, periods, gap):
    # load CSV data
    print 'Loading matrix from %s...' % csvfile
    matrix, headers = cu.load_matrix(csvfile, True)
    
    # negative periods argument means plot all the data
    if (periods < 0):
        periods = matrix.shape[0]

    print '  plot data for %d periods...' % periods
    print '  plot every %dth data point...' % gap

    # the matrix is structured as follows:
    # -- the first row contains the column headers
    # -- the first column contains the generation numbers
    # -- the next eight columns contain data for the assessment module bits

    # set the font for the legend
    matplotlib.rcParams.update({'font.size': 8})
    fontP = FontProperties()
    fontP.set_size(8)

    # clear the plot
    plt.figure(figsize=(8,3))
    plt.clf()
    
    colors = {'b0':'b','b1':'g','b2':'r','b3':'c',
              'b4':'m','b5':'y','b6':'k','b7':'0.75',
              'payout':'b','max':'g','%max':'r'}
    for i in range(1,matrix.shape[1]-3):
        indices = np.arange(0,periods)
        indices = indices[::gap]
        plt.plot(indices, matrix[indices,i], linewidth=0.5, color=colors[headers[i].strip()], aa=True)

    # label the axes
    #if (csvfile.find('scounts') >=0):
    #    ylab = "strategy frequencies (%)"
    #elif (csvfile.find('spayouts') >= 0):
    #    ylab = "average payouts"
    #elif (csvfile.find('sfitness') >= 0):
    #    ylab = "average fitness"
    #else:
    #    ylab = "unknown data"
    ylab = "num of tribes"
    
    plt.ylabel(ylab)
    plt.xlabel("generations")
    
    # create the legend
    ax = plt.subplot(111)
    box = ax.get_position()
    ax.set_position([box.x0, box.y0 + box.height * 0.5,
                     box.width, box.height*0.5])
    plt.legend(headers[1::], loc='upper center', bbox_to_anchor=(0.5, -0.3),
               ncol=matrix.shape[1]-1, prop=fontP, frameon=False)

    # get file name for plot
    idx = csvfile.find('.csv')
    if (idx >= 0):
        pngfile = csvfile[0:idx] + '.png'
    else:
        pngfile = csvfile + '.png'

    # write the plot to a file
    plt.savefig(pngfile, bbox_inches='tight');
def run_script(busjson, revjson, tipjson, senticsv, init_pdate, delta, ctype=linsvm,
               usamp=True, binary=None, rfe=False, pca=-1, reg=False, feat_info=fi.data_feat_info,
               states=None):
    print 'Initial prediction date: %s' % init_pdate
    print 'Time delta: %d months' % delta
    if (states):
        print 'limiting data to restaurants in: %s' % str(states)

    # convert pdate to secondds since the epoch
    pdate = du.date2int(du.str2date(init_pdate))

    # load business objects
    print 'Loading business objects from %s...' % busjson
    all_buses, junk = ju.load_objects(busjson)

    # load review objects
    print 'loading review objects from %s...' % revjson
    all_reviews, junk = ju.load_objects(revjson)

    # load tip objects
    print 'loading tip objects from %s...' % tipjson
    all_tips, junk = ju.load_objects(tipjson)
    
    # load sentiment ranking data derived from tip and review data
    print 'loading sentiment rankings from %s...' % senticsv
    all_senti = cu.load_matrix(senticsv, has_hdr=False)

    # reduce the number of features using recursive feature elimination
    # - See http://scikit-learn.org/stable/auto_examples/plot_rfe_with_cross_validation.html#example-plot-rfe-with-cross-validation-py
    # - See http://stackoverflow.com/questions/23815938/recursive-feature-elimination-and-grid-search-using-scikit-learn

    if (reg):
        # create the least squares linear regressor
        print 'using least squares linear regression...'
        c = linmod.LinearRegression()
        # grid search not supported for linear regression (???)
        param_grid = None
    elif (ctype==rbfsvm):
        # create RBF SVM to test
        #c = svm.NuSVC(kernel='rbf')
        c = svm.SVC(kernel='rbf')
        # configure parameter grid for grid search
        C_range = 10.0 ** np.arange(-3, 5)
        gamma_range = 10.0 ** np.arange(-4, 3)
        if (rfe):
            print 'RFE not currently supported for RBF SVM...'
            #c = fs.RFECV(c, step=1)
            #pgrid = []
            #for C in C_range:
            #    for gamma in gamma_range:
            #        pgrid.append({'C':C,'gamma':gamma})
            #pgrid = [{'gamma':0.5},{'gamma':0.1},{'gamma':0.01},{'gamma':0.001},{'gamma':0.0001}]
            #param_grid = {'estimator_params': pgrid}
        print 'using RBF SVM...'
        param_grid = dict(gamma=gamma_range, C=C_range)
    elif (ctype==knn):
        # create a KNN classifier
        c = neigh.KNeighborsClassifier()
        if (rfe):
            print 'RFE not currently supported for k-nearesrt neighbors...'
        print 'using k-mearest neighbors...'
        param_grid = {'n_neighbors':[1,2,3,4,5,6,7,8,9,10,15,20,25,30],
                      'weights':['uniform','distance'],
                      'p':[1,2,3,4,5,6,7,8,9,10]}
    elif (ctype==ada):
        # create boosted classifier
        c = ensemble.AdaBoostClassifier()
        if (rfe):
            print 'RFE not currently supported for AdaBoost...'
        print 'using AdaBoost...'
        param_grid = {'n_estimators':[5, 10, 25, 40, 50, 60, 75, 85, 100],
                      'learning_rate':[0.25, 0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0]}
    elif (ctype==rf):
        # create random forest classifier
        c = ensemble.RandomForestClassifier()
        if (rfe):
            print 'RFE not currently supported for random forest...'
        print 'using random forest...'
        param_grid = {'n_estimators':[5, 10, 25, 40, 50, 60, 75, 85, 100],
                      'max_depth':[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, None]}
    elif (ctype==dt):
        # create decision tree classifier
        c = tree.DecisionTreeClassifier()
        # max feats - subtract 1 because data feats includes the class label
        if (rfe):
            print 'RFE not supported with decision trees...'
        print 'using decision tree...'
        param_grid = {'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, None]}
    else:
        # create linear SVM to test
        c = svm.LinearSVC()
        # configure parameter grid for grid search
        C_range = 10.0 ** np.arange(-3, 5)
        if (rfe):
            print 'using linear SVM with RFE...'
            c = fs.RFECV(c, step=1)
            pgrid = []
            for C in C_range:
                pgrid.append({'C':C})
            #pgrid = [{'C':0.01},{'C':0.1},{'C':1},{'C':10},{'C':100},{'C':1000},{'C':10000}]
            param_grid = {'estimator_params': pgrid}
        else:
            print 'using linear SVM...'
            param_grid = {'C': C_range}

    # run the walk-forward cross validation and collect the results
    print('run walk-forward cross validation...')
    if (usamp):
        print('  under-sampling still open class...')
    else:
        print('  NOT under-sampling still open class...')
    results = wfcvutils.wfcv(c, param_grid, all_buses, all_reviews, all_tips, all_senti,
                             pdate, delta*du.month, pca=pca, usamp=usamp,
                             binary=binary, reg=reg, feat_info=feat_info, states=states)
    
    # combine the results to produce overall metrics
    y_true = None
    y_pred = None
    for r in results:
        if (y_true is None):
            y_true = r[0]
        else:
            y_true = np.hstack((y_true, r[0]))
        if (y_pred is None):
            y_pred = r[1]
        else:
            y_pred = np.hstack((y_pred, r[1]))

    # print out an overall classification report
    print('\n=========================================')
    print('Overall metrics for all prediction dates:\n')
    if (len(results) != 0):
        if (reg):
            wfcvutils.print_reg_metrics(y_true, y_pred)
        else:
            cm = metrics.confusion_matrix(y_true, y_pred)
            wfcvutils.print_cm(cm)
            #print(metrics.classification_report(y_true, y_pred, target_names=fi.class_names))
    else:
        print '  NO RESULTS\n'
def process_review_tip_census_data(in_revjson, in_tipjson, in_demoeconcsv, buses):
    # load the census tracts
    print 'loading demographic and economic data from %s...' % in_demoeconcsv
    demo_econ_data = csvutils.load_matrix(in_demoeconcsv,False)

    # initialize  dictionaries to hold the first and last review dates and
    # add lookup data for demo & econ data
    print 'initialize dictionaries...'
    first_review_dates = {}
    last_review_dates = {}
    demo_econ_lookup = {}
    for bus in buses:
        # add the business IDs for restaurants to the dictionaries
        bid = bus[fi.business_id]
        first_review_dates[bid] = None
        last_review_dates[bid] = None
        demo_econ_lookup[bid] = -1

    # initialize lookup table for demo and econ data
    print 'initialize lookup table for demographic and economic data...'
    for i in xrange(demo_econ_data.shape[0]):
        bid = demo_econ_data[i,fi.census_bus_id_idx]
        if (bid):
            demo_econ_lookup[bid] = i

    # collect the reviews that were written for one of the businesses in the list
    # of businesses and identify the first/last review/tip dates for each business
    reviews = []
    print 'processing reviews from %s...' % in_revjson
    with open(in_revjson, 'r') as fin:
        # there is one JSON object per line, iterate over the lines and load the JSON
        for line in fin:
            # load the JSON object as a dictionary
            review = json.loads(line)

            # if the review is for one of the requested businesses then update
            # the current first/last review/tip date for that business if necessary
            bid = review[fi.business_id]
            if (bid in last_review_dates):
                # append this review to the list of reviews
                reviews.append(review)
                # process review dates
                review_date = date2int(str2date(review[fi.date]))
                review[fi.date] = review_date
                # process first and last review/tip dates
                current_first = first_review_dates[bid]
                current_last = last_review_dates[bid]
                # if this review date is earlier than the current first review/tip
                # date then set the first review/tip date to this review date
                if (current_first is None or current_first > review_date):
                    first_review_dates[bid] = review_date
                # if this review date is more recent than the current last review/tip
                # date then set the last review/tip date to this review date
                if (current_last is None or current_last < review_date):
                    last_review_dates[bid] = review_date

    # collect the tips that were written for one of the businesses in the list
    # of businesses and update the first/last review/tip dates for each business
    tips = []
    print 'processing tips from %s...' % in_tipjson
    with open(in_tipjson, 'r') as fin:
        # there is one JSON object per line, iterate over the lines and load the JSON
        for line in fin:
            # load the JSON object as a dictionary
            tip = json.loads(line)

            # if the tip is for one of the requested businesses then update
            # the current first/last review/tip date for that business if necessary
            bid = tip[fi.business_id]
            if (bid in last_review_dates):
                # append this tip to the list of tips
                tips.append(tip)
                # process tip dates
                tip_date = date2int(str2date(tip[fi.date]))
                tip[fi.date] = tip_date
                # process first and last review/tip dates
                current_first = first_review_dates[bid]
                current_last = last_review_dates[bid]
                # if this tip date is earlier than the current first review/tip
                # date then set the first review/tip date to this review date
                if (current_first is None or current_first > tip_date):
                    first_review_dates[bid] = tip_date
                # if this tip date is more recent than the current last review/tip
                # date then set the last review/tip date to this review date
                if (current_last is None or current_last < tip_date):
                    last_review_dates[bid] = tip_date

    # copy the last review dates and census tracts into the business objects
    print 'adding first/last review date and census tract to business objects...'
    for bus in buses:
        bid = bus[fi.business_id]
        first_review_date = first_review_dates[bid]
        last_review_date = last_review_dates[bid]
        is_closed = not bus[fi.is_open]
        demo_econ_idx = demo_econ_lookup[bid]
        if (first_review_date is not None):
            bus[fi.first_review_date] = first_review_date
        if (last_review_date is not None):
            bus[fi.last_review_date] = last_review_date
            if (is_closed):
                bus[fi.close_date] = last_review_date
        if (demo_econ_idx >= 0):
            add_demo_econ_data(bus, demo_econ_data[demo_econ_idx,:])

    # return the augmented business objects, list of reviews and list of tips
    return buses, reviews, tips