def gen_dataset_files(pdates, busjson, revjson, tipjson, outdir): # load business objects print 'Loading business objects from %s...' % busjson all_buses, junk = jsonutils.load_objects(busjson) # load review objects print 'loading review objects from %s...' % revjson all_reviews, junk = jsonutils.load_objects(revjson) # load tip objects print 'loading tip objects from %s...' % tipjson all_tips, junk = jsonutils.load_objects(tipjson) # generate the datsets for pdatestr in pdates: # convert prediction date to int (seconds since epoch) pdate = date2int(str2date(pdatestr)) # generate the dataset for the specified prediction date print 'generating dataset for prediction date %s (%d)...' % (pdatestr,pdate) buses = gen_dataset(pdate, all_buses, all_reviews, all_tips) # generate filename for dataset outfile = outdir + '/' + pdatestr + '.json' # write dataset to file print 'writing %d JSON objects to %s...' % (len(buses),outfile) jsonutils.save_objects(buses, outfile)
def run_script(pdate_str, busjson, revjson, tipjson, senticsv, outfile): # convert pdate to seconds since the epoch pdate = du.date2int(du.str2date(pdate_str)) # load business objects print 'Loading business objects from %s...' % busjson all_buses, junk = ju.load_objects(busjson) # load review objects print 'loading review objects from %s...' % revjson all_reviews, junk = ju.load_objects(revjson) # load tip objects print 'loading tip objects from %s...' % tipjson all_tips, junk = ju.load_objects(tipjson) # load sentiment ranking data derived from tip and review data print 'loading sentiment rankings from %s...' % senticsv all_senti = cu.load_matrix(senticsv, has_hdr=False) # generate a data set the specified prediction date print('generate data set for prediction date %s...' % pdate_str) buses = du.gen_dataset(pdate, all_buses, all_reviews, all_tips, all_senti) # write data set to file print('writing generated data set to %s...' % outfile) ju.save_objects(buses, outfile)
def filter_yelp_data(in_busjson, out_busjson, in_revjson, out_revjson, in_tipjson, out_tipjson, in_demoeconcsv): # initialize the column names #feat_columns = feat_info.data_feat_names bus_feats = fi.bus_feat_names rev_feats = fi.rev_feat_names tip_feats = fi.tip_feat_names # make sure the data features have been initialized #if (len(feat_columns)==0): # print('\nWARNING: data features have not been initialized\n') # load the restaurant objects print 'loading business JSON objects from %s...' % in_busjson objects,junk = load_restaurants(in_busjson) # load the review and tip objects and add first/last review/tip date # and census tract to objects objects,reviews,tips = process_review_tip_census_data(in_revjson, in_tipjson, in_demoeconcsv, objects) # create feature matrix #feat_mat, columns = get_feature_matrix(objects, feat_columns) # write the 2D feature array to file #print 'writing data features to %s...' % out_buscsv #write_feature_matrix_csv(out_buscsv, feat_mat, feat_columns) # write meta data to file print 'writing business JSON object to %s...' % out_busjson jsonutils.save_objects(objects, out_busjson, attfilt=bus_feats) # write review data to file print 'writing review JSON objects to %s...' % out_revjson jsonutils.save_objects(reviews, out_revjson, attfilt=rev_feats) # write tip data to file print 'writing tip JSON objects to %s...' % out_tipjson jsonutils.save_objects(tips, out_tipjson, attfilt=tip_feats)