def file_download(self,sc,sqlContext,Row,cl,sz,links,local_path,hdfs_path,querydata_path): try: logging.info('entered into file_download function');import wget,os;l = str() for i in links: if spark_conf.values_311data['file_download'] == '1': print '++++++++++++entered++++++++++++' file_name = wget.download('http://hfdapp.houstontx.gov/311/'+i) call(['hdfs','dfs','-put','-f',i,hdfs_path]) os.remove(i) return_value,return_date=self.log_checking(sc,sqlContext,Row,sz) if return_value in (0,1): df_anc = self.dataframe_creation(sc,sqlContext,Row,cl,links,local_path,hdfs_path,querydata_path,return_date) print df_anc.count() if df_anc.count() > 0: pass print 'Not Empty' if 1 == cl.nlc(df_anc): if 1 == cl.confident_classifier(sqlContext,df_anc)[1]: print 'finished' # Move command here l = l + 'Nan' + '|' + 'Nan' + '|' + 'Nan' + '|' + 'Nan' + '|' + '311_data' + '|' + str(spark_conf.utctime.replace(':','-')) + '|' + 'Nan' + '|' + 'Nan' '|' + '3' #l = l + str(00) + '|' + str(00) + '|' + 'Nan' + '|' + 'Nan' + '|' + '311_data' + '|' + str('2017-12-1 19-32-59') + '|' + 'Nan' + '|' + 'Nan' + '|' + 'Nan' print (l) sc.parallelize([l]).saveAsTextFile(spark_conf.hdfs_path['zip_table']+'%s' %(spark_conf.utc_time()[1])) except Exception as e: logging.info('error in file_download func: %s' %str(e))
def confident_classifier(self, sqlContext, cc_input): # try: logging.info('Adding Confidence and class to the file') df3 = pd.read_json(spark_conf.file_path['otherfiles_path'] + 'review_{}.json'.format('classifier')) #print df3 df4 = df3.copy(deep='True') #df5 = pd.read_csv(spark_conf.classifier_input['input_file']+'reviews_%s.txt'% spark_conf.retrieved_time.replace(':','-'),sep='|',encoding="ISO-8859-1") df5 = cc_input.toPandas() df6 = df5.copy(deep='True') #print df6 if (len(df6.review_text) > 0): class_name_list = [] confidence_list = [] for rows in df4.iterrows(): class_name_list.append(rows[1]['classes'][0]['class_name']) confidence_list.append(rows[1]['classes'][0]['confidence']) df6['class_name'] = class_name_list df6["confidence"] = confidence_list print df6 print df6.head if spark_conf.program_execution['311_data'] == '1': df_311 = sqlContext.createDataFrame(df6) df_311.rdd.map(lambda x: list(x)).map( lambda y: filter_data(y)).saveAsTextFile( spark_conf.hdfs_path['classifier_output'] + '311_data_%s.txt' % spark_conf.utc_time()[1]) #df6.to_csv(spark_conf.classifier_input['output_file']+'Classified_Output_test.txt',sep='|',index=False,encoding="ISO-8859-1") return df6, 1 else: print(" No Classification Available") return 0
def linear_regression(self, lr_input, sqlContext, cc_output): # try: logging.info('Performing Regression') dfi = pd.read_csv(spark_conf.file_path['otherfiles_path'] + 'Encoded_Classified_1.txt', sep='|', encoding='ISO-8859-1') #dfi_test = pd.read_csv(spark_conf.classifier_input['output_file']+'Encoded_Classified_test.txt', sep = '|', encoding = 'ISO-8859-1') dfi_test_new = cc_output dfi_test = lr_input input_list = list(dfi_test) #print input_list corr = dfi.corr() #sns.heatmap(corr,xticklabels=corr.columns,yticklabels=corr.columns) feature_cols = [ 'likes', 'comment_count', 'user_level_num', 'Average', 'Avoid!', 'Blah!', 'Good Enough', 'Great!', 'Insane!', 'Not rated', 'Very Bad', 'Well...', 'Big Foodie', 'Connoisseur', 'Foodie', 'Super Foodie', 'Bad Ambience', 'Bad Food', 'Bad Service', 'Good Ambience', 'Good Food', 'Good Service', 'Not Worthy', 'binarized_user_foodie_level', 'binarized_rating_text', 'binarized_class_name' ] feature_cols_1 = list(set(input_list).intersection(feature_cols)) # print feature_cols_1 X_train = dfi[:-1] # print len(X_train) X_test = dfi_test[0:] # print len(X_test) y_train = dfi.confidence[:-1] # print len(y_train) y_test = dfi_test.confidence[0:] #print len(y_test) X = X_train[feature_cols_1] y = y_train Xtest = X_test[feature_cols_1] regr = linear_model.Lasso(alpha=0.0000000001, fit_intercept=True, normalize=False, precompute=False, copy_X=True, max_iter=1000, tol=0.0001, warm_start=False, positive=False, random_state=None, selection='cyclic') regr.fit(X, y) shuffle = cross_validation.KFold(len(X), n_folds=10, shuffle=True, random_state=0) scores = cross_validation.cross_val_score(regr, X, y, cv=shuffle) #print("Accuracy: %.3f%% (%.3f%%)") % (scores.mean()*100.0, scores.std()*100.0) #print regr.intercept_ #print (regr.coef_) #print mean_squared_error(regr.predict(Xtest), y_test)**0.5 #print regr.predict(Xtest) #print regr.score(X,y) se = pd.Series(regr.predict(Xtest)) dfi_test_new['score'] = se.values # dfi_test['xyz'] = se.values print list(dfi_test_new) df_s = sqlContext.createDataFrame(dfi_test_new) #df_s.show() #print df_s.count() df_s.select( 'comment_count', 'foodie_color', 'likes', 'profile_image', 'profile_url', 'rating', 'rating_color', 'rating_text', 'rating_time_friendly', 'restaurant_id', 'retrieved_time', 'review_id', 'review_text', 'time_stamp', 'user_foodie_level', 'user_level_num', 'user_name', 'user_zomatohandle', 'class_name', 'confidence', 'score').rdd.map(lambda x: list(x)).map(lambda y: filter_data( y)).saveAsTextFile(spark_conf.hdfs_path['classifier_output'] + '%s' % spark_conf.utc_time()[1]) # dfi_test.to_csv(spark_conf.classifier_input['output_file']+'final_Output.txt',sep='|',encoding="ISO-8859-1") return 1
def dataframe_creation(self,sc,sqlContext,Row,cl,links,local_path,hdfs_path,querydata_path,date_311): try: logging.info('entered into dataframe_creation function') print date_311 rdd_data = sc.textFile(hdfs_path);header = rdd_data.first() rdd_new = rdd_data.filter(lambda line: line != header).map(lambda x: x.split('|')).map(lambda z: Row(case_number =z[0],sr_location =z[1],county =z[2],district =z[3],neighborhood =z[4],tax_id =z[5],trash_quad =z[6],recycle_quad =z[7],trash_day =z[8],heavytrash_day =z[9],recycle_day =z[10],key_map =z[11],management_district =z[12],department =z[13],division =z[14],sr_type =z[15],queue =z[16],sla =z[17],status =z[18],sr_createdate =z[19],due_date =z[20],date_closed =z[21],overdue =z[22],title =z[23],x =z[24],y =z[25], latitude =z[26],longitude =z[27],channel_type =z[28])) #print rdd_new.map(lambda d: date_conversion(d)).take(10) df_data = sqlContext.createDataFrame(rdd_new).registerTempTable('311data') df_new = sqlContext.sql('select * from 311data where regexp_replace(lower(trim(sr_type))," ","") in ("waterleak","poordrainage","airpollution","crisiscleanup","deadanimalcollection","drainage","drainagesystemviolation","healthcode","heavytrashviolation","majorwaterleak","missedgarbagepickup","missedheavytrashpickup","missedyardewastepickup","pestcontrol","poolwaterqualitycontrol","sewermanhole","sewerwastewater","sidewalkrepair","streethazard","waterplaygroundrepair","waterquality","waterservice","waterorgroundpollution") and to_utc_timestamp (sr_createdate,"US/Central") > "%s" ' %date_311).select('case_number','sr_location' ,'county' ,'district' ,'neighborhood' ,'tax_id' ,'trash_quad' ,'recycle_quad','trash_day','heavytrash_day','recycle_day','key_map','management_district' ,'department' ,'division' ,'sr_type','queue','sla','status','sr_createdate' ,'due_date' ,'date_closed' ,'overdue' ,'title' ,'x' ,'y' , 'latitude' ,'longitude','channel_type') df_new.show() df_new.rdd.map(lambda y: list(y)).map(lambda z: rdd_structure(z)).coalesce(1).saveAsTextFile(querydata_path+'311query_data_%s.txt' %(spark_conf.utc_time()[1])) df_anc = df_new.withColumn('311_rating',lit('Nan')).withColumn('311_color',lit('Nan')).withColumn('311_rating_text',lit('Nan')).withColumn('311_likes',lit('Nan')).withColumn('311_comment_count',lit('Nan')).withColumn('311_user_name',lit('Nan')).withColumn('311_user_handle',lit('Nan')).withColumn('311_user_foodie_level',lit('Nan')).withColumn('311_user_level_num',lit('Nan')).withColumn('311_foodie_color',lit('Nan')).withColumn('311_profile_url',lit('Nan')).withColumn('311_user_image',lit('Nan')).withColumn('311_retrieved_time',lit(spark_conf.retrieved_time.replace(':','-'))).withColumnRenamed("sr_type", "review_text").select('tax_id','311_rating','case_number','review_text','311_color','date_closed','311_rating_text','sr_createdate','311_likes','311_comment_count','311_user_name','311_user_handle','311_user_foodie_level','311_user_level_num','311_foodie_color','311_profile_url','311_user_image','311_retrieved_time') return df_anc except Exception as e: logging.info('error in dataframe_creation func: %s' %str(e))
def reviews_data(self, sc, cl, sqlContext, zip_code, api_link, j, apikey, comp_date, lt): #try: logging.info('entered into reviews data function') print api_link + 'res_id=' + str(j) + '&apikey=' + str(apikey) reviews = json.loads( requests.get(api_link + 'res_id=' + str(j) + '&apikey=' + str(apikey)).text) rd = [] if 'user_reviews' in reviews: if len(reviews['user_reviews']) > 0: for k in reviews['user_reviews']: rev = str() if 'review' in k: #print type(datetime.utcfromtimestamp(k['review']['timestamp'])); #print type(comp_date); print comp_date.replace(tzinfo=pytz.UTC) print datetime.utcfromtimestamp( k['review']['timestamp']).replace( tzinfo=pytz.UTC).strftime('%Y-%m-%d %H:%M:%S') if datetime.utcfromtimestamp( k['review']['timestamp']).replace( tzinfo=pytz.UTC) > (comp_date).replace( tzinfo=pytz.UTC): rev = rev + str(j) + '|' if 'rating' in k['review']: rev = rev + str(k['review']['rating']) + '|' else: rev = rev + 'unknown' + '|' if 'id' in k['review']: rev = rev + str(k['review']['id']) + '|' else: rev = rev + 'unknown' + '|' if 'review_text' in k['review']: rev = rev + ( k['review']['review_text']).replace( '\r', '').replace( '\n', '').encode('utf-8').strip() + '|' else: rev = rev + 'unknown' + '|' if 'rating_color' in k['review']: rev = rev + (k['review']['rating_color'] ).encode('utf-8').strip() + '|' else: rev = rev + 'unknown' + '|' if 'rating_time_friendly' in k['review']: rev = rev + ( k['review']['rating_time_friendly']) + '|' else: rev = rev + 'unknown' + '|' if 'rating_text' in k['review']: rev = rev + ( k['review']['rating_text']).replace( '\r', '').replace( '\n', '').encode('utf-8').strip() + '|' else: rev = rev + 'unknown' + '|' if 'timestamp' in k['review']: rev = rev + (datetime.utcfromtimestamp( k['review']['timestamp']).replace( tzinfo=pytz.UTC).strftime( '%Y-%m-%d %H:%M:%S')) + '|' else: rev = rev + 'unknown' + '|' if 'likes' in k['review']: rev = rev + str(k['review']['likes']) + '|' else: rev = rev + 'unknown' + '|' if 'comments_count' in k['review']: rev = rev + str( k['review']['comments_count']) + '|' else: rev = rev + 'unknown' + '|' if 'user' in k['review']: if 'name' in k['review']['user']: rev = rev + ( k['review']['user']['name'] ).encode('utf-8').strip() + '|' else: rev = rev + 'unknown' + '|' if 'zomato_handle' in k['review']['user']: rev = rev + ( k['review']['user']['zomato_handle'] ).encode('utf-8').strip() + '|' else: rev = rev + 'unknown' + '|' if 'foodie_level' in k['review']['user']: rev = rev + ( k['review']['user']['foodie_level'] ).encode('utf-8').strip() + '|' else: rev = rev + 'unknown' + '|' if 'foodie_level_num' in k['review']['user']: rev = rev + str( k['review']['user'] ['foodie_level_num']).strip() + '|' else: rev = rev + 'unknown' + '|' if 'foodie_color' in k['review']['user']: rev = rev + ( k['review']['user']['foodie_color'] ).encode('utf-8').strip() + '|' else: rev = rev + 'unknown' + '|' if 'profile_url' in k['review']['user']: rev = rev + ( k['review']['user']['profile_url'] ).encode('utf-8').strip() + '|' else: rev = rev + 'unknown' + '|' if 'profile_image' in k['review']['user']: rev = rev + ( k['review']['user']['profile_image'] ).encode('utf-8').strip() + '|' else: rev = rev + 'unknown' + '|' rev = rev + str( spark_conf.retrieved_time.replace(':', '-')) #f = open(spark_conf.file_path['reviewdata_path']+'reviews_%s.txt'% spark_conf.retrieved_time.replace(':','-'),'a') #f.write('%s\n' %rev);f.close() rd.append(rev) print rd print len(rd) lt = lt + str(spark_conf.utctime.replace( ':', '-')) + '|' + str(j) + '|' + str(len(rd)) + '|' + '1' print lt sc.parallelize([ lt.strip() ]).saveAsTextFile(spark_conf.hdfs_path['zip_table'] + '%s_%s_%s.txt' % (zip_code, j, spark_conf.utc_time()[1])) if len(rd) > 0: if str(sc.parallelize(rd).isEmpty()) == 'False': rdd_r = sc.parallelize(rd).filter(lambda y: y != '').map( lambda x: x.split('|')).map( lambda x: str(x[0]) + '|' + str(x[1]) + '|' + str(x[2]) + '|' + str(x[3]) + '|' + str(x[4]) + '|' + str(x[5]) + '|' + str(x[6]) + '|' + str(x[7]) + '|' + str(x[8]) + '|' + str(x[9]) + '|' + str(x[10]) + '|' + str(x[11]) + '|' + str(x[12]) + '|' + str(x[13]) + '|' + str(x[ 14]) + '|' + str(x[15]) + '|' + str(x[ 16]) + '|' + str(x[17])).saveAsTextFile( spark_conf.hdfs_path['temp_review_data'] + '%s_%s_%s.txt' % (zip_code, j, spark_conf.utc_time()[1])) logging.info('completed saving rdd datafiles in hdfs') else: return 'None' else: return 'None'
def zomato_location(self, sc, sqlContext, zipcode, api_link, lat_value, long_value, radius, category_by, sort_by, order_by, apikey): #try: logging.info('entered into zomato location') lc = str() p = [] print api_link + 'lat=' + str(lat_value) + '&lon=' + str( long_value) + '&radius=' + str(radius) + '&category=' + str( category_by) + '&sort=' + str(sort_by) + '&order=' + str( order_by) + '&apikey=' + apikey search = json.loads( requests.get(api_link + 'lat=' + str(lat_value) + '&lon=' + str(long_value) + '&radius=' + str(radius) + '&category=' + str(category_by) + '&sort=' + str(sort_by) + '&order=' + str(order_by) + '&apikey=' + apikey).text) #print search if 'restaurants' in search: if len(search['restaurants']) > 0: for i in search['restaurants']: ser = str() if 'restaurant' in i: if 'apikey' in i['restaurant']: ser = ser + str(i['restaurant']['apikey']) + '|' else: ser = ser + 'unknown' + '|' if 'id' in i['restaurant']: ser = ser + str(i['restaurant']['id']) + '|' else: ser = ser + 'unknown' + '|' if 'name' in i['restaurant']: ser = ser + str(i['restaurant']['name'].encode( 'ascii', 'ignore').decode('ascii')) + '|' else: ser = ser + 'unknown' + '|' if 'url' in i['restaurant']: ser = ser + str(i['restaurant']['url'].encode( 'ascii', 'ignore').decode('ascii')) + '|' else: ser = ser + 'unknown' + '|' if 'utm_source' in i['restaurant']: ser = ser + str( i['restaurant']['utm_source']) + '|' else: ser = ser + 'unknown' + '|' if 'location' in i['restaurant']: if 'address' in i['restaurant']['location']: ser = ser + i['restaurant']['location'][ 'address'] + '|' else: ser = ser + 'unknown' + '|' if 'locality' in i['restaurant']['location']: ser = ser + i['restaurant']['location'][ 'locality'] + '|' else: ser = ser + 'unknown' + '|' if 'city' in i['restaurant']['location']: ser = ser + i['restaurant']['location'][ 'city'] + '|' else: ser = ser + 'unknown' + '|' if 'city_id' in i['restaurant']['location']: ser = ser + str( i['restaurant']['location']['city_id']) + '|' else: ser = ser + 'unknown' + '|' if 'latitude' in i['restaurant']['location']: ser = ser + i['restaurant']['location'][ 'latitude'] + '|' else: ser = ser + 'unknown' + '|' if 'longitude' in i['restaurant']['location']: ser = ser + i['restaurant']['location'][ 'longitude'] + '|' else: ser = ser + 'unknown' + '|' if 'zipcode' in i['restaurant']['location']: ser = ser + i['restaurant']['location'][ 'zipcode'] + '|' else: ser = ser + 'unknown' + '|' if 'country_id' in i['restaurant']['location']: ser = ser + str(i['restaurant']['location'] ['country_id']) + '|' else: ser = ser + 'unknown' + '|' if 'locality_verbose' in i['restaurant']['location']: ser = ser + i['restaurant']['location'][ 'locality_verbose'] + '|' else: ser = ser + 'unknown' + '|' if 'user_rating' in i['restaurant']: if 'aggregate_rating' in i['restaurant'][ 'user_rating']: ser = ser + str(i['restaurant']['user_rating'] ['aggregate_rating']) + '|' else: ser = ser + 'unknown' + '|' if 'rating_text' in i['restaurant']['user_rating']: ser = ser + str( i['restaurant']['user_rating']['rating_text']. replace('\r', '').replace('\n', '')) + '|' else: ser = ser + 'unknown' + '|' if 'rating_color' in i['restaurant']['user_rating']: ser = ser + str(i['restaurant']['user_rating'] ['rating_color']) + '|' else: ser = ser + 'unknown' + '|' if 'votes' in i['restaurant']['user_rating']: ser = ser + str( i['restaurant']['user_rating']['votes']) + '|' else: ser = ser + 'unknown' + '|' ser = ser + str(spark_conf.retrieved_time.replace( ':', '-')) p.append(ser.encode('utf-8', 'replace')) #s = open(spark_conf.file_path['searchdata_path']+'search_%s.txt'% spark_conf.retrieved_time.replace(':','-'),'a');s.write('%s' %ser.encode('utf-8','replace'));s.close() if len(p) > 0: res_save = sc.parallelize(p).filter(lambda y: y != '').map( lambda x: x.split('|')).map( lambda x: str(x[0]) + '|' + str(x[1]) + '|' + str(x[2]) + '|' + str(x[3]) + '|' + str(x[4]) + '|' + str(x[5]) + '|' + str(x[6]) + '|' + str(x[7]) + '|' + str(x[8]) + '|' + str(x[9]) + '|' + str(x[10]) + '|' + str(x[11]) + '|' + str(x[12]) + '|' + str(x[13]) + '|' + str(x[ 14]) + '|' + str(x[15]) + '|' + str(x[16]) + '|' + str(x[17]) + '|' + str(x[18])).saveAsTextFile( spark_conf.hdfs_path['restaurant_data'] + '%s_%s.txt' % (zipcode, spark_conf.utc_time()[1])) rdd_rest = sc.parallelize(p).map(lambda x: x.split('|')).map( lambda y: Row(apikey=y[0], rest_id=int(y[1]), rest_name=y[2], rest_url=y[3], rest_utmsource=y[4], rest_addr=y[5], rest_loclty=y[6], rest_city=y[7], res_cityid=y[8], rest_lat=y[9], rest_long=y[10], rest_zpcode=(y[11]), rest_countryid=y[12], loc_verbose=y[13], agg_rat=y[14], rating_text=y[15], rating_color=y[16], votes=y[17], rt=y[18])) df_rest = sqlContext.createDataFrame(rdd_rest) for i in df_rest.select('rest_id', 'rest_zpcode', 'rest_lat', 'rest_long').collect(): lc = lc + str(zipcode) + ',' + str( i['rest_id']) + ',' + str( i['rest_zpcode']) + ',' + str( i['rest_lat']) + ',' + str( i['rest_long']) + '|' #r print lc return lc
def yelp_looping(self, sc, sqlContext, jsondata, cl, zip_code, lat_v, long_v): try: logging.info('entered into yelp looping') url_params = {} r_id = self.yelp_restaurant(sc, sqlContext, jsondata) print r_id res = call([ 'hdfs', 'dfs', '-test', '-f', spark_conf.hdfs_path['zip_table'] + '*/p*' ]) if res == 0: zp_log = sc.textFile( spark_conf.hdfs_path['zip_table'] + '*').map(lambda x: x.split('|')).map(lambda l: filter( None, l)).map(lambda z: Row(quered_zipcode=(z[0]), original_zipcode=(z[1]), lat_value=z[2], long_value=z[3], sm_name=z[4], retrieving_time=z[5], restaurant_id=z[6], count_val=z[7], check_value=int(z[8]))) sqlContext.createDataFrame(zp_log).registerTempTable( 'temp_yelp_log') f = sqlContext.sql( 'select count(*) as count from temp_yelp_log').collect() if f[0]['count'] > 0: q_str = "select count(*) as count from temp_yelp_log where trim(lat_value) = trim('%s') and trim(long_value) = trim('%s') and check_value = 2" % ( lat_v, long_v) print q_str c = sqlContext.sql(q_str).collect() print c[0]['count'] if c[0]['count'] > 0: dt = sqlContext.sql( "select max(cast(from_unixtime(unix_timestamp(retrieving_time,'yyyy-MM-dd HH:mm:ss')) as timestamp)) as max_date from temp_yelp_log where trim(lat_value) = trim('%s') and trim(long_value) = trim('%s') and check_value = 2" % (lat_v, long_v)).collect() print dt previous_date = dt[0]['max_date'] print previous_date for i in r_id.split('|'): print i print i.decode('ascii', 'ignore').encode('ascii') print '1st loop' if i != '': review_path = spark_conf.yelp[ 'REVIEW_PATH'].replace( 'id', i.split(',')[0].decode( 'ascii', 'ignore').encode('ascii')) print review_path yl = str() yl = yl + str(zip_code) + '|' + i.split( ',')[1] + '|' + str(lat_v) + '|' + str( long_v ) + '|' + spark_conf.yelp['API_HOST'] + '|' print yl review_data = self.request( spark_conf.yelp['API_HOST'], review_path, spark_conf.yelp['access_token'], url_params=url_params) self.yelp_reviews(sc, sqlContext, review_data, i.split(',')[0], zip_code, yl, previous_date) else: if len(r_id) > 0: for i in r_id.split('|'): print i print i.decode('ascii', 'ignore').encode('ascii') print '2nd loop' if i != '': review_path = spark_conf.yelp[ 'REVIEW_PATH'].replace( 'id', i.split(',')[0].decode( 'ascii', 'ignore').encode('ascii')) print review_path yl = str() yl = yl + str(zip_code) + '|' + i.split( ',')[1] + '|' + str(lat_v) + '|' + str( long_v) + '|' + spark_conf.yelp[ 'API_HOST'] + '|' review_data = self.request( spark_conf.yelp['API_HOST'], review_path, spark_conf.yelp['access_token'], url_params=url_params) self.yelp_reviews(sc, sqlContext, review_data, i.split(',')[0], zip_code, yl, self.input_date()) else: if len(r_id) > 0: for i in r_id.split('|'): print i print i.decode('ascii', 'ignore').encode('ascii') print '3rd loop' if i != '': review_path = spark_conf.yelp[ 'REVIEW_PATH'].replace( 'id', i.split(',')[0].decode( 'ascii', 'ignore').encode('ascii')) print review_path yl = str() yl = yl + str(zip_code) + '|' + i.split( ',')[1] + '|' + str(lat_v) + '|' + str( long_v ) + '|' + spark_conf.yelp['API_HOST'] + '|' review_data = self.request( spark_conf.yelp['API_HOST'], review_path, spark_conf.yelp['access_token'], url_params=url_params) self.yelp_reviews(sc, sqlContext, review_data, i.split(',')[0], zip_code, yl, self.input_date()) d = call([ 'hdfs', 'dfs', '-test', '-f', spark_conf.hdfs_path['yelp_temp_review_data'] + '*/p*' ]) if d == 0: r_path = spark_conf.hdfs_path['yelp_temp_review_data'] + '*' print r_path rdd_data = sc.textFile(r_path, use_unicode=False) if rdd_data.count() > 0: print rdd_data.count() rdd_r = rdd_data.filter(lambda y: y != '').map( lambda x: x.split('|')).map( lambda z: Row(restaurant_id=z[0], time_created=z[1], review_id=z[2], rating=z[3], url=z[4], review_text=z[5], user_name=z[6], user_image=z[7], retrieved_time=z[8])) df_old = sqlContext.createDataFrame(rdd_r) df_new = df_old.select('*').withColumn( 'rating_color', lit('None').cast(StringType())).withColumn( 'rating_time_friendly', lit('None').cast(StringType())).withColumn( 'rating_text', lit('None').cast(StringType())).withColumn( 'likes', lit('None').cast(StringType())).withColumn( 'comment_count', lit('None').cast( StringType())).withColumn( 'user_zomatohandle', lit('None').cast(StringType()) ).withColumn( 'user_foodie_level', lit('None').cast(StringType()) ).withColumn( 'user_level_num', lit('None').cast(StringType()) ).withColumn( 'foodie_color', lit('None').cast( StringType())).withColumn( 'profile_url', lit('None').cast( StringType())) data_r = df_new.select( 'restaurant_id', 'rating', 'review_id', 'review_text', 'rating_color', 'rating_time_friendly', 'rating_text', 'time_created', 'likes', 'comment_count', 'user_name', 'user_zomatohandle', 'user_foodie_level', 'user_level_num', 'foodie_color', 'profile_url', 'user_image', 'retrieved_time') data_r.show() if 1 == cl.nlc(data_r): if 1 == cl.confident_classifier(sqlContext, data_r)[1]: sqlContext.createDataFrame( cl.confident_classifier(sqlContext, data_r)[0] ).withColumn( 'score', lit('None').cast(StringType())).select( 'comment_count', 'foodie_color', 'likes', 'user_image', 'profile_url', 'rating', 'rating_color', 'rating_text', 'rating_time_friendly', 'restaurant_id', 'retrieved_time', 'review_id', 'review_text', 'time_created', 'user_foodie_level', 'user_level_num', 'user_name', 'user_zomatohandle', 'class_name', 'confidence', 'score' ).rdd.map(lambda x: list(x)).map( lambda y: filter_data(y) ).saveAsTextFile( spark_conf.hdfs_path['classifier_output'] + '%s.txt' % spark_conf.utc_time()[1]) call([ 'hdfs', 'dfs', '-mv', spark_conf.hdfs_path['yelp_temp_review_data'] + '*', spark_conf.hdfs_path['yelp_final_review_data'] ]) except Exception as e: logging.info('error in yelp loop function %s' % str(e))
def yelp_reviews(self, sc, sqlContext, review_data, r_id, zip_code, yl, c_date): try: logging.info('entered into yelp reviews') yr = [] if 'reviews' in review_data: if len(review_data['reviews']) > 0: for i in review_data['reviews']: print 'entered reviews loop' rd = str() print self.convert_datetime_timezone( i['time_created'], spark_conf.yelp['timezone_initial'], spark_conf.yelp['timezone_final']) print c_date.replace( tzinfo=pytz.UTC).strftime('%Y-%m-%d %H:%M:%S') if self.convert_datetime_timezone( i['time_created'], spark_conf.yelp['timezone_initial'], spark_conf.yelp['timezone_final'] ) > c_date.replace(tzinfo=pytz.UTC): rd = rd + r_id + '|' if 'time_created' in i: rd = rd + self.convert_datetime_timezone( i['time_created'], spark_conf.yelp['timezone_initial'], spark_conf.yelp['timezone_final'] ).strftime("%Y-%m-%d %H:%M:%S") + '|' else: rd = rd + 'unknown' + '|' rd = rd + str(uuid.uuid4( )) + '_' + self.convert_datetime_timezone( i['time_created'], spark_conf.yelp['timezone_initial'], spark_conf.yelp['timezone_final']).strftime( "%Y-%m-%d %H:%M:%S") + '|' if 'rating' in i: rd = rd + str(i['rating']) + '|' else: rd = rd + 'unknown' + '|' if 'url' in i: rd = rd + i['url'].encode('utf-8') + '|' else: rd = rd + 'unknown' + '|' if 'text' in i: rd = rd + i['text'].replace('\r', '').replace( '\n', '').encode('utf-8').strip() + '|' else: rd = rd + 'unknown' + '|' if 'user' in i: if 'name' in i['user']: rd = rd + i['user']['name'].encode( 'utf-8') + '|' else: rd = rd + 'unknown' + '|' if 'image_url' in i['user']: rd = rd + str(i['user']['image_url']) + '|' else: rd = rd + 'unknown' + '|' rd = rd + str(spark_conf.utctime) yr.append(rd) #f = open('/bdaas/log/res_rev.txt','a');f.write("%s" %rd.encode('utf-8'));f.close() print len(yr) yl = yl + str(spark_conf.utctime) + '|' + r_id + '|' + str( len(yr)) + '|' + '2' print yl sc.parallelize([ yl.strip() ]).saveAsTextFile(spark_conf.hdfs_path['zip_table'] + '%s_%s_%s.txt' % (zip_code, r_id, spark_conf.utc_time()[1])) if len(yr) > 0: if str(sc.parallelize(yr).isEmpty()) == 'False': rdd_r = sc.parallelize(yr).filter( lambda y: y != '').map(lambda x: x.split('|')).map( lambda x: str(x[0]) + '|' + str(x[1]) + '|' + str(x[2]) + '|' + str(x[3]) + '|' + str(x[ 4]) + '|' + str(x[5]) + '|' + str(x[ 6]) + '|' + str(x[7]) + '|' + str(x[8]) ).saveAsTextFile( spark_conf.hdfs_path['yelp_temp_review_data'] + '%s_%s.txt' % (zip_code, spark_conf.utc_time()[1])) except Exception as e: logging.info('error in yelp reviews function %s' % str(e))
def yelp_restaurant(self, sc, sqlContext, jsondata): try: logging.info('entered into yelp restaurant') yr = [] ylp = str() if 'businesses' in jsondata: if len(jsondata['businesses']) > 0: if 'businesses' in jsondata: print 'entered into business' for i in jsondata['businesses']: print 'entered loop' yp = str() yp = yp + str( spark_conf.yelp['access_token']) + '|' if 'id' in i: yp = yp + (i['id']) + '|' else: yp = yp + 'unknown' + '|' if 'name' in i: yp = yp + (i['name']).encode( 'ascii', 'ignore').strip() + '|' else: yp = yp + 'unknown' + '|' if 'url' in i: yp = yp + ( i['url']).encode('utf-8').strip() + '|' else: yp = yp + 'unknown' + '|' if 'image_url' in i: yp = yp + (i['image_url'] ).encode('utf-8').strip() + '|' else: yp = yp + 'unknown' + '|' if 'location' in i: print 'location' if 'address' in i['location']: yp = yp + (i['location']['address'] ).encode('utf-8').strip() + '|' else: yp = yp + 'unknown' + '|' if 'state' in i['location']: yp = yp + (i['location']['state'] ).encode('utf-8').strip() + '|' else: yp = yp + 'unknown' + '|' if 'city' in i['location']: yp = yp + (i['location']['city'] ).encode('utf-8').strip() + '|' else: yp = yp + 'unknown' + '|' if 'city_id' in i['location']: yp = yp + ( i['location']['city_id']).strip() + '|' else: yp = yp + 'unknown' + '|' if 'region' in jsondata: if 'latitude' in jsondata['region']['center']: yp = yp + str(jsondata['region']['center'] ['latitude']) + '|' else: yp = yp + 'unknown' + '|' if 'longitude' in jsondata['region']['center']: yp = yp + str(jsondata['region']['center'] ['longitude']) + '|' else: yp = yp + 'unknown' + '|' if 'location' in i: if 'zip_code' in i['location']: yp = yp + str( i['location']['zip_code']) + '|' else: yp = yp + 'unknown' + '|' if 'country' in i['location']: yp = yp + (i['location']['country'] ).encode('utf-8').strip() + '|' else: yp = yp + 'unknown' + '|' if 'is_closed' in i: yp = yp + str(i['is_closed']) + '|' else: yp = yp + 'unknown' + '|' if 'rating' in i: yp = yp + str(i['rating']) + '|' else: yp = yp + 'unknown' + '|' if 'rating_text' in i: yp = yp + str(i['rating_text']) + '|' else: yp = yp + 'unknown' + '|' if 'rating_color' in i: yp = yp + str(i['rating_color']) + '|' else: yp = yp + 'unknown' + '|' if 'review_count' in i: yp = yp + str(i['review_count']) + '|' else: yp = yp + 'unknown' + '|' yp = yp + str( spark_conf.retrieved_time.replace(':', '-')) #if 'total' in jsondata:yp = yp + str(jsondata['total']) + '|' #else: yp = yp + 'unknown' + '|' # if 'price' in i: yp = yp + str(i['price']) + '|' # else: yp = yp + 'unknown' + '|' # if 'distance' in i: yp = yp + str(i['distance']) + '|' # else: yp = yp + 'unknown' + '|' # if 'phone' in i: yp = yp + str(i['phone']) + '|' # else: yp = yp + 'unknown' + '|' # if 'display_phone' in i: yp = yp + (i['display_phone']).encode('utf-8').strip() + '|' # else: yp = yp + 'unknown' + '|' # if 'coordinates' in i: # print 'entered coordinates' # if 'latitude' in i['coordinates']:yp = yp + str(i['coordinates']['latitude']) + '|' # else: yp = yp + 'unknown' + '|' # if 'longitude' in i['coordinates']:yp = yp + str(i['coordinates']['longitude']) + '|' # else: yp = yp + 'unknown' + '|' yr.append(yp.encode('utf-8')) #print yr #f = open('/bdaas/log/res_yelp.txt','a');f.write("%s" %yp.encode('utf-8'));f.close() if len(yr) > 0: yr_rdd = sc.parallelize(yr).filter( lambda y: y != '').map(lambda x: x.split('|')).map( lambda x: str(x[0]) + '|' + str(x[1]) + '|' + str(x[2]) + '|' + str(x[3]) + '|' + str(x[ 4]) + '|' + str(x[5]) + '|' + str(x[6]) + '|' + str(x[7]) + '|' + str(x[8]) + '|' + str( x[9]) + '|' + str(x[10]) + '|' + str(x[11]) + '|' + str(x[12]) + '|' + str(x[13]) + '|' + str(x[14]) + '|' + str(x[15]) + '|' + str(x[ 16]) + '|' + str(x[17]) + '|' + str(x[18]) ).saveAsTextFile( spark_conf.hdfs_path['yelp_restaurnt_data'] + '%s.txt' % (spark_conf.utc_time()[1])) yr_r = sc.parallelize(yr).map(lambda x: x.split( '|')).map(lambda y: Row(api_key=y[0], res_id=y[1], res_name=y[2], res_url=y[3], utm_source=y[4], res_location=y[5], res_locality=y[6], res_city=y[7], res_cityid=y[8], res_latitude=y[9], res_longitude=y[10], zip_code=y[11], country_id=y[12], locality_verbose=y[13], agg_rating=y[14], res_rating_text=y[15], rating_color=y[16], votes=y[17], retrieved_time=y[18])) df_res = sqlContext.createDataFrame(yr_r) for i in df_res.select('res_id', 'zip_code').collect(): ylp = ylp + i['res_id'] + ',' + i['zip_code'] + '|' #print ylp.encode('utf-8') return ylp.encode('utf-8') except Exception as e: logging.info('error in yelp restaurant function %s' % str(e))