Beispiel #1
0
	def file_download(self,sc,sqlContext,Row,cl,sz,links,local_path,hdfs_path,querydata_path):
		try:
			logging.info('entered into file_download function');import wget,os;l = str()
			for i in links:
				if spark_conf.values_311data['file_download'] == '1':
					print '++++++++++++entered++++++++++++'
					file_name = wget.download('http://hfdapp.houstontx.gov/311/'+i)
					call(['hdfs','dfs','-put','-f',i,hdfs_path])
					os.remove(i)
				return_value,return_date=self.log_checking(sc,sqlContext,Row,sz)
				if return_value in (0,1):
					df_anc = self.dataframe_creation(sc,sqlContext,Row,cl,links,local_path,hdfs_path,querydata_path,return_date)
					print df_anc.count()
					if df_anc.count() > 0:
						pass
				 		print 'Not Empty'
						if 1 == cl.nlc(df_anc):
							if 1 == cl.confident_classifier(sqlContext,df_anc)[1]:
								print 'finished'
								# Move command here

					l = l + 'Nan' + '|' + 'Nan' + '|' + 'Nan' + '|' + 'Nan' + '|' + '311_data' + '|' + str(spark_conf.utctime.replace(':','-')) + '|' + 'Nan' + '|' + 'Nan' '|' + '3'
					#l = l + str(00) + '|' + str(00) + '|' + 'Nan' + '|' + 'Nan' + '|' + '311_data' + '|' + str('2017-12-1 19-32-59') + '|' + 'Nan' + '|' + 'Nan' + '|' + 'Nan' 
					print (l)
					sc.parallelize([l]).saveAsTextFile(spark_conf.hdfs_path['zip_table']+'%s' %(spark_conf.utc_time()[1]))
			
		except Exception as e:
			logging.info('error in file_download func: %s' %str(e))				
Beispiel #2
0
    def confident_classifier(self, sqlContext, cc_input):
        # try:
        logging.info('Adding Confidence and class to the file')
        df3 = pd.read_json(spark_conf.file_path['otherfiles_path'] +
                           'review_{}.json'.format('classifier'))
        #print df3
        df4 = df3.copy(deep='True')

        #df5 = pd.read_csv(spark_conf.classifier_input['input_file']+'reviews_%s.txt'% spark_conf.retrieved_time.replace(':','-'),sep='|',encoding="ISO-8859-1")
        df5 = cc_input.toPandas()
        df6 = df5.copy(deep='True')
        #print df6
        if (len(df6.review_text) > 0):
            class_name_list = []
            confidence_list = []
            for rows in df4.iterrows():
                class_name_list.append(rows[1]['classes'][0]['class_name'])
                confidence_list.append(rows[1]['classes'][0]['confidence'])
            df6['class_name'] = class_name_list
            df6["confidence"] = confidence_list
            print df6
            print df6.head
            if spark_conf.program_execution['311_data'] == '1':
                df_311 = sqlContext.createDataFrame(df6)
                df_311.rdd.map(lambda x: list(x)).map(
                    lambda y: filter_data(y)).saveAsTextFile(
                        spark_conf.hdfs_path['classifier_output'] +
                        '311_data_%s.txt' % spark_conf.utc_time()[1])

            #df6.to_csv(spark_conf.classifier_input['output_file']+'Classified_Output_test.txt',sep='|',index=False,encoding="ISO-8859-1")
            return df6, 1

        else:
            print(" No Classification Available")
            return 0
Beispiel #3
0
    def linear_regression(self, lr_input, sqlContext, cc_output):

        # try:
        logging.info('Performing Regression')
        dfi = pd.read_csv(spark_conf.file_path['otherfiles_path'] +
                          'Encoded_Classified_1.txt',
                          sep='|',
                          encoding='ISO-8859-1')
        #dfi_test = pd.read_csv(spark_conf.classifier_input['output_file']+'Encoded_Classified_test.txt', sep  = '|', encoding = 'ISO-8859-1')
        dfi_test_new = cc_output
        dfi_test = lr_input
        input_list = list(dfi_test)
        #print input_list
        corr = dfi.corr()
        #sns.heatmap(corr,xticklabels=corr.columns,yticklabels=corr.columns)
        feature_cols = [
            'likes', 'comment_count', 'user_level_num', 'Average', 'Avoid!',
            'Blah!', 'Good Enough', 'Great!', 'Insane!', 'Not rated',
            'Very Bad', 'Well...', 'Big Foodie', 'Connoisseur', 'Foodie',
            'Super Foodie', 'Bad Ambience', 'Bad Food', 'Bad Service',
            'Good Ambience', 'Good Food', 'Good Service', 'Not Worthy',
            'binarized_user_foodie_level', 'binarized_rating_text',
            'binarized_class_name'
        ]

        feature_cols_1 = list(set(input_list).intersection(feature_cols))
        # print feature_cols_1

        X_train = dfi[:-1]
        # print len(X_train)
        X_test = dfi_test[0:]
        # print len(X_test)
        y_train = dfi.confidence[:-1]
        # print len(y_train)
        y_test = dfi_test.confidence[0:]
        #print len(y_test)

        X = X_train[feature_cols_1]
        y = y_train
        Xtest = X_test[feature_cols_1]

        regr = linear_model.Lasso(alpha=0.0000000001,
                                  fit_intercept=True,
                                  normalize=False,
                                  precompute=False,
                                  copy_X=True,
                                  max_iter=1000,
                                  tol=0.0001,
                                  warm_start=False,
                                  positive=False,
                                  random_state=None,
                                  selection='cyclic')
        regr.fit(X, y)

        shuffle = cross_validation.KFold(len(X),
                                         n_folds=10,
                                         shuffle=True,
                                         random_state=0)
        scores = cross_validation.cross_val_score(regr, X, y, cv=shuffle)
        #print("Accuracy: %.3f%% (%.3f%%)") % (scores.mean()*100.0, scores.std()*100.0)

        #print regr.intercept_
        #print (regr.coef_)

        #print mean_squared_error(regr.predict(Xtest), y_test)**0.5
        #print regr.predict(Xtest)
        #print regr.score(X,y)

        se = pd.Series(regr.predict(Xtest))
        dfi_test_new['score'] = se.values
        # dfi_test['xyz'] =  se.values
        print list(dfi_test_new)
        df_s = sqlContext.createDataFrame(dfi_test_new)
        #df_s.show()
        #print df_s.count()
        df_s.select(
            'comment_count', 'foodie_color', 'likes', 'profile_image',
            'profile_url', 'rating', 'rating_color', 'rating_text',
            'rating_time_friendly', 'restaurant_id', 'retrieved_time',
            'review_id', 'review_text', 'time_stamp', 'user_foodie_level',
            'user_level_num', 'user_name', 'user_zomatohandle', 'class_name',
            'confidence',
            'score').rdd.map(lambda x: list(x)).map(lambda y: filter_data(
                y)).saveAsTextFile(spark_conf.hdfs_path['classifier_output'] +
                                   '%s' % spark_conf.utc_time()[1])
        #  dfi_test.to_csv(spark_conf.classifier_input['output_file']+'final_Output.txt',sep='|',encoding="ISO-8859-1")
        return 1
Beispiel #4
0
	def dataframe_creation(self,sc,sqlContext,Row,cl,links,local_path,hdfs_path,querydata_path,date_311):
		try:
			logging.info('entered into dataframe_creation function')
			print date_311
			rdd_data = sc.textFile(hdfs_path);header = rdd_data.first()
			rdd_new = rdd_data.filter(lambda line: line != header).map(lambda x: x.split('|')).map(lambda z: Row(case_number =z[0],sr_location =z[1],county =z[2],district =z[3],neighborhood =z[4],tax_id =z[5],trash_quad =z[6],recycle_quad =z[7],trash_day =z[8],heavytrash_day =z[9],recycle_day =z[10],key_map =z[11],management_district =z[12],department =z[13],division =z[14],sr_type =z[15],queue =z[16],sla =z[17],status =z[18],sr_createdate =z[19],due_date =z[20],date_closed =z[21],overdue =z[22],title =z[23],x =z[24],y =z[25], latitude =z[26],longitude =z[27],channel_type =z[28]))
			#print rdd_new.map(lambda d: date_conversion(d)).take(10)
			df_data = sqlContext.createDataFrame(rdd_new).registerTempTable('311data')
			df_new = sqlContext.sql('select * from 311data where regexp_replace(lower(trim(sr_type))," ","") in ("waterleak","poordrainage","airpollution","crisiscleanup","deadanimalcollection","drainage","drainagesystemviolation","healthcode","heavytrashviolation","majorwaterleak","missedgarbagepickup","missedheavytrashpickup","missedyardewastepickup","pestcontrol","poolwaterqualitycontrol","sewermanhole","sewerwastewater","sidewalkrepair","streethazard","waterplaygroundrepair","waterquality","waterservice","waterorgroundpollution") and to_utc_timestamp (sr_createdate,"US/Central") > "%s" ' %date_311).select('case_number','sr_location' ,'county' ,'district' ,'neighborhood' ,'tax_id' ,'trash_quad' ,'recycle_quad','trash_day','heavytrash_day','recycle_day','key_map','management_district' ,'department' ,'division' ,'sr_type','queue','sla','status','sr_createdate' ,'due_date' ,'date_closed' ,'overdue' ,'title' ,'x' ,'y' , 'latitude' ,'longitude','channel_type')
			df_new.show()
			df_new.rdd.map(lambda y: list(y)).map(lambda z: rdd_structure(z)).coalesce(1).saveAsTextFile(querydata_path+'311query_data_%s.txt' %(spark_conf.utc_time()[1]))
	 		df_anc = df_new.withColumn('311_rating',lit('Nan')).withColumn('311_color',lit('Nan')).withColumn('311_rating_text',lit('Nan')).withColumn('311_likes',lit('Nan')).withColumn('311_comment_count',lit('Nan')).withColumn('311_user_name',lit('Nan')).withColumn('311_user_handle',lit('Nan')).withColumn('311_user_foodie_level',lit('Nan')).withColumn('311_user_level_num',lit('Nan')).withColumn('311_foodie_color',lit('Nan')).withColumn('311_profile_url',lit('Nan')).withColumn('311_user_image',lit('Nan')).withColumn('311_retrieved_time',lit(spark_conf.retrieved_time.replace(':','-'))).withColumnRenamed("sr_type", "review_text").select('tax_id','311_rating','case_number','review_text','311_color','date_closed','311_rating_text','sr_createdate','311_likes','311_comment_count','311_user_name','311_user_handle','311_user_foodie_level','311_user_level_num','311_foodie_color','311_profile_url','311_user_image','311_retrieved_time')
	 		return df_anc
 		except Exception as e:
			logging.info('error in dataframe_creation func: %s' %str(e))
Beispiel #5
0
    def reviews_data(self, sc, cl, sqlContext, zip_code, api_link, j, apikey,
                     comp_date, lt):
        #try:
        logging.info('entered into reviews data function')
        print api_link + 'res_id=' + str(j) + '&apikey=' + str(apikey)
        reviews = json.loads(
            requests.get(api_link + 'res_id=' + str(j) + '&apikey=' +
                         str(apikey)).text)
        rd = []
        if 'user_reviews' in reviews:
            if len(reviews['user_reviews']) > 0:
                for k in reviews['user_reviews']:
                    rev = str()
                    if 'review' in k:
                        #print type(datetime.utcfromtimestamp(k['review']['timestamp']));
                        #print type(comp_date);
                        print comp_date.replace(tzinfo=pytz.UTC)
                        print datetime.utcfromtimestamp(
                            k['review']['timestamp']).replace(
                                tzinfo=pytz.UTC).strftime('%Y-%m-%d %H:%M:%S')
                        if datetime.utcfromtimestamp(
                                k['review']['timestamp']).replace(
                                    tzinfo=pytz.UTC) > (comp_date).replace(
                                        tzinfo=pytz.UTC):
                            rev = rev + str(j) + '|'
                            if 'rating' in k['review']:
                                rev = rev + str(k['review']['rating']) + '|'
                            else:
                                rev = rev + 'unknown' + '|'
                            if 'id' in k['review']:
                                rev = rev + str(k['review']['id']) + '|'
                            else:
                                rev = rev + 'unknown' + '|'
                            if 'review_text' in k['review']:
                                rev = rev + (
                                    k['review']['review_text']).replace(
                                        '\r', '').replace(
                                            '\n',
                                            '').encode('utf-8').strip() + '|'
                            else:
                                rev = rev + 'unknown' + '|'
                            if 'rating_color' in k['review']:
                                rev = rev + (k['review']['rating_color']
                                             ).encode('utf-8').strip() + '|'
                            else:
                                rev = rev + 'unknown' + '|'
                            if 'rating_time_friendly' in k['review']:
                                rev = rev + (
                                    k['review']['rating_time_friendly']) + '|'
                            else:
                                rev = rev + 'unknown' + '|'
                            if 'rating_text' in k['review']:
                                rev = rev + (
                                    k['review']['rating_text']).replace(
                                        '\r', '').replace(
                                            '\n',
                                            '').encode('utf-8').strip() + '|'
                            else:
                                rev = rev + 'unknown' + '|'
                            if 'timestamp' in k['review']:
                                rev = rev + (datetime.utcfromtimestamp(
                                    k['review']['timestamp']).replace(
                                        tzinfo=pytz.UTC).strftime(
                                            '%Y-%m-%d %H:%M:%S')) + '|'
                            else:
                                rev = rev + 'unknown' + '|'
                            if 'likes' in k['review']:
                                rev = rev + str(k['review']['likes']) + '|'
                            else:
                                rev = rev + 'unknown' + '|'
                            if 'comments_count' in k['review']:
                                rev = rev + str(
                                    k['review']['comments_count']) + '|'
                            else:
                                rev = rev + 'unknown' + '|'
                            if 'user' in k['review']:
                                if 'name' in k['review']['user']:
                                    rev = rev + (
                                        k['review']['user']['name']
                                    ).encode('utf-8').strip() + '|'
                                else:
                                    rev = rev + 'unknown' + '|'
                                if 'zomato_handle' in k['review']['user']:
                                    rev = rev + (
                                        k['review']['user']['zomato_handle']
                                    ).encode('utf-8').strip() + '|'
                                else:
                                    rev = rev + 'unknown' + '|'
                                if 'foodie_level' in k['review']['user']:
                                    rev = rev + (
                                        k['review']['user']['foodie_level']
                                    ).encode('utf-8').strip() + '|'
                                else:
                                    rev = rev + 'unknown' + '|'
                                if 'foodie_level_num' in k['review']['user']:
                                    rev = rev + str(
                                        k['review']['user']
                                        ['foodie_level_num']).strip() + '|'
                                else:
                                    rev = rev + 'unknown' + '|'
                                if 'foodie_color' in k['review']['user']:
                                    rev = rev + (
                                        k['review']['user']['foodie_color']
                                    ).encode('utf-8').strip() + '|'
                                else:
                                    rev = rev + 'unknown' + '|'
                                if 'profile_url' in k['review']['user']:
                                    rev = rev + (
                                        k['review']['user']['profile_url']
                                    ).encode('utf-8').strip() + '|'
                                else:
                                    rev = rev + 'unknown' + '|'
                                if 'profile_image' in k['review']['user']:
                                    rev = rev + (
                                        k['review']['user']['profile_image']
                                    ).encode('utf-8').strip() + '|'
                                else:
                                    rev = rev + 'unknown' + '|'
                            rev = rev + str(
                                spark_conf.retrieved_time.replace(':', '-'))
                            #f = open(spark_conf.file_path['reviewdata_path']+'reviews_%s.txt'% spark_conf.retrieved_time.replace(':','-'),'a')
                            #f.write('%s\n' %rev);f.close()
                            rd.append(rev)

        print rd
        print len(rd)
        lt = lt + str(spark_conf.utctime.replace(
            ':', '-')) + '|' + str(j) + '|' + str(len(rd)) + '|' + '1'
        print lt
        sc.parallelize([
            lt.strip()
        ]).saveAsTextFile(spark_conf.hdfs_path['zip_table'] + '%s_%s_%s.txt' %
                          (zip_code, j, spark_conf.utc_time()[1]))

        if len(rd) > 0:

            if str(sc.parallelize(rd).isEmpty()) == 'False':
                rdd_r = sc.parallelize(rd).filter(lambda y: y != '').map(
                    lambda x: x.split('|')).map(
                        lambda x: str(x[0]) + '|' + str(x[1]) + '|' + str(x[2])
                        + '|' + str(x[3]) + '|' + str(x[4]) + '|' + str(x[5]) +
                        '|' + str(x[6]) + '|' + str(x[7]) + '|' + str(x[8]) +
                        '|' + str(x[9]) + '|' + str(x[10]) + '|' + str(x[11]) +
                        '|' + str(x[12]) + '|' + str(x[13]) + '|' + str(x[
                            14]) + '|' + str(x[15]) + '|' + str(x[
                                16]) + '|' + str(x[17])).saveAsTextFile(
                                    spark_conf.hdfs_path['temp_review_data'] +
                                    '%s_%s_%s.txt' %
                                    (zip_code, j, spark_conf.utc_time()[1]))
                logging.info('completed saving rdd datafiles in hdfs')

            else:
                return 'None'
        else:
            return 'None'
Beispiel #6
0
    def zomato_location(self, sc, sqlContext, zipcode, api_link, lat_value,
                        long_value, radius, category_by, sort_by, order_by,
                        apikey):
        #try:
        logging.info('entered into zomato location')
        lc = str()
        p = []
        print api_link + 'lat=' + str(lat_value) + '&lon=' + str(
            long_value) + '&radius=' + str(radius) + '&category=' + str(
                category_by) + '&sort=' + str(sort_by) + '&order=' + str(
                    order_by) + '&apikey=' + apikey
        search = json.loads(
            requests.get(api_link + 'lat=' + str(lat_value) + '&lon=' +
                         str(long_value) + '&radius=' + str(radius) +
                         '&category=' + str(category_by) + '&sort=' +
                         str(sort_by) + '&order=' + str(order_by) +
                         '&apikey=' + apikey).text)
        #print search
        if 'restaurants' in search:
            if len(search['restaurants']) > 0:

                for i in search['restaurants']:
                    ser = str()
                    if 'restaurant' in i:
                        if 'apikey' in i['restaurant']:
                            ser = ser + str(i['restaurant']['apikey']) + '|'
                        else:
                            ser = ser + 'unknown' + '|'
                        if 'id' in i['restaurant']:
                            ser = ser + str(i['restaurant']['id']) + '|'
                        else:
                            ser = ser + 'unknown' + '|'
                        if 'name' in i['restaurant']:
                            ser = ser + str(i['restaurant']['name'].encode(
                                'ascii', 'ignore').decode('ascii')) + '|'
                        else:
                            ser = ser + 'unknown' + '|'
                        if 'url' in i['restaurant']:
                            ser = ser + str(i['restaurant']['url'].encode(
                                'ascii', 'ignore').decode('ascii')) + '|'
                        else:
                            ser = ser + 'unknown' + '|'
                        if 'utm_source' in i['restaurant']:
                            ser = ser + str(
                                i['restaurant']['utm_source']) + '|'
                        else:
                            ser = ser + 'unknown' + '|'
                    if 'location' in i['restaurant']:
                        if 'address' in i['restaurant']['location']:
                            ser = ser + i['restaurant']['location'][
                                'address'] + '|'
                        else:
                            ser = ser + 'unknown' + '|'
                        if 'locality' in i['restaurant']['location']:
                            ser = ser + i['restaurant']['location'][
                                'locality'] + '|'
                        else:
                            ser = ser + 'unknown' + '|'
                        if 'city' in i['restaurant']['location']:
                            ser = ser + i['restaurant']['location'][
                                'city'] + '|'
                        else:
                            ser = ser + 'unknown' + '|'
                        if 'city_id' in i['restaurant']['location']:
                            ser = ser + str(
                                i['restaurant']['location']['city_id']) + '|'
                        else:
                            ser = ser + 'unknown' + '|'
                        if 'latitude' in i['restaurant']['location']:
                            ser = ser + i['restaurant']['location'][
                                'latitude'] + '|'
                        else:
                            ser = ser + 'unknown' + '|'
                        if 'longitude' in i['restaurant']['location']:
                            ser = ser + i['restaurant']['location'][
                                'longitude'] + '|'
                        else:
                            ser = ser + 'unknown' + '|'
                        if 'zipcode' in i['restaurant']['location']:
                            ser = ser + i['restaurant']['location'][
                                'zipcode'] + '|'
                        else:
                            ser = ser + 'unknown' + '|'
                        if 'country_id' in i['restaurant']['location']:
                            ser = ser + str(i['restaurant']['location']
                                            ['country_id']) + '|'
                        else:
                            ser = ser + 'unknown' + '|'
                        if 'locality_verbose' in i['restaurant']['location']:
                            ser = ser + i['restaurant']['location'][
                                'locality_verbose'] + '|'
                        else:
                            ser = ser + 'unknown' + '|'
                    if 'user_rating' in i['restaurant']:
                        if 'aggregate_rating' in i['restaurant'][
                                'user_rating']:
                            ser = ser + str(i['restaurant']['user_rating']
                                            ['aggregate_rating']) + '|'
                        else:
                            ser = ser + 'unknown' + '|'
                        if 'rating_text' in i['restaurant']['user_rating']:
                            ser = ser + str(
                                i['restaurant']['user_rating']['rating_text'].
                                replace('\r', '').replace('\n', '')) + '|'
                        else:
                            ser = ser + 'unknown' + '|'
                        if 'rating_color' in i['restaurant']['user_rating']:
                            ser = ser + str(i['restaurant']['user_rating']
                                            ['rating_color']) + '|'
                        else:
                            ser = ser + 'unknown' + '|'
                        if 'votes' in i['restaurant']['user_rating']:
                            ser = ser + str(
                                i['restaurant']['user_rating']['votes']) + '|'
                        else:
                            ser = ser + 'unknown' + '|'
                    ser = ser + str(spark_conf.retrieved_time.replace(
                        ':', '-'))
                    p.append(ser.encode('utf-8', 'replace'))
                    #s  = open(spark_conf.file_path['searchdata_path']+'search_%s.txt'% spark_conf.retrieved_time.replace(':','-'),'a');s.write('%s' %ser.encode('utf-8','replace'));s.close()

            if len(p) > 0:
                res_save = sc.parallelize(p).filter(lambda y: y != '').map(
                    lambda x: x.split('|')).map(
                        lambda x: str(x[0]) + '|' + str(x[1]) + '|' + str(x[2])
                        + '|' + str(x[3]) + '|' + str(x[4]) + '|' + str(x[5]) +
                        '|' + str(x[6]) + '|' + str(x[7]) + '|' + str(x[8]) +
                        '|' + str(x[9]) + '|' + str(x[10]) + '|' + str(x[11]) +
                        '|' + str(x[12]) + '|' + str(x[13]) + '|' + str(x[
                            14]) + '|' + str(x[15]) + '|' + str(x[16]) + '|'
                        + str(x[17]) + '|' + str(x[18])).saveAsTextFile(
                            spark_conf.hdfs_path['restaurant_data'] +
                            '%s_%s.txt' % (zipcode, spark_conf.utc_time()[1]))
                rdd_rest = sc.parallelize(p).map(lambda x: x.split('|')).map(
                    lambda y: Row(apikey=y[0],
                                  rest_id=int(y[1]),
                                  rest_name=y[2],
                                  rest_url=y[3],
                                  rest_utmsource=y[4],
                                  rest_addr=y[5],
                                  rest_loclty=y[6],
                                  rest_city=y[7],
                                  res_cityid=y[8],
                                  rest_lat=y[9],
                                  rest_long=y[10],
                                  rest_zpcode=(y[11]),
                                  rest_countryid=y[12],
                                  loc_verbose=y[13],
                                  agg_rat=y[14],
                                  rating_text=y[15],
                                  rating_color=y[16],
                                  votes=y[17],
                                  rt=y[18]))
                df_rest = sqlContext.createDataFrame(rdd_rest)

                for i in df_rest.select('rest_id', 'rest_zpcode', 'rest_lat',
                                        'rest_long').collect():
                    lc = lc + str(zipcode) + ',' + str(
                        i['rest_id']) + ',' + str(
                            i['rest_zpcode']) + ',' + str(
                                i['rest_lat']) + ',' + str(
                                    i['rest_long']) + '|'

            #r print lc
            return lc
Beispiel #7
0
    def yelp_looping(self, sc, sqlContext, jsondata, cl, zip_code, lat_v,
                     long_v):
        try:
            logging.info('entered into yelp looping')
            url_params = {}
            r_id = self.yelp_restaurant(sc, sqlContext, jsondata)
            print r_id
            res = call([
                'hdfs', 'dfs', '-test', '-f',
                spark_conf.hdfs_path['zip_table'] + '*/p*'
            ])
            if res == 0:
                zp_log = sc.textFile(
                    spark_conf.hdfs_path['zip_table'] +
                    '*').map(lambda x: x.split('|')).map(lambda l: filter(
                        None, l)).map(lambda z: Row(quered_zipcode=(z[0]),
                                                    original_zipcode=(z[1]),
                                                    lat_value=z[2],
                                                    long_value=z[3],
                                                    sm_name=z[4],
                                                    retrieving_time=z[5],
                                                    restaurant_id=z[6],
                                                    count_val=z[7],
                                                    check_value=int(z[8])))
                sqlContext.createDataFrame(zp_log).registerTempTable(
                    'temp_yelp_log')
                f = sqlContext.sql(
                    'select count(*) as count from temp_yelp_log').collect()
                if f[0]['count'] > 0:
                    q_str = "select count(*) as count from temp_yelp_log where trim(lat_value) = trim('%s') and trim(long_value) = trim('%s') and check_value = 2" % (
                        lat_v, long_v)
                    print q_str
                    c = sqlContext.sql(q_str).collect()
                    print c[0]['count']
                    if c[0]['count'] > 0:
                        dt = sqlContext.sql(
                            "select max(cast(from_unixtime(unix_timestamp(retrieving_time,'yyyy-MM-dd HH:mm:ss')) as timestamp)) as max_date from temp_yelp_log where trim(lat_value) = trim('%s') and trim(long_value) = trim('%s') and check_value = 2"
                            % (lat_v, long_v)).collect()
                        print dt
                        previous_date = dt[0]['max_date']
                        print previous_date
                        for i in r_id.split('|'):
                            print i
                            print i.decode('ascii', 'ignore').encode('ascii')
                            print '1st loop'
                            if i != '':
                                review_path = spark_conf.yelp[
                                    'REVIEW_PATH'].replace(
                                        'id',
                                        i.split(',')[0].decode(
                                            'ascii', 'ignore').encode('ascii'))
                                print review_path
                                yl = str()
                                yl = yl + str(zip_code) + '|' + i.split(
                                    ',')[1] + '|' + str(lat_v) + '|' + str(
                                        long_v
                                    ) + '|' + spark_conf.yelp['API_HOST'] + '|'
                                print yl
                                review_data = self.request(
                                    spark_conf.yelp['API_HOST'],
                                    review_path,
                                    spark_conf.yelp['access_token'],
                                    url_params=url_params)
                                self.yelp_reviews(sc, sqlContext, review_data,
                                                  i.split(',')[0], zip_code,
                                                  yl, previous_date)
                    else:

                        if len(r_id) > 0:
                            for i in r_id.split('|'):
                                print i
                                print i.decode('ascii',
                                               'ignore').encode('ascii')
                                print '2nd loop'
                                if i != '':
                                    review_path = spark_conf.yelp[
                                        'REVIEW_PATH'].replace(
                                            'id',
                                            i.split(',')[0].decode(
                                                'ascii',
                                                'ignore').encode('ascii'))
                                    print review_path
                                    yl = str()
                                    yl = yl + str(zip_code) + '|' + i.split(
                                        ',')[1] + '|' + str(lat_v) + '|' + str(
                                            long_v) + '|' + spark_conf.yelp[
                                                'API_HOST'] + '|'
                                    review_data = self.request(
                                        spark_conf.yelp['API_HOST'],
                                        review_path,
                                        spark_conf.yelp['access_token'],
                                        url_params=url_params)
                                    self.yelp_reviews(sc, sqlContext,
                                                      review_data,
                                                      i.split(',')[0],
                                                      zip_code, yl,
                                                      self.input_date())

            else:
                if len(r_id) > 0:
                    for i in r_id.split('|'):
                        print i
                        print i.decode('ascii', 'ignore').encode('ascii')
                        print '3rd loop'
                        if i != '':
                            review_path = spark_conf.yelp[
                                'REVIEW_PATH'].replace(
                                    'id',
                                    i.split(',')[0].decode(
                                        'ascii', 'ignore').encode('ascii'))
                            print review_path
                            yl = str()
                            yl = yl + str(zip_code) + '|' + i.split(
                                ',')[1] + '|' + str(lat_v) + '|' + str(
                                    long_v
                                ) + '|' + spark_conf.yelp['API_HOST'] + '|'
                            review_data = self.request(
                                spark_conf.yelp['API_HOST'],
                                review_path,
                                spark_conf.yelp['access_token'],
                                url_params=url_params)
                            self.yelp_reviews(sc, sqlContext, review_data,
                                              i.split(',')[0], zip_code, yl,
                                              self.input_date())

            d = call([
                'hdfs', 'dfs', '-test', '-f',
                spark_conf.hdfs_path['yelp_temp_review_data'] + '*/p*'
            ])
            if d == 0:
                r_path = spark_conf.hdfs_path['yelp_temp_review_data'] + '*'
                print r_path
                rdd_data = sc.textFile(r_path, use_unicode=False)
                if rdd_data.count() > 0:
                    print rdd_data.count()
                    rdd_r = rdd_data.filter(lambda y: y != '').map(
                        lambda x: x.split('|')).map(
                            lambda z: Row(restaurant_id=z[0],
                                          time_created=z[1],
                                          review_id=z[2],
                                          rating=z[3],
                                          url=z[4],
                                          review_text=z[5],
                                          user_name=z[6],
                                          user_image=z[7],
                                          retrieved_time=z[8]))
                    df_old = sqlContext.createDataFrame(rdd_r)
                    df_new = df_old.select('*').withColumn(
                        'rating_color',
                        lit('None').cast(StringType())).withColumn(
                            'rating_time_friendly',
                            lit('None').cast(StringType())).withColumn(
                                'rating_text',
                                lit('None').cast(StringType())).withColumn(
                                    'likes',
                                    lit('None').cast(StringType())).withColumn(
                                        'comment_count',
                                        lit('None').cast(
                                            StringType())).withColumn(
                                                'user_zomatohandle',
                                                lit('None').cast(StringType())
                                            ).withColumn(
                                                'user_foodie_level',
                                                lit('None').cast(StringType())
                                            ).withColumn(
                                                'user_level_num',
                                                lit('None').cast(StringType())
                                            ).withColumn(
                                                'foodie_color',
                                                lit('None').cast(
                                                    StringType())).withColumn(
                                                        'profile_url',
                                                        lit('None').cast(
                                                            StringType()))
                    data_r = df_new.select(
                        'restaurant_id', 'rating', 'review_id', 'review_text',
                        'rating_color', 'rating_time_friendly', 'rating_text',
                        'time_created', 'likes', 'comment_count', 'user_name',
                        'user_zomatohandle', 'user_foodie_level',
                        'user_level_num', 'foodie_color', 'profile_url',
                        'user_image', 'retrieved_time')
                    data_r.show()
                    if 1 == cl.nlc(data_r):
                        if 1 == cl.confident_classifier(sqlContext, data_r)[1]:
                            sqlContext.createDataFrame(
                                cl.confident_classifier(sqlContext, data_r)[0]
                            ).withColumn(
                                'score',
                                lit('None').cast(StringType())).select(
                                    'comment_count', 'foodie_color', 'likes',
                                    'user_image', 'profile_url', 'rating',
                                    'rating_color', 'rating_text',
                                    'rating_time_friendly', 'restaurant_id',
                                    'retrieved_time', 'review_id',
                                    'review_text', 'time_created',
                                    'user_foodie_level', 'user_level_num',
                                    'user_name', 'user_zomatohandle',
                                    'class_name', 'confidence', 'score'
                                ).rdd.map(lambda x: list(x)).map(
                                    lambda y: filter_data(y)
                                ).saveAsTextFile(
                                    spark_conf.hdfs_path['classifier_output'] +
                                    '%s.txt' % spark_conf.utc_time()[1])
                            call([
                                'hdfs', 'dfs', '-mv',
                                spark_conf.hdfs_path['yelp_temp_review_data'] +
                                '*',
                                spark_conf.hdfs_path['yelp_final_review_data']
                            ])

        except Exception as e:
            logging.info('error in yelp loop function  %s' % str(e))
Beispiel #8
0
    def yelp_reviews(self, sc, sqlContext, review_data, r_id, zip_code, yl,
                     c_date):
        try:
            logging.info('entered into yelp reviews')
            yr = []
            if 'reviews' in review_data:
                if len(review_data['reviews']) > 0:
                    for i in review_data['reviews']:
                        print 'entered reviews loop'
                        rd = str()
                        print self.convert_datetime_timezone(
                            i['time_created'],
                            spark_conf.yelp['timezone_initial'],
                            spark_conf.yelp['timezone_final'])
                        print c_date.replace(
                            tzinfo=pytz.UTC).strftime('%Y-%m-%d %H:%M:%S')
                        if self.convert_datetime_timezone(
                                i['time_created'],
                                spark_conf.yelp['timezone_initial'],
                                spark_conf.yelp['timezone_final']
                        ) > c_date.replace(tzinfo=pytz.UTC):
                            rd = rd + r_id + '|'
                            if 'time_created' in i:
                                rd = rd + self.convert_datetime_timezone(
                                    i['time_created'],
                                    spark_conf.yelp['timezone_initial'],
                                    spark_conf.yelp['timezone_final']
                                ).strftime("%Y-%m-%d %H:%M:%S") + '|'
                            else:
                                rd = rd + 'unknown' + '|'
                            rd = rd + str(uuid.uuid4(
                            )) + '_' + self.convert_datetime_timezone(
                                i['time_created'],
                                spark_conf.yelp['timezone_initial'],
                                spark_conf.yelp['timezone_final']).strftime(
                                    "%Y-%m-%d %H:%M:%S") + '|'
                            if 'rating' in i: rd = rd + str(i['rating']) + '|'
                            else: rd = rd + 'unknown' + '|'
                            if 'url' in i:
                                rd = rd + i['url'].encode('utf-8') + '|'
                            else:
                                rd = rd + 'unknown' + '|'
                            if 'text' in i:
                                rd = rd + i['text'].replace('\r', '').replace(
                                    '\n', '').encode('utf-8').strip() + '|'
                            else:
                                rd = rd + 'unknown' + '|'
                            if 'user' in i:
                                if 'name' in i['user']:
                                    rd = rd + i['user']['name'].encode(
                                        'utf-8') + '|'
                                else:
                                    rd = rd + 'unknown' + '|'
                                if 'image_url' in i['user']:
                                    rd = rd + str(i['user']['image_url']) + '|'
                                else:
                                    rd = rd + 'unknown' + '|'
                            rd = rd + str(spark_conf.utctime)
                            yr.append(rd)
                        #f = open('/bdaas/log/res_rev.txt','a');f.write("%s" %rd.encode('utf-8'));f.close()

                print len(yr)
                yl = yl + str(spark_conf.utctime) + '|' + r_id + '|' + str(
                    len(yr)) + '|' + '2'
                print yl
                sc.parallelize([
                    yl.strip()
                ]).saveAsTextFile(spark_conf.hdfs_path['zip_table'] +
                                  '%s_%s_%s.txt' %
                                  (zip_code, r_id, spark_conf.utc_time()[1]))

                if len(yr) > 0:
                    if str(sc.parallelize(yr).isEmpty()) == 'False':
                        rdd_r = sc.parallelize(yr).filter(
                            lambda y: y != '').map(lambda x: x.split('|')).map(
                                lambda x: str(x[0]) + '|' + str(x[1]) + '|' +
                                str(x[2]) + '|' + str(x[3]) + '|' + str(x[
                                    4]) + '|' + str(x[5]) + '|' + str(x[
                                        6]) + '|' + str(x[7]) + '|' + str(x[8])
                            ).saveAsTextFile(
                                spark_conf.hdfs_path['yelp_temp_review_data'] +
                                '%s_%s.txt' %
                                (zip_code, spark_conf.utc_time()[1]))

        except Exception as e:
            logging.info('error in yelp reviews function  %s' % str(e))
Beispiel #9
0
    def yelp_restaurant(self, sc, sqlContext, jsondata):
        try:
            logging.info('entered into yelp restaurant')
            yr = []
            ylp = str()
            if 'businesses' in jsondata:
                if len(jsondata['businesses']) > 0:
                    if 'businesses' in jsondata:
                        print 'entered into business'
                        for i in jsondata['businesses']:
                            print 'entered loop'
                            yp = str()
                            yp = yp + str(
                                spark_conf.yelp['access_token']) + '|'
                            if 'id' in i: yp = yp + (i['id']) + '|'
                            else: yp = yp + 'unknown' + '|'
                            if 'name' in i:
                                yp = yp + (i['name']).encode(
                                    'ascii', 'ignore').strip() + '|'
                            else:
                                yp = yp + 'unknown' + '|'
                            if 'url' in i:
                                yp = yp + (
                                    i['url']).encode('utf-8').strip() + '|'
                            else:
                                yp = yp + 'unknown' + '|'
                            if 'image_url' in i:
                                yp = yp + (i['image_url']
                                           ).encode('utf-8').strip() + '|'
                            else:
                                yp = yp + 'unknown' + '|'
                            if 'location' in i:
                                print 'location'
                                if 'address' in i['location']:
                                    yp = yp + (i['location']['address']
                                               ).encode('utf-8').strip() + '|'
                                else:
                                    yp = yp + 'unknown' + '|'
                                if 'state' in i['location']:
                                    yp = yp + (i['location']['state']
                                               ).encode('utf-8').strip() + '|'
                                else:
                                    yp = yp + 'unknown' + '|'
                                if 'city' in i['location']:
                                    yp = yp + (i['location']['city']
                                               ).encode('utf-8').strip() + '|'
                                else:
                                    yp = yp + 'unknown' + '|'
                                if 'city_id' in i['location']:
                                    yp = yp + (
                                        i['location']['city_id']).strip() + '|'
                                else:
                                    yp = yp + 'unknown' + '|'
                            if 'region' in jsondata:
                                if 'latitude' in jsondata['region']['center']:
                                    yp = yp + str(jsondata['region']['center']
                                                  ['latitude']) + '|'
                                else:
                                    yp = yp + 'unknown' + '|'
                                if 'longitude' in jsondata['region']['center']:
                                    yp = yp + str(jsondata['region']['center']
                                                  ['longitude']) + '|'
                                else:
                                    yp = yp + 'unknown' + '|'
                            if 'location' in i:
                                if 'zip_code' in i['location']:
                                    yp = yp + str(
                                        i['location']['zip_code']) + '|'
                                else:
                                    yp = yp + 'unknown' + '|'
                                if 'country' in i['location']:
                                    yp = yp + (i['location']['country']
                                               ).encode('utf-8').strip() + '|'
                                else:
                                    yp = yp + 'unknown' + '|'
                            if 'is_closed' in i:
                                yp = yp + str(i['is_closed']) + '|'
                            else:
                                yp = yp + 'unknown' + '|'
                            if 'rating' in i: yp = yp + str(i['rating']) + '|'
                            else: yp = yp + 'unknown' + '|'
                            if 'rating_text' in i:
                                yp = yp + str(i['rating_text']) + '|'
                            else:
                                yp = yp + 'unknown' + '|'
                            if 'rating_color' in i:
                                yp = yp + str(i['rating_color']) + '|'
                            else:
                                yp = yp + 'unknown' + '|'
                            if 'review_count' in i:
                                yp = yp + str(i['review_count']) + '|'
                            else:
                                yp = yp + 'unknown' + '|'
                            yp = yp + str(
                                spark_conf.retrieved_time.replace(':', '-'))
                            #if 'total' in jsondata:yp = yp + str(jsondata['total']) + '|'
                            #else: yp = yp + 'unknown' + '|'
                            # if 'price' in i: yp = yp + str(i['price']) + '|'
                            # else: yp = yp + 'unknown' + '|'
                            # if 'distance' in i: yp = yp + str(i['distance']) + '|'
                            # else: yp = yp + 'unknown' + '|'
                            # if 'phone' in i: yp = yp + str(i['phone']) + '|'
                            # else: yp = yp + 'unknown' + '|'

                            # if 'display_phone' in i: yp = yp + (i['display_phone']).encode('utf-8').strip() + '|'
                            # else: yp = yp + 'unknown' + '|'

                            # if 'coordinates' in i:
                            # 	print 'entered coordinates'
                            # 	if 'latitude' in i['coordinates']:yp = yp + str(i['coordinates']['latitude']) + '|'
                            # 	else: yp = yp + 'unknown' + '|'
                            # 	if 'longitude' in i['coordinates']:yp = yp + str(i['coordinates']['longitude']) + '|'
                            # 	else: yp = yp + 'unknown' + '|'

                            yr.append(yp.encode('utf-8'))

                    #print yr
                    #f = open('/bdaas/log/res_yelp.txt','a');f.write("%s" %yp.encode('utf-8'));f.close()
                    if len(yr) > 0:
                        yr_rdd = sc.parallelize(yr).filter(
                            lambda y: y != '').map(lambda x: x.split('|')).map(
                                lambda x: str(x[0]) + '|' + str(x[1]) + '|' +
                                str(x[2]) + '|' + str(x[3]) + '|' + str(x[
                                    4]) + '|' + str(x[5]) + '|' + str(x[6]) +
                                '|' + str(x[7]) + '|' + str(x[8]) + '|' + str(
                                    x[9]) + '|' + str(x[10]) + '|' + str(x[11])
                                + '|' + str(x[12]) + '|' + str(x[13]) + '|'
                                + str(x[14]) + '|' + str(x[15]) + '|' + str(x[
                                    16]) + '|' + str(x[17]) + '|' + str(x[18])
                            ).saveAsTextFile(
                                spark_conf.hdfs_path['yelp_restaurnt_data'] +
                                '%s.txt' % (spark_conf.utc_time()[1]))
                        yr_r = sc.parallelize(yr).map(lambda x: x.split(
                            '|')).map(lambda y: Row(api_key=y[0],
                                                    res_id=y[1],
                                                    res_name=y[2],
                                                    res_url=y[3],
                                                    utm_source=y[4],
                                                    res_location=y[5],
                                                    res_locality=y[6],
                                                    res_city=y[7],
                                                    res_cityid=y[8],
                                                    res_latitude=y[9],
                                                    res_longitude=y[10],
                                                    zip_code=y[11],
                                                    country_id=y[12],
                                                    locality_verbose=y[13],
                                                    agg_rating=y[14],
                                                    res_rating_text=y[15],
                                                    rating_color=y[16],
                                                    votes=y[17],
                                                    retrieved_time=y[18]))
                        df_res = sqlContext.createDataFrame(yr_r)
                        for i in df_res.select('res_id', 'zip_code').collect():
                            ylp = ylp + i['res_id'] + ',' + i['zip_code'] + '|'

                        #print ylp.encode('utf-8')
                        return ylp.encode('utf-8')
        except Exception as e:
            logging.info('error in yelp restaurant function  %s' % str(e))