def run(self): #pass jsonfilename = self.filename[-31:] #print("jsonfilename: ", jsonfilename) insert_list = [] file = open(self.filename, 'r') json_content = file.readlines() if len(json_content) == 0: insert_list.append((jsonfilename, None)) else: for line in json_content: insert_list.append((jsonfilename, line)) file.close() qry_string = "insert into restaurant_inbox (file_name, rest_desc_json) values (%s, %s)" #dbobj = dbUtil("192.168.1.221", "restuser", "restuser", "restaurantdb", True) dbobj = dbUtil(config.db_config['host'], config.db_config['username'], config.db_config['password'], config.db_config['database_name']) try: if dbobj: dbobj.executeQuery("delete from restaurant_inbox where file_name = '" + jsonfilename + "'") dbobj.executeManyQuery(qry_string, insert_list) #print('filename: ', self.filename) #dt = datetime.date.today() #valid_datelist = date_util.getDateListString(self.dt) #rest_businessobj.saveValidatedJsonFile(validated_datelist) with self.output().open('w') as output: output.write("Done") except Exception as e: print('Failed Insert ', self.filename, ': ', e)
def main(self, sc, *args): sqlContext = SQLContext(sc) # Connect to MySQL table and return data frame df_rest_data = sqlContext.read.format("jdbc").options( url="jdbc:mysql://192.168.1.221:3306/restaurantdb", driver="com.mysql.jdbc.Driver", dbtable="vw_restaurant_inbox", user="******", password="******").load() # aggragate data and return dataframe df_rest_data2 = df_rest_data.filter("boro is not null").groupBy( "boro", "dba").agg( F.avg("grade").alias("grade_avg"), F.count("*").alias("grade_count")) df_rest_data3 = df_rest_data2.select( "boro", "dba", "grade_avg", "grade_count", F.row_number().over( Window.partitionBy("boro").orderBy( F.desc("grade_avg"), F.desc("grade_count"))).alias( "row_num")).filter("row_num = 1") import time insert_list = [] for r in df_rest_data3.collect(): # insert_list.append(r(i) for i in (range(0, len(r)))) insert_list.append((r.boro, r.dba, r.grade_avg, r.grade_count, time.strftime('%Y-%m-%d %H:%M:%S'), time.strftime('%Y-%m-%d %H:%M:%S'))) # for r in insert_list: # print(r) # insert_list = [] # for r in df_rest_data3.collect(): # mydict = r.asDict(True) # mydict['date_created'] = datetime.now() # mydict['date_modifed'] = datetime.now() # insert_list.append(mydict) dbobj = dbUtil("192.168.1.221", "restuser", "restuser", "restaurantdb") if dbobj: #dbobj.executeQuery('truncate table recommendation_boro;') dbobj.executeQuery('update recommendation_boro set is_latest = 0;') dbobj.executeManyQuery( "insert into recommendation_boro (boro, dba, grade_average, grade_count, date_created, date_modified) values (%s, %s, %s, %s, %s, %s)", insert_list) dbobj.executeQuery( 'update recommendation_boro set is_latest = 1 where is_latest is NULL;' ) dbobj.executeCommit() with self.output().open('w') as output: output.write("Done") print(datetime.now() - startTime)
def test_conn(self): """ Test database connection success """ dbconn = dbUtil(config.db_config['host'], config.db_config['username'], config.db_config['password'], config.db_config['database_name']) resultset = dbconn.executeQuery("select 1 col1") if len(resultset) > 0: result = True else: result = False self.assertEqual(result, True)
def requires(self): #list within 1 month #------------------------------------------------------ valid_datelist = date_util.getDateListString(self.dt) file_folder = config.download_root_folder + '/data/' #file_folder = './data/' file_folder = os.path.abspath(file_folder) #print('abs folder path: ', file_folder) file_pattern = 'restaurant_data_*.json' #print(file_folder + file_pattern) #Search all *.json files downloaded/saved # ------------------------------------------------------ saved_list = glob.glob(file_folder + '/' + file_pattern) #print(saved_list) #valid date list are files downloaded within 30 days validated_datelist = sorted(list(filter(lambda l: (l[-15:-5] in valid_datelist), saved_list))) # print(validated_datelist) #dbobj = dbUtil("192.168.1.221", "restuser", "restuser", "restaurantdb", True) dbobj = dbUtil(config.db_config['host'], config.db_config['username'], config.db_config['password'], config.db_config['database_name']) if dbobj: resultobj = dbobj.executeQuery("select distinct file_name from restaurant_inbox order by file_name desc") #print(resultobj) #processed list is the list inserted into database processed_filelist = [' '.join(item) for item in resultobj] # print("--------------------\n") # print(processed_filelist) #dbinsert list is the new file list hasn't inserted into database dbinsert_datelist = sorted(list(filter(lambda l: (l[-31:] not in processed_filelist), validated_datelist))) # print("--------------------\n") # print(dbinsert_datelist) #for r in dbinsert_datelist: # print(r) return [SaveRestaurantData(filename) for filename in dbinsert_datelist]