def module_task(self, params): self.logger.info( 'Starting Hbase-HBase ETL using Hadoop to clean billing data...') """CHECK INCONSISTENCIES IN params""" try: result_companyId = params['result_companyId'] data_companyId = params[ 'data_companyId'] if 'data_companyId' in params else [] ts_to = params['ts_to'] ts_from = params[ 'ts_from'] if 'ts_from' in params else date_n_month( ts_to, -96) energyTypeList = params['type'] if 'type' in params else [] except KeyError as e: raise Exception('Mandatory Parameter not provided: {}'.format(e)) ###################################################################################################################################################################################### """ GET DATA FROM MONGO TO MAKE QUERYS """ ###################################################################################################################################################################################### if not energyTypeList: energyTypeList = list( set([ x['type'] for x in self.mongo['readings'].find({}, {'type': 1}) ])) if not data_companyId: data_companyId = list( set([ x['companyId'] for x in self.mongo['companies'].find({}, {'companyId': 1}) ])) ###################################################################################################################################################################################### """ HIVE QUERY TO GET HBASE DATA """ ###################################################################################################################################################################################### for measure_config in self.config['measures']: # Create temp tables with hbase data, add them to context_clean to be deleted after execution tables = [] type_table_name = measure_config["hbase_table"] tables_source = [] tables_energyType = [] self.logger.info('creating {} tables for {} and {}'.format( type_table_name, energyTypeList, data_companyId)) tables_list = self.hbase.tables() for energyType in energyTypeList: for companyId in data_companyId: try: table_name = "{}_{}_{}".format(type_table_name, energyType, companyId) if table_name not in tables_list: continue hbase_table_name = "{}{}{}".format( self.config['hbase']['db'], self.config['hbase']['db_separator'], table_name) keys = measure_config['hbase_keys'] columns = measure_config['hbase_columns'] temp_table = create_hive_table_from_hbase_table( self.hive, table_name, hbase_table_name, keys, columns, self.task_UUID) tables.append(temp_table) self.context.add_clean_hive_tables(temp_table) tables_energyType.append(energyType) tables_source.append(companyId) self.logger.debug( "Created table: {}".format(temp_table)) except Exception as e: self.logger.debug("Error creating table: {}".format(e)) self.logger.debug(len(tables)) fields = measure_config["hive_fields"] location = measure_config['measures'].format(UUID=self.task_UUID) self.context.add_clean_hdfs_file(location) input_table = create_hive_module_input_table( self.hive, measure_config['temp_input_table'], location, fields, self.task_UUID) #add input table to be deleted after execution self.context.add_clean_hive_tables(input_table) qbr = RawQueryBuilder(self.hive) select = ", ".join( [f[0] for f in measure_config["sql_sentence_select"]]) sentence = """ INSERT OVERWRITE TABLE {input_table} SELECT {select} FROM ( """.format(select=select, input_table=input_table) letter = ["a{}".format(i) for i in range(len(tables) + 1)] text = [] for index, tab in enumerate(tables): var = letter[index] energy_type = tables_energyType[index] source = tables_source[index] select = ", ".join([ f[1] for f in measure_config["sql_sentence_select"] ]).format(var=var, energy_type=energy_type, source=source) where = measure_config["sql_where_select"].format( var=var, ts_from=ts_from, ts_to=ts_to) text.append(""" SELECT {select} FROM {tab} {var} WHERE {where} """.format(var=var, select=select, tab=tab, where=where)) sentence += """UNION """.join(text) sentence += """) unionResult """ self.logger.debug(sentence) try: qbr.execute_query(sentence) except: continue ##################################################################################################################################################################################### """ SETUP MAP REDUCE JOB """ ###################################################################################################################################################################################### # remove previous raw_data results output_fields = self.config['output']['fields'] clean_tables = [] for measure_config in self.config['measures']: clean_file_name = measure_config['clean_output_file'].format( UUID=self.task_UUID) self.context.add_clean_hdfs_file(clean_file_name) clean_table_name = measure_config['clean_output_table'] self.logger.debug('Launching MR job to clean the daily data') try: # Launch MapReduce job self.launcher_hadoop_job( measure_config['type'], measure_config['measures'].format(UUID=self.task_UUID), clean_file_name, result_companyId) except Exception as e: raise Exception('MRJob process has failed: {}'.format(e)) clean_table = create_hive_module_input_table( self.hive, clean_table_name, clean_file_name, output_fields, self.task_UUID) self.context.add_clean_hive_tables(clean_table) clean_tables.append([clean_table, measure_config['type']]) self.logger.debug("MRJob finished for {}".format( measure_config['type'])) ###################################################################################################################################################################################### """ Join the output in a hive table """ ###################################################################################################################################################################################### output_file_name = self.config['output']['output_file_name'] output_hive_name = self.config['output']['output_hive_table'] output_hive_table = create_hive_module_input_table( self.hive, output_hive_name, output_file_name, output_fields) try: for i in self.hdfs.delete([output_file_name], recurse=True): try: i except: pass except: pass select = ", ".join( [f[0] for f in self.config['output']["sql_sentence_select"]]) sentence = """ INSERT OVERWRITE TABLE {output_table} SELECT {select} FROM ( """.format(select=select, output_table=output_hive_table) letter = ["a{}".format(i) for i in range(len(clean_tables) + 1)] text = [] for index, tab in enumerate(clean_tables): var = letter[index] select = ", ".join([ f[1] for f in self.config['output']["sql_sentence_select"] ]).format(var=var, data_type=tab[1]) text.append(""" SELECT {select} FROM {tab} {var} """.format(var=var, select=select, tab=tab[0])) sentence += """UNION """.join(text) sentence += """) unionResult """ self.logger.debug(sentence) qbr = RawQueryBuilder(self.hive) qbr.execute_query(sentence) self.logger.info( 'MHbase-HBase ETL clean billing data execution finished...')
def module_task(self, params): self.logger.info('Starting Module for edinet baseline...') """CHECK INCONSISTENCIES IN params""" try: result_companyId = params['result_companyId'] ts_to = params['ts_to'] ts_from = params[ 'ts_from'] if 'ts_from' in params else date_n_month( ts_to, -24) energyTypeList = params['type'] if 'type' in params else [] except KeyError as e: raise Exception( 'Not enough parameters provided to module: {}'.format(e)) ###################################################################################################################################################################################### """ GET DATA FROM MONGO TO MAKE QUERYS """ ###################################################################################################################################################################################### if not energyTypeList: energyTypeList = list( set([ x['type'] for x in self.mongo['readings'].find({}, {'type': 1}) ])) ##################################################################################################################################################################################### """ LOAD DATA FROM HIVE """ ###################################################################################################################################################################################### self.logger.info('Extracting data from mongodb') # setting variables for readability collection = self.config['mongodb']['modelling_units_collection'] self.logger.debug('Querying for modelling units in MongoDB') cursor = self.mongo[collection].find({}) device_key = {} stations = {} for item in cursor: if len(item['devices']) > 0: # to avoid empty list of devices for dev in item['devices']: stations[str(dev['deviceId'].encode('utf-8'))] = str( item['stationId']) if 'stationId' in item else None if str(dev['deviceId'].encode( 'utf-8')) in device_key.keys(): device_key[str( dev['deviceId'].encode('utf-8'))].append( str(item['modellingUnitId']) + '~' + str(item['devices'])) else: device_key[str(dev['deviceId'].encode('utf-8'))] = [ str(item['modellingUnitId']) + '~' + str(item['devices']) ] self.logger.info('A mongo query process has loaded {} devices'.format( len(device_key.keys()))) ###################################################################################################################################################################################### """ HIVE QUERY TO PREPARE DATA FOR MRJOB """ ###################################################################################################################################################################################### # create a table to link devices with stations device_stations_df = pd.DataFrame(data={ "deviceId": stations.keys(), "stationId": stations.values() }, columns=["deviceId", "stationId"]) f = NamedTemporaryFile(delete=False, suffix='.csv') device_stations_df.to_csv(f.name, header=None, index=None) f.close() call([ "hadoop", "fs", "-mkdir", "-p", f.name, self.config['paths']['stations'] ]) call([ "hadoop", "fs", "-copyFromLocal", f.name, self.config['paths']['stations'] ]) f.unlink(f.name) device_stations = create_hive_module_input_table( self.hive, 'edinet_device_stations_table', self.config['paths']['stations'], [('deviceId', 'string'), ('stationId', 'string')], self.task_UUID, sep=",") self.context.add_clean_hive_tables(device_stations) # create a table with the devices values fields = [('deviceId', 'string'), ('ts', 'int'), ('value', 'float'), ('energyType', 'string'), ('source', 'string'), ('temperature', 'string')] location = self.config['paths']['measures'] input_table = create_hive_module_input_table(self.hive, 'edinet_baseline_input', location, fields, self.task_UUID) #add input table to be deleted after execution self.context.add_clean_hive_tables(input_table) qbr = RawQueryBuilder(self.hive) sentence = """ INSERT OVERWRITE TABLE {input_table} SELECT a.deviceId, a.ts, a.value, a.energyType, a.source, c.temperature FROM (SELECT ai.deviceid as deviceId, ai.ts as ts, ai.value as value, ai.energyType as energyType, ai.source as source FROM edinet_hourly_consumption ai WHERE ai.ts >= UNIX_TIMESTAMP("{ts_from}","yyyy-MM-dd HH:mm:ss") AND ai.ts <= UNIX_TIMESTAMP("{ts_to}","yyyy-MM-dd HH:mm:ss")) a JOIN {device_stations} b on a.deviceId==b.deviceId JOIN edinet_meteo c on b.stationId==c.stationId and SUBSTR(FROM_UNIXTIME(a.ts), 1, 13) == SUBSTR(FROM_UNIXTIME(c.ts), 1, 13) """.format(input_table=input_table, ts_from=ts_from, ts_to=ts_to, device_stations=device_stations) self.logger.debug(sentence) qbr.execute_query(sentence) ###################################################################################################################################################################################### """ SETUP MAP REDUCE JOB """ ###################################################################################################################################################################################### self.logger.info('Getting') try: # Launch MapReduce job ## Buffered measures to HBase self.logger.debug('MRJob Align') self.launcher_hadoop_job('align', location, result_companyId, device_key, stations) except Exception as e: raise Exception('MRJob ALIGN process job has failed: {}'.format(e)) self.logger.info('Module EDINET_baseline execution finished...')
def module_task(self, params): self.logger.info('Starting Module for edinet baseline...') """CHECK INCONSISTENCIES IN params""" try: result_companyId = params['result_companyId'] ts_to = params['ts_to'] ts_from = params[ 'ts_from'] if 'ts_from' in params else date_n_month( ts_to, -24) energyTypeList = params['type'] if 'type' in params else [] save_data_debug = True if 'debug' in params and params[ 'debug'] else False except KeyError as e: raise Exception( 'Not enough parameters provided to module: {}'.format(e)) ##################################################################################################################################################################################### """ LOAD DATA FROM MONGO MODELLING UNITS """ ###################################################################################################################################################################################### self.logger.info('Extracting modelling_units from mongo') # setting variables for readability modelling_units_collection = self.config['mongodb'][ 'modelling_units_collection'] weather_stations_collection = self.config['mongodb'][ 'weather_stations_collection'] lon_lat_tz_dict = {} tf = TimezoneFinder(in_memory=True) self.logger.debug('Querying for weather station info in MongoDB') cursor = self.mongo[weather_stations_collection].find({}) for station in cursor: lon_lat_tz_dict[station['stationId']] = { "lat": station['latitude'], "lon": station['longitude'], "tz": tf.timezone_at(lat=station['latitude'], lng=station['longitude']) } cursor.close() tf = None device_key = {} stations = {} solar_station = {} self.logger.debug('Querying for modelling unit info in MongoDB') cursor = self.mongo[modelling_units_collection].find({}) for item in cursor: if len(item['devices']) > 0 and item[ 'stationId'] != "Unknown": # to avoid empty list of devices for dev in item['devices']: stations[dev['deviceId']] = item[ 'stationId'] if 'stationId' in item else None solar_station[dev['deviceId']] = item[ 'solar_station'] if 'solar_station' in item else None key_str = "{modelling}~{devices}~{lat}~{lon}~{tz}".format( modelling=item['modellingUnitId'], devices=item['devices'], lat=lon_lat_tz_dict[item['stationId']]['lat'], lon=lon_lat_tz_dict[item['stationId']]['lon'], tz=lon_lat_tz_dict[item['stationId']]['tz']) if dev['deviceId'] in device_key.keys(): device_key[dev['deviceId']].append(key_str) else: device_key[dev['deviceId']] = [key_str] cursor.close() self.logger.info('A mongo query process has loaded {} devices'.format( len(device_key.keys()))) ###################################################################################################################################################################################### """ CREATE INPUT DATA FROM HIVE TABLES """ ###################################################################################################################################################################################### # create a table to link devices with stations self.logger.debug('creating weather hive table') weather_stations_df = pd.DataFrame(data={ "deviceId": list(stations.keys()), "stationId": list(stations.values()) }, columns=["deviceId", "stationId"]) f_station = NamedTemporaryFile(delete=False, suffix='.csv') weather_stations_df.to_csv(f_station.name, header=None, index=None) call([ "hadoop", "fs", "-mkdir", "-p", f_station.name, self.config['paths']['stations'] ]) call([ "hadoop", "fs", "-copyFromLocal", f_station.name, self.config['paths']['stations'] ]) weather_stations = create_hive_module_input_table( self.hive, 'edinet_weather_stations_table', self.config['paths']['stations'], [('deviceId', 'string'), ('stationId', 'string')], self.task_UUID, sep=",") self.context.add_clean_hive_tables(weather_stations) # create a table to link devices with solar_stations self.logger.debug('creating solar hive table') solar_stations_df = pd.DataFrame(data={ "deviceId": list(solar_station.keys()), "stationId": list(solar_station.values()) }, columns=["deviceId", "stationId"]) f_solar_station = NamedTemporaryFile(delete=False, suffix='.csv') solar_stations_df.to_csv(f_solar_station.name, header=None, index=None) call([ "hadoop", "fs", "-mkdir", "-p", f_solar_station.name, self.config['paths']['solar_stations'] ]) call([ "hadoop", "fs", "-copyFromLocal", f_solar_station.name, self.config['paths']['solar_stations'] ]) solar_stations = create_hive_module_input_table( self.hive, 'edinet_solar_stations_table', self.config['paths']['solar_stations'], [('deviceId', 'string'), ('stationId', 'string')], self.task_UUID, sep=",") self.context.add_clean_hive_tables(solar_stations) # create a table with the devices values self.logger.debug('creating input table') final_table_fields = [ [x[0], x[1]] for x in self.config['hive']['final_table_fields'] ] location = self.config['paths']['measures'] input_table = create_hive_module_input_table( self.hive, self.config['hive']['job_table_name'], location, final_table_fields, self.task_UUID) #add input table to be deleted after execution self.context.add_clean_hive_tables(input_table) self.logger.debug('creating hive query') qbr = RawQueryBuilder(self.hive) total_select_joint = ", ".join([ "{}.{}".format(x[2], x[0]) for x in self.config['hive']['final_table_fields'] ]) sentence = """ INSERT OVERWRITE TABLE {input_table} SELECT {total_select_joint} FROM (SELECT ai.deviceid as deviceId, ai.ts as ts, ai.value as value, ai.energyType as energyType FROM edinet_hourly_consumption ai WHERE ai.ts >= UNIX_TIMESTAMP("{ts_from}","yyyy-MM-dd HH:mm:ss") AND ai.ts <= UNIX_TIMESTAMP("{ts_to}","yyyy-MM-dd HH:mm:ss") AND ai.deviceid IN ({devices})) a JOIN {weather_stations} b on a.deviceId==b.deviceId JOIN {solar_stations} b1 on a.deviceId==b1.deviceId JOIN edinet_meteo c on b.stationId==c.stationId and SUBSTR(FROM_UNIXTIME(a.ts), 1, 13) == SUBSTR(FROM_UNIXTIME(c.ts), 1, 13) JOIN edinet_meteo d on b1.stationId==d.stationId and SUBSTR(FROM_UNIXTIME(a.ts), 1, 13) == SUBSTR(FROM_UNIXTIME(d.ts), 1, 13) """.format(input_table=input_table, total_select_joint=total_select_joint, ts_from=ts_from, ts_to=ts_to, weather_stations=weather_stations, solar_stations=solar_stations, devices=", ".join("\"{}\"".format(x) for x in list(device_key.keys()))) self.logger.debug(sentence) qbr.execute_query(sentence) ##################################################################################################################################################################################### """ LOAD from MONGO to HBASE """ ###################################################################################################################################################################################### self.logger.info('Getting') try: # Launch MapReduce job ## Buffered measures to HBase self.logger.debug('Baseline Calculation') self.launcher_hadoop_job(location, device_key, result_companyId, save_data_debug) except Exception as e: raise Exception('MRJob ALIGN process job has failed: {}'.format(e)) self.logger.info('Module EDINET_baseline execution finished...')
def module_task(self, params): self.logger.info('Starting Module for edinet comparisons ...') """CHECK INCONSISTENCIES IN params""" try: result_companyId = params['result_companyId'] ts_to = params['ts_to'] ts_from = params[ 'ts_from'] if 'ts_from' in params else date_n_month( ts_to, -48) energyTypeDict = params['type'] if 'type' in params else { 'heatConsumption': 'gasConsumption', 'gasConsumption': 'gasConsumption', 'monthlyElectricityConsumption': 'electricityConsumption', 'electricityConsumption': 'electricityConsumption' } except KeyError as e: raise Exception( 'Not enough parameters provided to module: {}'.format(e)) ##################################################################################################################################################################################### """ LOAD from MONGO """ ###################################################################################################################################################################################### # Get the link between the devices and the modelling units. In the form of a dict {"device":{modelling_unit~{device:multiplier}} self.logger.info('Extracting data from mongodb') modelling_units_collection = self.config['mongodb'][ 'modelling_units_collection'] cursor = self.mongo[modelling_units_collection].find({}) device_key = {} def get_building(modelling_unit, mongo, building_collection, reporting_collection): building = mongo[building_collection].find_one( {"modellingUnits": modelling_unit}) if not building: reporting = mongo[reporting_collection].find_one( {"modelling_Units": modelling_unit}) if reporting and "reportingUnitId" in reporting: building = mongo[building_collection].find_one( {"buildingId": reporting['reportingUnitId']}) if not building: return None return building building_collection = self.config['mongodb']['buildings_collection'] reporting_collection = self.config['mongodb']['reporting_collection'] self.logger.debug("generating the device_key dict") for item in cursor: #self.logger.debug(item) #self.logger.debug("gettinng item building {}".format(item['modellingUnitId'])) building = get_building(item['modellingUnitId'], self.mongo, building_collection, reporting_collection) #self.logger.debug("obtained building {}".format(building)) if building and 'data' in building and 'areaBuild' in building[ 'data']: surface = building["data"]["areaBuild"] else: surface = None #self.logger.debug("area of building: {}".format(surface)) if len(item['devices'] ) > 0 and surface: # to avoid empty list of devices #self.logger.debug("list of devices {}".format(item['devices'])) for dev in item['devices']: key_str = "{modelling}~{devices}~{area}".format( modelling=item['modellingUnitId'], devices=item['devices'], area=surface) if dev['deviceId'] in device_key.keys(): device_key[dev['deviceId']].append(key_str) else: device_key[dev['deviceId']] = [key_str] #self.logger.debug("finished for {}".format(item['modellingUnitId'])) cursor.close() self.logger.info('A mongo query process has loaded {} devices'.format( len(device_key.keys()))) ###################################################################################################################################################################################### """ HIVE QUERY TO PREPARE DATA THAT HAS TO BE LOADED INTO MONGO """ ###################################################################################################################################################################################### # create a table with the devices values that will be the input of the MRJob that creates the monthly datatable. self.logger.debug('creating input table to aggregate monthly') final_table_fields = [ [x[0], x[1]] for x in self.config['hive']['final_table_fields'] ] location = self.config['paths']['monthly_aggregation'] input_table = create_hive_module_input_table( self.hive, self.config['hive']['job_table_name'], location, final_table_fields, self.task_UUID) #add input table to be deleted after execution self.context.add_clean_hive_tables(input_table) self.logger.debug('creating hive query') qbr = RawQueryBuilder(self.hive) total_select_joint = ", ".join([ "{}.{}".format(x[2], x[0]) for x in self.config['hive']['final_table_fields'] ]) sentence = """ INSERT OVERWRITE TABLE {input_table} SELECT {total_select_joint} FROM (SELECT ai.deviceid as deviceId, ai.ts as ts, ai.value as value, ai.energyType as energyType, ai.source as source FROM edinet_daily_consumption ai WHERE ai.ts >= UNIX_TIMESTAMP("{ts_from}","yyyy-MM-dd HH:mm:ss") AND ai.ts <= UNIX_TIMESTAMP("{ts_to}","yyyy-MM-dd HH:mm:ss") AND ai.deviceid IN ({devices})) a """.format(input_table=input_table, total_select_joint=total_select_joint, ts_from=ts_from, ts_to=ts_to, devices=", ".join("\"{}\"".format(x) for x in list(device_key.keys()))) self.logger.debug(sentence) qbr.execute_query(sentence) self.hive.close() gc.collect() ###################################################################################################################################################################################### """ MAPREDUCE TO AGGREGATE MONTHLY DATA """ ###################################################################################################################################################################################### self.logger.info('Running Mapreduce for Montly Aggregation') output_location = self.config['paths']['output_monthly_aggregation'] try: # Launch MapReduce job ## Buffered measures to HBase self.logger.debug('Montly Aggregation') self.aggregate_hadoop_job(location, output_location, device_key, result_companyId) except Exception as e: raise Exception('MRJob ALIGN process job has failed: {}'.format(e)) output_fields = [["modellingUnit", "string"], ["ts", "bigint"], ["value", "float"], ["energyType", "string"]] aggregated_table_name = self.config['hive'][ 'output_monthly_aggregation'] aggregated_table = create_hive_module_input_table( self.hive, aggregated_table_name, output_location, output_fields, self.task_UUID) self.context.add_clean_hive_tables(aggregated_table) self.logger.debug("MRJob for monthly aggregation finished") ###################################################################################################################################################################################### """ MAPREDUCE TO CALCULATE BENCHMARKING """ ###################################################################################################################################################################################### self.logger.debug('creating benchmarking information table') building_collection = self.config['mongodb']['buildings_collection'] cursor = self.mongo[building_collection].find({}) buildings_list = [] for item in cursor: if not 'modellingUnits' in item or not 'data' in item: continue if not 'useType' in item[ 'data'] or not 'organizationLevel1' in item['data']: continue for modelling in item['modellingUnits']: b_dic = { "modellingunit": modelling, "type": item['data']['useType'], "organization": item['data']['organizationLevel1'] } buildings_list.append(b_dic) cursor.close() buildings_df = pd.DataFrame.from_records( buildings_list, columns=['modellingunit', 'type', 'organization']) f_station = NamedTemporaryFile(delete=False, suffix='.csv') buildings_df.to_csv(f_station.name, header=None, index=None) call([ "hadoop", "fs", "-mkdir", "-p", f_station.name, self.config['paths']['building_info'] ]) call([ "hadoop", "fs", "-copyFromLocal", f_station.name, self.config['paths']['building_info'] ]) building_table = create_hive_module_input_table( self.hive, self.config['hive']['building_info_table'], self.config['paths']['building_info'], [('modellingunit', 'string'), ('type', 'string'), ('organization', 'string')], self.task_UUID, sep=",") self.context.add_clean_hive_tables(building_table) self.logger.debug('creating hive query to join data with information') qbr = RawQueryBuilder(self.hive) location = self.config['paths']['benchmarking_data'] benchmarking_field = self.config['hive']['benchmarking_table_fields'] benchmarking_table = create_hive_module_input_table( self.hive, self.config['hive']['benchmarking_table'], location, benchmarking_field, self.task_UUID) total_select_joint = ", ".join( ["{}.{}".format(x[2], x[0]) for x in benchmarking_field]) sentence = """ INSERT OVERWRITE TABLE {input_table} SELECT {total_select_joint} FROM (SELECT * FROM {aggregated_table}) a JOIN {building_table} b on a.modellingUnit==b.modellingUnit """.format(input_table=benchmarking_table, total_select_joint=total_select_joint, aggregated_table=aggregated_table, building_table=building_table) self.logger.debug(sentence) qbr.execute_query(sentence) self.logger.info('Running Mapreduce for Benchmarking') try: # Launch MapReduce job ## Buffered measures to HBase self.logger.debug('Benchmarking_calculation') self.benchmarking_hadoop_job(location, energyTypeDict, result_companyId) except Exception as e: raise Exception('MRJob ALIGN process job has failed: {}'.format(e)) self.logger.debug("MRJob for benchmarking finished")