def init(self, input_dict=None, block_params=None, program_arguments=None): try: self.init_log_and_metric_handlers(block_params) self.logger.info('Read stream block') self.logger.info('Input params:') self.logger.info(json.dumps(input_dict, sort_keys=True, indent=2)) sdk.resource_monitor_handler(block_params).post_cpu_mem_usage() return except Exception as e: self.logger.error(traceback.format_exc()) self.block_status = "FAILED" raise e
def run(self, input_dict=None, block_params=None, program_arguments=None): try: t1 = time.time() output_dict = dict() kafka_handler = sdk.kafka_handler(None) kafka_api_instance = kafka_handler.get_api_instance() channels = {} for count in range(5): self.logger.info("For Count: " + str(count)) for key, data in input_dict.items(): topic = data['queueTopicName'] if not topic: continue print('queueTopicName: ' + topic) consumer_pool = { "count": 1000, "bufferSizeInMB": 1, "groupId": str(uuid.uuid4()), "registerId": "", "topicsListToSubscribe": [topic] } try: consumer_pool_res = kafka_api_instance.create_consumer_list_using_post( consumer_pool) channels[key] = consumer_pool_res.result except Exception as e: self.logger.error( "Error Trying To Create a Consumer Of Topic:" + str(topic)) self.block_status = "FAILED" raise e optional_param = {} optional_param["api_instance"] = kafka_api_instance optional_param["channels"] = channels self.field_list = {} arrary_of_threads = [] for key, data in input_dict.items(): topic = data['queueTopicName'] if not topic: continue req = {"topicName": topic} try: schema = kafka_api_instance.get_topic_meta_using_post( req) schema = json.loads( json.loads(schema.result)["schema"]) optional_param['schema'] = schema self.logger.debug("Schema Received") except Exception as e: self.logger.error("Error Fetching Schema") self.logger.error(str(e)) self.logger.error(traceback.format_exc()) self.block_status = "FAILED" raise e col_names = schema.keys() parsed_schema_dict = {} for name in col_names: values = schema.get(name) parsed_schema_dict[name] = values['type'] self.logger.info("schema") self.logger.info(schema) self.logger.info(parsed_schema_dict) self.list_of_struct_fields = [] self.field_list[key] = [] fpath = '/bigbrain/' + str(key) if os.path.exists(fpath): rmtree(fpath) os.makedirs(fpath, exist_ok=True) t = self.ReadRecords(topic, key, input_dict, block_params, optional_param, self.field_list) t.start() t.join() # arrary_of_threads.append(t) # for t in arrary_of_threads: # t.join() print('All topics read done') self.logger.info("Output:") self.logger.info(json.dumps(output_dict, indent=2)) output_dict["queueTopicName"] = "" output_dict['readerInfo'] = None output_dict['readerInfoError'] = None output_dict["infoKeys"] = None self.logger.info("Output:") self.logger.info(json.dumps(output_dict, indent=2)) return output_dict except Exception as e: self.logger.error(traceback.format_exc()) self.block_status = "FAILED" raise e
def run(self, input_dict=None, block_params=None, program_arguments=None): try: t1 = time.time() output_dict = dict() configs = input_dict["Config"] # test_args = {'spark.app.name': 'spark_app_test', 'spark.shuffle.service.enabled': 'true', 'spark.dynamicAllocation.minExecutors': '1', 'spark.dynamicAllocation.enabled': 'true'} queue_dict = {} queue_dict['left_df'] = input_dict['leftData']['queueTopicName'] queue_dict['right_df'] = input_dict['rightData']['queueTopicName'] kafka_handler = sdk.kafka_handler(None) kafka_api_instance = kafka_handler.get_api_instance() channels = {} for key, topic in queue_dict.items(): consumer_pool = { "count": 1, "groupId": str(uuid.uuid4()), "registerId": "", "topicsListToSubscribe": [topic] } try: consumer_pool_res = kafka_api_instance.create_consumer_list_using_post( consumer_pool) channels[key] = consumer_pool_res.result except Exception as e: self.logger.error( "Error Trying To Create a Consumer Of Topic:" + str(topic)) self.block_status = "FAILED" raise e optional_param = {} optional_param['queue_dict'] = queue_dict optional_param["api_instance"] = kafka_api_instance optional_param["channels"] = channels self.spark = SparkConfCustom().get_spark_session() self.spark.sparkContext.setLogLevel('ERROR') self.spark_schema = {} self.field_list = {} print('waiting') # time.sleep(200) arrary_of_threads = [] for key, topic in queue_dict.items(): req = {"topicName": topic} try: schema = kafka_api_instance.get_topic_meta_using_post(req) schema = json.loads(json.loads(schema.result)["schema"]) optional_param['schema'] = schema self.logger.debug("Schema Received") except Exception as e: self.logger.error("Error Fetching Schema") self.logger.error(str(e)) self.logger.error(traceback.format_exc()) self.block_status = "FAILED" raise e col_names = schema.keys() parsed_schema_dict = {} for name in col_names: values = schema.get(name) parsed_schema_dict[name] = values['type'] self.logger.info("schemaaaa hereeee") self.logger.info(schema) self.logger.info(parsed_schema_dict) self.list_of_struct_fields = [] self.field_list[key] = [] for name in parsed_schema_dict.keys(): if parsed_schema_dict[name] == 'FloatType()': self.field_list[key].append(('float')) self.list_of_struct_fields.append( StructField(name, FloatType(), True)) elif parsed_schema_dict[name] == 'IntegerType()': self.field_list[key].append('int') self.list_of_struct_fields.append( StructField(name, IntegerType(), True)) elif parsed_schema_dict[name] == 'DoubleType()': self.field_list[key].append('float') self.list_of_struct_fields.append( StructField(name, DoubleType(), True)) else: self.field_list[key].append('string') self.list_of_struct_fields.append( StructField(name, StringType(), True)) self.spark_schema[key] = StructType(self.list_of_struct_fields) fpath = '/bigbrain/' + str(key) if os.path.exists(fpath): rmtree(fpath) os.makedirs(fpath, exist_ok=True) t = self.ReadRecords(self.spark, topic, key, self.spark_schema[key], input_dict, block_params, optional_param, self.field_list) t.start() arrary_of_threads.append(t) for t in arrary_of_threads: t.join() print('Both topics read done') # self.stream_block(input_dict=input_dict, block_params=block_params, optional_arg=optional_param) self.left_df = self.spark.read.parquet('/bigbrain/left_df') print(self.left_df.count()) self.right_df = self.spark.read.parquet('/bigbrain/right_df') print(self.right_df.count()) exec("self.resultant_join_df" + "=" + "self.left_df.join(self.right_df,self.left_df['" + configs['unique_key_left'] + "']== self.right_df['" + configs['unique_key_right'] + "'] ,how='" + configs['join_type'] + "')") print(self.left_df.rdd.getNumPartitions()) print(self.right_df.rdd.getNumPartitions()) new_column_name_list = self.resultant_join_df.columns renamed_cols = {} for col in new_column_name_list: count = new_column_name_list.count(col) if count > 1: idx = new_column_name_list.index(col) new_column_name_list[idx] = 'bbl_' + col print(self.resultant_join_df.columns) self.resultant_join_df = self.resultant_join_df.toDF( *new_column_name_list) print(self.resultant_join_df.columns) self.resultant_join_df.write.csv('/tmp/test') print(self.resultant_join_df.rdd.getNumPartitions()) self.logger.info("*****************************") self.logger.info("Join Completed") # create topic for the result to be stored kafka_handler = sdk.kafka_handler(None) api_instance = kafka_handler.get_api_instance() # preprocess and detect schema, add meta block_start_time = time.time() schema_new = self.get_df_schema(self.resultant_join_df) operationalParams = {} operationalParams["api_instance"] = api_instance operationalParams["block_start_time"] = block_start_time operationalParams["dataframe_name"] = "resultant_join_df" operationalParams["data_frame"] = self.resultant_join_df operationalParams["schema"] = schema_new try: resultant_topic = str(uuid.uuid4()) topic = { "name": resultant_topic, "identifiers": block_params, "displayName": 'resultantQueueTopic' } topic_res = api_instance.create_topic_using_post(topic) topic_name = { "name": resultant_topic, "metaData": { "schema": json.dumps(schema_new) } } api_instance.update_topic_meta_using_post(topic_name) self.logger.info(f"Schema Added For Topic {resultant_topic}") except Exception as e: self.logger.error("Error creating resultant topic " + str(resultant_topic)) raise e operationalParams["topic_id"] = resultant_topic if input_dict['Config']['streamOutput']: self.stream_block2(input_dict=input_dict, optional_arg=operationalParams, block_params=block_params) else: self.data_target_params = input_dict["DataTarget"] try: self.validate_target_params() except Exception as e: self.logger.error(str(e)) raise e try: self.client = self.validate_hdfs_connection( input_dict=input_dict, block_params=block_params) self.data_target_params[ 'fileWithFullPath'] = self.data_target_params[ 'filePath'] if not self.data_target_params['overwrite']: exists = self.file_exits(hdfs_connection=self.client) if exists: raise FileExistsError( "File Already Exists: " + str(self.data_target_params['filePath'])) except Exception as e: self.logger.error(str(e)) raise e self.block_write(self.client, '/tmp/test') self.logger.info("Output:") self.logger.info(json.dumps(output_dict, indent=2)) output_dict["queueTopicName"] = resultant_topic output_dict['readerInfo'] = None output_dict['readerInfoError'] = None output_dict["infoKeys"] = None self.logger.info("Output:") self.logger.info(json.dumps(output_dict, indent=2)) return output_dict except Exception as e: self.logger.error(traceback.format_exc()) self.block_status = "FAILED" raise e
def init_log_and_metric_handlers(self, block_params=None): self.logger = sdk.block_log_handler(block_params) self.metrics_handler = sdk.metrics_api() self.metrics["appId"] = MetricConfig.GRAPH_APP_ID self.metrics["keys"] = block_params return
def run(self, input_dict=None, block_params=None, program_arguments=None): try: t1 = time.time() output_dict = dict() configs = input_dict["Config"] # test_args = {'spark.app.name': 'spark_app_test', 'spark.shuffle.service.enabled': 'true', 'spark.dynamicAllocation.minExecutors': '1', 'spark.dynamicAllocation.enabled': 'true'} queue_dict = {} queue_dict['left_df'] = input_dict['leftData']['queueTopicName'] queue_dict['right_df'] = input_dict['rightData']['queueTopicName'] kafka_handler = sdk.kafka_handler(None) kafka_api_instance = kafka_handler.get_api_instance() channels = {} for key, topic in queue_dict.items(): consumer_pool = { "count": 1, "groupId": str(uuid.uuid4()), "registerId": "", "topicsListToSubscribe": [topic] } try: consumer_pool_res = kafka_api_instance.create_consumer_list_using_post( consumer_pool) channels[key] = consumer_pool_res.result except Exception as e: self.logger.error( "Error Trying To Create a Consumer Of Topic:" + str(topic)) self.block_status = "FAILED" raise e optional_param = {} optional_param['queue_dict'] = queue_dict optional_param["api_instance"] = kafka_api_instance optional_param["channels"] = channels self.spark = SparkConfCustom().get_spark_session() self.spark.sparkContext.setLogLevel('ERROR') self.spark_schema = {} self.field_list = {} print('waiting') # time.sleep(200) arrary_of_threads = [] for key, topic in queue_dict.items(): req = {"topicName": topic} try: schema = kafka_api_instance.get_topic_meta_using_post(req) schema = json.loads(json.loads(schema.result)["schema"]) optional_param['schema'] = schema self.logger.debug("Schema Received") except Exception as e: self.logger.error("Error Fetching Schema") self.logger.error(str(e)) self.logger.error(traceback.format_exc()) self.block_status = "FAILED" raise e col_names = schema.keys() parsed_schema_dict = {} for name in col_names: values = schema.get(name) parsed_schema_dict[name] = values['type'] self.logger.info("schemaaaa hereeee") self.logger.info(schema) self.logger.info(parsed_schema_dict) self.list_of_struct_fields = [] self.field_list[key] = [] for name in parsed_schema_dict.keys(): if parsed_schema_dict[name] == 'FloatType()': self.field_list[key].append(('float')) self.list_of_struct_fields.append( StructField(name, FloatType(), True)) elif parsed_schema_dict[name] == 'IntegerType()': self.field_list[key].append('int') self.list_of_struct_fields.append( StructField(name, IntegerType(), True)) elif parsed_schema_dict[name] == 'DoubleType()': self.field_list[key].append('float') self.list_of_struct_fields.append( StructField(name, DoubleType(), True)) else: self.field_list[key].append('string') self.list_of_struct_fields.append( StructField(name, StringType(), True)) self.spark_schema[key] = StructType(self.list_of_struct_fields) fpath = '/bigbrain/' + str(key) if os.path.exists(fpath): rmtree(fpath) os.makedirs(fpath, exist_ok=True) t = self.ReadRecords(self.spark, topic, key, self.spark_schema[key], input_dict, block_params, optional_param, self.field_list) t.start() arrary_of_threads.append(t) for t in arrary_of_threads: t.join() print('Both topics read done') # self.stream_block(input_dict=input_dict, block_params=block_params, optional_arg=optional_param) self.left_df = self.spark.read.parquet('/bigbrain/left_df') print(self.left_df.count()) self.right_df = self.spark.read.parquet('/bigbrain/right_df') print(self.right_df.count()) exec("self.resultant_join_df" + "=" + "self.left_df.join(self.right_df,self.left_df['" + configs['unique_key_left'] + "']== self.right_df['" + configs['unique_key_right'] + "'] ,how='" + configs['join_type'] + "')") print(self.left_df.rdd.getNumPartitions()) print(self.right_df.rdd.getNumPartitions()) new_column_name_list = self.resultant_join_df.columns renamed_cols = {} for col in new_column_name_list: count = new_column_name_list.count(col) if count > 1: idx = new_column_name_list.index(col) new_column_name_list[idx] = col + '_1' print(self.resultant_join_df.columns) self.resultant_join_df = self.resultant_join_df.toDF( *new_column_name_list) print(self.resultant_join_df.columns) temp_fp = '/bigbrain/' + str(t1) + '.csv' print(temp_fp) # os.makedirs(temp_fp, exist_ok=True) temp_join_time_st = time.time() self.resultant_join_df.write.mode("overwrite").option( "header", "true").csv(temp_fp) temp_join_time_end = time.time() print('Time for Join: ' + str(temp_join_time_end - temp_join_time_st) + ', File Partitions' + str(self.resultant_join_df.rdd.getNumPartitions())) self.logger.info("*****************************") self.logger.info("Join Completed") # self.logger.info("Count: " + str(self.resultant_join_df.count())) self.data_target_params = input_dict["DataTarget"] try: self.validate_target_params() except Exception as e: self.logger.error(str(e)) raise e try: self.client = self.validate_hdfs_connection( input_dict=input_dict, block_params=block_params) self.data_target_params[ 'fileWithFullPath'] = self.data_target_params['filePath'] exists = self.file_exits(hdfs_connection=self.client) if exists: if self.data_target_params['overwrite']: # remove file self.delete_file(hdfs_connection=self.client) self.append = False else: raise FileExistsError( "File Already Exists: " + str(self.data_target_params['filePath'])) except Exception as e: self.logger.error(str(e)) raise e write_start_time = time.time() self.logger.info("Writing to HDFS:") self.block_folder_write(self.client, temp_fp) # if os.path.isdir(temp_fp): # self.logger.info("dir") # for filename in os.listdir(temp_fp): # print(filename) # if filename.endswith(".csv"): # csv_path = temp_fp + '/' + filename # print(csv_path) # self.block_line_write(self.client, csv_path) # else: # self.block_line_write(self.client, temp_fp) print('Time for Join: ' + str(temp_join_time_end - temp_join_time_st)) self.logger.info("Time taken to write to HDFS: " + str(time.time() - write_start_time)) self.logger.info("Output:") self.logger.info(json.dumps(output_dict, indent=2)) output_dict["queueTopicName"] = '' output_dict['readerInfo'] = None output_dict['readerInfoError'] = None output_dict["infoKeys"] = None self.logger.info("Output:") self.logger.info(json.dumps(output_dict, indent=2)) return output_dict except Exception as e: self.logger.error(traceback.format_exc()) self.block_status = "FAILED" raise e