Esempio n. 1
0
 def init(self, input_dict=None, block_params=None, program_arguments=None):
     try:
         self.init_log_and_metric_handlers(block_params)
         self.logger.info('Read stream block')
         self.logger.info('Input params:')
         self.logger.info(json.dumps(input_dict, sort_keys=True, indent=2))
         sdk.resource_monitor_handler(block_params).post_cpu_mem_usage()
         return
     except Exception as e:
         self.logger.error(traceback.format_exc())
         self.block_status = "FAILED"
         raise e
Esempio n. 2
0
    def run(self, input_dict=None, block_params=None, program_arguments=None):
        try:
            t1 = time.time()
            output_dict = dict()

            kafka_handler = sdk.kafka_handler(None)
            kafka_api_instance = kafka_handler.get_api_instance()
            channels = {}

            for count in range(5):
                self.logger.info("For Count: " + str(count))
                for key, data in input_dict.items():
                    topic = data['queueTopicName']
                    if not topic:
                        continue
                    print('queueTopicName: ' + topic)
                    consumer_pool = {
                        "count": 1000,
                        "bufferSizeInMB": 1,
                        "groupId": str(uuid.uuid4()),
                        "registerId": "",
                        "topicsListToSubscribe": [topic]
                    }

                    try:
                        consumer_pool_res = kafka_api_instance.create_consumer_list_using_post(
                            consumer_pool)
                        channels[key] = consumer_pool_res.result
                    except Exception as e:
                        self.logger.error(
                            "Error Trying To Create a Consumer Of Topic:" +
                            str(topic))
                        self.block_status = "FAILED"
                        raise e

                optional_param = {}
                optional_param["api_instance"] = kafka_api_instance
                optional_param["channels"] = channels

                self.field_list = {}

                arrary_of_threads = []
                for key, data in input_dict.items():
                    topic = data['queueTopicName']
                    if not topic:
                        continue
                    req = {"topicName": topic}
                    try:
                        schema = kafka_api_instance.get_topic_meta_using_post(
                            req)
                        schema = json.loads(
                            json.loads(schema.result)["schema"])
                        optional_param['schema'] = schema
                        self.logger.debug("Schema Received")
                    except Exception as e:
                        self.logger.error("Error Fetching Schema")
                        self.logger.error(str(e))
                        self.logger.error(traceback.format_exc())
                        self.block_status = "FAILED"
                        raise e

                    col_names = schema.keys()
                    parsed_schema_dict = {}

                    for name in col_names:
                        values = schema.get(name)
                        parsed_schema_dict[name] = values['type']

                    self.logger.info("schema")
                    self.logger.info(schema)
                    self.logger.info(parsed_schema_dict)

                    self.list_of_struct_fields = []
                    self.field_list[key] = []

                    fpath = '/bigbrain/' + str(key)
                    if os.path.exists(fpath):
                        rmtree(fpath)
                    os.makedirs(fpath, exist_ok=True)
                    t = self.ReadRecords(topic, key, input_dict, block_params,
                                         optional_param, self.field_list)
                    t.start()
                    t.join()
                    # arrary_of_threads.append(t)

                # for t in arrary_of_threads:
                #     t.join()

                print('All topics read done')

            self.logger.info("Output:")
            self.logger.info(json.dumps(output_dict, indent=2))

            output_dict["queueTopicName"] = ""
            output_dict['readerInfo'] = None
            output_dict['readerInfoError'] = None
            output_dict["infoKeys"] = None

            self.logger.info("Output:")
            self.logger.info(json.dumps(output_dict, indent=2))

            return output_dict

        except Exception as e:
            self.logger.error(traceback.format_exc())
            self.block_status = "FAILED"
            raise e
Esempio n. 3
0
    def run(self, input_dict=None, block_params=None, program_arguments=None):
        try:
            t1 = time.time()
            output_dict = dict()

            configs = input_dict["Config"]
            # test_args = {'spark.app.name': 'spark_app_test', 'spark.shuffle.service.enabled': 'true', 'spark.dynamicAllocation.minExecutors': '1', 'spark.dynamicAllocation.enabled': 'true'}

            queue_dict = {}

            queue_dict['left_df'] = input_dict['leftData']['queueTopicName']
            queue_dict['right_df'] = input_dict['rightData']['queueTopicName']

            kafka_handler = sdk.kafka_handler(None)
            kafka_api_instance = kafka_handler.get_api_instance()
            channels = {}

            for key, topic in queue_dict.items():
                consumer_pool = {
                    "count": 1,
                    "groupId": str(uuid.uuid4()),
                    "registerId": "",
                    "topicsListToSubscribe": [topic]
                }

                try:
                    consumer_pool_res = kafka_api_instance.create_consumer_list_using_post(
                        consumer_pool)
                    channels[key] = consumer_pool_res.result
                except Exception as e:
                    self.logger.error(
                        "Error Trying To Create a Consumer Of Topic:" +
                        str(topic))
                    self.block_status = "FAILED"
                    raise e

            optional_param = {}
            optional_param['queue_dict'] = queue_dict
            optional_param["api_instance"] = kafka_api_instance
            optional_param["channels"] = channels

            self.spark = SparkConfCustom().get_spark_session()
            self.spark.sparkContext.setLogLevel('ERROR')

            self.spark_schema = {}
            self.field_list = {}
            print('waiting')
            # time.sleep(200)

            arrary_of_threads = []
            for key, topic in queue_dict.items():
                req = {"topicName": topic}
                try:
                    schema = kafka_api_instance.get_topic_meta_using_post(req)
                    schema = json.loads(json.loads(schema.result)["schema"])
                    optional_param['schema'] = schema
                    self.logger.debug("Schema Received")
                except Exception as e:
                    self.logger.error("Error Fetching Schema")
                    self.logger.error(str(e))
                    self.logger.error(traceback.format_exc())
                    self.block_status = "FAILED"
                    raise e

                col_names = schema.keys()
                parsed_schema_dict = {}

                for name in col_names:
                    values = schema.get(name)
                    parsed_schema_dict[name] = values['type']

                self.logger.info("schemaaaa hereeee")
                self.logger.info(schema)
                self.logger.info(parsed_schema_dict)

                self.list_of_struct_fields = []
                self.field_list[key] = []

                for name in parsed_schema_dict.keys():
                    if parsed_schema_dict[name] == 'FloatType()':
                        self.field_list[key].append(('float'))
                        self.list_of_struct_fields.append(
                            StructField(name, FloatType(), True))
                    elif parsed_schema_dict[name] == 'IntegerType()':
                        self.field_list[key].append('int')
                        self.list_of_struct_fields.append(
                            StructField(name, IntegerType(), True))
                    elif parsed_schema_dict[name] == 'DoubleType()':
                        self.field_list[key].append('float')
                        self.list_of_struct_fields.append(
                            StructField(name, DoubleType(), True))
                    else:
                        self.field_list[key].append('string')
                        self.list_of_struct_fields.append(
                            StructField(name, StringType(), True))

                self.spark_schema[key] = StructType(self.list_of_struct_fields)
                fpath = '/bigbrain/' + str(key)
                if os.path.exists(fpath):
                    rmtree(fpath)
                os.makedirs(fpath, exist_ok=True)
                t = self.ReadRecords(self.spark, topic, key,
                                     self.spark_schema[key], input_dict,
                                     block_params, optional_param,
                                     self.field_list)
                t.start()
                arrary_of_threads.append(t)

            for t in arrary_of_threads:
                t.join()

            print('Both topics read done')

            # self.stream_block(input_dict=input_dict, block_params=block_params, optional_arg=optional_param)

            self.left_df = self.spark.read.parquet('/bigbrain/left_df')
            print(self.left_df.count())
            self.right_df = self.spark.read.parquet('/bigbrain/right_df')
            print(self.right_df.count())
            exec("self.resultant_join_df" + "=" +
                 "self.left_df.join(self.right_df,self.left_df['" +
                 configs['unique_key_left'] + "']== self.right_df['" +
                 configs['unique_key_right'] + "'] ,how='" +
                 configs['join_type'] + "')")

            print(self.left_df.rdd.getNumPartitions())
            print(self.right_df.rdd.getNumPartitions())

            new_column_name_list = self.resultant_join_df.columns
            renamed_cols = {}
            for col in new_column_name_list:
                count = new_column_name_list.count(col)
                if count > 1:
                    idx = new_column_name_list.index(col)
                    new_column_name_list[idx] = 'bbl_' + col

            print(self.resultant_join_df.columns)
            self.resultant_join_df = self.resultant_join_df.toDF(
                *new_column_name_list)
            print(self.resultant_join_df.columns)

            self.resultant_join_df.write.csv('/tmp/test')
            print(self.resultant_join_df.rdd.getNumPartitions())

            self.logger.info("*****************************")
            self.logger.info("Join Completed")

            # create topic for the result to be stored
            kafka_handler = sdk.kafka_handler(None)
            api_instance = kafka_handler.get_api_instance()

            # preprocess and detect schema, add meta
            block_start_time = time.time()

            schema_new = self.get_df_schema(self.resultant_join_df)

            operationalParams = {}
            operationalParams["api_instance"] = api_instance
            operationalParams["block_start_time"] = block_start_time
            operationalParams["dataframe_name"] = "resultant_join_df"
            operationalParams["data_frame"] = self.resultant_join_df
            operationalParams["schema"] = schema_new

            try:
                resultant_topic = str(uuid.uuid4())

                topic = {
                    "name": resultant_topic,
                    "identifiers": block_params,
                    "displayName": 'resultantQueueTopic'
                }
                topic_res = api_instance.create_topic_using_post(topic)

                topic_name = {
                    "name": resultant_topic,
                    "metaData": {
                        "schema": json.dumps(schema_new)
                    }
                }
                api_instance.update_topic_meta_using_post(topic_name)

                self.logger.info(f"Schema Added For Topic {resultant_topic}")

            except Exception as e:
                self.logger.error("Error creating resultant topic " +
                                  str(resultant_topic))
                raise e

            operationalParams["topic_id"] = resultant_topic

            if input_dict['Config']['streamOutput']:
                self.stream_block2(input_dict=input_dict,
                                   optional_arg=operationalParams,
                                   block_params=block_params)
            else:
                self.data_target_params = input_dict["DataTarget"]
                try:
                    self.validate_target_params()
                except Exception as e:
                    self.logger.error(str(e))
                    raise e

                try:
                    self.client = self.validate_hdfs_connection(
                        input_dict=input_dict, block_params=block_params)
                    self.data_target_params[
                        'fileWithFullPath'] = self.data_target_params[
                            'filePath']
                    if not self.data_target_params['overwrite']:
                        exists = self.file_exits(hdfs_connection=self.client)
                        if exists:
                            raise FileExistsError(
                                "File Already Exists: " +
                                str(self.data_target_params['filePath']))
                except Exception as e:
                    self.logger.error(str(e))
                    raise e

                self.block_write(self.client, '/tmp/test')

            self.logger.info("Output:")
            self.logger.info(json.dumps(output_dict, indent=2))

            output_dict["queueTopicName"] = resultant_topic
            output_dict['readerInfo'] = None
            output_dict['readerInfoError'] = None
            output_dict["infoKeys"] = None

            self.logger.info("Output:")
            self.logger.info(json.dumps(output_dict, indent=2))

            return output_dict

        except Exception as e:
            self.logger.error(traceback.format_exc())
            self.block_status = "FAILED"
            raise e
Esempio n. 4
0
 def init_log_and_metric_handlers(self, block_params=None):
     self.logger = sdk.block_log_handler(block_params)
     self.metrics_handler = sdk.metrics_api()
     self.metrics["appId"] = MetricConfig.GRAPH_APP_ID
     self.metrics["keys"] = block_params
     return
Esempio n. 5
0
    def run(self, input_dict=None, block_params=None, program_arguments=None):
        try:
            t1 = time.time()
            output_dict = dict()

            configs = input_dict["Config"]
            # test_args = {'spark.app.name': 'spark_app_test', 'spark.shuffle.service.enabled': 'true', 'spark.dynamicAllocation.minExecutors': '1', 'spark.dynamicAllocation.enabled': 'true'}

            queue_dict = {}

            queue_dict['left_df'] = input_dict['leftData']['queueTopicName']
            queue_dict['right_df'] = input_dict['rightData']['queueTopicName']

            kafka_handler = sdk.kafka_handler(None)
            kafka_api_instance = kafka_handler.get_api_instance()
            channels = {}

            for key, topic in queue_dict.items():
                consumer_pool = {
                    "count": 1,
                    "groupId": str(uuid.uuid4()),
                    "registerId": "",
                    "topicsListToSubscribe": [topic]
                }

                try:
                    consumer_pool_res = kafka_api_instance.create_consumer_list_using_post(
                        consumer_pool)
                    channels[key] = consumer_pool_res.result
                except Exception as e:
                    self.logger.error(
                        "Error Trying To Create a Consumer Of Topic:" +
                        str(topic))
                    self.block_status = "FAILED"
                    raise e

            optional_param = {}
            optional_param['queue_dict'] = queue_dict
            optional_param["api_instance"] = kafka_api_instance
            optional_param["channels"] = channels

            self.spark = SparkConfCustom().get_spark_session()
            self.spark.sparkContext.setLogLevel('ERROR')

            self.spark_schema = {}
            self.field_list = {}
            print('waiting')
            # time.sleep(200)

            arrary_of_threads = []
            for key, topic in queue_dict.items():
                req = {"topicName": topic}
                try:
                    schema = kafka_api_instance.get_topic_meta_using_post(req)
                    schema = json.loads(json.loads(schema.result)["schema"])
                    optional_param['schema'] = schema
                    self.logger.debug("Schema Received")
                except Exception as e:
                    self.logger.error("Error Fetching Schema")
                    self.logger.error(str(e))
                    self.logger.error(traceback.format_exc())
                    self.block_status = "FAILED"
                    raise e

                col_names = schema.keys()
                parsed_schema_dict = {}

                for name in col_names:
                    values = schema.get(name)
                    parsed_schema_dict[name] = values['type']

                self.logger.info("schemaaaa hereeee")
                self.logger.info(schema)
                self.logger.info(parsed_schema_dict)

                self.list_of_struct_fields = []
                self.field_list[key] = []

                for name in parsed_schema_dict.keys():
                    if parsed_schema_dict[name] == 'FloatType()':
                        self.field_list[key].append(('float'))
                        self.list_of_struct_fields.append(
                            StructField(name, FloatType(), True))
                    elif parsed_schema_dict[name] == 'IntegerType()':
                        self.field_list[key].append('int')
                        self.list_of_struct_fields.append(
                            StructField(name, IntegerType(), True))
                    elif parsed_schema_dict[name] == 'DoubleType()':
                        self.field_list[key].append('float')
                        self.list_of_struct_fields.append(
                            StructField(name, DoubleType(), True))
                    else:
                        self.field_list[key].append('string')
                        self.list_of_struct_fields.append(
                            StructField(name, StringType(), True))

                self.spark_schema[key] = StructType(self.list_of_struct_fields)
                fpath = '/bigbrain/' + str(key)
                if os.path.exists(fpath):
                    rmtree(fpath)
                os.makedirs(fpath, exist_ok=True)
                t = self.ReadRecords(self.spark, topic, key,
                                     self.spark_schema[key], input_dict,
                                     block_params, optional_param,
                                     self.field_list)
                t.start()
                arrary_of_threads.append(t)

            for t in arrary_of_threads:
                t.join()

            print('Both topics read done')

            # self.stream_block(input_dict=input_dict, block_params=block_params, optional_arg=optional_param)

            self.left_df = self.spark.read.parquet('/bigbrain/left_df')
            print(self.left_df.count())
            self.right_df = self.spark.read.parquet('/bigbrain/right_df')
            print(self.right_df.count())
            exec("self.resultant_join_df" + "=" +
                 "self.left_df.join(self.right_df,self.left_df['" +
                 configs['unique_key_left'] + "']== self.right_df['" +
                 configs['unique_key_right'] + "'] ,how='" +
                 configs['join_type'] + "')")

            print(self.left_df.rdd.getNumPartitions())
            print(self.right_df.rdd.getNumPartitions())

            new_column_name_list = self.resultant_join_df.columns
            renamed_cols = {}
            for col in new_column_name_list:
                count = new_column_name_list.count(col)
                if count > 1:
                    idx = new_column_name_list.index(col)
                    new_column_name_list[idx] = col + '_1'

            print(self.resultant_join_df.columns)
            self.resultant_join_df = self.resultant_join_df.toDF(
                *new_column_name_list)
            print(self.resultant_join_df.columns)

            temp_fp = '/bigbrain/' + str(t1) + '.csv'
            print(temp_fp)
            # os.makedirs(temp_fp, exist_ok=True)
            temp_join_time_st = time.time()
            self.resultant_join_df.write.mode("overwrite").option(
                "header", "true").csv(temp_fp)
            temp_join_time_end = time.time()
            print('Time for Join: ' +
                  str(temp_join_time_end - temp_join_time_st) +
                  ', File Partitions' +
                  str(self.resultant_join_df.rdd.getNumPartitions()))

            self.logger.info("*****************************")
            self.logger.info("Join Completed")
            # self.logger.info("Count: " + str(self.resultant_join_df.count()))

            self.data_target_params = input_dict["DataTarget"]
            try:
                self.validate_target_params()
            except Exception as e:
                self.logger.error(str(e))
                raise e

            try:
                self.client = self.validate_hdfs_connection(
                    input_dict=input_dict, block_params=block_params)
                self.data_target_params[
                    'fileWithFullPath'] = self.data_target_params['filePath']
                exists = self.file_exits(hdfs_connection=self.client)
                if exists:
                    if self.data_target_params['overwrite']:
                        # remove file
                        self.delete_file(hdfs_connection=self.client)
                        self.append = False
                    else:
                        raise FileExistsError(
                            "File Already Exists: " +
                            str(self.data_target_params['filePath']))
            except Exception as e:
                self.logger.error(str(e))
                raise e

            write_start_time = time.time()
            self.logger.info("Writing to HDFS:")
            self.block_folder_write(self.client, temp_fp)
            # if os.path.isdir(temp_fp):
            #     self.logger.info("dir")
            #     for filename in os.listdir(temp_fp):
            #         print(filename)
            #         if filename.endswith(".csv"):
            #             csv_path = temp_fp + '/' + filename
            #             print(csv_path)
            #             self.block_line_write(self.client, csv_path)
            # else:
            #     self.block_line_write(self.client, temp_fp)

            print('Time for Join: ' +
                  str(temp_join_time_end - temp_join_time_st))

            self.logger.info("Time taken to write to HDFS: " +
                             str(time.time() - write_start_time))

            self.logger.info("Output:")
            self.logger.info(json.dumps(output_dict, indent=2))

            output_dict["queueTopicName"] = ''
            output_dict['readerInfo'] = None
            output_dict['readerInfoError'] = None
            output_dict["infoKeys"] = None

            self.logger.info("Output:")
            self.logger.info(json.dumps(output_dict, indent=2))

            return output_dict

        except Exception as e:
            self.logger.error(traceback.format_exc())
            self.block_status = "FAILED"
            raise e