Ejemplo n.º 1
0
    def __init__(self, execution_system, algorithm_instance, algorithm_params):
        """
        Initialize generic Algorithm class

        :param execution_system: an instance of execution system
        :param algorithm_instance: name of the algorithm instance
        :param algorithm_params: algorithm configuration
        """

        self._execution_system = execution_system
        self._parameters = algorithm_params.get(AlgorithmConfigurationHadoop.Keys.PARAMETERS, {})

        param_file_basename = "{system}-{database}-{environment}.{algorithm}.{time}{extension}".format(
            system=self._execution_system.source_system,
            database=self._execution_system.database,
            environment=self._execution_system.environment,
            algorithm=algorithm_instance,
            time=Util.get_formatted_utc_now(EMRSystem.DATETIME_FORMAT),
            extension=ConfigService.Extensions.JSON
        )

        # derived
        dir_apps_algorithm_instance = os.path.join(
            self._execution_system.dir_apps_algorithm,
            algorithm_instance
        )

        self._params_uri_cluster = os.path.join(dir_apps_algorithm_instance, param_file_basename)
        self._params_uri_local = os.path.join(self._execution_system.config_service.dir_exec, param_file_basename)
Ejemplo n.º 2
0
    def execute_hive(self, hql, return_output=False):
        # Put HQL statement to a file since it can be longer than allowed length of EMR step parameter.
        datetime_str = Util.get_formatted_utc_now(EMRSystem.DATETIME_FORMAT)
        id_str = EMRSystem._generate_random_id()

        hql_filename = "{}.{}{}".format(datetime_str, id_str,
                                        ConfigService.Extensions.HQL)
        hql_path_local = os.path.join(self.dir_tmp_local, hql_filename)
        hql_path_s3 = os.path.join(self.dir_tmp_s3, hql_filename)

        with open(hql_path_local, "w") as hql_file:
            hql_file.write(hql)

        self.s3_util.upload_object(hql_path_local, hql_path_s3)

        # Create hive command line.
        hive_cmd = "hive --silent -f {}".format(hql_path_s3)

        # Add step to EMR cluster.
        step_name = "Hive EMR Step: datetime=\"{}\", id=\"{}\"".format(
            datetime_str, id_str)
        emr_step_id = self.emr_cluster_client.add_step(step_name, hive_cmd)

        self.emr_cluster_client.wait_for_step_completion(emr_step_id)

        if return_output:
            output_file = self.emr_cluster_client.get_step_output_path(
                emr_step_id)
            logging.info(
                "Waiting for availability of output file: '{}'.".format(
                    output_file))

            self.s3_util.wait_for_file_availability(
                output_file, self.emr_cluster_client.polling_interval_seconds,
                EMRClusterClient.AWSConstants.
                S3_FILE_AVAILABILITY_TIMEOUT_SECONDS)
            file_content = self.s3_util.read_gzip_file_content(output_file)

            return file_content

        return None