def get_metrics(self,
                    day,
                    aws_conn_id=DEFAULT_AWS_CONN_ID,
                    region_name=None,
                    metrics=None):
        """
        Get the metrics

        :param day:             Date to be exported as string in YYYY-MM-DD format or date/datetime instance (default: yesterday)
        :type day:              str, date or datetime
        :param aws_conn_id:     AWS connection id (default: aws_default)
        :type aws_conn_id:      str
        :param region_name:     AWS Region
        :type region_name:      str
        :param metrics:         Metrics
        :type metrics:          list
        """
        aws_hook = AwsHook(aws_conn_id)
        region_name = region_name or aws_hook.get_session().region_name
        if not day or (isinstance(day, str) and day.lower() == 'yesterday'):
            ds = datetime.today() - timedelta(days=1)
        elif isinstance(day, date):  # datetime is a subclass of date
            ds = day
        else:
            ds = dateutil.parser.parse(day)
        self.metrics = metrics.split(',') if isinstance(metrics,
                                                        str) else metrics
        self.log.info(
            'ds: {ds:%Y-%m-%d} aws_conn_id: {aws_conn_id} region_name: {region_name} metrics: {metrics}'
            .format(ds=ds,
                    aws_conn_id=aws_conn_id,
                    region_name=region_name,
                    metrics=metrics))
        return self.get_metrics_perform_query(ds, metrics, aws_hook,
                                              region_name)
Example #2
0
    def test_get_session_returns_a_boto3_session(self):
        hook = AwsHook(aws_conn_id='aws_default')
        session_from_hook = hook.get_session()
        resource_from_session = session_from_hook.resource('dynamodb')
        table = resource_from_session.create_table(TableName='test_airflow',
                                                   KeySchema=[
                                                       {
                                                           'AttributeName':
                                                           'id',
                                                           'KeyType': 'HASH'
                                                       },
                                                   ],
                                                   AttributeDefinitions=[{
                                                       'AttributeName':
                                                       'name',
                                                       'AttributeType':
                                                       'S'
                                                   }],
                                                   ProvisionedThroughput={
                                                       'ReadCapacityUnits': 10,
                                                       'WriteCapacityUnits': 10
                                                   })

        table.meta.client.get_waiter('table_exists').wait(
            TableName='test_airflow')

        self.assertEqual(table.item_count, 0)
Example #3
0
 def execute(self, context):
     hook = AwsHook(aws_conn_id=self.aws_credentials_id)
     s3 = hook.get_session(region_name=self.region).resource('s3')
     my_bucket = s3.Bucket(self.bucket)
     logging.info("upload files from %s to %s/%s/", self.src, my_bucket,
                  self.dst)
     my_bucket.upload_file(self.src, self.dst)
    def test_get_session_returns_a_boto3_session(self):
        hook = AwsHook(aws_conn_id='aws_default')
        session_from_hook = hook.get_session()
        resource_from_session = session_from_hook.resource('dynamodb')
        table = resource_from_session.create_table(
            TableName='test_airflow',
            KeySchema=[
                {
                    'AttributeName': 'id',
                    'KeyType': 'HASH'
                },
            ],
            AttributeDefinitions=[
                {
                    'AttributeName': 'name',
                    'AttributeType': 'S'
                }
            ],
            ProvisionedThroughput={
                'ReadCapacityUnits': 10,
                'WriteCapacityUnits': 10
            }
        )

        table.meta.client.get_waiter(
            'table_exists').wait(TableName='test_airflow')

        self.assertEqual(table.item_count, 0)
Example #5
0
    def execute(self, context):
        """Extract id list from local DB and upload id list to S3
        """
        if self.only_current_date:
            start_date = context["execution_date"] + datetime.timedelta(days=-1)
            end_date = start_date + datetime.timedelta(days=1)
        else:
            start_date, end_date = self.min_max_timestamp()

        logging.info("%s, %s", start_date, end_date)
        n_date = (end_date - start_date).days + 1
        date_fmt = "%Y-%m-%d"
        hook = AwsHook(aws_conn_id=self.aws_credentials_id)
        s3 = hook.get_session(region_name=self.region).resource('s3')
        my_bucket = s3.Bucket(self.bucket)
        table_name = 'themoviedb'
        logging.info("%s date", n_date)
        with tempfile.TemporaryDirectory() as tmp:
            for i in range(n_date):
                dt1 = start_date + timedelta(days=i)
                dt2 = dt1 + timedelta(days=1)
                res = self.extract_json_data(table_name, dt1, dt2)
                logging.info("result length %s", len(res))
                output_file = "{}.json".format(dt1.strftime(date_fmt))
                local_path = tmp + "/" + output_file
                remote_path = self.tmdb_data_path + "/" + output_file
                logging.info("pathes %s to %s", local_path, remote_path)
                try:
                    with open(local_path, "w") as f:
                        logging.info("write %s", local_path)
                        f.writelines([json.dumps(d[0]) + "\n" for d in res])

                        logging.info("upload from %s to %s", local_path, remote_path)
                        res = my_bucket.upload_file(local_path, remote_path)
                except ClientError as e:
                    logging.error(e)
                    raise ValueError("", e)
                except OSError as e:
                    logging.error(e)
                    raise ValueError("", e)
                except Exception as e:
                    logging.error(e)
                    raise ValueError("", e)

        logging.info("uploaded finished")
    iam = boto3.client('iam', region_name=region_name)
    response = iam.get_role(RoleName=role_name)
    return response["Role"]["Arn"]


# =============================================================================
# setting up training, tuning and transform configuration
# =============================================================================

# read config file
config = cfg.config

# set configuration for tasks
hook = AwsHook(aws_conn_id='airflow-sagemaker')
region = config["job_level"]["region_name"]
sess = hook.get_session(region_name=region)
role = get_sagemaker_role_arn(config["train_model"]["sagemaker_role"],
                              sess.region_name)
container = get_image_uri(sess.region_name, 'factorization-machines')
hpo_enabled = is_hpo_enabled()

# create estimator
fm_estimator = Estimator(image_name=container,
                         role=role,
                         sagemaker_session=sagemaker.session.Session(sess),
                         **config["train_model"]["estimator_config"])

# train_config specifies SageMaker training configuration
train_config = training_config(estimator=fm_estimator,
                               inputs=config["train_model"]["inputs"])
Example #7
0
 def boto3_session(self, aws_creds_name: str) -> 'boto3.session.Session':
     from airflow.contrib.hooks.aws_hook import AwsHook
     aws_hook = AwsHook(aws_creds_name)
     return aws_hook.get_session()
# must create a SageMaker team role that also has Glue access - must add this instruction in the blog
def get_sagemaker_role_arn(role_name, region_name):
    iam = boto3.client("iam", region_name=region_name)
    response = iam.get_role(RoleName=role_name)
    return response["Role"]["Arn"]


# =============================================================================
# setting up training, model creation and endpoint deployment configuration
# =============================================================================

# set configuration for tasks
hook = AwsHook(aws_conn_id="airflow-sagemaker")
sess = hook.get_session(
    region_name=config.REGION_NAME
)  #how is this session different from the SageMaker session - necessary?
sagemaker_role = get_sagemaker_role_arn(config.SAGEMAKER_ROLE_NAME,
                                        config.REGION_NAME)
container = get_image_uri(sess.region_name, "xgboost")

# initialize training hyperparameters
hyperparameters = {
    "max_depth": "5",
    "eta": "0.2",
    "gamma": "4",
    "min_child_weight": "6",
    "subsample": "0.8",
    "objective": "binary:logistic",
    "num_round": "100"
}