def download_file(bucket_name: str, key_prefix: str):
    s3 = get_service_resource("s3")
    """ Download the file locally to the /tmp folder provided by lambda runtime """
    logger.debug(f"prefix is {key_prefix}")
    local_file_name = os.path.basename(key_prefix)  # get file name
    local_file_path = os.path.join(
        local_folder_path, local_file_name)  # generate destination file path
    bucket = s3.Bucket(bucket_name)

    # check file size
    size = bucket.Object(key_prefix).content_length
    if size > max_file_size:
        err_msg = f"File {key_prefix} in bucket {bucket_name} too big to process. Max file size allowed is 10 MB"
        logger.error(err_msg)
        raise FileSizeTooBigException(err_msg)

    try:
        bucket.download_file(key_prefix, local_file_path)
    except botocore.exceptions.ClientError as e:
        logger.error(
            f"When downloading file from bucket: {bucket_name} and prefix: {key_prefix} following error occured: {e}"
        )
        if e.response["Error"]["Code"]:
            logger.error(
                f"The service returned following error code: {e.response['Error']['Code']}"
            )
        raise e
    return local_file_path
def get_config(dynamodb=None, **scan_kwargs):
    """ This method retrieves configuration list from DDB which are "enabled = True" """
    if not dynamodb:
        dynamodb = service_helper.get_service_resource("dynamodb")

    table = dynamodb.Table(os.environ["DDB_CONFIG_TABLE_NAME"])

    config_list = []
    start_key = None
    done = False

    while not done:
        scan_kwargs["FilterExpression"] = Attr("enabled").eq(True)
        if start_key:
            scan_kwargs["ExclusiveStartKey"] = start_key

        # scan all the config in the dynamodb table and filter records with enabled=false
        response = table.scan(**scan_kwargs)
        config_list.extend(response["Items"])

        # if "LastEvaluatedKey" is not part of the respose then do not pagingate. Hence exit the loop
        start_key = response.get("LastEvaluatedKey", None)
        done = start_key is None

    logger.debug(f"Dumping config_list: {config_list}")
    return config_list
Beispiel #3
0
    def test_get_service_resource(self):
        service_resource = service_helper.get_service_resource("dynamodb")
        self.assertIsNotNone(service_resource)

        table_name = os.environ["DDB_TABLE_NAME"]
        ddb_setup(table_name)
        table = service_resource.Table(table_name)
        assert table_name == table.table_name
def update_query(item, **put_item_kwargs):
    """
    This method updates the 'query' (data searched through APIs) details into DDB. THis information
    is used for tracking the last time a jop was run for an account
    """
    dynamodb = service_helper.get_service_resource("dynamodb")

    # hash key for the item should be account#url#topic#search_query and range key should be timestamp
    logger.info(f"Inserting item: {item}")
    table = dynamodb.Table(os.environ["TARGET_DDB_TABLE"])
    table.put_item(Item=item, **put_item_kwargs)
def get_query_timestamp(video_id):
    ddb = get_service_resource("dynamodb")
    table = ddb.Table(os.environ["TARGET_DDB_TABLE"])

    try:
        ddb_response = table.get_item(Key={"VIDEO_ID": video_id})
    except ClientError as e:
        logger.error(
            f'Error in getting tracker {e.response["Error"]["Message"]}')
        raise e

    return ddb_response.get("Item", None)
def get_query_tracker(account, url, search_query, topic=None, **item_kwargs):
    dynamodb = service_helper.get_service_resource("dynamodb")

    table = dynamodb.Table(os.environ["TARGET_DDB_TABLE"])
    query = "#".join([account, url])
    if topic:
        query = "#".join([query, topic])
    if search_query:
        query = "#".join([query, search_query])
    logger.info(f"Query to retrieve tracker is {query}")
    response = table.query(
        KeyConditionExpression=Key("ID").eq(query),
        Limit=1,
        ScanIndexForward=False,
    )
    if len(response["Items"]) == 0:
        logger.warning("Query tracker is empty")
        return {
            "LAST_PUBLISHED_TIMESTAMP":
            (datetime.now(timezone.utc) - timedelta(days=30)).isoformat()
        }

    return response["Items"][
        0]  # since limit is 1, it will return only 1 record and hence taking the first index value