Ejemplo n.º 1
0
def download_pub_key_from_s3(bucket_location, bucket_key, path):
    s3client = Session().client('s3')
    response = s3client.list_objects(Bucket=bucket_location, Prefix=bucket_key)
    if 'Contents' in response:  #対象のキーが無い場合の処置
        keys = [content['Key'] for content in response['Contents']]
        for key in keys:
            base, ext = os.path.splitext(key)
            if ext == '.pub':
                dirname, filename = os.path.split(key)
                bucket = boto3.resource('s3').Bucket(bucket_location)
                bucket.download_file(key, path + filename)
                return True
    return False
Ejemplo n.º 2
0
class S3Manager:
    def __init__(self, bucket_name, location):
        self.conn = boto3.resource('s3')
        self.bucket_name = bucket_name
        self.s3client = Session().client('s3')
        self.location = location

    def save(self, data, file_name):
        obj = self.conn.Object(self.bucket_name, file_name)
        obj.put(Body=json.dumps(data, ensure_ascii=False))

    def download(self, file_key):
        obj = self.conn.Object(self.bucket_name, file_key)
        response = obj.get()["Body"].read()
        buf = BytesIO(response)
        gzip_f = gzip.GzipFile(fileobj=buf)
        body = gzip_f.read().decode('utf-8')
        return body

    def get_created_at(self, file_key):
        obj = self.conn.Object(self.bucket_name, file_key)
        return obj.last_modified

    def get_all_file_names(self, prefix='', keys=[], marker=''):
        response = self.s3client.list_objects(Bucket=self.bucket_name,
                                              Prefix=prefix,
                                              Marker=marker)
        if 'Contents' in response:
            keys.extend([content['Key'] for content in response['Contents']])
            if 'IsTruncated' in response:
                return self.get_all_file_names(prefix=prefix,
                                               keys=keys,
                                               marker=keys[-1])
        return keys

    def upload(self, remote_file_key, local_file_name, is_public=None):
        if is_public:
            self.s3client.upload_file(local_file_name,
                                      self.bucket_name,
                                      remote_file_key,
                                      ExtraArgs={
                                          'ContentType': "image/png",
                                          'ACL': 'public-read'
                                      })
            return "Ok"

        self.s3client.upload_file(local_file_name, self.bucket_name,
                                  remote_file_key)
        return "Ok"
Ejemplo n.º 3
0
class Thunderbolt():
    def __init__(self, file_path: str, task_filters=''):
        self.s3client = None
        self.file_path = file_path
        self.task_filters = [task_filters] if type(task_filters) == str else task_filters
        self.bucket_name = file_path.replace('s3://', '').split('/')[0] if file_path.startswith('s3://') else None
        self.prefix = '/'.join(file_path.replace('s3://', '').split('/')[1:]) if file_path.startswith('s3://') else None
        self.resource = boto3.resource('s3') if file_path.startswith('s3://') else None
        self.s3client = Session().client('s3') if file_path.startswith('s3://') else None
        self.tasks = self._get_tasks_from_s3() if file_path.startswith('s3://') else self._get_tasks()

    def _get_tasks(self):
        """Get task parameters."""
        files = {str(path) for path in Path(os.path.join(self.file_path, 'log/task_log')).rglob('*')}
        tasks = {}
        for i, x in enumerate(tqdm(files)):
            n = x.split('/')[-1]
            if self.task_filters and not [x for x in self.task_filters if x in n]:
                continue
            n = n.split('_')
            modified = datetime.fromtimestamp(os.stat(x).st_mtime)
            with open(x, 'rb') as f:
                task_log = pickle.load(f)
            with open(x.replace('task_log', 'task_params'), 'rb') as f:
                task_params = pickle.load(f)
            tasks[i] = {
                'task_name': '_'.join(n[:-1]),
                'task_params': task_params,
                'task_log': task_log,
                'last_modified': modified,
                'task_hash': n[-1].split('.')[0],
            }
        return tasks

    def _get_tasks_from_s3(self):
        """Get task parameters from S3."""
        files = self._get_s3_keys([], '')
        tasks = {}
        for i, x in enumerate(tqdm(files)):
            n = x['Key'].split('/')[-1]
            if self.task_filters and not [x for x in self.task_filters if x in n]:
                continue
            n = n.split('_')
            tasks[i] = {
                'task_name': '_'.join(n[:-1]),
                'task_params': pickle.loads(self.resource.Object(self.bucket_name, x['Key'].replace('task_log', 'task_params')).get()['Body'].read()),
                'task_log': pickle.loads(self.resource.Object(self.bucket_name, x['Key']).get()['Body'].read()),
                'last_modified': x['LastModified'],
                'task_hash': n[-1].split('.')[0]
            }
        return tasks

    def _get_s3_keys(self, keys: list = [], marker: str = '') -> list:
        """Recursively get Key from S3."""
        response = self.s3client.list_objects(Bucket=self.bucket_name, Prefix=os.path.join(self.prefix, 'log/task_log'), Marker=marker)
        if 'Contents' in response:
            keys.extend([{'Key': content['Key'], 'LastModified': content['LastModified']} for content in response['Contents']])
            if 'IsTruncated' in response:
                return self._get_s3_keys(keys=keys, marker=keys[-1]['Key'])
        return keys

    def get_task_df(self, all_data: bool = False) -> pd.DataFrame:
        """Get task's pandas data frame."""
        df = pd.DataFrame([{
            'task_id': k,
            'task_name': v['task_name'],
            'last_modified': v['last_modified'],
            'task_params': v['task_params'],
            'task_hash': v['task_hash'],
            'task_log': v['task_log']
        } for k, v in self.tasks.items()])
        if all_data:
            return df
        return df[['task_id', 'task_name', 'last_modified', 'task_params']]

    def load(self, task_id: int) -> list:
        """Load File."""
        return [gokart.target.make_target(file_path=x).load() for x in self.tasks[task_id]['task_log']['file_path']]
Ejemplo n.º 4
0
    y = str(i.year)
    m = str(i.month)
    d = str(i.day)
    if len(m) != 2:
        m = '0' + m
    if len(d) != 2:
        d = '0' + d

    dates.append(y + m + d)

s3client = Session().client('s3')
rdata = []

for date in dates:

    response = s3client.list_objects(Bucket='ld-rawdata-2',
                                     Prefix='TR_JISSEKI/' + date + 'XXXXXX/')

    if 'Contents' in response:  # 該当する key がないと response に 'Contents' が含まれない
        keys = [content['Key'] for content in response['Contents']]
        key = keys[-1]  #23時のデータ

    records = b""

    sql = "SELECT s._1,s._2,s._4,s._5,s._9,s._12,s._13 FROM s3Object as s where (s._2=\'0001\' or s._2=\'0048\' or s._2=\'0052\') and (s._4=\'9263126700000\' or s._4=\'9261821300000\' or s._4=\'9262702400000\' or s._4=\'9285130600000\' or s._4=\'9264112900000\' or s._4=\'9264102000000\' or s._4=\'9264103700000\' or s._4=\'9261106100000\' or s._4=\'9263621700000\' or s._4=\'9262543300000\' or s._4=\'9263125000000\' or s._4=\'9265625300000\' or s._4=\'9264904000000\' or s._4=\'9261151100000\' or s._4=\'9266505700000\' or s._4=\'9285106100000\' or s._4=\'9265661100000\' or s._4=\'9266339800000\' or s._4=\'9262102200000\' or s._4=\'9261808400000\' or s._4=\'9285108500000\' or s._4=\'9264545500000\' or s._4=\'9262201200000\' or s._4=\'9286901100000\' or s._4=\'9261105400000\' or s._4=\'9264513400000\' or s._4=\'9265603100000\' or s._4=\'9262902800000\' or s._4=\'9263620000000\' or s._4=\'9264514100000\')"

    r = s3.select_object_content(
        Bucket="ld-rawdata-2",
        Key=key,
        ExpressionType='SQL',
        Expression=sql,
        InputSerialization={'CSV': {
Ejemplo n.º 5
0
class Thunderbolt():
    def __init__(self,
                 workspace_directory: str = '',
                 task_filters: Union[str, List[str]] = '',
                 use_tqdm: bool = False,
                 tmp_path: str = './tmp'):
        """Thunderbolt init.

        Set the path to the directory or S3.

        Args:
            workspace_directory: Gokart's TASK_WORKSPACE_DIRECTORY. If None, use $TASK_WORKSPACE_DIRECTORY in os.env.
            task_filters: Filter for task name.
                Load only tasks that contain the specified string here. We can also specify the number of copies.
            use_tqdm: Flag of using tdqm. If False, tqdm not be displayed (default=False).
            tmp_path: Temporary directory when use external load function.
        """
        self.tqdm_disable = not use_tqdm
        self.tmp_path = tmp_path
        self.s3client = None
        if not workspace_directory:
            env = os.getenv('TASK_WORKSPACE_DIRECTORY')
            workspace_directory = env if env else ''
        self.workspace_directory = workspace_directory if workspace_directory.startswith(
            's3://') else os.path.abspath(workspace_directory)
        self.task_filters = [task_filters
                             ] if type(task_filters) == str else task_filters
        self.bucket_name = workspace_directory.replace('s3://', '').split(
            '/')[0] if workspace_directory.startswith('s3://') else None
        self.prefix = '/'.join(
            workspace_directory.replace('s3://', '').split('/')
            [1:]) if workspace_directory.startswith('s3://') else None
        self.resource = boto3.resource('s3') if workspace_directory.startswith(
            's3://') else None
        self.s3client = Session().client(
            's3') if workspace_directory.startswith('s3://') else None
        self.tasks = self._get_tasks_from_s3(
        ) if workspace_directory.startswith('s3://') else self._get_tasks()

    def _get_tasks(self):
        """Load all task_log from workspace_directory."""
        files = {
            str(path)
            for path in Path(
                os.path.join(self.workspace_directory, 'log/task_log')).rglob(
                    '*')
        }
        tasks = {}
        for i, x in enumerate(tqdm(files, disable=self.tqdm_disable)):
            n = x.split('/')[-1]
            if self.task_filters and not [
                    x for x in self.task_filters if x in n
            ]:
                continue
            n = n.split('_')
            modified = datetime.fromtimestamp(os.stat(x).st_mtime)
            with open(x, 'rb') as f:
                task_log = pickle.load(f)
            with open(x.replace('task_log', 'task_params'), 'rb') as f:
                task_params = pickle.load(f)
            tasks[i] = {
                'task_name': '_'.join(n[:-1]),
                'task_params': task_params,
                'task_log': task_log,
                'last_modified': modified,
                'task_hash': n[-1].split('.')[0],
            }
        return tasks

    def _get_tasks_from_s3(self):
        """Load all task_log from S3"""
        files = self._get_s3_keys([], '')
        tasks = {}
        for i, x in enumerate(tqdm(files, disable=self.tqdm_disable)):
            n = x['Key'].split('/')[-1]
            if self.task_filters and not [
                    x for x in self.task_filters if x in n
            ]:
                continue
            n = n.split('_')
            tasks[i] = {
                'task_name':
                '_'.join(n[:-1]),
                'task_params':
                pickle.loads(
                    self.resource.Object(
                        self.bucket_name,
                        x['Key'].replace('task_log',
                                         'task_params')).get()['Body'].read()),
                'task_log':
                pickle.loads(
                    self.resource.Object(self.bucket_name,
                                         x['Key']).get()['Body'].read()),
                'last_modified':
                x['LastModified'],
                'task_hash':
                n[-1].split('.')[0]
            }
        return tasks

    def _get_s3_keys(self, keys=[], marker=''):
        """Recursively get Key from S3.

        Using s3client api by boto module.
        Reference: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html

        Args:
            keys: The object key to get. Increases with recursion.
            marker: S3 marker. The recursion ends when this is gone.

        Returns:
            Object keys from S3. For example: ['hoge', 'piyo', ...]
        """
        response = self.s3client.list_objects(Bucket=self.bucket_name,
                                              Prefix=os.path.join(
                                                  self.prefix, 'log/task_log'),
                                              Marker=marker)
        if 'Contents' in response:
            keys.extend([{
                'Key': content['Key'],
                'LastModified': content['LastModified']
            } for content in response['Contents']])
            if 'Contents' in response and 'IsTruncated' in response:
                return self._get_s3_keys(keys=keys, marker=keys[-1]['Key'])
        return keys

    def get_task_df(self, all_data: bool = False) -> pd.DataFrame:
        """Get task's pandas DataFrame.

        Args:
            all_data: If True, add `task unique hash` and `task log data` to DataFrame.

        Returns:
            All gokart task infomation pandas.DataFrame.
        """
        df = pd.DataFrame([{
            'task_id': k,
            'task_name': v['task_name'],
            'last_modified': v['last_modified'],
            'task_params': v['task_params'],
            'task_hash': v['task_hash'],
            'task_log': v['task_log']
        } for k, v in self.tasks.items()])
        if all_data:
            return df
        return df[['task_id', 'task_name', 'last_modified', 'task_params']]

    def load(self, task_id: int) -> Union[list, Any]:
        """Load File using gokart.load.

        Args:
            task_id: Specify the ID given by Thunderbolt, Read data into memory.
                Please check `task_id` by using Thunderbolt.get_task_df.

        Returns:
            The return value is data or data list. This is because it may be divided when dumping by gokart.
        """
        data = [
            self._target_load(x)
            for x in self.tasks[task_id]['task_log']['file_path']
        ]
        data = data[0] if len(data) == 1 else data
        return data

    def _target_load(self, file_name: str) -> Any:
        """Select gokart load_function and load model.

        Args:
            file_name: Path to gokart's output file.

        Returns:
            Loaded data.
        """
        file_path = os.path.join(os.path.dirname(self.workspace_directory),
                                 file_name)
        if file_path.endswith('.zip'):
            tmp_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                                    os.path.abspath(self.tmp_path))
            zip_client = gokart.zip_client_util.make_zip_client(
                file_path, tmp_path)
            zip_client.unpack_archive()
            load_function_path = os.path.join(tmp_path, 'load_function.pkl')
            load_function = gokart.target.make_target(
                load_function_path).load()
            model = load_function(os.path.join(tmp_path, 'model.pkl'))
            shutil.rmtree(tmp_path)
            return model
        return gokart.target.make_target(file_path=file_path).load()
Ejemplo n.º 6
0
class S3Client:
    def __init__(self, workspace_directory: str = '', task_filters: List[str] = [], tqdm_disable: bool = False, use_cache: bool = True):
        self.workspace_directory = workspace_directory
        self.task_filters = task_filters
        self.tqdm_disable = tqdm_disable
        self.bucket_name = workspace_directory.replace('s3://', '').split('/')[0]
        self.prefix = '/'.join(workspace_directory.replace('s3://', '').split('/')[1:])
        self.resource = boto3.resource('s3')
        self.s3client = Session().client('s3')
        self.local_cache = LocalCache(workspace_directory, use_cache)
        self.use_cache = use_cache

    def get_tasks(self) -> List[Dict[str, Any]]:
        """Load all task_log from S3"""
        files = self._get_s3_keys([], '')
        tasks_list = list()
        for x in tqdm(files, disable=self.tqdm_disable):
            n = x['Key'].split('/')[-1]
            if self.task_filters and not [x for x in self.task_filters if x in n]:
                continue
            n = n.split('_')

            if self.use_cache:
                cache = self.local_cache.get(x['key'])
                if cache:
                    tasks_list.append(cache)
                    continue

            try:
                params = {
                    'task_name': '_'.join(n[:-1]),
                    'task_params': pickle.loads(self.resource.Object(self.bucket_name, x['Key'].replace('task_log', 'task_params')).get()['Body'].read()),
                    'task_log': pickle.loads(self.resource.Object(self.bucket_name, x['Key']).get()['Body'].read()),
                    'last_modified': x['LastModified'],
                    'task_hash': n[-1].split('.')[0]
                }
                tasks_list.append(params)
                if self.use_cache:
                    self.local_cache.dump(x['key'], params)
            except Exception:
                continue

        if len(tasks_list) != len(files):
            warnings.warn(f'[NOT FOUND LOGS] target file: {len(files)}, found log file: {len(tasks_list)}')

        return tasks_list

    def _get_s3_keys(self, keys: List[Dict[str, Any]] = [], marker: str = '') -> List[Dict[str, Any]]:
        """Recursively get Key from S3.

        Using s3client api by boto module.
        Reference: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html

        Args:
            keys: The object key to get. Increases with recursion.
            marker: S3 marker. The recursion ends when this is gone.

        Returns:
            Object keys from S3. For example: ['hoge', 'piyo', ...]
        """
        response = self.s3client.list_objects(Bucket=self.bucket_name, Prefix=os.path.join(self.prefix, 'log/task_log'), Marker=marker)
        if 'Contents' in response:
            keys.extend([{'Key': content['Key'], 'LastModified': content['LastModified']} for content in response['Contents']])
            if 'Contents' in response and 'IsTruncated' in response:
                return self._get_s3_keys(keys=keys, marker=keys[-1]['Key'])
        return keys

    def to_absolute_path(self, x: str) -> str:
        """get S3 file path"""
        x = x.lstrip('.').lstrip('/')
        if self.workspace_directory.rstrip('/').split('/')[-1] == x.split('/')[0]:
            x = '/'.join(x.split('/')[1:])
        return x
Ejemplo n.º 7
0
from pyspark import SparkContext
from pyspark.sql import SparkSession
from datetime import timedelta

sc = SparkContext()
spark = SparkSession.builder.getOrCreate()

s3client = Session().client('s3')
s3 = boto3.resource('s3')
bucket = s3.Bucket('zozo-image-analyze-user-favorite')

now = datetime.now()
target_date = now - timedelta(hours=1)
prefix = target_date.strftime("%Y/%m/%d/%H")

response = s3client.list_objects(Bucket='zozo-image-analyze-user-favorite',
                                 Prefix=prefix)

if 'Contents' in response:
    keys = [content['Key'] for content in response['Contents']]

timestamp = datetime.now().strftime("%Y%m%d%H%M%S%f")

json_file_name = '/home/ec2-user/close-clothes/user-favorite/' + timestamp + '.json'
parquet_file_name = '/home/ec2-user/close-clothes/user-favorite/' + timestamp + '.parquet'

f = open(json_file_name, 'w')
for key in keys:
    obj = bucket.Object(key)
    obj = obj.get()
    json_stream = obj['Body'].read()
    decoder = json.JSONDecoder()