def download_pub_key_from_s3(bucket_location, bucket_key, path): s3client = Session().client('s3') response = s3client.list_objects(Bucket=bucket_location, Prefix=bucket_key) if 'Contents' in response: #対象のキーが無い場合の処置 keys = [content['Key'] for content in response['Contents']] for key in keys: base, ext = os.path.splitext(key) if ext == '.pub': dirname, filename = os.path.split(key) bucket = boto3.resource('s3').Bucket(bucket_location) bucket.download_file(key, path + filename) return True return False
class S3Manager: def __init__(self, bucket_name, location): self.conn = boto3.resource('s3') self.bucket_name = bucket_name self.s3client = Session().client('s3') self.location = location def save(self, data, file_name): obj = self.conn.Object(self.bucket_name, file_name) obj.put(Body=json.dumps(data, ensure_ascii=False)) def download(self, file_key): obj = self.conn.Object(self.bucket_name, file_key) response = obj.get()["Body"].read() buf = BytesIO(response) gzip_f = gzip.GzipFile(fileobj=buf) body = gzip_f.read().decode('utf-8') return body def get_created_at(self, file_key): obj = self.conn.Object(self.bucket_name, file_key) return obj.last_modified def get_all_file_names(self, prefix='', keys=[], marker=''): response = self.s3client.list_objects(Bucket=self.bucket_name, Prefix=prefix, Marker=marker) if 'Contents' in response: keys.extend([content['Key'] for content in response['Contents']]) if 'IsTruncated' in response: return self.get_all_file_names(prefix=prefix, keys=keys, marker=keys[-1]) return keys def upload(self, remote_file_key, local_file_name, is_public=None): if is_public: self.s3client.upload_file(local_file_name, self.bucket_name, remote_file_key, ExtraArgs={ 'ContentType': "image/png", 'ACL': 'public-read' }) return "Ok" self.s3client.upload_file(local_file_name, self.bucket_name, remote_file_key) return "Ok"
class Thunderbolt(): def __init__(self, file_path: str, task_filters=''): self.s3client = None self.file_path = file_path self.task_filters = [task_filters] if type(task_filters) == str else task_filters self.bucket_name = file_path.replace('s3://', '').split('/')[0] if file_path.startswith('s3://') else None self.prefix = '/'.join(file_path.replace('s3://', '').split('/')[1:]) if file_path.startswith('s3://') else None self.resource = boto3.resource('s3') if file_path.startswith('s3://') else None self.s3client = Session().client('s3') if file_path.startswith('s3://') else None self.tasks = self._get_tasks_from_s3() if file_path.startswith('s3://') else self._get_tasks() def _get_tasks(self): """Get task parameters.""" files = {str(path) for path in Path(os.path.join(self.file_path, 'log/task_log')).rglob('*')} tasks = {} for i, x in enumerate(tqdm(files)): n = x.split('/')[-1] if self.task_filters and not [x for x in self.task_filters if x in n]: continue n = n.split('_') modified = datetime.fromtimestamp(os.stat(x).st_mtime) with open(x, 'rb') as f: task_log = pickle.load(f) with open(x.replace('task_log', 'task_params'), 'rb') as f: task_params = pickle.load(f) tasks[i] = { 'task_name': '_'.join(n[:-1]), 'task_params': task_params, 'task_log': task_log, 'last_modified': modified, 'task_hash': n[-1].split('.')[0], } return tasks def _get_tasks_from_s3(self): """Get task parameters from S3.""" files = self._get_s3_keys([], '') tasks = {} for i, x in enumerate(tqdm(files)): n = x['Key'].split('/')[-1] if self.task_filters and not [x for x in self.task_filters if x in n]: continue n = n.split('_') tasks[i] = { 'task_name': '_'.join(n[:-1]), 'task_params': pickle.loads(self.resource.Object(self.bucket_name, x['Key'].replace('task_log', 'task_params')).get()['Body'].read()), 'task_log': pickle.loads(self.resource.Object(self.bucket_name, x['Key']).get()['Body'].read()), 'last_modified': x['LastModified'], 'task_hash': n[-1].split('.')[0] } return tasks def _get_s3_keys(self, keys: list = [], marker: str = '') -> list: """Recursively get Key from S3.""" response = self.s3client.list_objects(Bucket=self.bucket_name, Prefix=os.path.join(self.prefix, 'log/task_log'), Marker=marker) if 'Contents' in response: keys.extend([{'Key': content['Key'], 'LastModified': content['LastModified']} for content in response['Contents']]) if 'IsTruncated' in response: return self._get_s3_keys(keys=keys, marker=keys[-1]['Key']) return keys def get_task_df(self, all_data: bool = False) -> pd.DataFrame: """Get task's pandas data frame.""" df = pd.DataFrame([{ 'task_id': k, 'task_name': v['task_name'], 'last_modified': v['last_modified'], 'task_params': v['task_params'], 'task_hash': v['task_hash'], 'task_log': v['task_log'] } for k, v in self.tasks.items()]) if all_data: return df return df[['task_id', 'task_name', 'last_modified', 'task_params']] def load(self, task_id: int) -> list: """Load File.""" return [gokart.target.make_target(file_path=x).load() for x in self.tasks[task_id]['task_log']['file_path']]
y = str(i.year) m = str(i.month) d = str(i.day) if len(m) != 2: m = '0' + m if len(d) != 2: d = '0' + d dates.append(y + m + d) s3client = Session().client('s3') rdata = [] for date in dates: response = s3client.list_objects(Bucket='ld-rawdata-2', Prefix='TR_JISSEKI/' + date + 'XXXXXX/') if 'Contents' in response: # 該当する key がないと response に 'Contents' が含まれない keys = [content['Key'] for content in response['Contents']] key = keys[-1] #23時のデータ records = b"" sql = "SELECT s._1,s._2,s._4,s._5,s._9,s._12,s._13 FROM s3Object as s where (s._2=\'0001\' or s._2=\'0048\' or s._2=\'0052\') and (s._4=\'9263126700000\' or s._4=\'9261821300000\' or s._4=\'9262702400000\' or s._4=\'9285130600000\' or s._4=\'9264112900000\' or s._4=\'9264102000000\' or s._4=\'9264103700000\' or s._4=\'9261106100000\' or s._4=\'9263621700000\' or s._4=\'9262543300000\' or s._4=\'9263125000000\' or s._4=\'9265625300000\' or s._4=\'9264904000000\' or s._4=\'9261151100000\' or s._4=\'9266505700000\' or s._4=\'9285106100000\' or s._4=\'9265661100000\' or s._4=\'9266339800000\' or s._4=\'9262102200000\' or s._4=\'9261808400000\' or s._4=\'9285108500000\' or s._4=\'9264545500000\' or s._4=\'9262201200000\' or s._4=\'9286901100000\' or s._4=\'9261105400000\' or s._4=\'9264513400000\' or s._4=\'9265603100000\' or s._4=\'9262902800000\' or s._4=\'9263620000000\' or s._4=\'9264514100000\')" r = s3.select_object_content( Bucket="ld-rawdata-2", Key=key, ExpressionType='SQL', Expression=sql, InputSerialization={'CSV': {
class Thunderbolt(): def __init__(self, workspace_directory: str = '', task_filters: Union[str, List[str]] = '', use_tqdm: bool = False, tmp_path: str = './tmp'): """Thunderbolt init. Set the path to the directory or S3. Args: workspace_directory: Gokart's TASK_WORKSPACE_DIRECTORY. If None, use $TASK_WORKSPACE_DIRECTORY in os.env. task_filters: Filter for task name. Load only tasks that contain the specified string here. We can also specify the number of copies. use_tqdm: Flag of using tdqm. If False, tqdm not be displayed (default=False). tmp_path: Temporary directory when use external load function. """ self.tqdm_disable = not use_tqdm self.tmp_path = tmp_path self.s3client = None if not workspace_directory: env = os.getenv('TASK_WORKSPACE_DIRECTORY') workspace_directory = env if env else '' self.workspace_directory = workspace_directory if workspace_directory.startswith( 's3://') else os.path.abspath(workspace_directory) self.task_filters = [task_filters ] if type(task_filters) == str else task_filters self.bucket_name = workspace_directory.replace('s3://', '').split( '/')[0] if workspace_directory.startswith('s3://') else None self.prefix = '/'.join( workspace_directory.replace('s3://', '').split('/') [1:]) if workspace_directory.startswith('s3://') else None self.resource = boto3.resource('s3') if workspace_directory.startswith( 's3://') else None self.s3client = Session().client( 's3') if workspace_directory.startswith('s3://') else None self.tasks = self._get_tasks_from_s3( ) if workspace_directory.startswith('s3://') else self._get_tasks() def _get_tasks(self): """Load all task_log from workspace_directory.""" files = { str(path) for path in Path( os.path.join(self.workspace_directory, 'log/task_log')).rglob( '*') } tasks = {} for i, x in enumerate(tqdm(files, disable=self.tqdm_disable)): n = x.split('/')[-1] if self.task_filters and not [ x for x in self.task_filters if x in n ]: continue n = n.split('_') modified = datetime.fromtimestamp(os.stat(x).st_mtime) with open(x, 'rb') as f: task_log = pickle.load(f) with open(x.replace('task_log', 'task_params'), 'rb') as f: task_params = pickle.load(f) tasks[i] = { 'task_name': '_'.join(n[:-1]), 'task_params': task_params, 'task_log': task_log, 'last_modified': modified, 'task_hash': n[-1].split('.')[0], } return tasks def _get_tasks_from_s3(self): """Load all task_log from S3""" files = self._get_s3_keys([], '') tasks = {} for i, x in enumerate(tqdm(files, disable=self.tqdm_disable)): n = x['Key'].split('/')[-1] if self.task_filters and not [ x for x in self.task_filters if x in n ]: continue n = n.split('_') tasks[i] = { 'task_name': '_'.join(n[:-1]), 'task_params': pickle.loads( self.resource.Object( self.bucket_name, x['Key'].replace('task_log', 'task_params')).get()['Body'].read()), 'task_log': pickle.loads( self.resource.Object(self.bucket_name, x['Key']).get()['Body'].read()), 'last_modified': x['LastModified'], 'task_hash': n[-1].split('.')[0] } return tasks def _get_s3_keys(self, keys=[], marker=''): """Recursively get Key from S3. Using s3client api by boto module. Reference: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html Args: keys: The object key to get. Increases with recursion. marker: S3 marker. The recursion ends when this is gone. Returns: Object keys from S3. For example: ['hoge', 'piyo', ...] """ response = self.s3client.list_objects(Bucket=self.bucket_name, Prefix=os.path.join( self.prefix, 'log/task_log'), Marker=marker) if 'Contents' in response: keys.extend([{ 'Key': content['Key'], 'LastModified': content['LastModified'] } for content in response['Contents']]) if 'Contents' in response and 'IsTruncated' in response: return self._get_s3_keys(keys=keys, marker=keys[-1]['Key']) return keys def get_task_df(self, all_data: bool = False) -> pd.DataFrame: """Get task's pandas DataFrame. Args: all_data: If True, add `task unique hash` and `task log data` to DataFrame. Returns: All gokart task infomation pandas.DataFrame. """ df = pd.DataFrame([{ 'task_id': k, 'task_name': v['task_name'], 'last_modified': v['last_modified'], 'task_params': v['task_params'], 'task_hash': v['task_hash'], 'task_log': v['task_log'] } for k, v in self.tasks.items()]) if all_data: return df return df[['task_id', 'task_name', 'last_modified', 'task_params']] def load(self, task_id: int) -> Union[list, Any]: """Load File using gokart.load. Args: task_id: Specify the ID given by Thunderbolt, Read data into memory. Please check `task_id` by using Thunderbolt.get_task_df. Returns: The return value is data or data list. This is because it may be divided when dumping by gokart. """ data = [ self._target_load(x) for x in self.tasks[task_id]['task_log']['file_path'] ] data = data[0] if len(data) == 1 else data return data def _target_load(self, file_name: str) -> Any: """Select gokart load_function and load model. Args: file_name: Path to gokart's output file. Returns: Loaded data. """ file_path = os.path.join(os.path.dirname(self.workspace_directory), file_name) if file_path.endswith('.zip'): tmp_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), os.path.abspath(self.tmp_path)) zip_client = gokart.zip_client_util.make_zip_client( file_path, tmp_path) zip_client.unpack_archive() load_function_path = os.path.join(tmp_path, 'load_function.pkl') load_function = gokart.target.make_target( load_function_path).load() model = load_function(os.path.join(tmp_path, 'model.pkl')) shutil.rmtree(tmp_path) return model return gokart.target.make_target(file_path=file_path).load()
class S3Client: def __init__(self, workspace_directory: str = '', task_filters: List[str] = [], tqdm_disable: bool = False, use_cache: bool = True): self.workspace_directory = workspace_directory self.task_filters = task_filters self.tqdm_disable = tqdm_disable self.bucket_name = workspace_directory.replace('s3://', '').split('/')[0] self.prefix = '/'.join(workspace_directory.replace('s3://', '').split('/')[1:]) self.resource = boto3.resource('s3') self.s3client = Session().client('s3') self.local_cache = LocalCache(workspace_directory, use_cache) self.use_cache = use_cache def get_tasks(self) -> List[Dict[str, Any]]: """Load all task_log from S3""" files = self._get_s3_keys([], '') tasks_list = list() for x in tqdm(files, disable=self.tqdm_disable): n = x['Key'].split('/')[-1] if self.task_filters and not [x for x in self.task_filters if x in n]: continue n = n.split('_') if self.use_cache: cache = self.local_cache.get(x['key']) if cache: tasks_list.append(cache) continue try: params = { 'task_name': '_'.join(n[:-1]), 'task_params': pickle.loads(self.resource.Object(self.bucket_name, x['Key'].replace('task_log', 'task_params')).get()['Body'].read()), 'task_log': pickle.loads(self.resource.Object(self.bucket_name, x['Key']).get()['Body'].read()), 'last_modified': x['LastModified'], 'task_hash': n[-1].split('.')[0] } tasks_list.append(params) if self.use_cache: self.local_cache.dump(x['key'], params) except Exception: continue if len(tasks_list) != len(files): warnings.warn(f'[NOT FOUND LOGS] target file: {len(files)}, found log file: {len(tasks_list)}') return tasks_list def _get_s3_keys(self, keys: List[Dict[str, Any]] = [], marker: str = '') -> List[Dict[str, Any]]: """Recursively get Key from S3. Using s3client api by boto module. Reference: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html Args: keys: The object key to get. Increases with recursion. marker: S3 marker. The recursion ends when this is gone. Returns: Object keys from S3. For example: ['hoge', 'piyo', ...] """ response = self.s3client.list_objects(Bucket=self.bucket_name, Prefix=os.path.join(self.prefix, 'log/task_log'), Marker=marker) if 'Contents' in response: keys.extend([{'Key': content['Key'], 'LastModified': content['LastModified']} for content in response['Contents']]) if 'Contents' in response and 'IsTruncated' in response: return self._get_s3_keys(keys=keys, marker=keys[-1]['Key']) return keys def to_absolute_path(self, x: str) -> str: """get S3 file path""" x = x.lstrip('.').lstrip('/') if self.workspace_directory.rstrip('/').split('/')[-1] == x.split('/')[0]: x = '/'.join(x.split('/')[1:]) return x
from pyspark import SparkContext from pyspark.sql import SparkSession from datetime import timedelta sc = SparkContext() spark = SparkSession.builder.getOrCreate() s3client = Session().client('s3') s3 = boto3.resource('s3') bucket = s3.Bucket('zozo-image-analyze-user-favorite') now = datetime.now() target_date = now - timedelta(hours=1) prefix = target_date.strftime("%Y/%m/%d/%H") response = s3client.list_objects(Bucket='zozo-image-analyze-user-favorite', Prefix=prefix) if 'Contents' in response: keys = [content['Key'] for content in response['Contents']] timestamp = datetime.now().strftime("%Y%m%d%H%M%S%f") json_file_name = '/home/ec2-user/close-clothes/user-favorite/' + timestamp + '.json' parquet_file_name = '/home/ec2-user/close-clothes/user-favorite/' + timestamp + '.parquet' f = open(json_file_name, 'w') for key in keys: obj = bucket.Object(key) obj = obj.get() json_stream = obj['Body'].read() decoder = json.JSONDecoder()