def download_prices_from_s3(bucket: ServiceResource, dir_prices: Path, remote_dir_prices: Path, missing_rics: List[str], logger: logging.Logger) -> None: dir_prices.mkdir(parents=True, exist_ok=True) for ric in missing_rics: remote_filename = ric2filename(remote_dir_prices, ric, 'csv.gz') basename = remote_filename.name dest_parent = dir_prices dest = dest_parent / Path(basename) if dest.is_file(): logger.debug('skip downloading {}'.format(basename)) else: logger.debug('start downloading {}'.format(basename)) try: bucket.download_file(Key=str(remote_filename), Filename=str(dest)) except ClientError as e: code = e.response.get('Error', {}).get('Code', '') if str(code) == str(HTTPStatus.NOT_FOUND.value): logger.critical('{} is not found'.format( str(remote_filename))) logger.debug('end downloading {}'.format(basename))
def download_nikkei_bodies_from_s3(bucket: ServiceResource, dest_dirname: Path, remote_filenames: List[Path], logger: logging.Logger) -> None: dest_dirname.mkdir(parents=True, exist_ok=True) for remote_filename in remote_filenames: logger.info('start downloading {}'.format(remote_filename)) dest = dest_dirname.joinpath(remote_filename.name) if not dest.is_file(): bucket.download_file(Key=str(remote_filename), Filename=str(dest)) logger.info('start downloading {}'.format(remote_filename))
def download_nikkei_headlines_from_s3(bucket: ServiceResource, cp932zip_dirname: str, utf8csv_dirname: str, remote_filenames: List[str], logger: logging.Logger) -> None: os.makedirs(cp932zip_dirname, exist_ok=True) os.makedirs(utf8csv_dirname, exist_ok=True) for remote_filename in remote_filenames: basename = os.path.basename(remote_filename).lower() temp_dest = os.path.join(cp932zip_dirname, basename) if os.path.isfile(temp_dest): logger.debug('skip downloading {}'.format(basename)) else: logger.debug('start downloading {}'.format(basename)) try: bucket.download_file(Key=remote_filename, Filename=temp_dest) except ClientError as e: code = e.response.get('Error', {}).get('Code', '') if str(code) == str(HTTPStatus.NOT_FOUND.value): logger.info('{} is not found'.format(remote_filename)) logger.debug('end downloading {}'.format(basename)) infl_filename = re.sub(r'\.zip$', '.csv', basename, flags=re.IGNORECASE) match = re.search(r'_([12][0-9]{3})_', infl_filename) if match is None: raise ValueError year = int(match[1]) utf8_filename = os.path.join(utf8csv_dirname, 'nikkei_headlines_{}.csv'.format(year)) if os.path.isfile(utf8_filename): logger.debug('skip converting {}'.format(utf8_filename)) continue with zipfile.ZipFile(temp_dest, mode='r') as zf: logger.debug('start converting {}'.format(utf8_filename)) with zf.open(infl_filename, mode='r') as cp932_file: text = cp932_file.read() with open(utf8_filename, mode='wb') as utf8_file: utf8_file.write(text.decode('cp932').encode('utf-8')) logger.debug('end converting {}'.format(utf8_filename))
def download_reuters_articles_from_s3(bucket: ServiceResource, dest_dirname: Path, remote_dirnames: List[Path], logger: logging.Logger) -> None: dest_dirname.mkdir(parents=True, exist_ok=True) for remote_dirname in remote_dirnames: logger.info('start downloading files in {}'.format(remote_dirname)) summaries = bucket.objects.filter(Prefix=str(remote_dirname)) for summary in summaries: dest = Path(dest_dirname).joinpath(summary.key.split('/')[-1]) if not summary.key.endswith('/') and not dest.is_file(): bucket.download_file(Key=summary.key, Filename=str(dest)) logger.info('end downloading files in {}'.format(remote_dirname))
def _create_topic_base(sqs: ServiceResource, topic: str) -> None: try: try: sqs.get_queue_by_name(QueueName='%s.fifo' % topic) except sqs.meta.client.exceptions.QueueDoesNotExist: sqs.create_queue(QueueName='%s.fifo' % topic, Attributes={ 'DelaySeconds': '0', 'MessageRetentionPeriod': '86400', 'FifoQueue': 'true' }) logging.info('Queue %s created', topic) except ClientError: pass
def update_metadata(option: dict, dynamodb_resource: ServiceResource) -> dict: """ metadataを更新する """ table = dynamodb_resource.Table(get_table_name()) resp = table.update_item(**option) return resp['Attributes']
def put_make_history_item(item: dict, table_name: str, dynamodb_resource: ServiceResource): table = dynamodb_resource.Table(table_name) option = {"Item": item} resp = table.put_item(**option) logger.info("put make history item", item=item, resp=resp)
def upload_image_to_s3( # See https://github.com/python/typeshed/issues/2706 bucket: ServiceResource, file_name: str, content_type: Optional[str], user_profile: UserProfile, contents: bytes, ) -> None: key = bucket.Object(file_name) metadata = { "user_profile_id": str(user_profile.id), "realm_id": str(user_profile.realm_id), } content_disposition = "" if content_type is None: content_type = "" if content_type not in INLINE_MIME_TYPES: content_disposition = "attachment" key.put( Body=contents, Metadata=metadata, ContentType=content_type, ContentDisposition=content_disposition, )
def put_metadata_item(metadata: dict, dynamodb_resource: ServiceResource): """ metadataをDynamoDBに保存する """ table_name = _get_table_name() table = dynamodb_resource.Table(table_name) table.put_item(Item=metadata)
def upsert_user_make_history(user_id: str, user_name: str, amount: int, table_name: str, dynamodb_resource: ServiceResource) -> int: jst = timezone(offset=timedelta(hours=+9), name="jst") now = datetime.now(jst).strftime("%Y年%m月") table = dynamodb_resource.Table(table_name) key_user_times = f"{user_id}_times" option = { "Key": { "partitionId": "userMakeHistory", "sortId": now }, "UpdateExpression": "add #user_times :user_times set #user_name = :user_name, #updatedAt = :updatedAt", "ExpressionAttributeNames": { "#user_times": key_user_times, "#user_name": f"{user_id}_name", "#updatedAt": "updatedAt", }, "ExpressionAttributeValues": { ":user_times": amount, ":user_name": user_name, ":updatedAt": int(datetime.now(timezone.utc).timestamp() * 1000), }, "ReturnValues": "ALL_NEW", } resp = table.update_item(**option) return int(resp["Attributes"][key_user_times])
def get_a_metadata(id: str, dynamodb_resource: ServiceResource) -> Optional[dict]: """ IDを使って、metadataを取得する """ table = dynamodb_resource.Table(get_table_name()) resp = table.get_item(Key={'id': id}) return resp.get('Item')
def upload_prices_to_s3(bucket: ServiceResource, local_dir: Path, remote_dir: Path, rics: List[str]) -> None: for ric in rics: local_filename = ric2filename(local_dir, ric, 'csv.gz') key = str(remote_dir / Path(local_filename.name)) objs = list(bucket.objects.filter(Prefix=key).all()) if len(objs) > 0 and objs[0].key == key: continue with local_filename.open(mode='rb') as body: bucket.put_object(Key=key, Body=body)
def fetch_a_metadata(id: str, dynamodb_resource: ServiceResource) -> Optional[dict]: """ DynamoDBからmetadataを単件取得する。該当するmetadataがなければnullを返す(not found)。 """ table = table = dynamodb_resource.Table(get_table_name()) resp = table.get_item(Key={'id': id}) return resp.get('Item')
def scan_table(table_name: str, dynamodb_resource: ServiceResource) -> List[dict]: table = dynamodb_resource.Table(table_name) resp = table.scan() table_items = resp['Items'] while 'LastEvaluatedKey' in resp: resp = table.scan(ExclusiveStartKey=resp['LastEvaluatedKey']) table_items.extend(resp['Items']) return table_items
def delete_diff_items(table_name: str, partition_key: str, sort_key: str, diff_keys: List[str], dynamodb_resource: ServiceResource) -> None: table = dynamodb_resource.Table(table_name) for diff_key in diff_keys: key = {partition_key: diff_key.split(KEY_SPLITTER)[0]} if sort_key != '': sort_key_d = {sort_key: diff_key.split(KEY_SPLITTER)[1]} key.update(**sort_key_d) table.delete_item(Key=key)
def create_instances(ec2_resource: ServiceResource, image_id: str, key_name: str, instance_type: str, num_instances: int = 1, security_group_ids: Optional[List] = None, user_data: Optional[Union[str, bytes]] = None, block_device_map: Optional[List[Dict]] = None, instance_profile_arn: Optional[str] = None, placement_az: Optional[str] = None, subnet_id: str = None, tags: Optional[Dict[str, str]] = None) -> List[dict]: """ Replaces create_ondemand_instances. Uses boto3 and returns a list of Boto3 instance dicts. See "create_instances" (returns a list of ec2.Instance objects): https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances Not to be confused with "run_instances" (same input args; returns a dictionary): https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/ec2.html#EC2.Client.run_instances Tags, if given, are applied to the instances, and all volumes. """ logger.info('Creating %s instance(s) ... ', instance_type) if isinstance(user_data, str): user_data = user_data.encode('utf-8') request = {'ImageId': image_id, 'MinCount': num_instances, 'MaxCount': num_instances, 'KeyName': key_name, 'SecurityGroupIds': security_group_ids, 'InstanceType': instance_type, 'UserData': user_data, 'BlockDeviceMappings': block_device_map, 'SubnetId': subnet_id} if instance_profile_arn: # We could just retry when we get an error because the ARN doesn't # exist, but we might as well wait for it. wait_until_instance_profile_arn_exists(instance_profile_arn) # Add it to the request request['IamInstanceProfile'] = {'Arn': instance_profile_arn} if placement_az: request['Placement'] = {'AvailabilityZone': placement_az} if tags: # Tag everything when we make it. flat_tags = flatten_tags(tags) request['TagSpecifications'] = [{'ResourceType': 'instance', 'Tags': flat_tags}, {'ResourceType': 'volume', 'Tags': flat_tags}] return ec2_resource.create_instances(**prune(request))
def delete_file_from_s3(self, path_id: str, bucket: ServiceResource) -> bool: key = bucket.Object(path_id) try: key.load() except botocore.exceptions.ClientError: file_name = path_id.split("/")[-1] logging.warning("%s does not exist. Its entry in the database will be removed.", file_name) return False key.delete() return True
def update_sack_counter(amount: int, table_name: str, dynamodb_resource: ServiceResource) -> int: table = dynamodb_resource.Table(table_name) option = { "Key": {"partitionId": "sackCounter", "sortId": "counter"}, "UpdateExpression": "add #times :amount set #updatedAt = :updatedAt", "ExpressionAttributeNames": {"#times": "times", "#updatedAt": "updatedAt"}, "ExpressionAttributeValues": { ":amount": amount, ":updatedAt": int(datetime.now(timezone.utc).timestamp() * 1000), }, "ReturnValues": "ALL_NEW", } resp = table.update_item(**option) return resp["Attributes"]["times"]
def scan_metadata(dynamodb_resource: ServiceResource, last_evaluated_key: Optional[dict] = None) -> List[dict]: """ DynamoDBからmetadataを全件取得する。 scanではデータ量が多いと一度で取得できない場合があるので、再帰的に処理を行い全件取得するようにしている。 """ table = dynamodb_resource.Table(get_table_name()) option = {} if last_evaluated_key is not None: option['ExclusiveStartKey'] = last_evaluated_key resp = table.scan(**option) result = resp.get('Items', []) if 'LastEvaluatedKey' in resp: result += scan_metadata(dynamodb_resource, last_evaluated_key=resp['LastEvaluatedKey']) return result
def get_open_sack_history(table_name: str, dynamodb_resource: ServiceResource) -> List[dict]: table = dynamodb_resource.Table(table_name) option = { "KeyConditionExpression": Key("partitionId").eq("sackHistory"), "Limit": 5, "ScanIndexForward": False, "ProjectionExpression": "#sortId, #times", "ExpressionAttributeNames": { "#sortId": "sortId", "#times": "times" }, } resp = table.query(**option) logger.info("get open sack history result", option=option, response=resp) return resp.get("Items", [])
def insert_sack_history(times: int, table_name: str, dynamodb_resouce: ServiceResource) -> None: table = dynamodb_resouce.Table(table_name) jst = timezone(offset=timedelta(hours=+9), name="jst") now = str(datetime.now(jst)) option = { "Item": { "partitionId": "sackHistory", "sortId": now, "times": times } } resp = table.put_item(**option) logger.info("insert sack history result", option=option, resp=resp)
def create_instances(ec2: ServiceResource, image_id: str, key_name: str, instance_type: str, instance_profile_arn: Dict, num_instances: int = 1, security_group_ids: Optional[List] = None, user_data: Optional[bytes] = None, block_device_map: Optional[List[Dict]] = None, placement: Optional[Dict] = None, subnet_id: str = None): """ Replaces create_ondemand_instances. Uses boto3. See "create_instances" (returns a list of ec2.Instance objects): https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances Not to be confused with "run_instances" (same input args; returns a dictionary): https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/ec2.html#EC2.Client.run_instances """ log.info('Creating %s instance(s) ... ', instance_type) wait_until_instance_profile_arn_exists(instance_profile_arn) request = { 'ImageId': image_id, 'MinCount': num_instances, 'MaxCount': num_instances, 'KeyName': key_name, 'SecurityGroupIds': security_group_ids, 'InstanceType': instance_type, 'UserData': user_data, 'Placement': placement, 'BlockDeviceMappings': block_device_map, 'IamInstanceProfile': instance_profile_arn, 'SubnetId': subnet_id } # remove empty args actual_request = dict() for key in request: if request[key]: actual_request[key] = request[key] return ec2.create_instances(**actual_request)
def reset_sack_counter(table_name: str, dynamodb_resource: ServiceResource) -> None: table = dynamodb_resource.Table(table_name) option = { "Key": { "partitionId": "sackCounter", "sortId": "counter" }, "UpdateExpression": "set #times = :times, #updatedAt = :updatedAt", "ExpressionAttributeNames": { "#times": "times", "#updatedAt": "updatedAt" }, "ExpressionAttributeValues": { ":times": 0, ":updatedAt": int(datetime.now(timezone.utc).timestamp() * 1000) }, } resp = table.update_item(**option) logger.info("reset sack counter result", resp=resp)
def update_make_history(user_id: str, user_name: str, ts: str, table_name: str, dynamodb_resource: ServiceResource) -> Tuple[int, int]: table = dynamodb_resource.Table(table_name) partition_id = "makeHistory" option = { "Key": { "partitionId": partition_id, "sortId": ts }, "ConditionExpression": Key("partitionId").eq(partition_id) & Key("sortId").eq(ts) & Attr("isDetected").eq(False), "UpdateExpression": "set #updatedAt = :updatedAt, #userName = :userName, #isDetected = :isDetected", "ExpressionAttributeNames": { "#updatedAt": "updatedAt", "#userName": f"{user_id}_name", "#isDetected": "isDetected", }, "ExpressionAttributeValues": { ":updatedAt": int(datetime.now(timezone.utc).timestamp() * 1000), ":userName": user_name, ":isDetected": True, }, "ReturnValues": "ALL_NEW", } try: resp = table.update_item(**option) logger.info("update make history result", option=option, response=resp) times = resp["Attributes"]["times"] amount = resp["Attributes"]["amount"] return int(times), int(amount) except ClientError as e: if e.response["Error"]["Code"] == "ConditionalCheckFailedException": raise NotTargetError() raise
def read_s3(bucket: str, key: str, s3: ServiceResource = None): """ returns file :type bucket: str :param bucket: :type key: str :param key: :param s3: S3 resource :return: str """ if not s3: log.warning('creating a S3 resource in read_s3() function') s3 = get_s3_resource() t0 = time() log.info("Downloading config file {0} from s3://{1}...".format( key, bucket)) obj = s3.Object(bucket, key) log.debug('ET for reading {} from S3: {} sec'.format( key, round(time() - t0, 4))) return obj.get()['Body'].read().decode('utf-8')
def get_user_make_history(year: int, month: int, table_name: str, dynamodb_resource: ServiceResource) -> dict: table = dynamodb_resource.Table(table_name) option = {"Key": {"partitionId": "userMakeHistory", "sortId": f"{year}年{month:02}月"}} resp = table.get_item(**option) logger.info("get user make history result", option=option, response=resp) return resp.get("Item")
def put_to_table(table_name: str, src_items: List[dict], dynamodb_resource: ServiceResource) -> None: table = dynamodb_resource.Table(table_name) with table.batch_writer() as batch: for item in src_items: batch.put_item(Item=item)
def __init__(self, table_name: str, dynamodb_resource: ServiceResource): self.table_name = table_name self.dynamodb_resource = dynamodb_resource self.create() self.table = dynamodb_resource.Table(table_name)