Exemple #1
0
 def test_error_for_file_too_big(self):
     client = boto3.client("s3", config=custom_boto_config.init())
     big_file = "mock_file_too_big.donotload"
     fixtures_dir = os.path.join(os.path.dirname(__file__), "fixtures")
     client.upload_file(Filename=f"{fixtures_dir}/{big_file}",
                        Bucket=MOCK_BUCKET,
                        Key=big_file)
     with pytest.raises(FileSizeTooBigException):
         download_file(MOCK_BUCKET, big_file)
Exemple #2
0
def generate_report_for_day_s3(s3bucket: str, previous_report_s3_key: str,
                               new_report_s3_key: str, report_date: date,
                               report_time: time) -> str:

    # Load the workbook
    print("downloading file from bucket: " + s3bucket + " with key: " +
          previous_report_s3_key)
    # https://stackoverflow.com/questions/17195569/using-a-variable-in-a-try-catch-finally-statement-without-declaring-it-outside
    try:
        file = download_file(bucket=s3bucket, key=previous_report_s3_key)
    except Exception as exc:
        raise RuntimeError('Failed to get sit from previous day. Key: ' +
                           previous_report_s3_key) from exc

    wb = load_workbook(file)

    # Should update the wb itself
    update_workbook_with_new_record(wb=wb,
                                    report_date=report_date,
                                    report_time=report_time)

    # Save the workbook
    local_filepath = '/tmp/' + get_file_name(report_date)
    print("saving file locally to path: " + local_filepath)
    wb.save(filename=local_filepath)

    print("uploading file to bucket: " + s3bucket + " with key: " +
          new_report_s3_key)
    upload_workbook(workbook=wb, bucket=s3bucket, key=new_report_s3_key)

    return local_filepath
    def process_file(self, bucket_name: str, key_prefix: str):
        # download the file and build the array of json documents
        file_path = s3_util.download_file(bucket_name, key_prefix)
        with open(file_path) as json_file:
            # generating a parent_id as a mechanism to aggregate records from the same file. An index will
            # be appended to this parent_id for each individual record using '#' delimiter
            parent_id = None
            if not self.id_expression:
                parent_id = uuid.uuid4().hex

            source_json_data = json.load(json_file)

            lang_code = None
            if self.lang_expression.search(source_json_data):
                lang_code = self.lang_expression.search(source_json_data)

            created_at = None
            if not self.create_date_expression:
                created_at = datetime.now(
                    timezone.utc).strftime(TIMESTAMP_FORMAT)

            if self.list_selector_expression:
                json_data = self.list_selector_expression.search(
                    source_json_data)
                for index, record in enumerate(json_data):
                    output_record = self.transform_row(record,
                                                       index,
                                                       lang=lang_code,
                                                       parent_id=parent_id,
                                                       created_at=created_at)
                    output_record["feed"]["source_file"] = os.path.basename(
                        file_path)
                    logger.debug(
                        f"JSON record is: {json.dumps(output_record)}")
                    # buffer the output_record into kinesis data stream
                    buffer_data_into_stream(
                        output_record,
                        partition_key=output_record["feed"]["id_str"])

                # since the selector section of the json is already processed above, removing that key from the
                # json object to reduce the parameter size
                del source_json_data[self.list_selector_key]
                self.auxillary_processing_callback(source_json_data,
                                                   parent_id=parent_id,
                                                   created_at=created_at)
            else:
                output_record = self.transform_row(
                    source_json_data, 0
                )  # passing index as 0 since it has only 1 record in a file
                logger.debug(f"JSON record is: {json.dumps(output_record)}")
                buffer_data_into_stream(
                    output_record,
                    partition_key=output_record["feed"]["id_str"])
        # now since the file is processed, delete the file. This would ensure if the same lambda instance is used,
        # it would not have the same file in the /tmp directory
        os.remove(file_path)
        s3_util.tag_file_as_processed(
            bucket_name,
            key_prefix)  # tag file in s3 that processing is complete
Exemple #4
0
 def test_download_files(self):
     fixtures_dir = os.path.join(os.path.dirname(__file__), "fixtures")
     filenames = [
         fn for fn in os.listdir(fixtures_dir) if any(
             fn.endswith(ext) for ext in included_extensions)
     ]
     for filename in filenames:
         local_file_path = download_file(MOCK_BUCKET, filename)
         self.assertEqual(os.path.basename(local_file_path),
                          os.path.basename(filename))
def get_workbook_file_only_given_s3_event(event):
    bucket_and_key = get_bucket_and_key_from_event(event)
    source_bucket = bucket_and_key[0]
    source_key = bucket_and_key[1]

    # Load the workbook
    print("downloading file from bucket: " + source_bucket + " with key: " +
          source_key)
    # https://stackoverflow.com/questions/17195569/using-a-variable-in-a-try-catch-finally-statement-without-declaring-it-outside
    try:
        return download_file(bucket=source_bucket, key=source_key)
    except Exception as exc:
        raise RuntimeError('Failed to file from bucket: ' + source_bucket +
                           ' with key: ' + source_key) from exc
Exemple #6
0
    def process_file(self, bucket_name: str, key_prefix: str):
        file_path = s3_util.download_file(bucket_name, key_prefix)

        """
        Open the excel file for reading worksheet. Only supports reading the first worksheet. Also the sheet is
        expected to have only data, for security reason any links, VBA scripts or calculation are disabled
        """
        workbook = openpyxl.load_workbook(filename=file_path, read_only=True, data_only=True)
        worksheet = workbook.worksheets[0]
        self.header_list = list(map(lambda x: x.value, worksheet[1]))

        """ Process each row of the worksheet. Assuming that it has a header, reading from 2nd row """
        for row in worksheet.iter_rows(min_row=2, values_only=True):
            data = self.transform_row(row)
            data["source_file"] = os.path.basename(file_path)
            buffer_data_into_stream(data, partition_key=data["feed"]["id_str"])

        # now since the file is processed, delete the file. This would ensure if the same lambda instance is used,
        # it would not have the same file in the /tmp directory
        os.remove(file_path)
        s3_util.tag_file_as_processed(bucket_name, key_prefix)  # tag file in s3 that processing is complete
Exemple #7
0
 def test_error_for_download_files(self):
     with pytest.raises(botocore.errorfactory.ClientError):
         with unittest.mock.patch(
                 "boto3.s3.transfer.S3Transfer.download_file",
                 side_effect=get_mock_client_exception()):
             download_file(MOCK_BUCKET, "error_prefix")