Esempio n. 1
0
 def test_kinesis_put_upload_retrigger_event(self):
     events = KinesisEvents(service="Testing", mode="test")
     s3_bucket = "beep-input-data"
     obj = {
         "Key":
         "d3Batt/raw/arbin/FastCharge_000002_CH2_Metadata.csv",
         "LastModified":
         datetime.datetime(2019, 4, 4, 23, 19, 20, tzinfo=tzutc()),
         "ETag":
         '"37677ae6b73034197d59cf3075f6fb98"',
         "Size":
         615,
         "StorageClass":
         "STANDARD",
         "Owner": {
             "DisplayName":
             "it-admin+materials-admin",
             "ID":
             "02d8b24e2f66c2b5937f391b7c87406d4eeab68cf887bd9933d6631536959f24",
         },
     }
     retrigger_data = {
         "filename": obj["Key"],
         "bucket": s3_bucket,
         "size": obj["Size"],
         "hash": obj["ETag"].strip('"'),
     }
     response_valid = events.put_upload_retrigger_event(
         "complete", retrigger_data)
     assert response_valid["ResponseMetadata"]["HTTPStatusCode"] == 200
Esempio n. 2
0
def scan(config):
    print("scanning")
    s3 = boto3.client("s3")
    all_objects = s3.list_objects_v2(Bucket=S3_BUCKET, Prefix=config.s3_prefix)

    objects = [obj for obj in all_objects['Contents'] if obj['Size'] > 1000]

    # db_objects = dim_run['file_path_data'].tolist()
    # print(db_objects)
    # print(len([obj for obj in objects if obj['Key'] not in db_objects]))
    # objects = [obj for obj in objects if obj['Key'] not in db_objects]
    objects = [
        obj for obj in objects if "PredictionDiagnostics" in obj['Key']
        and "x" not in obj['Key'] and "Complete" not in obj['Key']
        # and obj['LastModified'] < datetime.datetime(2020, 2, 15, 5, 35, 43, tzinfo=tzutc())]
        and "_000128_" in obj['Key']
    ]
    print(len(objects))

    events = KinesisEvents(service='S3Syncer', mode=config.mode)
    objects.reverse()
    for obj in objects:
        retrigger_data = {
            "filename": obj['Key'],
            "bucket": S3_BUCKET,
            "size": obj['Size'],
            "hash": obj["ETag"].strip('\"')
        }
        events.put_upload_retrigger_event('complete', retrigger_data)
        print(retrigger_data)
Esempio n. 3
0
 def test_kinesis_put_service_event_stress(self):
     events = KinesisEvents(service='Testing', mode='test')
     for i in range(10):
         array = np.random.rand(5, 5, 3)
         print(array.tolist())
         response_valid = events.put_service_event('Test', 'starting', {"Stress test array": array.tolist()})
         assert response_valid['ResponseMetadata']['HTTPStatusCode'] == 200
Esempio n. 4
0
    def test_kinesis_put_analyzing_event(self):
        events = KinesisEvents(service='Testing', mode='test')
        processed_paths_list = [os.path.join(TEST_FILE_DIR, "2017-12-04_4_65C-69per_6C_features.json")]
        processed_run_list = [24]
        processed_result_list = ["success"]
        processed_message_list = [{'comment': '',
                                   'error': ''}]

        output_data = {"file_list": processed_paths_list,
                       "run_list": processed_run_list,
                       "result_list": processed_result_list,
                       "message_list": processed_message_list
                       }

        response_valid = events.put_analyzing_event(output_data, 'featurizing', 'complete')
        assert response_valid['ResponseMetadata']['HTTPStatusCode'] == 200

        processed_paths_list = [os.path.join(TEST_FILE_DIR, "2017-12-04_4_65C-69per_6C_predictions.json")]
        processed_run_list = [24]
        processed_result_list = ["success"]
        processed_message_list = [{'comment': '',
                                   'error': ''}]

        output_data = {"file_list": processed_paths_list,
                       "run_list": processed_run_list,
                       "result_list": processed_result_list,
                       "message_list": processed_message_list
                       }

        response_valid = events.put_analyzing_event(output_data, 'predicting', 'complete')
        assert response_valid['ResponseMetadata']['HTTPStatusCode'] == 200
Esempio n. 5
0
 def test_kinesis_put_upload_retrigger_event(self):
     events = KinesisEvents(service='Testing', mode='test')
     s3_bucket = "beep-input-data"
     obj = {
         'Key':
         'd3Batt/raw/arbin/FastCharge_000002_CH2_Metadata.csv',
         'LastModified':
         datetime.datetime(2019, 4, 4, 23, 19, 20, tzinfo=tzutc()),
         'ETag':
         '"37677ae6b73034197d59cf3075f6fb98"',
         'Size':
         615,
         'StorageClass':
         'STANDARD',
         'Owner': {
             'DisplayName':
             'it-admin+materials-admin',
             'ID':
             '02d8b24e2f66c2b5937f391b7c87406d4eeab68cf887bd9933d6631536959f24'
         }
     }
     retrigger_data = {
         "filename": obj['Key'],
         "bucket": s3_bucket,
         "size": obj['Size'],
         "hash": obj["ETag"].strip('\"')
     }
     response_valid = events.put_upload_retrigger_event(
         'complete', retrigger_data)
     assert response_valid['ResponseMetadata']['HTTPStatusCode'] == 200
Esempio n. 6
0
 def test_kinesis_put_service_event_stress(self):
     events = KinesisEvents(service="Testing", mode="test")
     for i in range(10):
         array = np.random.rand(5, 5, 3)
         print(array.tolist())
         response_valid = events.put_service_event(
             "Test", "starting", {"Stress test array": array.tolist()})
         assert response_valid["ResponseMetadata"]["HTTPStatusCode"] == 200
Esempio n. 7
0
 def test_get_file_size(self):
     events = KinesisEvents(service='Testing', mode='test')
     file_list = [os.path.join(TEST_FILE_DIR, "2017-05-09_test-TC-contact_CH33.csv"),
                  os.path.join(TEST_FILE_DIR, "2017-12-04_4_65C-69per_6C_CH29.csv"),
                  os.path.join(TEST_FILE_DIR, "xTESLADIAG_000019_CH70.070")]
     file_sizes = events.get_file_size(file_list)
     print(file_sizes)
     assert file_sizes[0] == 54620
     assert file_sizes[1] == 37878198
     assert file_sizes[2] == 3019440
Esempio n. 8
0
 def test_kinesis_put_validation_event(self):
     events = KinesisEvents(service='Testing', mode='test')
     file_list = [os.path.join(TEST_FILE_DIR, "2017-12-04_4_65C-69per_6C_CH29.csv")]
     file_list_data = {"run_list": [24]}
     validity = ["valid"]
     messages = [{'comment': '',
                  'error': ''}]
     output_json = {'file_list': file_list, 'run_list': file_list_data['run_list'],
                    'validity': validity, 'message_list': messages}
     response_valid = events.put_validation_event(output_json, 'complete')
     assert response_valid['ResponseMetadata']['HTTPStatusCode'] == 200
Esempio n. 9
0
    def test_kinesis_put_service_event(self):
        events = KinesisEvents(service='Testing', mode='test')
        response_valid = events.put_service_event('Test', 'starting',
                                                  {"String": "test"})
        assert response_valid['ResponseMetadata']['HTTPStatusCode'] == 200

        response_type_error = events.put_service_event('Test', 'starting',
                                                       np.array([1, 2, 3]))
        self.assertRaises(TypeError, response_type_error)
        # Test list variable type
        response_valid = events.put_service_event('Test', 'starting',
                                                  {"List": [1, 2, 3]})
        assert response_valid['ResponseMetadata']['HTTPStatusCode'] == 200
        # Test float variable type
        response_valid = events.put_service_event('Test', 'starting',
                                                  {"Float": 1238.1231234})
        assert response_valid['ResponseMetadata']['HTTPStatusCode'] == 200
        # Test np.array variable type
        response_valid = events.put_service_event(
            'Test', 'starting', {"Array": np.random.rand(10, 10).tolist()})
        assert response_valid['ResponseMetadata']['HTTPStatusCode'] == 200
        # Test dictionary variable type
        response_valid = events.put_service_event('Test', 'starting',
                                                  {"Dict": {
                                                      "key": "value"
                                                  }})
        assert response_valid['ResponseMetadata']['HTTPStatusCode'] == 200
Esempio n. 10
0
 def test_kinesis_put_validation_event(self):
     events = KinesisEvents(service="Testing", mode="test")
     file_list = [
         os.path.join(TEST_FILE_DIR, "2017-12-04_4_65C-69per_6C_CH29.csv")
     ]
     file_list_data = {"run_list": [24]}
     validity = ["valid"]
     messages = [{"comment": "", "error": ""}]
     output_json = {
         "file_list": file_list,
         "run_list": file_list_data["run_list"],
         "validity": validity,
         "message_list": messages,
     }
     response_valid = events.put_validation_event(output_json, "complete")
     assert response_valid["ResponseMetadata"]["HTTPStatusCode"] == 200
Esempio n. 11
0
    def test_kinesis_put_structuring_event(self):
        events = KinesisEvents(service='Testing', mode='test')
        processed_file_list = [os.path.join(TEST_FILE_DIR, "2017-06-30_2C-10per_6C_CH10_structure.json")]
        processed_run_list = [24]
        processed_result_list = ["success"]
        processed_message_list = [{'comment': '',
                                   'error': ''}]
        invalid_file_list = []
        output_json = {"file_list": processed_file_list,
                       "run_list": processed_run_list,
                       "result_list": processed_result_list,
                       "message_list": processed_message_list,
                       "invalid_file_list": invalid_file_list}

        response_valid = events.put_structuring_event(output_json, 'complete')
        assert response_valid['ResponseMetadata']['HTTPStatusCode'] == 200
Esempio n. 12
0
    def test_kinesis_put_generate_event(self):
        events = KinesisEvents(service='Testing', mode='test')

        all_output_files = \
            ['/data-share/protocols/procedures/name_000000.000',
             '/data-share/protocols/procedures/name_000007.000',
             '/data-share/protocols/procedures/name_000014.000']
        result = 'success'
        message = {'comment': '', 'error': ''}

        output_data = {
            "file_list": all_output_files,
            "result": result,
            "message": message
        }

        response_valid = events.put_generate_event(output_data, 'complete')
        assert response_valid['ResponseMetadata']['HTTPStatusCode'] == 200
Esempio n. 13
0
def scan(config):
    print("scanning")
    s3 = boto3.client("s3")
    all_objects = s3.list_objects_v2(Bucket=S3_BUCKET_IN,
                                     Prefix=config.s3_prefix)

    objects = [obj for obj in all_objects['Contents'] if obj['Size'] > 1000]

    objects = [
        obj for obj in objects if "PredictionDiagnostics" in obj['Key']
        and "x" not in obj['Key'] and "Complete" not in obj['Key']
        # and obj['LastModified'] < datetime.datetime(2020, 3, 24, 5, 35, 43, tzinfo=tzutc())
        # and "_000175_" in obj['Key']
    ]

    old_objects = []
    old = datetime.datetime.now(pytz.utc) - datetime.timedelta(hours=6)
    for obj in objects:
        name = config.s3_output + '/' + get_structure_name(obj)
        structure_objects = s3.list_objects_v2(Bucket=S3_BUCKET_OUT,
                                               Prefix=name)
        # print(structure_objects)
        if 'Contents' in structure_objects.keys() and len(
                structure_objects['Contents']) == 1:
            if structure_objects['Contents'][0]['LastModified'] < old:
                old_objects.append(obj)
        else:
            old_objects.append(obj)

    objects = old_objects
    print(len(objects))

    events = KinesisEvents(service='S3Syncer', mode=config.mode)
    objects.reverse()
    for obj in objects:
        retrigger_data = {
            "filename": obj['Key'],
            "bucket": S3_BUCKET_IN,
            "size": obj['Size'],
            "hash": obj["ETag"].strip('\"')
        }
        events.put_upload_retrigger_event('complete', retrigger_data)
        print(retrigger_data)
        time.sleep(0.1)
Esempio n. 14
0
    def test_kinesis_put_generate_event(self):
        events = KinesisEvents(service="Testing", mode="test")

        all_output_files = [
            "/data-share/protocols/procedures/name_000000.000",
            "/data-share/protocols/procedures/name_000007.000",
            "/data-share/protocols/procedures/name_000014.000",
        ]
        result = "success"
        message = {"comment": "", "error": ""}

        output_data = {
            "file_list": all_output_files,
            "result": result,
            "message": message,
        }

        response_valid = events.put_generate_event(output_data, "complete")
        assert response_valid["ResponseMetadata"]["HTTPStatusCode"] == 200
Esempio n. 15
0
def process_csv_file_list_from_json(
    file_list_json, processed_dir="data-share/protocols/"
):
    """

    Args:
        file_list_json (str):
        processed_dir (str):

    Returns:
        str:
    """
    # Get file list and validity from json, if ends with .json,
    # assume it's a file, if not assume it's a json string
    if file_list_json.endswith(".json"):
        file_list_data = loadfn(file_list_json)
    else:
        file_list_data = json.loads(file_list_json)

    # Setup Events
    events = KinesisEvents(service="ProtocolGenerator", mode=file_list_data["mode"])
    outputs = WorkflowOutputs()

    file_list = file_list_data["file_list"]
    all_output_files = []
    protocol_dir = os.path.join(
        os.environ.get("BEEP_PROCESSING_DIR", "/"), processed_dir
    )
    for filename in file_list:
        output_files, result, message = generate_protocol_files_from_csv(
            filename, output_directory=protocol_dir
        )
        all_output_files.extend(output_files)

    output_data = {"file_list": all_output_files, "result": result, "message": message}

    events.put_generate_event(output_data, "complete")

    # Workflow outputs
    outputs.put_generate_outputs_list(output_data, "complete")

    return json.dumps(output_data)
Esempio n. 16
0
    def test_kinesis_put_structuring_event(self):
        events = KinesisEvents(service="Testing", mode="test")
        processed_file_list = [
            os.path.join(TEST_FILE_DIR,
                         "2017-06-30_2C-10per_6C_CH10_structure.json")
        ]
        processed_run_list = [24]
        processed_result_list = ["success"]
        processed_message_list = [{"comment": "", "error": ""}]
        invalid_file_list = []
        output_json = {
            "file_list": processed_file_list,
            "run_list": processed_run_list,
            "result_list": processed_result_list,
            "message_list": processed_message_list,
            "invalid_file_list": invalid_file_list,
        }

        response_valid = events.put_structuring_event(output_json, "complete")
        assert response_valid["ResponseMetadata"]["HTTPStatusCode"] == 200
Esempio n. 17
0
    def test_kinesis_put_analyzing_event(self):
        events = KinesisEvents(service="Testing", mode="test")
        processed_paths_list = [
            os.path.join(TEST_FILE_DIR,
                         "2017-12-04_4_65C-69per_6C_features.json")
        ]
        processed_run_list = [24]
        processed_result_list = ["success"]
        processed_message_list = [{"comment": "", "error": ""}]

        output_data = {
            "file_list": processed_paths_list,
            "run_list": processed_run_list,
            "result_list": processed_result_list,
            "message_list": processed_message_list,
        }

        response_valid = events.put_analyzing_event(output_data, "featurizing",
                                                    "complete")
        assert response_valid["ResponseMetadata"]["HTTPStatusCode"] == 200

        processed_paths_list = [
            os.path.join(TEST_FILE_DIR,
                         "2017-12-04_4_65C-69per_6C_predictions.json")
        ]
        processed_run_list = [24]
        processed_result_list = ["success"]
        processed_message_list = [{"comment": "", "error": ""}]

        output_data = {
            "file_list": processed_paths_list,
            "run_list": processed_run_list,
            "result_list": processed_result_list,
            "message_list": processed_message_list,
        }

        response_valid = events.put_analyzing_event(output_data, "predicting",
                                                    "complete")
        assert response_valid["ResponseMetadata"]["HTTPStatusCode"] == 200
Esempio n. 18
0
def validate_file_list_from_json(file_list_json,
                                 record_results=False,
                                 skip_existing=False,
                                 validator_class=SimpleValidator):
    """
    Validates a list of files from json input

    Args:
        file_list_json (str): input for validation files, should be a json string
            with attribute "file_list" or a filename (e. g. something.json)
            corresponding to a json object with a similar attribute.
        record_results (bool): Whether to record the validation results locally
            or not (defaults to False).
        skip_existing (bool): Whether to skip already validated files. This
            is done by checking if the file is in the validation_records.
            skip_existing only matters if record_results is True. (defaults to False)
        validator_class (ValidatorBeep or SimpleValidator): validator class
            to use in validation.

    Returns:
        str: json dump of the validator results.

    """
    # Process input json
    if file_list_json.endswith(".json"):
        file_list_data = loadfn(file_list_json)
    else:
        file_list_data = json.loads(file_list_json)

    # Setup Events
    events = KinesisEvents(service='DataValidator',
                           mode=file_list_data['mode'])

    file_list = file_list_data['file_list']

    validator = validator_class()
    all_results = validator.validate_from_paths(
        file_list,
        record_results=record_results,
        skip_existing=skip_existing,
    )

    # Get validities and recast to strings (valid/invalid) based on result
    validity = [
        all_results[os.path.split(file)[-1]]['validated'] for file in file_list
    ]

    validity = list(map(lambda x: 'valid' if x else 'invalid', validity))

    # Get errors
    errors = [
        all_results[os.path.split(file)[-1]]['errors'] for file in file_list
    ]
    messages = [{'comment': '', 'error': error} for error in errors]
    output_json = {
        'file_list': file_list,
        'run_list': file_list_data['run_list'],
        'validity': validity,
        'message_list': messages
    }

    events.put_validation_event(output_json, 'complete')

    return json.dumps(output_json)
Esempio n. 19
0
def validate_file_list_from_json(
    file_list_json,
    record_results=False,
    skip_existing=False,
    validator_class=SimpleValidator,
):
    """
    Validates a list of files from json input

    Args:
        file_list_json (str): input for validation files, should be a json string
            with attribute "file_list" or a filename (e. g. something.json)
            corresponding to a json object with a similar attribute.
        record_results (bool): Whether to record the validation results locally
            or not (defaults to False).
        skip_existing (bool): Whether to skip already validated files. This
            is done by checking if the file is in the validation_records.
            skip_existing only matters if record_results is True. (defaults to False)
        validator_class (ValidatorBeep or SimpleValidator): validator class
            to use in validation.

    Returns:
        str: json dump of the validator results.

    """
    # Process input json
    if file_list_json.endswith(".json"):
        file_list_data = loadfn(file_list_json)
    else:
        file_list_data = json.loads(file_list_json)

    # Setup Events
    events = KinesisEvents(service="DataValidator",
                           mode=file_list_data["mode"])
    outputs = WorkflowOutputs()

    file_list = file_list_data["file_list"]

    validator = validator_class()
    all_results = validator.validate_from_paths(
        file_list,
        record_results=record_results,
        skip_existing=skip_existing,
    )

    # Get validities and recast to strings (valid/invalid) based on result
    validity = [
        all_results[os.path.split(file)[-1]]["validated"] for file in file_list
    ]

    validity = list(map(lambda x: "valid" if x else "invalid", validity))

    # Get errors
    errors = [
        all_results[os.path.split(file)[-1]]["errors"] for file in file_list
    ]
    messages = [{"comment": "", "error": error} for error in errors]
    output_json = {
        "file_list": file_list,
        "run_list": file_list_data["run_list"],
        "validity": validity,
        "message_list": messages,
    }

    events.put_validation_event(output_json, "complete")

    # Workflow outputs
    file_list_size = len(output_json["file_list"])
    if file_list_size > 1 or file_list_size == 0:
        logger.warning("{file_list_size} files being validated, should be 1")

    output_data = {
        "filename": output_json["file_list"][0],
        "run_id": output_json["run_list"][0],
        "result": output_json["validity"][0],
    }

    outputs.put_workflow_outputs(output_data, "validating")

    return json.dumps(output_json)
Esempio n. 20
0
 def test_kinesis_put_basic_event(self):
     events = KinesisEvents(service='Testing', mode='test')
     response = events.put_basic_event('test_events',
                                       'This is a basic event test')
     assert response['ResponseMetadata']['HTTPStatusCode'] == 200
Esempio n. 21
0
 def test_kinesis_put_basic_event(self):
     events = KinesisEvents(service="Testing", mode="test")
     response = events.put_basic_event("test_events",
                                       "This is a basic event test")
     assert response["ResponseMetadata"]["HTTPStatusCode"] == 200
Esempio n. 22
0
def process_file_list_from_json(file_list_json,
                                processed_dir='data-share/features/',
                                features_label='full_model',
                                predict_only=False,
                                prediction_type="multi",
                                predicted_quantity="cycle"):
    """
    Function to take a json file containing processed cycler run file locations,
    extract features, dump the processed file into a predetermined directory,
    and return a jsonable dict of feature file locations.

    Args:
        file_list_json (str): json string or json filename corresponding
            to a dictionary with a file_list attribute,
            if this string ends with ".json", a json file is assumed
            and loaded, otherwise interpreted as a json string.
        processed_dir (str): location for processed cycler run output files
            to be placed.
        features_label (str): name of feature generation method.
        predict_only (bool): whether to calculate predictions or not.
        prediction_type (str): Single or multi-point predictions.
        predicted_quantity (str): quantity being predicted - cycle or capacity.

    Returns:
        str: json string of feature files (with key "file_list").

    """
    # Get file list and validity from json, if ends with .json,
    # assume it's a file, if not assume it's a json string
    if file_list_json.endswith(".json"):
        file_list_data = loadfn(file_list_json)
    else:
        file_list_data = json.loads(file_list_json)

    # Setup Events
    events = KinesisEvents(service='DataAnalyzer', mode=file_list_data['mode'])

    # Add root path to processed_dir
    processed_dir = os.path.join(os.environ.get("BEEP_ROOT", "/"),
                                 processed_dir)
    file_list = file_list_data['file_list']
    run_ids = file_list_data['run_list']
    processed_run_list = []
    processed_result_list = []
    processed_message_list = []
    processed_paths_list = []

    required_cycle_num = 100  #for full model

    for path, run_id in zip(file_list, run_ids):
        logger.info('run_id=%s featurizing=%s', str(run_id), path, extra=s)

        #check if there is enough data to try featurizing
        if not len(loadfn(path).summary) > required_cycle_num:
            logger.info("run_id=%s Insufficient data for featurization",
                        str(run_id),
                        extra=s)
            processed_paths_list.append(path)
            processed_run_list.append(run_id)
            processed_result_list.append("incomplete")
            processed_message_list.append({
                'comment': 'Insufficient data for featurization',
                'error': ''
            })

        else:
            processed_data = DegradationPredictor.from_processed_cycler_run_file(
                path,
                features_label=features_label,
                predict_only=predict_only,
                prediction_type=prediction_type,
                predicted_quantity=predicted_quantity)
            new_filename = os.path.basename(path)
            new_filename = scrub_underscore_suffix(new_filename)

            # Append model_name along with "features" to demarcate
            # different models when saving the feature vectors.
            new_filename = add_suffix_to_filename(
                new_filename,
                "_" + features_label + "_" + prediction_type + "_features")
            processed_path = os.path.join(processed_dir, new_filename)
            processed_path = os.path.abspath(processed_path)
            dumpfn(processed_data, processed_path)
            processed_paths_list.append(processed_path)
            processed_run_list.append(run_id)
            processed_result_list.append("success")
            processed_message_list.append({'comment': '', 'error': ''})

    output_data = {
        "file_list": processed_paths_list,
        "run_list": processed_run_list,
        "result_list": processed_result_list,
        "message_list": processed_message_list
    }

    events.put_analyzing_event(output_data, 'featurizing', 'complete')
    # Return jsonable file list
    return json.dumps(output_data)
Esempio n. 23
0
def process_file_list_from_json(file_list_json,
                                processed_dir='data-share/features/',
                                features_label='full_model',
                                predict_only=False,
                                prediction_type="multi",
                                predicted_quantity="cycle"):
    """
    Function to take a json file containing processed cycler run file locations,
    extract features, dump the processed file into a predetermined directory,
    and return a jsonable dict of feature file locations.

    Args:
        file_list_json (str): json string or json filename corresponding
            to a dictionary with a file_list attribute,
            if this string ends with ".json", a json file is assumed
            and loaded, otherwise interpreted as a json string.
        processed_dir (str): location for processed cycler run output files
            to be placed.
        features_label (str): name of feature generation method.
        predict_only (bool): whether to calculate predictions or not.
        prediction_type (str): Single or multi-point predictions.
        predicted_quantity (str): quantity being predicted - cycle or capacity.

    Returns:
        str: json string of feature files (with key "file_list").

    """
    # Get file list and validity from json, if ends with .json,
    # assume it's a file, if not assume it's a json string
    if file_list_json.endswith(".json"):
        file_list_data = loadfn(file_list_json)
    else:
        file_list_data = json.loads(file_list_json)

    # Setup Events
    events = KinesisEvents(service='DataAnalyzer', mode=file_list_data['mode'])

    # Add root path to processed_dir
    processed_dir = os.path.join(os.environ.get("BEEP_ROOT", "/"),
                                 processed_dir)
    file_list = file_list_data['file_list']
    run_ids = file_list_data['run_list']
    processed_run_list = []
    processed_result_list = []
    processed_message_list = []
    processed_paths_list = []

    for path, run_id in zip(file_list, run_ids):
        logger.info('run_id=%s featurizing=%s', str(run_id), path, extra=s)
        processed_cycler_run = loadfn(path)

        featurizer_classes = [DeltaQFastCharge, TrajectoryFastCharge]
        for featurizer_class in featurizer_classes:
            featurizer = featurizer_class.from_run(path, processed_dir,
                                                   processed_cycler_run)
            if featurizer:
                dumpfn(featurizer, featurizer.name)
                processed_paths_list.append(featurizer.name)
                processed_run_list.append(run_id)
                processed_result_list.append("success")
                processed_message_list.append({'comment': '', 'error': ''})
                logger.info('Successfully generated %s',
                            featurizer.name,
                            extra=s)
            else:
                processed_paths_list.append(path)
                processed_run_list.append(run_id)
                processed_result_list.append("incomplete")
                processed_message_list.append({
                    'comment':
                    'Insufficient or incorrect data for featurization',
                    'error': ''
                })
                logger.info('Unable to featurize %s', path, extra=s)

    output_data = {
        "file_list": processed_paths_list,
        "run_list": processed_run_list,
        "result_list": processed_result_list,
        "message_list": processed_message_list
    }

    events.put_analyzing_event(output_data, 'featurizing', 'complete')
    # Return jsonable file list
    return json.dumps(output_data)
Esempio n. 24
0
def process_file_list_from_json(file_list_json, model_dir="/data-share/models/",
                                processed_dir='data-share/predictions/',
                                hyperparameters=None, model_name=None, predict_only=True):
    """
    Function to take a json file containing featurized json locations,
    train a new model if necessary, write files containing predictions into a
    predetermined directory, and return a jsonable dict of prediction file locations

    Args:
        file_list_json (str): json string or json filename corresponding
            to a dictionary with a file_list attribute,
            if this string ends with ".json", a json file is assumed
            and loaded, otherwise interpreted as a json string
        model_dir (str): location where models are serialized and stored
        processed_dir (str): location for processed cycler run output files
            to be placed
        hyperparameters (dict): dictionary of hyperparameters to optimize/use for training
        model_name (str): name of feature generation method
        predict_only (bool):

    Returns:
        str: json string of feature files (with key "feature_file_list").

    """
    # Get file list and validity from json, if ends with .json,
    # assume it's a file, if not assume it's a json string
    if file_list_json.endswith(".json"):
        file_list_data = loadfn(file_list_json)
    else:
        file_list_data = json.loads(file_list_json)

    # Setup Events
    events = KinesisEvents(service='DataAnalyzer', mode=file_list_data['mode'])

    # Add BEEP_ROOT to processed_dir
    processed_dir = os.path.join(os.environ.get("BEEP_ROOT", "/"),
                                 processed_dir)
    file_list = file_list_data['file_list']
    run_ids = file_list_data['run_list']
    processed_run_list = []
    processed_result_list = []
    processed_message_list = []
    processed_paths_list = []
    project_name = get_project_name_from_list(file_list)
    if predict_only:
        features = loadfn(file_list[0])
        if model_name is None and project_name in DEFAULT_MODEL_PROJECTS:

            if features.prediction_type == 'multi':
                model = DegradationModel.from_serialized_model(model_dir=model_dir,
                                                               serialized_model='d3batt_multi_point.model')
            else:
                model = DegradationModel.from_serialized_model(model_dir=model_dir,
                                                               serialized_model='d3batt_single_point.model')

        elif model_name is None and project_name not in DEFAULT_MODEL_PROJECTS:
            output_data = {"file_list": [],
                           "run_list": [],
                           "result_list": [],
                           "message_list": []
                           }

            events.put_analyzing_event(output_data, 'predicting', 'error')

            # Return jsonable file list
            return json.dumps(output_data)

        else:
            model = DegradationModel.from_serialized_model(model_dir=model_dir,
                                                           serialized_model=model_name)

    else:
        if hyperparameters is None:
            hyperparameters = {'random_state': 1,
                               'test_size': .3,
                               'k_fold': 5,
                               'tol': 0.001,
                               'l1_ratio': [.1, .5, .7, .9, .95, .99, 1]
                               }

        dataset_id = file_list_data.get("dataset_id")
        model = DegradationModel.train(file_list_json, dataset_id=dataset_id,
                                       model_type='linear', regularization_type='elasticnet',
                                       model_name=model_name, hyperparameters=hyperparameters)
        logger.warning('fitting=%s dataset=%s', model.name, str(dataset_id), extra=s)

    for path, run_id in zip(file_list, run_ids):
        logger.info('model=%s run_id=%s predicting=%s', model.name, str(run_id), path, extra=s)
        features = loadfn(path)
        prediction = model.predict(features)
        prediction_dict = model.prediction_to_dict(prediction, features.nominal_capacity)
        new_filename = os.path.basename(path)
        new_filename = scrub_underscore_suffix(new_filename)
        new_filename = add_suffix_to_filename(new_filename, "_predictions")
        processed_path = os.path.join(processed_dir, new_filename)
        processed_path = os.path.abspath(processed_path)
        dumpfn(prediction_dict, processed_path)

        # Append file loc to list to be returned
        processed_paths_list.append(processed_path)
        processed_run_list.append(run_id)
        processed_result_list.append("success")
        processed_message_list.append({'comment': '',
                                       'error': ''})

    output_data = {"file_list": processed_paths_list,
                   "run_list": processed_run_list,
                   "result_list": processed_result_list,
                   "message_list": processed_message_list
                   }

    events.put_analyzing_event(output_data, 'predicting', 'complete')

    # Return jsonable file list
    return json.dumps(output_data)