def test_text_pipe(self): Insert = extract.Insert(server_name, driver_name, database_name) customer_id = 15 sel = sqlalchemy.select([Points]).where( Points.customer_id.__eq__(customer_id)) dataset_raw = Insert.pandas_select_execute(sel) # Create 'clean' data processing pipeline clean_pipe = Transform.cleaning_pipeline( drop_attributes=DROP_ATTRIBUTES, nan_replace_dict=NAN_REPLACE_DICT, dtype_dict=TYPE_DICT, unit_dict=UNIT_DICT, dupe_cols=DUPE_COLS, remove_dupe=REMOVE_DUPE, replace_numbers=REPLACE_NUMBERS, remove_virtual=REMOVE_VIRTUAL, text_clean_attributes=TEXT_CLEAN_ATTRS) # Create pipeline specifically for clustering text features name_vocabulary = VocabularyText.read_vocabulary_disc( POINTNAME_VOCABULARY_FILENAME) name_text_pipe = Transform.text_pipeline_label( attributes=['NAME'], vocabulary=name_vocabulary) full_pipeline = Pipeline([ ('clean_pipe', clean_pipe), ('text_pipe', name_text_pipe), ]) dataset = full_pipeline.fit_transform(dataset_raw) return None
def test_numeric_pipe(self): Insert = extract.Insert(server_name, driver_name, database_name) customer_id = 15 sel = sqlalchemy.select([Points]).where( Points.customer_id.__eq__(customer_id)) dataset_raw = Insert.pandas_select_execute(sel) # Create 'clean' data processing pipeline clean_pipe = Transform.cleaning_pipeline( drop_attributes=DROP_ATTRIBUTES, nan_replace_dict=NAN_REPLACE_DICT, dtype_dict=TYPE_DICT, unit_dict=UNIT_DICT, dupe_cols=DUPE_COLS, remove_dupe=REMOVE_DUPE, replace_numbers=REPLACE_NUMBERS, remove_virtual=REMOVE_VIRTUAL, text_clean_attributes=TEXT_CLEAN_ATTRS) numeric_pipe = Transform.numeric_pipeline( numeric_attributes=NUM_ATTRIBUTES) df_clean = clean_pipe.fit_transform(dataset_raw) df_numeric = numeric_pipe.fit_transform(df_clean) return None
def test_read_categories(self): # Ititialize categories = Transform._read_categories(CATEGORICAL_ATTRIBUTES, CATEGORIES_FILE) replaceNone = ReplaceNone(CATEGORICAL_ATTRIBUTES) dataFrameSelector = DataFrameSelector(CATEGORICAL_ATTRIBUTES) oneHotEncoder = OneHotEncoder(categories=categories, handle_unknown='ignore') # Get raw database Insert = extract.Insert(server_name, driver_name, database_name) customer_id = 15 sel = sqlalchemy.select([Points]).where( Points.customer_id.__eq__(customer_id)) dataset_raw = Insert.pandas_select_execute(sel) clean_pipe = Transform.cleaning_pipeline( drop_attributes=DROP_ATTRIBUTES, nan_replace_dict=NAN_REPLACE_DICT, dtype_dict=TYPE_DICT, unit_dict=UNIT_DICT, dupe_cols=DUPE_COLS, remove_dupe=REMOVE_DUPE, replace_numbers=REPLACE_NUMBERS, remove_virtual=REMOVE_VIRTUAL, text_clean_attributes=TEXT_CLEAN_ATTRS) df_clean1 = clean_pipe.fit_transform(dataset_raw) # Transform df0 = replaceNone.fit_transform(df_clean1) df1_array = dataFrameSelector.fit_transform(df0) ohearray = oneHotEncoder.fit_transform(df1_array).toarray() return None
def test_categorical_pipe(self): Insert = extract.Insert(server_name, driver_name, database_name) customer_id = 15 sel = sqlalchemy.select([Points]).where( Points.customer_id.__eq__(customer_id)) dataset_raw = Insert.pandas_select_execute(sel) # Create 'clean' data processing pipeline clean_pipe = Transform.cleaning_pipeline( drop_attributes=DROP_ATTRIBUTES, nan_replace_dict=NAN_REPLACE_DICT, dtype_dict=TYPE_DICT, unit_dict=UNIT_DICT, dupe_cols=DUPE_COLS, remove_dupe=REMOVE_DUPE, replace_numbers=REPLACE_NUMBERS, remove_virtual=REMOVE_VIRTUAL, text_clean_attributes=TEXT_CLEAN_ATTRS) categorical_pipe = Transform.categorical_pipeline( categorical_attributes=CATEGORICAL_ATTRIBUTES, handle_unknown='ignore', categories_file=CATEGORIES_FILE) df_clean = clean_pipe.fit_transform(dataset_raw) ohe_array = categorical_pipe.fit_transform(df_clean).toarray() print("Example OneHotEcoded Array: ", ohe_array[0]) # Find more about categorical pipe ohe = categorical_pipe.named_steps['OneHotEncoder'] print("Categories used for OneHotEncoder", ohe.categories) return None
def test__transform_list(self): # Transform from a list of dictionaries input_data_list = self.input_data_list input_list = [x.__dict__ for x in input_data_list] dfraw = pd.DataFrame(input_list) bag = Transform.categorical_transform_pipeline_MIL()\ .fit_transform(dfraw) # Transform from a list of dictionaries input_data_list_pydantic = self.input_data_list_pydantic input_list = [x.__dict__ for x in input_data_list_pydantic] dfraw = pd.DataFrame(input_list) bag = Transform.categorical_transform_pipeline_MIL()\ .fit_transform(dfraw) return None
def get_single_mil_bag(self, pipeline='whole'): """Return a bag of commonly labeled data Bags are defined in SQL Server in the Points table on the group_id Froeign Key""" # Retrieve a single unique bag label sql = """SELECT top(1) group_id FROM {} WHERE IsNumeric(group_id) = 1 ORDER BY group_id ASC""".format(Points.__tablename__) sel = sqltext(sql) # List, remove a single item from the list group_ids = self.Insert.core_select_execute(sel) group_id = group_ids.pop().group_id # Create the pipeline if pipeline == 'whole': full_pipeline = Transform.numeric_transform_pipeline_MIL() elif pipeline == 'categorical': full_pipeline = Transform.categorical_transform_pipeline_MIL() else: raise ValueError('pipeline must be one of ["whole","categorical"]') # Retrieve bag label for each group_id sel = sqlalchemy.select([Labeling]).where(Labeling.id.__eq__(group_id)) with self.Insert.engine.connect() as connection: res = connection.execute(sel) label = res.fetchone().bag_label # Load the dataset sel = sqlalchemy.select([Points]).where(Points.group_id.__eq__(group_id)) dfraw = self.Insert.pandas_select_execute(sel) # Transform the dataset try: bag = full_pipeline.fit_transform(dfraw) except ValueError as e: print('Transform error, Skipped Group ID : ', group_id) traceback.print_exc() print(dfraw) raise e # Validate cleaned dataset if not self.validate_bag(bag): print("Invalid cleaned bag:\n") print(bag) return dfraw, bag, label
def test_calc_categories_dict(self): # Generate data to find categories Insert = extract.Insert(server_name, driver_name, database_name) sel = sqlalchemy.select([Points]) dataset_raw = Insert.pandas_select_execute(sel) # Create 'clean' data processing pipeline clean_pipe = Transform.cleaning_pipeline( drop_attributes=DROP_ATTRIBUTES, nan_replace_dict=NAN_REPLACE_DICT, dtype_dict=TYPE_DICT, unit_dict=UNIT_DICT, dupe_cols=DUPE_COLS, remove_dupe=REMOVE_DUPE, replace_numbers=REPLACE_NUMBERS, remove_virtual=REMOVE_VIRTUAL, text_clean_attributes=TEXT_CLEAN_ATTRS) string_pipe = SetDtypes( type_dict={ 'TYPE': str, 'ALARMTYPE': str, 'FUNCTION': str, 'VIRTUAL': str, 'CS': str, 'SENSORTYPE': str, 'DEVUNITS': str }) categories_clean_pipe = Pipeline([('clean_pipe', clean_pipe), ('string_pipe', string_pipe)]) df_clean = categories_clean_pipe.fit_transform(dataset_raw) # Calculate categories to be used later Encoding = EncodingCategories() columns = [ 'TYPE', 'ALARMTYPE', 'FUNCTION', 'VIRTUAL', 'CS', 'SENSORTYPE', 'DEVUNITS' ] categories_dict_calc = Encoding.calc_categories_dict(df_clean, columns) if not os.path.exists(CATEGORIES_FILE): raise OSError("Categories file not found: {}".\ format(CATEGORIES_FILE)) # Compare categoires read to those saved on disc categories_dict_read = Encoding.read_categories_from_disc( CATEGORIES_FILE) for key in set( (*categories_dict_calc.keys(), *categories_dict_read.keys())): self.assertEqual(set(categories_dict_calc[key]), set(categories_dict_read[key])) return None
def test__transform_data(self): # The loaded bag (self.bag_load) should match the manually transformed bag_manual = Transform.numeric_transform_pipeline_MIL()\ .fit_transform(self.dfraw_load) self.assertTrue( np.equal(bag_manual.toarray(), self.bag_load.toarray()).all()) # Transform raw data (from input class) bag_input = Transform.numeric_transform_pipeline_MIL().fit_transform( self.dfraw_input) msg = ("The transformed bag has {} features, and the predictor " + "has {} features") print( msg.format(bag_input.shape[1], self.predictor.classifier.n_features_in_)) self.assertEqual(bag_input.shape[1], self.predictor.classifier.n_features_in_) return None
def test__transform_data_pydantic(self): # Transform raw data (from input class) bag_input_pydantic = Transform.numeric_transform_pipeline_MIL()\ .fit_transform(self.dfraw_input_pydantic) msg = ("The transformed bag has {} features, and the predictor " + "has {} features") print( msg.format(bag_input_pydantic.shape[1], self.predictor.classifier.n_features_in_)) self.assertEqual(bag_input_pydantic.shape[1], self.predictor.classifier.n_features_in_) return None
def _load_pipeline(self, classifier_type: str) -> Pipeline: """Determine which pipeline to load based on the chosen classifier and passed classifier_type inputs ------ classifier_type: (str) one of ['numeric','categorical'] for numeric or categorical pipelines depening on requirements of the classifier If 'numeric' then return Transform.numeric_transform_pipeline_MIL() or Transform.categorical_transform_pipeline_MIL()""" if classifier_type == 'numeric': return Transform.numeric_transform_pipeline_MIL() elif classifier_type == 'categorical': return Transform.categorical_transform_pipeline_MIL() else: msg = ( "classifier_type must be one of ['numeric','categorical']. " + "Got {}") raise ValueError(msg.format(classifier_type)) return None
def __init__(self, classifier_filename: Union[str, bytes]): """inputs ------ classifier_filename: (str) name of file for pickled sklearn classifier Example usage basePredictorL1 = BasePredictor( classifier_filename=SVMC_l1_classifier_filename) # Somehow, create some raw data input_data = RawInputData( # Required numeric attributes DEVICEHI=122.0, DEVICELO=32.0, SIGNALHI=10, SIGNALLO=0, SLOPE=1.2104, INTERCEPT=0.01, # Required categorical attributes TYPE="LAI", ALARMTYPE="Standard", FUNCTION="Value", VIRTUAL=0, CS="AE", SENSORTYPE="VOLTAGE", DEVUNITS="VDC", # Requried text attributes NAME="SHLH.AHU-ED.RAT", DESCRIPTOR="RETURN TEMP", NETDEVID='test-value', SYSTEM='test-system' ) # Load raw data dfraw_input = pd.DataFrame(data=[input_data]) # Create predictions from raw input data results_l1 = basePredictorL1.predict(dfraw_input) """ # Load classifier self.classifier = self._load_predictor(classifier_filename) # Load transform pipeline self.numeric_transform_pipeline_MIL = Transform.numeric_transform_pipeline_MIL( ) # Load embedding class member self.MILESEmbedder = MILESEmbedding(CONCEPT_CLASS_FILENAME) return None
def bag_data_generator(self, pipeline, verbose=False): """Return a bag of commonly labeled data Bags are defined in SQL Server in the Points table on the group_id Froeign Key""" # Retrieve all unique bag labels sql = """SELECT distinct group_id FROM {} WHERE IsNumeric(group_id) = 1 ORDER BY group_id ASC""".format(Points.__tablename__) sel = sqltext(sql) group_ids = self.Insert.core_select_execute(sel) # Retrieve bag label for each group_id sql_bag = """SELECT id, bag_label FROM {} WHERE id = {}""" # Create the pipeline if pipeline == 'whole': full_pipeline = Transform.numeric_transform_pipeline_MIL() elif pipeline == 'categorical': full_pipeline = Transform.categorical_transform_pipeline_MIL() else: raise ValueError('pipeline must be one of ["whole","categorical"]') for row in group_ids: group_id = row.group_id sel = sqltext(sql_bag.format(Labeling.__tablename__, group_id)) with self.Insert.engine.connect() as connection: res = connection.execute(sel) label = res.fetchone().bag_label # Load the dataset sel = sqlalchemy.select([Points]).where(Points.group_id.__eq__(group_id)) dfraw = self.Insert.pandas_select_execute(sel) # Validate raw dataset if not self.validate_bag(dfraw): continue # Transform the dataset try: bag = full_pipeline.fit_transform(dfraw) except ValueError as e: print('Transform error, Skipped Group ID : ', group_id) if verbose: traceback.print_exc() print(dfraw) x = input("Do you want to continue and discard this bag? : ") if x in ['y','yes','Y','Yes','True','TRUE']: continue else: raise e else: continue # Validate cleaned dataset if not self.validate_bag(bag): continue yield bag, label
def calc_save_categories_vocabulary(): # Read raw data from database Insert = extract.Insert(server_name, driver_name, database_name) sel = sqlalchemy.select([Points]) dataset_raw = Insert.pandas_select_execute(sel) # Create 'clean' data processing pipeline clean_pipe = Transform.cleaning_pipeline( drop_attributes=DROP_ATTRIBUTES, nan_replace_dict=NAN_REPLACE_DICT, dtype_dict=TYPE_DICT, unit_dict=UNIT_DICT, dupe_cols=DUPE_COLS, remove_dupe=REMOVE_DUPE, replace_numbers=REPLACE_NUMBERS, remove_virtual=REMOVE_VIRTUAL, text_clean_attributes=TEXT_CLEAN_ATTRS) string_pipe = SetDtypes( type_dict={ 'TYPE': str, 'ALARMTYPE': str, 'FUNCTION': str, 'VIRTUAL': str, 'CS': str, 'SENSORTYPE': str, 'DEVUNITS': str }) categories_clean_pipe = Pipeline([('clean_pipe', clean_pipe), ('string_pipe', string_pipe)]) # Process raw data with pipeline df_clean = categories_clean_pipe.fit_transform(dataset_raw) # Calculate categories to be used later Encoding = EncodingCategories() columns = [ 'TYPE', 'ALARMTYPE', 'FUNCTION', 'VIRTUAL', 'CS', 'SENSORTYPE', 'DEVUNITS' ] categories_dict_calc = Encoding.calc_categories_dict(df_clean, columns) # Save categories in numpy array to be used later Encoding.save_categories_to_disc(categories_dict_calc, CATEGORIES_FILE) # Save vocabulary to file VOCAB_ALARMTYPE_PATH = '../data/vocab_alarmtype.txt' save_numpy_string_array_to_text(categories_dict_calc['ALARMTYPE'], VOCAB_ALARMTYPE_PATH) VOCAB_CS_PATH = '../data/vocab_cs.txt' save_numpy_string_array_to_text(categories_dict_calc['CS'], VOCAB_CS_PATH) VOCAB_DEVUNITS_PATH = '../data/vocab_devunits.txt' save_numpy_string_array_to_text(categories_dict_calc['DEVUNITS'], VOCAB_DEVUNITS_PATH) VOCAB_FUNCTION_PATH = '../data/vocab_function.txt' save_numpy_string_array_to_text(categories_dict_calc['FUNCTION'], VOCAB_FUNCTION_PATH) VOCAB_SENSORTYPE_PATH = '../data/vocab_sensortype.txt' save_numpy_string_array_to_text(categories_dict_calc['SENSORTYPE'], VOCAB_SENSORTYPE_PATH) VOCAB_TYPE_PATH = '../data/vocab_type.txt' save_numpy_string_array_to_text(categories_dict_calc['TYPE'], VOCAB_TYPE_PATH) VOCAB_VIRTUAL_PATH = '../data/vocab_virtual.txt' save_numpy_string_array_to_text(categories_dict_calc['VIRTUAL'], VOCAB_VIRTUAL_PATH) return None