def test_text_pipe(self):

        Insert = extract.Insert(server_name, driver_name, database_name)

        customer_id = 15
        sel = sqlalchemy.select([Points]).where(
            Points.customer_id.__eq__(customer_id))
        dataset_raw = Insert.pandas_select_execute(sel)

        # Create 'clean' data processing pipeline
        clean_pipe = Transform.cleaning_pipeline(
            drop_attributes=DROP_ATTRIBUTES,
            nan_replace_dict=NAN_REPLACE_DICT,
            dtype_dict=TYPE_DICT,
            unit_dict=UNIT_DICT,
            dupe_cols=DUPE_COLS,
            remove_dupe=REMOVE_DUPE,
            replace_numbers=REPLACE_NUMBERS,
            remove_virtual=REMOVE_VIRTUAL,
            text_clean_attributes=TEXT_CLEAN_ATTRS)

        # Create pipeline specifically for clustering text features
        name_vocabulary = VocabularyText.read_vocabulary_disc(
            POINTNAME_VOCABULARY_FILENAME)
        name_text_pipe = Transform.text_pipeline_label(
            attributes=['NAME'], vocabulary=name_vocabulary)

        full_pipeline = Pipeline([
            ('clean_pipe', clean_pipe),
            ('text_pipe', name_text_pipe),
        ])

        dataset = full_pipeline.fit_transform(dataset_raw)

        return None
    def test_numeric_pipe(self):

        Insert = extract.Insert(server_name, driver_name, database_name)

        customer_id = 15
        sel = sqlalchemy.select([Points]).where(
            Points.customer_id.__eq__(customer_id))
        dataset_raw = Insert.pandas_select_execute(sel)

        # Create 'clean' data processing pipeline
        clean_pipe = Transform.cleaning_pipeline(
            drop_attributes=DROP_ATTRIBUTES,
            nan_replace_dict=NAN_REPLACE_DICT,
            dtype_dict=TYPE_DICT,
            unit_dict=UNIT_DICT,
            dupe_cols=DUPE_COLS,
            remove_dupe=REMOVE_DUPE,
            replace_numbers=REPLACE_NUMBERS,
            remove_virtual=REMOVE_VIRTUAL,
            text_clean_attributes=TEXT_CLEAN_ATTRS)

        numeric_pipe = Transform.numeric_pipeline(
            numeric_attributes=NUM_ATTRIBUTES)

        df_clean = clean_pipe.fit_transform(dataset_raw)
        df_numeric = numeric_pipe.fit_transform(df_clean)

        return None
    def test_read_categories(self):

        # Ititialize
        categories = Transform._read_categories(CATEGORICAL_ATTRIBUTES,
                                                CATEGORIES_FILE)

        replaceNone = ReplaceNone(CATEGORICAL_ATTRIBUTES)
        dataFrameSelector = DataFrameSelector(CATEGORICAL_ATTRIBUTES)
        oneHotEncoder = OneHotEncoder(categories=categories,
                                      handle_unknown='ignore')

        # Get raw database
        Insert = extract.Insert(server_name, driver_name, database_name)
        customer_id = 15
        sel = sqlalchemy.select([Points]).where(
            Points.customer_id.__eq__(customer_id))
        dataset_raw = Insert.pandas_select_execute(sel)
        clean_pipe = Transform.cleaning_pipeline(
            drop_attributes=DROP_ATTRIBUTES,
            nan_replace_dict=NAN_REPLACE_DICT,
            dtype_dict=TYPE_DICT,
            unit_dict=UNIT_DICT,
            dupe_cols=DUPE_COLS,
            remove_dupe=REMOVE_DUPE,
            replace_numbers=REPLACE_NUMBERS,
            remove_virtual=REMOVE_VIRTUAL,
            text_clean_attributes=TEXT_CLEAN_ATTRS)
        df_clean1 = clean_pipe.fit_transform(dataset_raw)

        # Transform
        df0 = replaceNone.fit_transform(df_clean1)
        df1_array = dataFrameSelector.fit_transform(df0)
        ohearray = oneHotEncoder.fit_transform(df1_array).toarray()

        return None
    def test_categorical_pipe(self):

        Insert = extract.Insert(server_name, driver_name, database_name)

        customer_id = 15
        sel = sqlalchemy.select([Points]).where(
            Points.customer_id.__eq__(customer_id))
        dataset_raw = Insert.pandas_select_execute(sel)

        # Create 'clean' data processing pipeline
        clean_pipe = Transform.cleaning_pipeline(
            drop_attributes=DROP_ATTRIBUTES,
            nan_replace_dict=NAN_REPLACE_DICT,
            dtype_dict=TYPE_DICT,
            unit_dict=UNIT_DICT,
            dupe_cols=DUPE_COLS,
            remove_dupe=REMOVE_DUPE,
            replace_numbers=REPLACE_NUMBERS,
            remove_virtual=REMOVE_VIRTUAL,
            text_clean_attributes=TEXT_CLEAN_ATTRS)

        categorical_pipe = Transform.categorical_pipeline(
            categorical_attributes=CATEGORICAL_ATTRIBUTES,
            handle_unknown='ignore',
            categories_file=CATEGORIES_FILE)

        df_clean = clean_pipe.fit_transform(dataset_raw)
        ohe_array = categorical_pipe.fit_transform(df_clean).toarray()
        print("Example OneHotEcoded Array: ", ohe_array[0])

        # Find more about categorical pipe
        ohe = categorical_pipe.named_steps['OneHotEncoder']
        print("Categories used for OneHotEncoder", ohe.categories)

        return None
Example #5
0
    def test__transform_list(self):
        # Transform from a list of dictionaries
        input_data_list = self.input_data_list
        input_list = [x.__dict__ for x in input_data_list]
        dfraw = pd.DataFrame(input_list)
        bag = Transform.categorical_transform_pipeline_MIL()\
            .fit_transform(dfraw)
        # Transform from a list of dictionaries
        input_data_list_pydantic = self.input_data_list_pydantic
        input_list = [x.__dict__ for x in input_data_list_pydantic]
        dfraw = pd.DataFrame(input_list)
        bag = Transform.categorical_transform_pipeline_MIL()\
            .fit_transform(dfraw)

        return None
    def get_single_mil_bag(self, pipeline='whole'):
        """Return a bag of commonly labeled data
        Bags are defined in SQL Server in the Points table on the group_id
        Froeign Key"""
        
        # Retrieve a single unique bag label
        sql = """SELECT top(1) group_id
        FROM {}
        WHERE IsNumeric(group_id) = 1
        ORDER BY group_id ASC""".format(Points.__tablename__)
        sel = sqltext(sql)
        # List, remove a single item from the list
        group_ids = self.Insert.core_select_execute(sel)
        group_id = group_ids.pop().group_id
        # Create the pipeline
        if pipeline == 'whole':
            full_pipeline = Transform.numeric_transform_pipeline_MIL()
        elif pipeline == 'categorical':
            full_pipeline = Transform.categorical_transform_pipeline_MIL()
        else:
            raise ValueError('pipeline must be one of ["whole","categorical"]')
        
        # Retrieve bag label for each group_id
        sel = sqlalchemy.select([Labeling]).where(Labeling.id.__eq__(group_id))
        with self.Insert.engine.connect() as connection:
            res = connection.execute(sel)
            label = res.fetchone().bag_label

        # Load the dataset
        sel = sqlalchemy.select([Points]).where(Points.group_id.__eq__(group_id))
        dfraw = self.Insert.pandas_select_execute(sel)

        # Transform the dataset
        try:
            bag = full_pipeline.fit_transform(dfraw)
        except ValueError as e:
            print('Transform error, Skipped Group ID : ', group_id)
            traceback.print_exc()
            print(dfraw)
            raise e

        # Validate cleaned dataset
        if not self.validate_bag(bag):
            print("Invalid cleaned bag:\n")
            print(bag)

        return dfraw, bag, label
    def test_calc_categories_dict(self):

        # Generate data to find categories
        Insert = extract.Insert(server_name, driver_name, database_name)

        sel = sqlalchemy.select([Points])
        dataset_raw = Insert.pandas_select_execute(sel)

        # Create 'clean' data processing pipeline
        clean_pipe = Transform.cleaning_pipeline(
            drop_attributes=DROP_ATTRIBUTES,
            nan_replace_dict=NAN_REPLACE_DICT,
            dtype_dict=TYPE_DICT,
            unit_dict=UNIT_DICT,
            dupe_cols=DUPE_COLS,
            remove_dupe=REMOVE_DUPE,
            replace_numbers=REPLACE_NUMBERS,
            remove_virtual=REMOVE_VIRTUAL,
            text_clean_attributes=TEXT_CLEAN_ATTRS)

        string_pipe = SetDtypes(
            type_dict={
                'TYPE': str,
                'ALARMTYPE': str,
                'FUNCTION': str,
                'VIRTUAL': str,
                'CS': str,
                'SENSORTYPE': str,
                'DEVUNITS': str
            })

        categories_clean_pipe = Pipeline([('clean_pipe', clean_pipe),
                                          ('string_pipe', string_pipe)])

        df_clean = categories_clean_pipe.fit_transform(dataset_raw)

        # Calculate categories to be used later
        Encoding = EncodingCategories()
        columns = [
            'TYPE', 'ALARMTYPE', 'FUNCTION', 'VIRTUAL', 'CS', 'SENSORTYPE',
            'DEVUNITS'
        ]
        categories_dict_calc = Encoding.calc_categories_dict(df_clean, columns)

        if not os.path.exists(CATEGORIES_FILE):
            raise OSError("Categories file not found: {}".\
                          format(CATEGORIES_FILE))

        # Compare categoires read to those saved on disc
        categories_dict_read = Encoding.read_categories_from_disc(
            CATEGORIES_FILE)
        for key in set(
            (*categories_dict_calc.keys(), *categories_dict_read.keys())):
            self.assertEqual(set(categories_dict_calc[key]),
                             set(categories_dict_read[key]))

        return None
Example #8
0
    def test__transform_data(self):

        # The loaded bag (self.bag_load) should match the manually transformed
        bag_manual = Transform.numeric_transform_pipeline_MIL()\
            .fit_transform(self.dfraw_load)
        self.assertTrue(
            np.equal(bag_manual.toarray(), self.bag_load.toarray()).all())

        # Transform raw data (from input class)
        bag_input = Transform.numeric_transform_pipeline_MIL().fit_transform(
            self.dfraw_input)
        msg = ("The transformed bag has {} features, and the predictor " +
               "has {} features")
        print(
            msg.format(bag_input.shape[1],
                       self.predictor.classifier.n_features_in_))
        self.assertEqual(bag_input.shape[1],
                         self.predictor.classifier.n_features_in_)

        return None
Example #9
0
 def test__transform_data_pydantic(self):
     # Transform raw data (from input class)
     bag_input_pydantic = Transform.numeric_transform_pipeline_MIL()\
         .fit_transform(self.dfraw_input_pydantic)
     msg = ("The transformed bag has {} features, and the predictor " +
            "has {} features")
     print(
         msg.format(bag_input_pydantic.shape[1],
                    self.predictor.classifier.n_features_in_))
     self.assertEqual(bag_input_pydantic.shape[1],
                      self.predictor.classifier.n_features_in_)
     return None
    def _load_pipeline(self, classifier_type: str) -> Pipeline:
        """Determine which pipeline to load based on the chosen classifier
        and passed classifier_type
        inputs
        ------
        classifier_type: (str) one of ['numeric','categorical'] for numeric or 
        categorical pipelines depening on requirements of the classifier
        If 'numeric' then return Transform.numeric_transform_pipeline_MIL()
        or Transform.categorical_transform_pipeline_MIL()"""

        if classifier_type == 'numeric':
            return Transform.numeric_transform_pipeline_MIL()
        elif classifier_type == 'categorical':
            return Transform.categorical_transform_pipeline_MIL()
        else:
            msg = (
                "classifier_type must be one of ['numeric','categorical']. " +
                "Got {}")
            raise ValueError(msg.format(classifier_type))

        return None
Example #11
0
    def __init__(self, classifier_filename: Union[str, bytes]):
        """inputs
        ------
        classifier_filename: (str) name of file for pickled sklearn classifier
        
        Example usage
        basePredictorL1 = BasePredictor(
            classifier_filename=SVMC_l1_classifier_filename)
        # Somehow, create some raw data
        input_data = RawInputData(
            # Required numeric attributes
            DEVICEHI=122.0,
            DEVICELO=32.0,
            SIGNALHI=10,
            SIGNALLO=0,
            SLOPE=1.2104,
            INTERCEPT=0.01,
            # Required categorical attributes
            TYPE="LAI",
            ALARMTYPE="Standard",
            FUNCTION="Value",
            VIRTUAL=0,
            CS="AE",
            SENSORTYPE="VOLTAGE",
            DEVUNITS="VDC",
            # Requried text attributes
            NAME="SHLH.AHU-ED.RAT",
            DESCRIPTOR="RETURN TEMP",
            NETDEVID='test-value',
            SYSTEM='test-system'
            )
        # Load raw data
        dfraw_input = pd.DataFrame(data=[input_data])
        # Create predictions from raw input data
        results_l1 = basePredictorL1.predict(dfraw_input)
        """
        # Load classifier
        self.classifier = self._load_predictor(classifier_filename)
        # Load transform pipeline
        self.numeric_transform_pipeline_MIL = Transform.numeric_transform_pipeline_MIL(
        )
        # Load embedding class member
        self.MILESEmbedder = MILESEmbedding(CONCEPT_CLASS_FILENAME)

        return None
Example #12
0
    def bag_data_generator(self, pipeline, verbose=False):
        """Return a bag of commonly labeled data
        Bags are defined in SQL Server in the Points table on the group_id
        Froeign Key"""

        # Retrieve all unique bag labels
        sql = """SELECT distinct group_id
        FROM {}
        WHERE IsNumeric(group_id) = 1
        ORDER BY group_id ASC""".format(Points.__tablename__)
        sel = sqltext(sql)
        group_ids = self.Insert.core_select_execute(sel)

        # Retrieve bag label for each group_id
        sql_bag = """SELECT id, bag_label
                FROM {}
                WHERE id = {}"""

        # Create the pipeline
        if pipeline == 'whole':
            full_pipeline = Transform.numeric_transform_pipeline_MIL()
        elif pipeline == 'categorical':
            full_pipeline = Transform.categorical_transform_pipeline_MIL()
        else:
            raise ValueError('pipeline must be one of ["whole","categorical"]')

        for row in group_ids:
            group_id = row.group_id

            sel = sqltext(sql_bag.format(Labeling.__tablename__, group_id))
            with self.Insert.engine.connect() as connection:
                res = connection.execute(sel)
                label = res.fetchone().bag_label

            # Load the dataset
            sel = sqlalchemy.select([Points]).where(Points.group_id.__eq__(group_id))
            dfraw = self.Insert.pandas_select_execute(sel)

            # Validate raw dataset
            if not self.validate_bag(dfraw):
                continue

            # Transform the dataset
            try:
                bag = full_pipeline.fit_transform(dfraw)
            except ValueError as e:
                print('Transform error, Skipped Group ID : ', group_id)

                if verbose:
                    traceback.print_exc()
                    print(dfraw)
                    x = input("Do you want to continue and discard this bag? : ")
                    if x in ['y','yes','Y','Yes','True','TRUE']:
                        continue
                    else:
                        raise e
                else:
                    continue

            # Validate cleaned dataset
            if not self.validate_bag(bag):
                continue

            yield bag, label
Example #13
0
def calc_save_categories_vocabulary():

    # Read raw data from database
    Insert = extract.Insert(server_name, driver_name, database_name)
    sel = sqlalchemy.select([Points])
    dataset_raw = Insert.pandas_select_execute(sel)

    # Create 'clean' data processing pipeline
    clean_pipe = Transform.cleaning_pipeline(
        drop_attributes=DROP_ATTRIBUTES,
        nan_replace_dict=NAN_REPLACE_DICT,
        dtype_dict=TYPE_DICT,
        unit_dict=UNIT_DICT,
        dupe_cols=DUPE_COLS,
        remove_dupe=REMOVE_DUPE,
        replace_numbers=REPLACE_NUMBERS,
        remove_virtual=REMOVE_VIRTUAL,
        text_clean_attributes=TEXT_CLEAN_ATTRS)

    string_pipe = SetDtypes(
        type_dict={
            'TYPE': str,
            'ALARMTYPE': str,
            'FUNCTION': str,
            'VIRTUAL': str,
            'CS': str,
            'SENSORTYPE': str,
            'DEVUNITS': str
        })

    categories_clean_pipe = Pipeline([('clean_pipe', clean_pipe),
                                      ('string_pipe', string_pipe)])

    # Process raw data with pipeline
    df_clean = categories_clean_pipe.fit_transform(dataset_raw)

    # Calculate categories to be used later
    Encoding = EncodingCategories()
    columns = [
        'TYPE', 'ALARMTYPE', 'FUNCTION', 'VIRTUAL', 'CS', 'SENSORTYPE',
        'DEVUNITS'
    ]
    categories_dict_calc = Encoding.calc_categories_dict(df_clean, columns)

    # Save categories in numpy array to be used later
    Encoding.save_categories_to_disc(categories_dict_calc, CATEGORIES_FILE)

    # Save vocabulary to file
    VOCAB_ALARMTYPE_PATH = '../data/vocab_alarmtype.txt'
    save_numpy_string_array_to_text(categories_dict_calc['ALARMTYPE'],
                                    VOCAB_ALARMTYPE_PATH)
    VOCAB_CS_PATH = '../data/vocab_cs.txt'
    save_numpy_string_array_to_text(categories_dict_calc['CS'], VOCAB_CS_PATH)
    VOCAB_DEVUNITS_PATH = '../data/vocab_devunits.txt'
    save_numpy_string_array_to_text(categories_dict_calc['DEVUNITS'],
                                    VOCAB_DEVUNITS_PATH)
    VOCAB_FUNCTION_PATH = '../data/vocab_function.txt'
    save_numpy_string_array_to_text(categories_dict_calc['FUNCTION'],
                                    VOCAB_FUNCTION_PATH)
    VOCAB_SENSORTYPE_PATH = '../data/vocab_sensortype.txt'
    save_numpy_string_array_to_text(categories_dict_calc['SENSORTYPE'],
                                    VOCAB_SENSORTYPE_PATH)
    VOCAB_TYPE_PATH = '../data/vocab_type.txt'
    save_numpy_string_array_to_text(categories_dict_calc['TYPE'],
                                    VOCAB_TYPE_PATH)
    VOCAB_VIRTUAL_PATH = '../data/vocab_virtual.txt'
    save_numpy_string_array_to_text(categories_dict_calc['VIRTUAL'],
                                    VOCAB_VIRTUAL_PATH)

    return None