Ejemplo n.º 1
0
def test_fit():
    df_dict= {"col1": ["[1,2,3]", "[5,6]", "[8]"]}
    df = pds.DataFrame(data=df_dict)
    df["col2"] = [1, 2, 3]
    df["col4"] = ["a", "b", "c"]

    mib = MultiIdBinarizer()
    mib.fit(df["col1"], "col1")
    df = mib.transform(df)
    print("\n")
    print(df)
    #
    #   "col1": [1,2,3] becomes
    #   "col1_1": 1
    #   "col1_2": 1
    #   "col1_3": 1
    #
    assert df.loc[0, "col1_3"] == 1
    assert df.loc[0, "col1_2"] == 1
    assert df.loc[0, "col1_1"] == 1

    assert df.loc[1, "col1_5"] == 1
    assert df.loc[1, "col1_6"] == 1

    assert df.loc[2, "col1_8"] == 1
    print("Done1")
Ejemplo n.º 2
0
def test_is_multi_selection():
    a = ["[abnc", "efg]", "[123]"]
    assert not MultiIdBinarizer.is_multi_selection(a)
    a = ["a[1,2,3]"]
    assert not MultiIdBinarizer.is_multi_selection(a)
    a = ["[1]", "[1,2,3,4"]
    assert MultiIdBinarizer.is_multi_selection(a)
    a = ["[1,2,3]", "[12]"]
    assert MultiIdBinarizer.is_multi_selection(a)
    a = ["[]", "[12]"]
    assert MultiIdBinarizer.is_multi_selection(a)
Ejemplo n.º 3
0
def get_default_encoder(features, csv_file, separator=',', in_log=None):
    """
    Get the default preprocess for selected features

    :param features:        selected features from the csv_file
    :param csv_file:        csv file containing samples
    :return:
    """
    preprocess = {}
    log = in_log if in_log else logging.getLogger(__name__)

    try:
        df = pds.read_csv(csv_file,
                          sep=separator,
                          usecols=features,
                          skipinitialspace=True,
                          quotechar='"')

        X = df[features]

        # check Python version and use appropriate method to return iterable list
        if sys.version_info[0] < 3:
            items = X.iteritems()
        else:
            items = X.items()

        for col_name, col in items:
            if col.dtype.name == "object":
                is_list = MultiIdBinarizer.is_multi_selection(col)
                if is_list:
                    preprocess[
                        col_name] = u"object", u"Resilient Binarizer for multiple select"
                else:
                    preprocess[col_name] = u"object", u"Label Encoder"
            elif col.dtype.name == "float64":
                preprocess[col_name] = u"float64", u"Label Encoder"
            else:
                preprocess[col_name] = col.dtype.name, u"None"

    except Exception as e:
        log.error("Failed to read_csv: {f}\n Error: {err}".format(f=csv_file,
                                                                  err=str(e)))

    return preprocess
Ejemplo n.º 4
0
#
label_encoder = {}

# check Python version and use appropriate method to return iterable list
if sys.version_info[0] < 3:
    items = X.iteritems()
else:
    items = X.items()

for col_name, col in items:
    if col.dtype.name == "object":
        #
        # For multi selection, col is a list. Use
        # json to load it and check if it is a list
        #
        is_list = MultiIdBinarizer.is_multi_selection(col)

        if is_list:
            #
            # Multi select is 2 dimensional
            #
            le = MultiIdBinarizer()
            le.fit(col, col_name)
            X = le.transform(X)
        else:
            le = LabelEncoder()
            le.fit(col)
            X[col_name] = le.transform(X[col_name])
        label_encoder[col_name] = le
    elif col.dtype.name == "float64":
        #
Ejemplo n.º 5
0
    def transform_numerical(self):
        """
        Each categorical column needs one label encoder.
        https://stackoverflow.com/questions/28656736/using-scikits-labelencoder-correctly-across-multiple-programs

        A label encoder convert a string value (of features) into integers.
        :return:
        """

        # check Python version and use appropriate method to return iterable list
        if sys.version_info[0] < 3:
            items = self.X.iteritems()
        else:
            items = self.X.items()

        for col_name, col in items:
            self.log.debug("Column {col_name} is {col_type}".format(
                col_name=col_name, col_type=col.dtype.name))
            #
            # For numerical column labelencoder is used to normalize the column
            # And for categorical column, it is used to transform to numerical labels.
            #
            # https://www.analyticsvidhya.com/blog/2016/07/practical-guide-data-preprocessing-python-scikit-learn/
            # This link prefers only doing label encoding for categorical columns
            # The numerical column, once normalized, has a big chance to get new (unseen) labels
            # during prediction later.
            if col.dtype.name == "object":
                #
                # For multi selection, col is a list. Use
                # json to load it and check if it is a list
                #
                is_list = MultiIdBinarizer.is_multi_selection(col)

                if is_list:
                    #
                    # Multi select is 2 dimensional
                    #
                    le = MultiIdBinarizer()
                    le.fit(col, col_name)
                    self.X = le.transform(self.X)
                else:
                    le = LabelEncoder()
                    le.fit(col)
                    self.X[col_name] = le.transform(self.X[col_name])
                self.label_encoder[col_name] = le
            elif col.dtype.name == "float64":
                #
                #   Normalize it
                # http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
                # Note that labelencoder can be used to normalize as well
                # labelencoder is good except SVM.
                #

                # So here will use template method design pattern and let the subclass to decide
                # what label encoder to use
                le = self.get_encoder_for_float()

                le.fit(col)
                self.X[col_name] = le.transform(self.X[col_name])
                self.label_encoder[col_name] = le

        self.log.debug(self.X)
Ejemplo n.º 6
0
def test_union():
    a = [1, 2, 3, 4, 5]
    b = [1, 3, 4, 5, 6, 7, 8, 9, 10]

    c = MultiIdBinarizer.union(a, b)
    assert c == [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]