Ejemplo n.º 1
0
    def process(self):
        df = pd.read_csv(self.filename,
                         names=self.headers,
                         na_values=["?"],
                         quotechar="'")
        obj_df = df.copy()
        # Process age {numeric}
        obj_df["age"] = obj_df["age"].fillna(0)
        # Process gender {f,m}
        obj_df = pd.get_dummies(obj_df, columns=["gender"], prefix=["is"])

        # Process ethnicity {White-European,Latino,Others,Black,Asian,'Middle Eastern ',Pasifika,'South Asian',Hispanic,Turkish,others}
        obj_df["ethnicity"] = obj_df["ethnicity"].fillna('')
        hee = HashingEncoder(cols=["ethnicity"])
        hee.fit(obj_df)
        obj_df = hee.transform(obj_df)

        # Process jundice {no,yes}
        # Process austim {no,yes}
        # Process used_app_before {no,yes}
        # Class/ASD {NO,YES}
        replace_bool = {
            "jundice": {
                "no": 0,
                "yes": 1
            },
            "austim": {
                "no": 0,
                "yes": 1
            },
            "used_app_before": {
                "no": 0,
                "yes": 1
            },
            "class": {
                "NO": 0,
                "YES": 1
            },
        }
        obj_df.replace(replace_bool, inplace=True)
        # Process contry_of_res {'United States',Brazil,Spain,Egypt,'New Zealand',Bahamas,Burundi,Austria,Argentina,Jordan,Ireland,'United Arab Emirates',Afghanistan,Lebanon,'United Kingdom','South Africa',Italy,Pakistan,Bangladesh,Chile,France,China,Australia,Canada,'Saudi Arabia',Netherlands,Romania,Sweden,Tonga,Oman,India,Philippines,'Sri Lanka','Sierra Leone',Ethiopia,'Viet Nam',Iran,'Costa Rica',Germany,Mexico,Russia,Armenia,Iceland,Nicaragua,'Hong Kong',Japan,Ukraine,Kazakhstan,AmericanSamoa,Uruguay,Serbia,Portugal,Malaysia,Ecuador,Niger,Belgium,Bolivia,Aruba,Finland,Turkey,Nepal,Indonesia,Angola,Azerbaijan,Iraq,'Czech Republic',Cyprus}
        obj_df["contry_of_res"] = obj_df["contry_of_res"].fillna('')
        hec = HashingEncoder(cols=["contry_of_res"])
        hec.fit(obj_df)
        obj_df = hec.transform(obj_df)

        # Process age_desc {'18 and more'}
        obj_df.drop(columns=["age_desc"], inplace=True)

        # Process relation {Self,Parent,'Health care professional',Relative,Others}
        obj_df["relation"] = obj_df["relation"].fillna('')
        lb_relation = LabelEncoder()
        obj_df["relation"] = lb_relation.fit_transform(obj_df["relation"])

        self.processed.data = obj_df.values
        self.processed.target = np.array(obj_df["class"])
        self.processed.target_names = np.array(df["class"].unique())
        return self.processed
Ejemplo n.º 2
0
class _HashingEncoderImpl:
    def __init__(self, **hyperparams):
        self._hyperparams = hyperparams
        self._wrapped_model = SkHashingEncoder(**self._hyperparams)

    def fit(self, X, y=None):
        self._wrapped_model.fit(X, y)
        if isinstance(X, pd.DataFrame):
            self._X_columns = X.columns
        return self

    def transform(self, X):
        result = self._wrapped_model.transform(X)
        return result