def load_full(self): ''' full_data : [train, validate, test]の呼び出し ''' train_df = pd.read_csv( self.dataset_path / self.valid_dir / "train.csv", dtype=get_csv_dtype(), usecols=["MachineIdentifier"] + self.get_loadcols()) validate_df = pd.read_csv( self.dataset_path / self.valid_dir / "validate.csv", dtype=get_csv_dtype(), usecols=["MachineIdentifier"] + self.get_loadcols()) test_df = pd.read_csv(self.dataset_path / "test.csv", dtype=get_csv_dtype(), usecols=["MachineIdentifier"] + self.get_loadcols()) return pd.concat([train_df, validate_df, test_df])
def load_part(self): ''' part_data : train, validate, testのいずれかの呼び出し ''' part_df = pd.read_csv(self.input_path, dtype=get_csv_dtype(), usecols=["MachineIdentifier"] + self.get_loadcols()) return part_df
def CountEncoding(self, df): feature_col = df.columns.tolist()[1] value_counts = dd.read_csv( self.dataset_path / self.valid_dir / "train.csv", dtype=get_csv_dtype(), usecols=[feature_col]).compute()[feature_col].value_counts() df[feature_col] = df[feature_col].map(value_counts.to_dict()).fillna(1) return df
def TargetEncoding(self, df): cols = df.columns.tolist() train_df = pd.read_csv(self.dataset_path / self.valid_dir / "train.csv", dtype=get_csv_dtype(), usecols=[cols[1]] + ["HasDetections"]) mean_df = train_df.groupby(cols[1]).mean()["HasDetections"] df[cols[1]] = df[cols[1]].map(mean_df.to_dict()).fillna(1 / len(train_df)) return df
def calcFratures(name, formula): if dask_mode(): part_df = dd.read_csv(self.input_path, dtype=get_csv_dtype(), usecols=["MachineIdentifier"] + self.get_loadcols()).compute() else: part_df = pd.read_csv(self.input_path, dtype=get_csv_dtype(), usecols=["MachineIdentifier"] + self.get_loadcols()) if len(formula) == 1: new_feature_df = eval(formula[0], locals()) elif len(formula) == 2: new_feature_df = eval(formula[0], locals(), formula[1]) calc_back_checker(new_feature_df, set(part_df["MachineIdentifier"].values)) del part_df gc.collect() back_col_name = [ n for n in new_feature_df.columns if n != "MachineIdentifier" ][0] new_feature_df = new_feature_df.rename( columns={back_col_name: name}) new_feature_df = new_feature_df.loc[:, ["MachineIdentifier", name]] new_feature_df.sort_values("MachineIdentifier", inplace=True) new_feature_df.reset_index(drop=True, inplace=True) self.lock.acquire() try: stdout.write( "\r[Group] {:<15} >> multiprocessing : Done {:>3} out of {:>3} latest {:<100}" .format(self.__class__.__name__, str(self.check_job_progress()), str(len(needCalculationFeatures)), "\"" + name + "\"")) finally: self.lock.release() return new_feature_df
def FactorizeEncoding(self, df): cols = df.columns.tolist() train_df = pd.read_csv(self.dataset_path / self.valid_dir / "train.csv", dtype=get_csv_dtype(), usecols=cols) train_df.sort_values("MachineIdentifier", inplace=True) labels, uniques = pd.factorize(train_df[cols[1]]) df[cols[1]] = uniques.get_indexer(df[cols[1]]) return df
def RankEncoding(self, df): feature_col = df.columns.tolist()[1] value_counts = dd.read_csv( self.dataset_path / self.valid_dir / "train.csv", dtype=get_csv_dtype(), usecols=[feature_col]).compute()[feature_col].value_counts() rank_counts = value_counts.rank(method="min", ascending=False) df[feature_col] = df[feature_col].map(rank_counts.to_dict()).fillna( len(rank_counts)) return df
def FrequencyEncoding(self, df): feature_col = df.columns.tolist()[1] train_df = pd.read_csv(self.dataset_path / self.valid_dir / "train.csv", dtype=get_csv_dtype(), usecols=[feature_col]) value_counts = train_df[feature_col].value_counts() value_freq = value_counts / len(train_df) df[feature_col] = df[feature_col].map(value_freq.to_dict()).fillna( 1 / len(train_df)) return df
def LabelEncoding(self, df): cols = df.columns.tolist() another_part = "test" if self.part == "train" else "train" part_df = dd.from_pandas(df, 2) another_df = dd.read_csv(self.input_path / "{}.csv".format(another_part), dtype=get_csv_dtype(), usecols=cols) full_df = dd.concat([part_df, another_df]).compute() full_df.sort_values("MachineIdentifier", inplace=True) labels, uniques = pd.factorize(full_df[cols[1]]) full_df[cols[1]] = labels M_id = df["MachineIdentifier"].values.tolist() back_df = full_df[full_df["MachineIdentifier"].isin(M_id)] return back_df
def LabelEncoding(self, df): cols = df.columns.tolist() full_df = None for part in ["train", "validate", "test"]: if part == "test": part_path = self.dataset_path / "test.csv" else: part_path = self.dataset_path / self.valid_dir / "{}.csv".format( part) another_df = pd.read_csv(part_path, dtype=get_csv_dtype(), usecols=cols) full_df = another_df if full_df is None else pd.concat( [full_df, another_df]) full_df.sort_values("MachineIdentifier", inplace=True) labels, uniques = pd.factorize(full_df[cols[1]]) full_df[cols[1]] = labels M_id = df["MachineIdentifier"].values.tolist() back_df = full_df[full_df["MachineIdentifier"].isin(M_id)] return back_df
def load_train(self): train_df = pd.read_csv( self.dataset_path / self.valid_dir / "train.csv", dtype=get_csv_dtype(), usecols=["MachineIdentifier"] + self.get_loadcols()) return train_df