class Model(): def __init__(self, is_infer_only=False): self.config = Config() self.logger = Logger(__name__) self._model = self.load_model() self._label_encoder = self.load_labelencoder() def train(self, directory): dm = DataManager(directory) if not dm.validate(): raise UiPathUsageException("No valid data to run this pipeline.") data_df = dm.get_data() X = data_df[dm.get_feature_columns()].values y = data_df[dm.get_target_column()].values help_string = "Warning: You have retrained a model which was generated by a TPOT optimization pipeline.\ \nFor optimal results please run the TPOT optimization pipeline from scratch by training package version [1.0]." if not self.is_trained() or self.config.warm_start == True: self._model = self.build_model(X, y) else: self._model.fit(X, y) self.logger.info(f"Finished retraining model.") self.logger.info(help_string) joblib.dump(self._model, os.path.join(self.config.cur_dir, "model", "Model.sav")) def evaluate(self, evaluation_directory): dm = DataManager(evaluation_directory) data_df = dm.get_data() if not dm.validate(for_train=False): self.logger.info( "No valid test data to run this evaluation pipeline.") data_df = dm.get_data() X = data_df[dm.get_feature_columns()].values y = data_df[dm.get_target_column()].values if not self.is_trained(): self.logger.info(_UNTRAINED_HELP) else: score = self._model.score(X, y) self.logger.info(f"Evaluation score = {score}") return score def process_data(self, directory): if not self.config.test_data_from_ui: dm = DataManager(directory) if not dm.validate(): raise UiPathUsageException( "No valid data to run this pipeline.") all_data = dm.get_data() # Stratified split percentage = self.config.process_data_split_percentage train, test = train_test_split( all_data, test_size=percentage, random_state=self.config.seed, ) # Write train.csv DataManager.write_dataframe(train, 'train', self.config.train_data_directory) # Write evaluate.csv DataManager.write_dataframe(test, 'test', self.config.test_data_directory) else: dm = DataManager(directory) if not dm.validate(): raise UiPathUsageException( "No valid data to run this pipeline.") all_data = dm.get_data() # Write train.csv DataManager.write_dataframe(all_data, 'train', self.config.train_data_directory) self.logger.info( "Did not split data into train and test sets. Model will be evaluated on data selected from UI." ) def build_model(self, X, y): # Perform missing value imputation as scikit-learn models can't handle NaN's nan_imputer = SimpleImputer(missing_values=np.nan, strategy="mean") X = nan_imputer.fit_transform(X) pipeline_optimizer = TPOTRegressor( generations=self.config.generations, population_size=self.config.population_size, offspring_size=self.config.offspring_size, mutation_rate=self.config.mutation_rate, crossover_rate=self.config.crossover_rate, scoring=self.config.scoring, cv=self.config.cv, subsample=self.config.subsample, n_jobs=-1, max_time_mins=self.config.max_time_mins, max_eval_time_mins=self.config.max_eval_time_mins, random_state=self.config.seed, config_dict=self.config.classifier_config_dict, warm_start=self.config.warm_start, memory=self.config.artifacts_directory, verbosity=1) # Fit TPOT to data pipeline_optimizer.fit(X, y) self.logger.info(f"Finished running TPOT optimization pipeline.") # Export fitted pipeline to artifacts directory pipeline_path = os.path.join(self.config.artifacts_directory, "TPOT_pipeline.py") pipeline_optimizer.export(pipeline_path) self.logger.info(f"Saving best pipeline to {pipeline_path}") # Create new pipeline which contains nan_imputer pipe = Pipeline([ ("nan_imputer", nan_imputer), ("tpot_pipeline", pipeline_optimizer.fitted_pipeline_), ]) return pipe def predict(self, mlskill_input): data = pd.read_json(mlskill_input) predictions = self._model.predict(data.values) return json.dumps(predictions.tolist()) def load_model(self): if os.path.isfile( os.path.join(self.config.cur_dir, "model", "Model.sav")): self.logger.info(f"Loading pre-trained model...") return joblib.load( os.path.join(self.config.cur_dir, "model", "Model.sav")) else: return None def load_labelencoder(self): if os.path.isfile( os.path.join(self.config.cur_dir, "model", "LabelEncoder.sav")): self.logger.info(f"Loading label encoder...") return joblib.load( os.path.join(self.config.cur_dir, "model", "LabelEncoder.sav")) else: return None def is_trained(self): if self._model is None: return False else: return True
class DataManager(): def __init__(self, directory): self.config = Config() self.logger = Logger(__name__) self.logger.info(f"Loading data from {directory}...") self.target_column_name = self.config.target_column self.feature_column_names = None self._label_encoder = self.load_labelencoder() self.is_single_file = self.config.csv_name is not None self.raw_data = self.read_all_data(directory) if self.raw_data is None: return nclasses = self.num_classes() self.logger.info( f"Done read [{len(self.raw_data)}] points with [{nclasses}] classes." ) if nclasses == 1: self.logger.info("Data must have at least 2 classes.") self.raw_data = None def read_all_data(self, directory): dataframe_from_csv = self.read_all_csv(directory) if self.is_single_file: if dataframe_from_csv is None: self.logger.info( f"Not able to read any valid csv data in [{os.path.join(directory, self.config.csv_name)}]" ) return None return dataframe_from_csv else: if dataframe_from_csv is None: self.logger.info( f"Unable to read any valid data from *.csv files in [{directory}]" ) return None if dataframe_from_csv[self.target_column_name].dtype == 'object': label = dataframe_from_csv.loc[:, self.target_column_name] dataframe_from_csv.loc[:, self. target_column_name] = self._label_encoder.fit_transform( label) joblib.dump( self._label_encoder, os.path.join(self.config.cur_dir, "model", 'LabelEncoder.sav')) self.logger.info( f"Encoded label column ['{self.target_column_name}']") return dataframe_from_csv def read_all_csv(self, directory): help_string = " The csv file must contain a header, a target column and and at least one feature column." \ " The target column name is set by the <input_column> variable of this run. The default value is 'target'." paths = [] if self.is_single_file: paths = [os.path.join(directory, self.config.csv_name)] else: paths = glob.glob(os.path.join(directory, "*.csv"), recursive=True) frames = [] for path in paths: try: self.logger.verbose( f"Attempting to read data from csv [{path}]" f" with delimiter [{self.config.delimiter}]") frame = pd.read_csv(path, error_bad_lines=False, delimiter=self.config.delimiter, encoding=self.config.encoding) except Exception as e: self.logger.info( f"Failed to read csv [{path}] exception:\n{e}") continue if self.target_column_name not in frame.columns: self.logger.info( f"File [{path}] does not have name [{self.target_column_name}] in header" f"{list(frame.columns)}', skipping this file." + help_string) continue frames.append(frame) self.logger.verbose( f"Read [{len(frame)}] data points from [{path}]\n") if len(frames) == 0: return None coalesced = reduce(lambda a, b: a.append(b), frames[1:], frames[0]) return coalesced def validate(self, for_train=True): if self.raw_data is None: return False if not for_train: return True # Validates if all classes have at least some number of minium training examples min_count_per_class = int( len(self.get_data()) * self.config.class_percentage_validation) target_value_counts = self.get_data()[ self.target_column_name].value_counts() not_enough_examples = target_value_counts[ target_value_counts < min_count_per_class] if len(not_enough_examples) == 0: return True help_string = ( f"Provided data does not have enough training examples to train," f"you must provide at least {min_count_per_class} training examples of each class" ) for class_name, number_examples in not_enough_examples.iteritems(): self.logger.info( f"Class [{class_name}] has [{number_examples}] data points.") self.logger.info(help_string) return False @staticmethod def write_dataframe(frame, name, directory=None): config = Config() if directory is None: path = os.path.join(config.artifacts_directory, f"{name}.csv") else: path = os.path.join(directory, f"{name}.csv") frame.to_csv(path, index=False, sep=config.delimiter) checksum = DataManager.checksum(path) return checksum @staticmethod def checksum(path): hasher = hashlib.md5() with open(path, 'rb') as infile: hasher.update(infile.read()) return hasher.hexdigest() def get_classes(self): return self.raw_data[self.target_column_name].unique() def num_classes(self): return len(self.get_classes()) def get_data(self): return self.raw_data def get_target_column(self): return self.target_column_name def get_feature_columns(self): self.feature_column_names = list( self.raw_data.drop(self.target_column_name, axis=1).columns) return self.feature_column_names def load_labelencoder(self): if os.path.isfile( os.path.join(self.config.cur_dir, "model", "LabelEncoder.sav")): self.logger.info(f"Loading label encoder...") return joblib.load( os.path.join(self.config.cur_dir, "model", "LabelEncoder.sav")) else: return LabelEncoder()
class DataManager(): def __init__(self, directory): self.config = Config() self.logger = Logger(__name__) self.logger.info(f"Loading data from {directory}...") self.target_column_name = self.config.target_column self.feature_column_names = None self.is_single_file = self.config.csv_name is not None self.raw_data = self.read_all_data(directory) if self.raw_data is None: return self.logger.info(f"Done read [{len(self.raw_data)}] points.") def read_all_data(self, directory): dataframe_from_csv = self.read_all_csv(directory) if self.is_single_file: if dataframe_from_csv is None: self.logger.info( f"Not able to read any valid csv data in [{os.path.join(directory, self.config.csv_name)}]" ) return None return dataframe_from_csv else: if dataframe_from_csv is None: self.logger.info( f"Unable to read any valid data from *.csv files in [{directory}]" ) return None return dataframe_from_csv def read_all_csv(self, directory): help_string = " The csv file must contain a header, a target column and and at least one feature column." \ " The target column name is set by the <input_column> variable of this run. The default value is 'target'." paths = [] if self.is_single_file: paths = [os.path.join(directory, self.config.csv_name)] else: paths = glob.glob(os.path.join(directory, "*.csv"), recursive=True) frames = [] for path in paths: try: self.logger.verbose( f"Attempting to read data from csv [{path}]" f" with delimiter [{self.config.delimiter}]") frame = pd.read_csv(path, error_bad_lines=False, delimiter=self.config.delimiter, encoding=self.config.encoding) except Exception as e: self.logger.info( f"Failed to read csv [{path}] exception:\n{e}") continue if self.target_column_name not in frame.columns: self.logger.info( f"File [{path}] does not have name [{self.target_column_name}] in header" f"{list(frame.columns)}', skipping this file." + help_string) continue frames.append(frame) self.logger.verbose( f"Read [{len(frame)}] data points from [{path}]\n") if len(frames) == 0: return None coalesced = reduce(lambda a, b: a.append(b), frames[1:], frames[0]) return coalesced def validate(self, for_train=True): if self.raw_data is None: return False elif not for_train: return True else: return True @staticmethod def write_dataframe(frame, name, directory=None): config = Config() if directory is None: path = os.path.join(config.artifacts_directory, f"{name}.csv") else: path = os.path.join(directory, f"{name}.csv") frame.to_csv(path, index=False, sep=config.delimiter) checksum = DataManager.checksum(path) return checksum @staticmethod def checksum(path): hasher = hashlib.md5() with open(path, 'rb') as infile: hasher.update(infile.read()) return hasher.hexdigest() def get_data(self): return self.raw_data def get_target_column(self): return self.target_column_name def get_feature_columns(self): self.feature_column_names = list( self.raw_data.drop(self.target_column_name, axis=1).columns) return self.feature_column_names