class PreparationTask(luigi.Task): input_df_file = luigi.Parameter(globalconfig().train_data_path) output_df_folder = luigi.Parameter(globalconfig().preprocessed_data_folder) def clean_df(self, df, column_name): """This function removes line breaks from the specified column Arguments: df {pd.DataFrame} -- data to process column_name {string} -- dataFrame column to clean Returns: pd.DataFrame -- processed dataframe """ df[column_name] = df[column_name].fillna('').str.replace('\n', ' ') return df def output(self): input_file_name, extention = self.input_df_file.split(os.path.sep)[-1].split('.') self.output_file_name = '.'.join([input_file_name + '_prepared', extention]) return luigi.LocalTarget(os.path.join(self.output_df_folder, self.output_file_name)) def run(self): df = pd.read_csv(self.input_df_file) logger.info('processing {}'.format(self.input_df_file)) df_clean = self.clean_df(df, 'comment_text') logger.info('writing {} to {}'.format(self.output_file_name, self.output_df_folder)) with self.output().open('w') as f: df_clean.to_csv(f, index=False, encoding='utf-8')
def get_artefacts(): client = mlflow.tracking.MlflowClient() try_mkdir(globalconfig().featurizers_artefacts_folder) try_mkdir(globalconfig().model_artefacts_folder) client.download_artifacts( client.list_run_infos( client.get_experiment_by_name('/tfidf').experiment_id)[0].run_id, 'tfidf_vectorizer.pkl', globalconfig().featurizers_artefacts_folder) for category in [ 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate' ]: client.download_artifacts( client.list_run_infos( client.get_experiment_by_name( f'/mnb_category_{category}').experiment_id)[0].run_id, f'mnb_featurizer_{category}.pkl', globalconfig().featurizers_artefacts_folder) client.download_artifacts( client.list_run_infos( client.get_experiment_by_name( f'/lr_category_{category}').experiment_id)[0].run_id, f'{category}_lr.pkl', globalconfig().model_artefacts_folder)
class TrainLogRegAllWrapperTask(luigi.WrapperTask): input_file_path = luigi.Parameter(default='./data/prepared/train_prepared.csv') input_features_path = luigi.Parameter(globalconfig().featurized_data_folder) output_artefact_path = luigi.Parameter(globalconfig().model_artefacts_folder) def requires(self): for category in ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']: yield TrainLogRegTask(input_file_path=self.input_file_path, input_features_path=self.input_features_path, output_artefact_path=self.output_artefact_path, category_name=category)
def build_pipeline(self, category: str) -> Pipeline: with open(os.path.join(globalconfig().featurizers_artefacts_folder, f'mnb_featurizer_{category}.pkl'), 'rb') as f: mnb = pickle.load(f) with open(os.path.join(globalconfig().model_artefacts_folder, f'{category}_lr.pkl'), 'rb') as f: lr = pickle.load(f) pipeline = Pipeline([('mnb',mnb), ('lr', lr)]) return pipeline
class TrainLogRegTask(luigi.Task): input_file_path = luigi.Parameter(default='./data/prepared/train_prepared.csv') input_features_path = luigi.Parameter(globalconfig().featurized_data_folder) category_name = luigi.Parameter() output_artefact_path = luigi.Parameter(globalconfig().model_artefacts_folder) def requires(self): return GenerateMNBFeaturesTask(input_file_path=self.input_file_path, input_artefact_path=globalconfig().featurizers_artefacts_folder, data_output_path=globalconfig().featurized_data_folder, category_name=self.category_name) def output(self): output_path = os.path.join(self.output_artefact_path, f'{self.category_name}_lr.pkl') return luigi.LocalTarget(output_path) def run(self): logger.info(f'Reading data from {self.input_file_path}') data_df = pd.read_csv(self.input_file_path) try_mkdir(self.output_artefact_path) features_file_name = self.input_file_path.split('/')[-1].split('.csv')[0] with open(os.path.join(self.input_features_path, f'{features_file_name}_{self.category_name}_features.pkl'), 'rb') as f: features = pickle.load(f) C_s = [0.01, 0.1, 1, 10, 100] model = LogisticRegressionCV(C_s, cv=5, n_jobs=-1, max_iter=1000, scoring=make_scorer(roc_auc_score)) logger.info(f"Fitting lr for category {self.category_name}") model.fit(features, data_df[self.category_name]) with open(self.output().path, 'wb') as f: logger.info(f"Saving predictor locally for category {self.category_name}") pickle.dump(model, f) try: mlflow.set_experiment(f'/lr_category_{self.category_name}') with mlflow.start_run(): logger.info("Sending cv parameters and scores to ML Flow") for i, c in enumerate(model.C_): mlflow.log_param(f'C_{i}', c) mlflow.log_metric(f"mean_roc_auc_C_{C_s[i]}", np.mean(model.scores_[1][:, i])) logger.info("Sending model artifact to ML Flow") mlflow.log_artifact(self.output().path) except Exception as e: logger.error("Something went wrong while trying to use MLFlow tracking: ", e)
class GenerateFeaturesWrapperTask(luigi.WrapperTask): input_file_path = luigi.Parameter('./data/prepared/train_prepared.csv') input_artefact_path = luigi.Parameter( globalconfig().featurizers_artefacts_folder) data_output_path = luigi.Parameter(globalconfig().featurized_data_folder) def requires(self): for category in [ 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate' ]: yield GenerateMNBFeaturesTask( input_file_path=self.input_file_path, input_artefact_path=self.input_artefact_path, data_output_path=self.data_output_path, category_name=category)
def requires(self): test_preparation = PreparationTask( input_df_file=self.input_batch_data, output_df_folder=globalconfig().preprocessed_data_folder) requirements_dict = {'prepared_test': test_preparation} for category in [ "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate" ]: requirements_dict[ f'generated_test_features_{category}'] = GenerateMNBFeaturesTask( input_file_path=test_preparation.output().path, input_artefact_path=globalconfig( ).featurizers_artefacts_folder, data_output_path=globalconfig().featurized_data_folder, category_name=category) requirements_dict['trained_log_reg'] = TrainLogRegAllWrapperTask( input_file_path=globalconfig().prepared_train_data_path, input_features_path=globalconfig().featurized_data_folder, output_artefact_path=globalconfig().model_artefacts_folder) return requirements_dict
class TrainTfidfTask(luigi.Task): input_file_path = luigi.Parameter(globalconfig().prepared_train_data_path) artefact_output_path = luigi.Parameter( globalconfig().featurizers_artefacts_folder) def requires(self): return PreparationTask( input_df_file=globalconfig().train_data_path, output_df_folder=globalconfig().preprocessed_data_folder) def output(self): output_name = os.path.join(self.artefact_output_path, 'tfidf_vectorizer.pkl') return luigi.LocalTarget(output_name) def run(self): logger.info('Reading data from {}'.format(self.input_file_path)) data_df = pd.read_csv(self.input_file_path) bpemb_en = BPEmb(lang="en", dim=50, vs=200000, cache_dir='./bpemb_cache') tfidf = TfidfVectorizer(tokenizer=bpemb_en.encode) logger.info("Fitting tfidf") tfidf.fit(data_df['comment_text']) try_mkdir(self.artefact_output_path) with open(self.output().path, 'wb') as f: pickle.dump(tfidf, f) try: mlflow.set_experiment('/tfidf') with mlflow.start_run(): logger.info("Sending tfidf artefact to MLFlow") mlflow.log_artifact(self.output().path) except Exception as e: logger.error( "Something went wrong while trying to use MLFlow tracking: ", e)
class GenerateMNBFeaturesTask(luigi.Task): input_file_path = luigi.Parameter('./data/prepared/train_prepared.csv') input_artefact_path = luigi.Parameter( globalconfig().featurizers_artefacts_folder) data_output_path = luigi.Parameter(globalconfig().featurized_data_folder) category_name = luigi.Parameter() def requires(self): return TrainMNBTask( input_file_path=globalconfig().prepared_train_data_path, artefact_output_path=self.input_artefact_path, category_name=self.category_name) def output(self): file_name = self.input_file_path.split('/')[-1].split('.csv')[0] output_name = os.path.join( self.data_output_path, f'{file_name}_{self.category_name}_features.pkl') return luigi.LocalTarget(output_name) def run(self): logger.info("Generating features") logger.info("Reading data at {}".format(self.input_file_path)) data_df = pd.read_csv(self.input_file_path) featurizer_name = os.path.join( self.input_artefact_path, f'mnb_featurizer_{self.category_name}.pkl') featurizer = MNBFeaturizer.load(featurizer_name) logger.info('Transofrming data') transformed = featurizer.transform(data_df['comment_text']) try_mkdir(self.data_output_path) with open(self.output().path, 'wb') as f: pickle.dump(transformed, f)
class TrainMNBTask(luigi.Task): input_file_path = luigi.Parameter(globalconfig().prepared_train_data_path) artefact_output_path = luigi.Parameter( globalconfig().featurizers_artefacts_folder) category_name = luigi.Parameter() def requires(self): return TrainTfidfTask(input_file_path=self.input_file_path, artefact_output_path=self.artefact_output_path) def output(self): output_name = os.path.join(self.artefact_output_path, f'mnb_featurizer_{self.category_name}.pkl') return luigi.LocalTarget(output_name) def run(self): data_df = pd.read_csv(self.input_file_path) with open(self.requires().output().path, 'rb') as f: tfidf = pickle.load(f) featurizer = MNBFeaturizer(tfidf) logger.info("Fitting MNB for category {}".format(self.category_name)) featurizer.fit(data_df['comment_text'], data_df[self.category_name]) try_mkdir(self.artefact_output_path) featurizer.save(self.output().path) try: mlflow.set_experiment(f'/mnb_category_{self.category_name}') with mlflow.start_run(): logger.info("Sending MNB artefact to MLFlow") mlflow.log_artifact(self.output().path) except Exception as e: logger.error( "Something went wrong while trying to use MLFlow tracking: ", e)
def run(self): logger.info(f'Reading data from {self.input_batch_data}') data_df = pd.read_csv(self.input_batch_data) pred_df = data_df[['id']] try_mkdir(self.output_prediction_path) for category in [ "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate" ]: filename = self.input()[f'generated_test_features_{category}'].path with open(filename, 'rb') as f: features = pickle.load(f) with open( os.path.join(globalconfig().model_artefacts_folder, f'{category}_lr.pkl'), 'rb') as f: lr = pickle.load(f) pred = lr.predict_proba(features)[:, 1] pred_df[category] = pred pred_df.to_csv(self.output().path, index=False)
def requires(self): return PreparationTask( input_df_file=globalconfig().train_data_path, output_df_folder=globalconfig().preprocessed_data_folder)
def requires(self): return TrainMNBTask( input_file_path=globalconfig().prepared_train_data_path, artefact_output_path=self.input_artefact_path, category_name=self.category_name)
class PredictLogRegTask(luigi.Task): input_batch_data = luigi.Parameter(default=globalconfig().test_data_path) output_prediction_path = luigi.Parameter( default=globalconfig().output_prediction_path) def output(self): output_path = os.path.join( self.output_prediction_path, self.input_batch_data.split('/')[-1].split('.csv')[0] + f'_prediction.csv') return luigi.LocalTarget(output_path) def requires(self): test_preparation = PreparationTask( input_df_file=self.input_batch_data, output_df_folder=globalconfig().preprocessed_data_folder) requirements_dict = {'prepared_test': test_preparation} for category in [ "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate" ]: requirements_dict[ f'generated_test_features_{category}'] = GenerateMNBFeaturesTask( input_file_path=test_preparation.output().path, input_artefact_path=globalconfig( ).featurizers_artefacts_folder, data_output_path=globalconfig().featurized_data_folder, category_name=category) requirements_dict['trained_log_reg'] = TrainLogRegAllWrapperTask( input_file_path=globalconfig().prepared_train_data_path, input_features_path=globalconfig().featurized_data_folder, output_artefact_path=globalconfig().model_artefacts_folder) return requirements_dict def run(self): logger.info(f'Reading data from {self.input_batch_data}') data_df = pd.read_csv(self.input_batch_data) pred_df = data_df[['id']] try_mkdir(self.output_prediction_path) for category in [ "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate" ]: filename = self.input()[f'generated_test_features_{category}'].path with open(filename, 'rb') as f: features = pickle.load(f) with open( os.path.join(globalconfig().model_artefacts_folder, f'{category}_lr.pkl'), 'rb') as f: lr = pickle.load(f) pred = lr.predict_proba(features)[:, 1] pred_df[category] = pred pred_df.to_csv(self.output().path, index=False)
def requires(self): return GenerateMNBFeaturesTask(input_file_path=self.input_file_path, input_artefact_path=globalconfig().featurizers_artefacts_folder, data_output_path=globalconfig().featurized_data_folder, category_name=self.category_name)