def get_pickled_data(file_path): obj = get_obj_or_dump(file_path) if obj is None: #may_debug(True) return get_obj_or_dump(f"{ROOT_PATH}/input/clean-pickle-for-jigsaw-toxicity/{file_path}") return obj
def get_open_subtitles(): df_ot = get_pickled_data("ot.pkl") if df_ot is None: df_ot = pd.read_csv(f'{ROOT_PATH}/input/open-subtitles-toxic-pseudo-labeling/open-subtitles-synthesic.csv', index_col='id')[['comment_text', 'toxic', 'lang']] df_ot = df_ot[~df_ot['comment_text'].isna()] df_ot['comment_text'] = df_ot.parallel_apply( lambda x: clean_text(x['comment_text'], x['lang']), axis=1) df_ot = df_ot.drop_duplicates(subset='comment_text') df_ot['toxic'] = df_ot['toxic'].round().astype(np.int) get_obj_or_dump("ot.pkl", default=df_ot) return df_ot
def prepare_train_dev_data(self): """prepare_train_dev_data.""" df_train = get_pickled_data("train_XLM.pkl") if df_train is None: df_train = pd.read_csv(f'{ROOT_PATH}/input/jigsaw-toxicity-train-data-with-aux/train_data.csv') df_train['comment_text'] = df_train.parallel_apply(lambda x: clean_text(x['comment_text'], x['lang']), axis=1) get_obj_or_dump("train_XLM.pkl", default=df_train) #supliment_toxic = get_toxic_comments(df_train) self.train_dataset = DatasetRetriever( labels_or_ids=df_train['toxic'].values, comment_texts=df_train['comment_text'].values, langs=df_train['lang'].values, severe_toxic=df_train['severe_toxic'].values, obscene=df_train['obscene'].values, threat=df_train['threat'].values, insult=df_train['insult'].values, identity_hate=df_train['identity_hate'].values, use_train_transforms=True, transformers=self.transformers ) df_val = get_pickled_data("val_XLM.pkl") if df_val is None: df_val = pd.read_csv(f'{ROOT_PATH}/input/jigsaw-multilingual-toxic-comment-classification/validation.csv', index_col='id') df_val['comment_text'] = df_val.parallel_apply(lambda x: clean_text(x['comment_text'], x['lang']), axis=1) get_obj_or_dump("val_XLM.pkl", default=df_val) self.validation_tune_dataset = DatasetRetriever( labels_or_ids=df_val['toxic'].values, comment_texts=df_val['comment_text'].values, langs=df_val['lang'].values, use_train_transforms=True, transformers=self.transformers ) self.validation_dataset = DatasetRetriever( labels_or_ids=df_val['toxic'].values, comment_texts=df_val['comment_text'].values, langs=df_val['lang'].values, use_train_transforms=False, transformers=self.transformers ) del df_val gc.collect(); del df_train gc.collect();
def check_predict_details(self): # Thanks https://discuss.pytorch.org/t/how-can-l-load-my-best-model-as-a-feature-extractor-evaluator/17254/6 assert self.model_ft is not None activations = kernel_utils.get_obj_or_dump("dev_output_results.pkl") self.analyzer = TorchModelAnalyzer(self) analyzer = self.analyzer if activations is not None: self.analyzer.activation = activations else: analyzer.register_forward_hook( self.model_ft.roi_heads, analyzer.get_output_saved("roi_heads")) # for mask_head output, size torch.Size([40, 256, 14, 14]), # (mask_fcn4): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) # (relu4): ReLU(inplace) # for mask_predictor, torch.Size([46, 2, 28, 28]) (in evaluate mode) # for roi_heads, output.shape, tuple length 2, ([{'boxes', 'labels', 'scores':torch.Size([7]), 'masks':torch.Size([7, 1, 28, 28])},T,T,T],[ PLACE_FOR_LOSSES ]) # for roi_heads, in training mode, output different number of test result, according to threshold thing self.eval_model_loss( self.model_ft, self.data_loader_dev, self.device, self.metric_logger, print_freq=150, ) kernel_utils.dump_obj(analyzer.activation, "dev_output_results.pkl", force=True) roi_acts = [] for acts in analyzer.activation["roi_heads"]: roi_acts += acts[0] self.analyzer.test_out_threshold(roi_acts)
def get_pickled_data(file_path): """get_pickled_data from current folder or kaggle data input folder. Args: file_path: Returns: """ obj = get_obj_or_dump(file_path) if obj is None: #may_debug(True) return get_obj_or_dump(f"{ROOT_PATH}/input/clean-pickle-for-jigsaw-toxicity/{file_path}") return obj
def load_state_data_only(self, stage, file_name="run_state.pkl"): if stage is not None: file_name = f"run_state_{stage}.pkl" kernel_utils.logger.debug(f"restore from {file_name}") self_data = kernel_utils.get_obj_or_dump(filename=file_name) self._stage = self_data["_stage"] # self.model_ft = self_data['model_ft'] self.num_epochs = self_data["num_epochs"] # self.optimizer = self_data['optimizer'] self.data_loader = self_data["data_loader"] self.data_loader_dev = self_data["data_loader_dev"] self.device = self_data["device"]
def _load_state(cls, stage=None, file_name="run_state.pkl"): """ Args: file_name: return: the kernel object, need to continue (Default value = "run_state.pkl") stage: (Default value = None) Returns: : the kernel object, need to continue """ if stage is not None: file_name = f"run_state_{stage}.pkl" logger.debug(f"restore from {file_name}") return get_obj_or_dump(filename=file_name)
def prepare_test_data(self): """prepare_test_data.""" if os.path.exists('/content'): # colab df_test = get_pickled_data("test_XLM.pkl") else: df_test = None if df_test is None: df_test = pd.read_csv(f'{ROOT_PATH}/input/jigsaw-multilingual-toxic-comment-classification/test.csv', index_col='id') df_test['comment_text'] = df_test.parallel_apply(lambda x: clean_text(x['content'], x['lang']), axis=1) get_obj_or_dump("test_XLM.pkl", default=df_test) self.test_dataset = DatasetRetriever( labels_or_ids=df_test.index.values, ## here different!!! comment_texts=df_test['comment_text'].values, langs=df_test['lang'].values, use_train_transforms=False, test=True, transformers=self.transformers ) del df_test gc.collect();
def _load_state(cls, stage=None, file_name="run_state.pkl", logger=None): """ Args: file_name: return: the kernel object, need to continue (Default value = "run_state.pkl") stage: (Default value = None) logger: (Default value = None) Returns: : the kernel object, need to continue """ if stage is not None: file_name = f"run_state_{stage}.pkl" if logger is not None: logger.debug(f"restore from {file_name}") self = kernel_utils.get_obj_or_dump(filename=file_name) assert self is not None self.logger = logger return self
TRAIN_PATH = DATA_PATH + "jigsaw-toxic-comment-train.csv" val_data = None test_data = None train_data = None # + train_dataset = None test_dataset = None valid_dataset = None if False: data_package = get_kaggle_dataset_input( "jigsaw-multilingula-toxicity-token-encoded/toxic_fast_tok_512.pk") try: csv_data_package = get_obj_or_dump("toxic_csv.pk") if csv_data_package is None: csv_data_package = get_kaggle_dataset_input( "jigsaw-multilingula-toxicity-token-encoded/toxic_csv.pk") except ModuleNotFoundError as e: logger.error("%s", e) csv_data_package = None if csv_data_package is None: val_data = pd.read_csv(VAL_PATH) test_data = pd.read_csv(TEST_PATH) train_data = pd.read_csv(TRAIN_PATH) csv_data_package = (val_data, test_data, train_data) get_obj_or_dump("toxic_csv.pk", default=csv_data_package) else:
return text.strip() # - MODEL = 'gpt2-medium' tokenizer = AutoTokenizer.from_pretrained(MODEL) # !git clone https://github.com/pennz/kaggle_runner k # !rsync -r k/* . # !python3 -m pip install -e . # + X, y, X_valid, Y_valid, X_test = None, None, None, None, None y = get_obj_or_dump("y.pkl", default=y) X = get_obj_or_dump("x.pkl", default=X) X_valid = get_obj_or_dump("xval.pkl", default=X_valid) Y_valid = get_obj_or_dump("yval.pkl", default=Y_valid) # - X_test = get_obj_or_dump("xtest.pkl", default=X_test) len(X), len(y) # + def focal_loss(gamma=2., alpha=.2): def focal_loss_fixed(y_true, y_pred):