Example #1
0
def get_pickled_data(file_path):
    obj = get_obj_or_dump(file_path)

    if obj is None:
        #may_debug(True)

        return get_obj_or_dump(f"{ROOT_PATH}/input/clean-pickle-for-jigsaw-toxicity/{file_path}")

    return obj
Example #2
0
def get_open_subtitles():
    df_ot = get_pickled_data("ot.pkl")

    if df_ot is None:
        df_ot = pd.read_csv(f'{ROOT_PATH}/input/open-subtitles-toxic-pseudo-labeling/open-subtitles-synthesic.csv',
                            index_col='id')[['comment_text', 'toxic', 'lang']]
        df_ot = df_ot[~df_ot['comment_text'].isna()]
        df_ot['comment_text'] = df_ot.parallel_apply(
            lambda x: clean_text(x['comment_text'], x['lang']), axis=1)
        df_ot = df_ot.drop_duplicates(subset='comment_text')
        df_ot['toxic'] = df_ot['toxic'].round().astype(np.int)
        get_obj_or_dump("ot.pkl", default=df_ot)

    return df_ot
Example #3
0
    def prepare_train_dev_data(self):
        """prepare_train_dev_data."""
        df_train = get_pickled_data("train_XLM.pkl")

        if df_train is None:
            df_train = pd.read_csv(f'{ROOT_PATH}/input/jigsaw-toxicity-train-data-with-aux/train_data.csv')
            df_train['comment_text'] = df_train.parallel_apply(lambda x: clean_text(x['comment_text'], x['lang']), axis=1)
            get_obj_or_dump("train_XLM.pkl", default=df_train)

        #supliment_toxic = get_toxic_comments(df_train)
        self.train_dataset = DatasetRetriever(
            labels_or_ids=df_train['toxic'].values,
            comment_texts=df_train['comment_text'].values,
            langs=df_train['lang'].values,
            severe_toxic=df_train['severe_toxic'].values,
            obscene=df_train['obscene'].values,
            threat=df_train['threat'].values,
            insult=df_train['insult'].values,
            identity_hate=df_train['identity_hate'].values,
            use_train_transforms=True,
            transformers=self.transformers
        )
        df_val = get_pickled_data("val_XLM.pkl")

        if df_val is None:
            df_val = pd.read_csv(f'{ROOT_PATH}/input/jigsaw-multilingual-toxic-comment-classification/validation.csv', index_col='id')
            df_val['comment_text'] = df_val.parallel_apply(lambda x: clean_text(x['comment_text'], x['lang']), axis=1)
            get_obj_or_dump("val_XLM.pkl", default=df_val)

        self.validation_tune_dataset = DatasetRetriever(
            labels_or_ids=df_val['toxic'].values,
            comment_texts=df_val['comment_text'].values,
            langs=df_val['lang'].values,
            use_train_transforms=True,
            transformers=self.transformers
        )
        self.validation_dataset = DatasetRetriever(
            labels_or_ids=df_val['toxic'].values,
            comment_texts=df_val['comment_text'].values,
            langs=df_val['lang'].values,
            use_train_transforms=False,
            transformers=self.transformers
        )

        del df_val
        gc.collect();

        del df_train
        gc.collect();
Example #4
0
    def check_predict_details(self):
        # Thanks https://discuss.pytorch.org/t/how-can-l-load-my-best-model-as-a-feature-extractor-evaluator/17254/6
        assert self.model_ft is not None
        activations = kernel_utils.get_obj_or_dump("dev_output_results.pkl")
        self.analyzer = TorchModelAnalyzer(self)
        analyzer = self.analyzer

        if activations is not None:
            self.analyzer.activation = activations
        else:
            analyzer.register_forward_hook(
                self.model_ft.roi_heads,
                analyzer.get_output_saved("roi_heads"))
            # for mask_head output, size torch.Size([40, 256, 14, 14]),
            # (mask_fcn4): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
            # (relu4): ReLU(inplace)
            # for mask_predictor, torch.Size([46, 2, 28, 28])  (in evaluate mode)
            # for roi_heads, output.shape, tuple length 2, ([{'boxes', 'labels', 'scores':torch.Size([7]), 'masks':torch.Size([7, 1, 28, 28])},T,T,T],[ PLACE_FOR_LOSSES ])
            # for roi_heads, in training mode, output different number of test result, according to threshold thing

            self.eval_model_loss(
                self.model_ft,
                self.data_loader_dev,
                self.device,
                self.metric_logger,
                print_freq=150,
            )
            kernel_utils.dump_obj(analyzer.activation,
                                  "dev_output_results.pkl",
                                  force=True)

        roi_acts = []
        for acts in analyzer.activation["roi_heads"]:
            roi_acts += acts[0]
        self.analyzer.test_out_threshold(roi_acts)
Example #5
0
def get_pickled_data(file_path):
    """get_pickled_data from current folder or kaggle data input folder.

    Args:
      file_path: 

    Returns:

    """
    obj = get_obj_or_dump(file_path)

    if obj is None:
        #may_debug(True)

        return get_obj_or_dump(f"{ROOT_PATH}/input/clean-pickle-for-jigsaw-toxicity/{file_path}")

    return obj
Example #6
0
    def load_state_data_only(self, stage, file_name="run_state.pkl"):
        if stage is not None:
            file_name = f"run_state_{stage}.pkl"
        kernel_utils.logger.debug(f"restore from {file_name}")
        self_data = kernel_utils.get_obj_or_dump(filename=file_name)

        self._stage = self_data["_stage"]

        # self.model_ft = self_data['model_ft']
        self.num_epochs = self_data["num_epochs"]
        # self.optimizer = self_data['optimizer']
        self.data_loader = self_data["data_loader"]
        self.data_loader_dev = self_data["data_loader_dev"]
        self.device = self_data["device"]
Example #7
0
    def _load_state(cls, stage=None, file_name="run_state.pkl"):
        """

        Args:
          file_name: return: the kernel object, need to continue (Default value = "run_state.pkl")
          stage: (Default value = None)

        Returns:
          : the kernel object, need to continue

        """
        if stage is not None:
            file_name = f"run_state_{stage}.pkl"
        logger.debug(f"restore from {file_name}")
        return get_obj_or_dump(filename=file_name)
Example #8
0
    def prepare_test_data(self):
        """prepare_test_data."""

        if os.path.exists('/content'): # colab
            df_test = get_pickled_data("test_XLM.pkl")
        else:
            df_test = None

        if df_test is None:
            df_test = pd.read_csv(f'{ROOT_PATH}/input/jigsaw-multilingual-toxic-comment-classification/test.csv', index_col='id')
            df_test['comment_text'] = df_test.parallel_apply(lambda x: clean_text(x['content'], x['lang']), axis=1)
            get_obj_or_dump("test_XLM.pkl", default=df_test)

        self.test_dataset = DatasetRetriever(
            labels_or_ids=df_test.index.values, ## here different!!!
            comment_texts=df_test['comment_text'].values,
            langs=df_test['lang'].values,
            use_train_transforms=False,
            test=True,
            transformers=self.transformers
        )

        del df_test
        gc.collect();
Example #9
0
    def _load_state(cls, stage=None, file_name="run_state.pkl", logger=None):
        """

        Args:
          file_name: return: the kernel object, need to continue (Default value = "run_state.pkl")
          stage: (Default value = None)
          logger: (Default value = None)

        Returns:
          : the kernel object, need to continue

        """

        if stage is not None:
            file_name = f"run_state_{stage}.pkl"

        if logger is not None:
            logger.debug(f"restore from {file_name}")
        self = kernel_utils.get_obj_or_dump(filename=file_name)
        assert self is not None
        self.logger = logger

        return self
Example #10
0
TRAIN_PATH = DATA_PATH + "jigsaw-toxic-comment-train.csv"

val_data = None
test_data = None
train_data = None

# +
train_dataset = None
test_dataset = None
valid_dataset = None

if False:
    data_package = get_kaggle_dataset_input(
        "jigsaw-multilingula-toxicity-token-encoded/toxic_fast_tok_512.pk")
    try:
        csv_data_package = get_obj_or_dump("toxic_csv.pk")

        if csv_data_package is None:
            csv_data_package = get_kaggle_dataset_input(
                "jigsaw-multilingula-toxicity-token-encoded/toxic_csv.pk")
    except ModuleNotFoundError as e:
        logger.error("%s", e)
        csv_data_package = None

    if csv_data_package is None:
        val_data = pd.read_csv(VAL_PATH)
        test_data = pd.read_csv(TEST_PATH)
        train_data = pd.read_csv(TRAIN_PATH)
        csv_data_package = (val_data, test_data, train_data)
        get_obj_or_dump("toxic_csv.pk", default=csv_data_package)
    else:
Example #11
0
    return text.strip()


# -

MODEL = 'gpt2-medium'
tokenizer = AutoTokenizer.from_pretrained(MODEL)

# !git clone https://github.com/pennz/kaggle_runner k

# !rsync -r  k/* .
# !python3 -m pip install -e .

# +
X, y, X_valid, Y_valid, X_test = None, None, None, None, None
y = get_obj_or_dump("y.pkl", default=y)
X = get_obj_or_dump("x.pkl", default=X)

X_valid = get_obj_or_dump("xval.pkl", default=X_valid)
Y_valid = get_obj_or_dump("yval.pkl", default=Y_valid)
# -

X_test = get_obj_or_dump("xtest.pkl", default=X_test)

len(X), len(y)

# +


def focal_loss(gamma=2., alpha=.2):
    def focal_loss_fixed(y_true, y_pred):