Example #1
0
def tokenize_character(X_text_list_train, X_text_list_test, max_sent_len=800):
    """
    Tokenizes characters at word level
    :param X_text_list_train:
    :param X_text_list_test:
    :param max_sent_len:
    :return: x_char_encoder, x_char_padded_train, x_char_padded_test, max_word_length
    """
    X_text_list_train = [
        lst[:max_sent_len] + (max_sent_len - len(lst)) * ["<end>"]
        for lst in X_text_list_train
    ]
    X_text_list_test = [
        lst[:max_sent_len] + (max_sent_len - len(lst)) * ["<end>"]
        for lst in X_text_list_test
    ]

    x_char_encoder = CharacterEncoder(
        sample=[" ".join(sent) for sent in X_text_list_train], append_eos=False,
    )

    x_char_encoded_train = [
        [x_char_encoder.encode(char) for char in word] for word in X_text_list_train
    ]
    x_char_encoded_test = [
        [x_char_encoder.encode(char) for char in word] for word in X_text_list_test
    ]

    max_word_length = max(
        [
            max([internal.shape[0] for internal in external])
            for external in x_char_encoded_train
        ]
    )

    outer_list = []
    for lst in x_char_encoded_train:
        inner_list = []
        for ten in lst:
            res = torch.zeros(max_word_length, dtype=torch.long)
            res[: ten.shape[0]] = ten[:max_word_length]
            inner_list.append(res)
        outer_list.append(inner_list)

    x_char_padded_train = torch.stack([torch.stack(lst) for lst in outer_list])

    outer_list = []
    for lst in x_char_encoded_test:
        inner_list = []
        for ten in lst:
            res = torch.zeros(max_word_length, dtype=torch.long)
            res[: ten.shape[0]] = ten[:max_word_length]
            inner_list.append(res)
        outer_list.append(inner_list)

    x_char_padded_test = torch.stack([torch.stack(lst) for lst in outer_list])
    return x_char_encoder, x_char_padded_train, x_char_padded_test, max_word_length
Example #2
0
    def train(
        self, pair_dataset, append_eos=True, append_sos=True, min_occurrences=1000
    ):
        """train a tokenizer"""

        # create generator based on slice of data (3*10^5 sentences)
        dataset_example_gen = (
            ex["correct"] + " " + ex["incorrect"]
            for ex in itr.islice(pair_dataset, self._tokenizer_max_seq)
        )

        self.tokenizer = CharacterEncoder(
            dataset_example_gen, append_eos=True, append_sos=True, min_occurrences=1000
        )

        # after training set the variables
        self.vocab_size = self.tokenizer.vocab_size
        self.padding_index = self.tokenizer.padding_index  # =0
def tokenize_character(X_text_list_train, x_padded_train, x_padded_test,
                       x_encoder):
    x_char_encoder = CharacterEncoder(
        sample=X_text_list_train,
        append_eos=False,
    )
    x_char_encoded_train = [[
        x_char_encoder.encode(x_encoder.index_to_token[word.item()])
        for word in text
    ] for text in x_padded_train]
    MAX_WORD_LENGTH = max([
        max([internal.shape[0] for internal in external])
        for external in x_char_encoded_train
    ])
    # x_char_padded = max([max([internal.shape[0] for internal in external]) for external in x_char_encoded])
    # x_char_padded = torch.LongTensor(pad_sequence(x_char_encoded, MAX_SENTENCE_LEN+1))
    outer_list = []
    for lst in x_char_encoded_train:
        inner_list = []
        for ten in lst:
            res = torch.zeros(MAX_WORD_LENGTH, dtype=torch.long)
            res[:ten.shape[0]] = ten
            inner_list.append(res)
        outer_list.append(inner_list)

    x_char_padded_train = torch.stack([torch.stack(lst) for lst in outer_list])

    x_char_encoded_test = [[
        x_char_encoder.encode(x_encoder.index_to_token[word]) for word in text
    ] for text in x_padded_test]
    outer_list = []
    for lst in x_char_encoded_test:
        inner_list = []
        for ten in lst:
            res = torch.zeros(MAX_WORD_LENGTH, dtype=torch.long)
            res[:ten.shape[0]] = ten
            inner_list.append(res)
        outer_list.append(inner_list)

    x_char_padded_test = torch.stack([torch.stack(lst) for lst in outer_list])

    return x_char_encoder, x_char_padded_train, x_char_padded_test
Example #4
0
class CharTokenizerEncoder(TokenizerEncoder):
    def __init__(self):

        self.vocab_size = None
        self.padding_index = None

        # determine how many sequences we take to build the vocabulary
        self._tokenizer_max_seq = 3 * 10 ** 5

        # pickle tokenizer file name
        self.tokenizer_name = "char_corrector_tokenizer"

    def train(
        self, pair_dataset, append_eos=True, append_sos=True, min_occurrences=1000
    ):
        """train a tokenizer"""

        # create generator based on slice of data (3*10^5 sentences)
        dataset_example_gen = (
            ex["correct"] + " " + ex["incorrect"]
            for ex in itr.islice(pair_dataset, self._tokenizer_max_seq)
        )

        self.tokenizer = CharacterEncoder(
            dataset_example_gen, append_eos=True, append_sos=True, min_occurrences=1000
        )

        # after training set the variables
        self.vocab_size = self.tokenizer.vocab_size
        self.padding_index = self.tokenizer.padding_index  # =0

    def encode(self, text):
        pass

    def encode_batch(self, samples):
        """
        Encodes list of strings

        Args:
        -----------
        samples: list of strings
        """

        # it is compatible with pytrochNLP
        tokens, lengths = self.tokenizer.batch_encode(samples)

        return tokens, lengths

    def decode(self, text):
        pass
Example #5
0
    def __init__(
        self,
        markdown_lines: List[str],
        tokenizer,
        seq_len=128,
    ):
        self.intent_dict = {}
        self.entity_dict = {}
        self.entity_dict["O"] = 0  # using BIO tagging

        self.dataset = []
        self.seq_len = seq_len

        intent_value_list = []
        entity_type_list = []

        current_intent_focus = ""

        text_list = []

        for line in tqdm(
                markdown_lines,
                desc=
                "Organizing Intent & Entity dictionary in NLU markdown file ...",
        ):
            if len(line.strip()) < 2:
                current_intent_focus = ""
                continue

            if "## " in line:
                if "intent:" in line:
                    intent_value_list.append(line.split(":")[1].strip())
                    current_intent_focus = line.split(":")[1].strip()
                else:
                    current_intent_focus = ""

            else:
                if current_intent_focus != "":
                    text = line[2:].strip().lower()

                    for type_str in re.finditer(r"\([a-zA-Z_1-2]+\)", text):
                        entity_type = (text[type_str.start() +
                                            1:type_str.end() - 1].replace(
                                                "(", "").replace(")", ""))
                        entity_type_list.append(entity_type)

                    text = re.sub(r"\([a-zA-Z_1-2]+\)", "",
                                  text)  # remove (...) str
                    text = text.replace("[", "").replace(
                        "]", "")  # remove '[',']' special char

                    if len(text) > 0:
                        text_list.append(text.strip())

        #dataset tokenizer setting
        if "ElectraTokenizer" in str(type(tokenizer)):
            self.tokenizer = tokenizer
            self.pad_token_id = 0
            self.unk_token_id = 1
            self.eos_token_id = 3  #[SEP] token
            self.bos_token_id = 2  #[CLS] token

        else:
            if tokenizer == 'char':
                self.tokenizer = CharacterEncoder(text_list)

                # torchnlp base special token indices
                self.pad_token_id = 0
                self.unk_token_id = 1
                self.eos_token_id = 2
                self.bos_token_id = 3
            elif tokenizer == 'space':
                self.tokenizer = WhitespaceEncoder(text_list)

                # torchnlp base special token indices
                self.pad_token_id = 0
                self.unk_token_id = 1
                self.eos_token_id = 2
                self.bos_token_id = 3
            elif tokenizer == 'kobert':
                self.tokenizer = kobert_tokenizer()
                self.pad_token_id = 1
                self.unk_token_id = 0
                self.eos_token_id = 3  #[SEP] token
                self.bos_token_id = 2  #[CLS] token
            else:
                raise ValueError('not supported tokenizer type')

        intent_value_list = sorted(intent_value_list)
        for intent_value in intent_value_list:
            if intent_value not in self.intent_dict.keys():
                self.intent_dict[intent_value] = len(self.intent_dict)

        entity_type_list = sorted(entity_type_list)
        for entity_type in entity_type_list:
            if entity_type + '_B' not in self.entity_dict.keys():
                self.entity_dict[str(entity_type) + '_B'] = len(
                    self.entity_dict)
            if entity_type + '_I' not in self.entity_dict.keys():
                self.entity_dict[str(entity_type) + '_I'] = len(
                    self.entity_dict)

        current_intent_focus = ""

        for line in tqdm(
                markdown_lines,
                desc="Extracting Intent & Entity in NLU markdown files...",
        ):
            if len(line.strip()) < 2:
                current_intent_focus = ""
                continue

            if "## " in line:
                if "intent:" in line:
                    current_intent_focus = line.split(":")[1].strip()
                else:
                    current_intent_focus = ""
            else:
                if current_intent_focus != "":  # intent & entity sentence occur case
                    text = line[2:].strip().lower()

                    entity_value_list = []
                    for value in re.finditer(r"\[(.*?)\]", text):
                        entity_value_list.append(
                            text[value.start() + 1:value.end() - 1].replace(
                                "[", "").replace("]", ""))

                    entity_type_list = []
                    for type_str in re.finditer(r"\([a-zA-Z_1-2]+\)", text):
                        entity_type = (text[type_str.start() +
                                            1:type_str.end() - 1].replace(
                                                "(", "").replace(")", ""))
                        entity_type_list.append(entity_type)

                    text = re.sub(r"\([a-zA-Z_1-2]+\)", "",
                                  text)  # remove (...) str
                    text = text.replace("[", "").replace(
                        "]", "")  # remove '[',']' special char

                    if len(text) > 0:
                        each_data_dict = {}
                        each_data_dict["text"] = text.strip()
                        each_data_dict["intent"] = current_intent_focus
                        each_data_dict["intent_idx"] = self.intent_dict[
                            current_intent_focus]
                        each_data_dict["entities"] = []

                        for value, type_str in zip(entity_value_list,
                                                   entity_type_list):
                            for entity in re.finditer(value, text):
                                entity_tokens = self.tokenize(value)

                                for i, entity_token in enumerate(
                                        entity_tokens):
                                    if i == 0:
                                        BIO_type_str = type_str + '_B'
                                    else:
                                        BIO_type_str = type_str + '_I'

                                    each_data_dict["entities"].append({
                                        "start":
                                        text.find(entity_token, entity.start(),
                                                  entity.end()),
                                        "end":
                                        text.find(entity_token, entity.start(),
                                                  entity.end()) +
                                        len(entity_token),
                                        "entity":
                                        type_str,
                                        "value":
                                        entity_token,
                                        "entity_idx":
                                        self.entity_dict[BIO_type_str],
                                    })

                        self.dataset.append(each_data_dict)

        print(f"Intents: {self.intent_dict}")
        print(f"Entities: {self.entity_dict}")
Example #6
0
class ABCSec2SeqDataModule(pl.LightningDataModule):
    '''simple pytroch-lightning data module witch generates artificial data,
    generate simple translation task from permuted alphabet to normal alphabet as 
    list of dicts 
     [
         { "correct" : "...", "incorrect": "..."}
         { "correct" : "...", "incorrect": "..."}
     ]
    '''
    def __init__(
        self, batch_size=4, N_random_samples=1000, N_valid_size=200, num_workers=1
    ):
        super().__init__()

        self.batch_size = batch_size

        self.vocab_size = -1

        self.padding_index = -1

        assert N_random_samples > N_valid_size

        self.N_random_samples = N_random_samples
        self.N_valid_size = N_valid_size
        self.num_workers = num_workers

    def prepare_data(self):
        # stuff here is done once at the very beginning of training
        # before any distributed training starts

        # https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html#prepare-data

        pass

    def _setup_task1(self, N_random_samples):
        """generate simple translation task from permuted alphabet to normal alphabet.
        Fixed length only permuted characters
        """

        # If you want to play with it:
        # - you can try shorter vocab for faster trainning eg. 'abcdefghij' (ony 10 chars)
        init_string = "abcdefghijklmnopqrstuwxyz"  

        dataset = []
        for i in range(N_random_samples):

            l = list(init_string)
            random.shuffle(l)
            dataset.append(
                {"correct": f"{i}-{init_string}", "incorrect": f'{i}-{"".join(l)}'}
            )

        return dataset

    def _setup_task2(self, N_random_samples):
        """generates simple translation task random length, random letters"""

        init_string = "abcdefghijklmnoprstuwxyz"
        str_len = len(init_string)

        dataset = []
        for i in range(N_random_samples):
            # random seq len [3, str_len]
            rnd_len = random.randint(3, str_len)
            # random characters choosen with replacement
            t = random.choices(init_string, k=rnd_len)
            t_sort = sorted(t)

            dataset.append({"correct": "".join(t_sort), "incorrect": "".join(t)})

        return dataset

    def _bucket_train_sort_func(self, i):
        """defines sort key for bucketBatchSampler, should be defined in top scope because
        in distributed mode lambda can't be pickled
        """
        return -len(self.train_ds[i]["incorrect"])

    def _bucket_val_sort_func(self, i):
        """defines sort key for bucketBatchSampler, should be defined in top scope because
        in distributed mode lambda can't be pickled
        """
        return -len(self.valid_ds[i]["incorrect"])

    def _sampler_sort_func(self, x):
        """
        function for SortedSampler, sort in reverse orderby incorrect sequence lenght,
        added random value for hashing the rows in distributed scenario, each epoch get slight different
        set of sentences
        sequences arent sorted exacly, but it does not matter much,
        """

        return -len(x["incorrect"]) + random.randint(0, 4)

    def setup(self, stage):
        # https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html#setup

        N_valid_size = self.N_valid_size

        # dataset = self._setup_task1(self.N_random_samples)
        dataset = self._setup_task2(self.N_random_samples)

        # list of dicts
        self.train_ds = dataset[0:-N_valid_size]

        self.valid_ds = dataset[-N_valid_size:]

        # load dataset build vocab and numericalize

        # todo: change it bad design! only for prototyping and learning
        dataset_example_gen = (ex["correct"] + " " + ex["incorrect"] for ex in dataset)

        
        self.tokenizer = CharacterEncoder(
            dataset_example_gen, append_eos=True, append_sos=True
        )
        pickle.dump(
            self.tokenizer,
            open(f"./abc_data_character_encoder.p", "wb"),
        )

        self.train_sampler = SortedSampler(
            self.train_ds, sort_key=self._sampler_sort_func
        )

        self.val_sampler = SortedSampler(
            self.valid_ds, sort_key=self._sampler_sort_func
        )

        # #samplers from torchnlp, did not work with distibutedDataParallel
        # self.train_sampler = BucketBatchSampler(
        #     sampler=SequentialSampler(self.train_ds),
        #     # bucket_size_multiplier=1000,
        #     batch_size=self.batch_size,
        #     drop_last=True,
        #     sort_key=self._bucket_train_sort_func,
        #     #sort_key=lambda i: -len(self.train_ds[i]["incorrect"]),
        # )

        # self.val_sampler = BucketBatchSampler(
        #     sampler=SequentialSampler(self.valid_ds),
        #     batch_size=self.batch_size,
        #     drop_last=True,
        #     sort_key = self._bucket_val_sort_func,
        #     #sort_key=lambda i: -len(self.valid_ds[i]["incorrect"]),
        # )

        # samplers from catalyst
        # DistributedWrapperSampler
        # DynamicBatchLensampler
        # https://github.com/catalyst-team/catalyst/blob/master/catalyst/data/sampler.py

        # DynamicLenBatchSampler, DistributedSamplerWrapper

        # train_sampler = RandomSampler(self.train_ds)
        # train_sampler = DynamicLenBatchSampler(train_sampler, self.batch_size, drop_last=True)

        # self.train_sampler = train_sampler
        # self.train_sampler = DistributedSamplerWrapper(train_sampler)

        # valid_sampler = RandomSampler(self.valid_ds)
        # valid_sampler = DynamicLenBatchSampler(valid_sampler, self.batch_size, drop_last=True)
        # self.val_sampler = valid_sampler
        # self.valid_sampler = DistributedSamplerWrapper(valid_sampler)

        ### todo: do wymiany
        self.vocab_size = self.tokenizer.vocab_size
        self.padding_index = self.tokenizer.padding_index  # =0

    def __collate_fn(self, sample: list, prepare_target=True):
        """
        torch.utils.Dataloader collate_fn

        change layout of data from list of dicts to dict of tensors
         [
           {text: 'a', label:'0'}
           {text: 'b', label:'1'}
           {text: 'c', label:'2'}
         ]
         to
         { text: ['a', 'b', 'c'], label:[0,1,2] }

         and encode tokens to its ids in vocab, do also 0 padding
        """

        # sort in reverse order, need for packed sequence

        sorted_sample = sorted(sample, key=lambda x: -len(x["incorrect"]))

        collate_sample = collate_tensors(
            sorted_sample, stack_tensors=stack_and_pad_tensors
        )

        ### todo: do wymiany
        src_tokens, src_lengths = self.tokenizer.batch_encode(
            collate_sample["incorrect"]
        )

        # cant change layout here, becaure when use distributeddataloader (multi-gpu) it will
        # divide first dim by the number of gpus,
        # change from [batch, seq_len] -> to [seq_len, batch]
        # src_tokens = src_tokens.transpose(0, 1)

        inputs = {"src_ids": src_tokens, "src_lengths": src_lengths}

        ### todo: do wymiany
        ### encode tokens based on vocab
        trg_tokens, trg_lengths = self.tokenizer.batch_encode(collate_sample["correct"])

        # change from [batch, seq_len] -> to [seq_len, batch]
        # trg_tokens = trg_tokens.transpose(0, 1)
        targets = {"trg_ids": trg_tokens, "trg_lengths": trg_lengths}

        return inputs, targets

    def train_dataloader(self):

        # dataloader with sampler for distributed training trainer should have set replace_sampler_ddp=False
        self._train_dl = DataLoader(
            dataset=self.train_ds,
            num_workers=self.num_workers,
            shuffle=False,
            sampler=self.train_sampler,
            collate_fn=self.__collate_fn,
            batch_size=self.batch_size,
        )

        return self._train_dl

    def val_dataloader(self):

        # with normal sampler
        self._val_dl = DataLoader(
            dataset=self.valid_ds,
            collate_fn=self.__collate_fn,
            num_workers=self.num_workers,
            sampler=self.val_sampler,
            batch_size=self.batch_size,
            shuffle=False,
        )

        return self._val_dl
Example #7
0
    def setup(self, stage):
        # https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html#setup

        N_valid_size = self.N_valid_size

        # dataset = self._setup_task1(self.N_random_samples)
        dataset = self._setup_task2(self.N_random_samples)

        # list of dicts
        self.train_ds = dataset[0:-N_valid_size]

        self.valid_ds = dataset[-N_valid_size:]

        # load dataset build vocab and numericalize

        # todo: change it bad design! only for prototyping and learning
        dataset_example_gen = (ex["correct"] + " " + ex["incorrect"] for ex in dataset)

        
        self.tokenizer = CharacterEncoder(
            dataset_example_gen, append_eos=True, append_sos=True
        )
        pickle.dump(
            self.tokenizer,
            open(f"./abc_data_character_encoder.p", "wb"),
        )

        self.train_sampler = SortedSampler(
            self.train_ds, sort_key=self._sampler_sort_func
        )

        self.val_sampler = SortedSampler(
            self.valid_ds, sort_key=self._sampler_sort_func
        )

        # #samplers from torchnlp, did not work with distibutedDataParallel
        # self.train_sampler = BucketBatchSampler(
        #     sampler=SequentialSampler(self.train_ds),
        #     # bucket_size_multiplier=1000,
        #     batch_size=self.batch_size,
        #     drop_last=True,
        #     sort_key=self._bucket_train_sort_func,
        #     #sort_key=lambda i: -len(self.train_ds[i]["incorrect"]),
        # )

        # self.val_sampler = BucketBatchSampler(
        #     sampler=SequentialSampler(self.valid_ds),
        #     batch_size=self.batch_size,
        #     drop_last=True,
        #     sort_key = self._bucket_val_sort_func,
        #     #sort_key=lambda i: -len(self.valid_ds[i]["incorrect"]),
        # )

        # samplers from catalyst
        # DistributedWrapperSampler
        # DynamicBatchLensampler
        # https://github.com/catalyst-team/catalyst/blob/master/catalyst/data/sampler.py

        # DynamicLenBatchSampler, DistributedSamplerWrapper

        # train_sampler = RandomSampler(self.train_ds)
        # train_sampler = DynamicLenBatchSampler(train_sampler, self.batch_size, drop_last=True)

        # self.train_sampler = train_sampler
        # self.train_sampler = DistributedSamplerWrapper(train_sampler)

        # valid_sampler = RandomSampler(self.valid_ds)
        # valid_sampler = DynamicLenBatchSampler(valid_sampler, self.batch_size, drop_last=True)
        # self.val_sampler = valid_sampler
        # self.valid_sampler = DistributedSamplerWrapper(valid_sampler)

        ### todo: do wymiany
        self.vocab_size = self.tokenizer.vocab_size
        self.padding_index = self.tokenizer.padding_index  # =0
    def __init__(
        self,
        markdown_lines: List[str],
        seq_len=128,
        pad_token_id=0,
        unk_token_id=1,
        eos_token_id=2,
        bos_token_id=3,
        tokenizer=None
    ):
        self.intent_dict = {}
        self.entity_dict = {}
        self.entity_dict[
            "O"
        ] = 0  # based on XO tagging(one entity_type has assigned to one class)

        self.dataset = []
        self.seq_len = seq_len

        # following torchnlp encoder preset
        self.pad_token_id = pad_token_id
        self.unk_token_id = unk_token_id
        self.eos_token_id = eos_token_id
        self.bos_token_id = bos_token_id

        current_intent_focus = ""

        for line in tqdm(
            markdown_lines, desc="Extracting Intent & Entity in NLU markdown files...",
        ):
            if len(line.strip()) < 2:
                continue

            if "## " in line:
                if "intent:" in line:
                    current_intent_focus = line.split(":")[1].strip()

                    if current_intent_focus not in self.intent_dict.keys():
                        self.intent_dict[current_intent_focus] = len(
                            self.intent_dict.keys()
                        )

                else:
                    current_intent_focus = ""
            else:
                if current_intent_focus != "":  # intent & entity sentence occur case
                    text = line[2:]

                    entity_value_list = []
                    for value in re.finditer(r"\[[^)]*\]", text):
                        entity_value_list.append(
                            text[value.start() + 1 : value.end() - 1].replace('[','').replace(']','')
                        )

                    entity_type_list = []
                    for type_str in re.finditer(r"\([^)]*\)", text):
                        entity_type = text[type_str.start() + 1 : type_str.end() - 1].replace('(','').replace(')','')
                        entity_type_list.append(entity_type)

                        if entity_type not in self.entity_dict.keys():
                            self.entity_dict[entity_type] = len(self.entity_dict.keys())

                    text = re.sub(r"\([^)]*\)", "", text)
                    text = text.replace("[", "").replace("]", "")

                    each_data_dict = {}
                    each_data_dict["text"] = text.strip()
                    each_data_dict["intent"] = current_intent_focus
                    each_data_dict["intent_idx"] = self.intent_dict[
                        current_intent_focus
                    ]
                    each_data_dict["entities"] = []

                    for value, type_str in zip(entity_value_list, entity_type_list):
                        try:
                            for entity in re.finditer(value, text):
                                each_data_dict["entities"].append(
                                    {
                                        "start": entity.start(),
                                        "end": entity.end(),
                                        "entity": type_str,
                                        "entity_idx": self.entity_dict[type_str],
                                    }
                                )
                        except Exception as ex:
                            print (f'error occured : {ex}')
                            print (f'value: {value}')
                            print (f'text: {text}')

                    self.dataset.append(each_data_dict)

        # encoder(tokenizer) definition
        self.encoder = CharacterEncoder([data["text"] for data in self.dataset])
        self.tokenizer = tokenizer
class RasaIntentEntityDataset(torch.utils.data.Dataset):
    """
    RASA NLU markdown file lines based Custom Dataset Class

    Dataset Example in nlu.md

    ## intent:intent_데이터_자동_선물하기_멀티턴                <- intent name
    - T끼리 데이터 주기적으로 보내기                            <- utterance without entity
    - 인터넷 데이터 [달마다](Every_Month)마다 보내줄 수 있어?    <- utterance with entity
    
    """

    def __init__(
        self,
        markdown_lines: List[str],
        seq_len=128,
        pad_token_id=0,
        unk_token_id=1,
        eos_token_id=2,
        bos_token_id=3,
        tokenizer=None
    ):
        self.intent_dict = {}
        self.entity_dict = {}
        self.entity_dict[
            "O"
        ] = 0  # based on XO tagging(one entity_type has assigned to one class)

        self.dataset = []
        self.seq_len = seq_len

        # following torchnlp encoder preset
        self.pad_token_id = pad_token_id
        self.unk_token_id = unk_token_id
        self.eos_token_id = eos_token_id
        self.bos_token_id = bos_token_id

        current_intent_focus = ""

        for line in tqdm(
            markdown_lines, desc="Extracting Intent & Entity in NLU markdown files...",
        ):
            if len(line.strip()) < 2:
                continue

            if "## " in line:
                if "intent:" in line:
                    current_intent_focus = line.split(":")[1].strip()

                    if current_intent_focus not in self.intent_dict.keys():
                        self.intent_dict[current_intent_focus] = len(
                            self.intent_dict.keys()
                        )

                else:
                    current_intent_focus = ""
            else:
                if current_intent_focus != "":  # intent & entity sentence occur case
                    text = line[2:]

                    entity_value_list = []
                    for value in re.finditer(r"\[[^)]*\]", text):
                        entity_value_list.append(
                            text[value.start() + 1 : value.end() - 1].replace('[','').replace(']','')
                        )

                    entity_type_list = []
                    for type_str in re.finditer(r"\([^)]*\)", text):
                        entity_type = text[type_str.start() + 1 : type_str.end() - 1].replace('(','').replace(')','')
                        entity_type_list.append(entity_type)

                        if entity_type not in self.entity_dict.keys():
                            self.entity_dict[entity_type] = len(self.entity_dict.keys())

                    text = re.sub(r"\([^)]*\)", "", text)
                    text = text.replace("[", "").replace("]", "")

                    each_data_dict = {}
                    each_data_dict["text"] = text.strip()
                    each_data_dict["intent"] = current_intent_focus
                    each_data_dict["intent_idx"] = self.intent_dict[
                        current_intent_focus
                    ]
                    each_data_dict["entities"] = []

                    for value, type_str in zip(entity_value_list, entity_type_list):
                        try:
                            for entity in re.finditer(value, text):
                                each_data_dict["entities"].append(
                                    {
                                        "start": entity.start(),
                                        "end": entity.end(),
                                        "entity": type_str,
                                        "entity_idx": self.entity_dict[type_str],
                                    }
                                )
                        except Exception as ex:
                            print (f'error occured : {ex}')
                            print (f'value: {value}')
                            print (f'text: {text}')

                    self.dataset.append(each_data_dict)

        # encoder(tokenizer) definition
        self.encoder = CharacterEncoder([data["text"] for data in self.dataset])
        self.tokenizer = tokenizer

    def tokenize(self, text: str, padding: bool = True, return_tensor: bool = True):
        # bos_token=3, eos_token=2, unk_token=1, pad_token=0
        if self.tokenizer is not None:
            tokens = self.tokenizer.encode(text)
            if type(tokens) == list:
                tokens = torch.tensor(tokens)

        else:
            tokens = self.encoder.encode(text)

            bos_tensor = torch.tensor([self.bos_token_id])
            eos_tensor = torch.tensor([self.eos_token_id])
            tokens = torch.cat((bos_tensor, tokens, eos_tensor), 0)

        if padding:
            if len(tokens) > self.seq_len:
                tokens = tokens[:self.seq_len]
            else:
                pad_tensor = torch.tensor(
                    [self.pad_token_id] * (self.seq_len - len(tokens))
                )
                tokens = torch.cat((tokens, pad_tensor), 0)

        if return_tensor:
            return tokens
        else:
            return tokens.numpy()

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        tokens = self.tokenize(self.dataset[idx]["text"])

        intent_idx = torch.tensor([self.dataset[idx]["intent_idx"]])

        entity_idx = np.zeros(self.seq_len)
        for entity_info in self.dataset[idx]["entities"]:
            for i in range(entity_info["start"], entity_info["end"] + 1):
                entity_idx[i] = entity_info["entity_idx"]
        entity_idx = torch.from_numpy(entity_idx)

        return tokens, intent_idx, entity_idx

    def get_intent_idx(self):
        return self.intent_dict

    def get_entity_idx(self):
        return self.entity_dict

    def get_vocab_size(self):
        if self.tokenizer is not None:
            return len(self.tokenizer)

        return len(self.encoder.vocab)

    def get_seq_len(self):
        return self.seq_len
def test_character_encoder_min_occurrences(sample):
    encoder = CharacterEncoder(sample, min_occurrences=10)
    input_ = 'English-language pangram'
    output = encoder.encode(input_)
    assert encoder.decode(output) == ''.join([DEFAULT_UNKNOWN_TOKEN] * len(input_))
def encoder(sample):
    return CharacterEncoder(sample)