Example #1
0
    def __init__(
        self,
        input_file: str,
        tokenizer: PreTrainedTokenizerBase,
        mode: str,
        do_basic_tokenize: bool,
        tagger_data_augmentation: bool,
    ):
        assert mode in constants.MODES
        self.mode = mode
        raw_insts = read_data_file(input_file)

        # Convert raw instances to TaggerDataInstance
        insts = []
        for (_, w_words, s_words) in tqdm(raw_insts):
            for inst_dir in constants.INST_DIRECTIONS:
                if inst_dir == constants.INST_BACKWARD and mode == constants.TN_MODE:
                    continue
                if inst_dir == constants.INST_FORWARD and mode == constants.ITN_MODE:
                    continue
                # Create a new TaggerDataInstance
                inst = TaggerDataInstance(w_words, s_words, inst_dir,
                                          do_basic_tokenize)
                insts.append(inst)
                # Data Augmentation (if enabled)
                if tagger_data_augmentation:
                    filtered_w_words, filtered_s_words = [], []
                    for ix, (w, s) in enumerate(zip(w_words, s_words)):
                        if not s in constants.SPECIAL_WORDS:
                            filtered_w_words.append(w)
                            filtered_s_words.append(s)
                    if len(filtered_s_words) > 1:
                        inst = TaggerDataInstance(filtered_w_words,
                                                  filtered_s_words, inst_dir)
                        insts.append(inst)

        self.insts = insts
        texts = [inst.input_words for inst in insts]
        tags = [inst.labels for inst in insts]

        # Tags Mapping
        self.tag2id = {
            tag: id
            for id, tag in enumerate(constants.ALL_TAG_LABELS)
        }

        # Finalize
        self.encodings = tokenizer(texts,
                                   is_split_into_words=True,
                                   padding=False,
                                   truncation=True)
        self.labels = self.encode_tags(tags, self.encodings)
Example #2
0
    def __init__(self,
                 input_file: str,
                 mode: str,
                 lang: str,
                 keep_puncts: bool = False):
        self.lang = lang
        insts = read_data_file(input_file)

        # Build inputs and targets
        self.directions, self.inputs, self.targets = [], [], []
        for (_, w_words, s_words) in insts:
            # Extract words that are not punctuations
            processed_w_words, processed_s_words = [], []
            for w_word, s_word in zip(w_words, s_words):
                if s_word == constants.SIL_WORD:
                    if keep_puncts:
                        processed_w_words.append(w_word)
                        processed_s_words.append(w_word)
                    continue
                if s_word == constants.SELF_WORD:
                    processed_s_words.append(w_word)
                if not s_word in constants.SPECIAL_WORDS:
                    processed_s_words.append(s_word)
                processed_w_words.append(w_word)
            # Create examples
            for direction in constants.INST_DIRECTIONS:
                if direction == constants.INST_BACKWARD:
                    if mode == constants.TN_MODE:
                        continue
                    input_words = processed_s_words
                    output_words = processed_w_words
                if direction == constants.INST_FORWARD:
                    if mode == constants.ITN_MODE:
                        continue
                    input_words = w_words
                    output_words = processed_s_words
                # Basic tokenization
                input_words = basic_tokenize(' '.join(input_words), lang)
                output_words = basic_tokenize(' '.join(output_words), lang)
                # Update self.directions, self.inputs, self.targets
                self.directions.append(direction)
                self.inputs.append(' '.join(input_words))
                self.targets.append(' '.join(output_words))
        self.examples = list(zip(self.directions, self.inputs, self.targets))
Example #3
0
    def __init__(self, input_file: str, mode: str, lang: str):
        self.lang = lang
        insts = read_data_file(input_file, lang=lang)

        # Build inputs and targets
        self.directions, self.inputs, self.targets, self.classes, self.nb_spans, self.span_starts, self.span_ends = (
            [],
            [],
            [],
            [],
            [],
            [],
            [],
        )
        for (classes, w_words, s_words) in insts:
            # Extract words that are not punctuations
            for direction in constants.INST_DIRECTIONS:
                if direction == constants.INST_BACKWARD:
                    if mode == constants.TN_MODE:
                        continue

                    # ITN mode
                    (
                        processed_w_words,
                        processed_s_words,
                        processed_classes,
                        processed_nb_spans,
                        processed_s_span_starts,
                        processed_s_span_ends,
                    ) = ([], [], [], 0, [], [])
                    s_word_idx = 0
                    for cls, w_word, s_word in zip(classes, w_words, s_words):
                        if s_word == constants.SIL_WORD:
                            continue
                        elif s_word == constants.SELF_WORD:
                            processed_s_words.append(w_word)
                        else:
                            processed_s_words.append(s_word)

                        processed_nb_spans += 1
                        processed_classes.append(cls)
                        processed_s_span_starts.append(s_word_idx)
                        s_word_idx += len(
                            basic_tokenize(processed_s_words[-1],
                                           lang=self.lang))
                        processed_s_span_ends.append(s_word_idx)
                        processed_w_words.append(w_word)

                    self.span_starts.append(processed_s_span_starts)
                    self.span_ends.append(processed_s_span_ends)
                    self.classes.append(processed_classes)
                    self.nb_spans.append(processed_nb_spans)
                    # Basic tokenization
                    input_words = basic_tokenize(' '.join(processed_s_words),
                                                 lang)
                    # Update self.directions, self.inputs, self.targets
                    self.directions.append(direction)
                    self.inputs.append(' '.join(input_words))
                    self.targets.append(
                        processed_w_words
                    )  # is list of lists where inner list contains target tokens (not words)

                # TN mode
                elif direction == constants.INST_FORWARD:
                    if mode == constants.ITN_MODE:
                        continue
                    (
                        processed_w_words,
                        processed_s_words,
                        processed_classes,
                        processed_nb_spans,
                        w_span_starts,
                        w_span_ends,
                    ) = ([], [], [], 0, [], [])
                    w_word_idx = 0
                    for cls, w_word, s_word in zip(classes, w_words, s_words):

                        # TN forward mode
                        if s_word in constants.SPECIAL_WORDS:
                            processed_s_words.append(w_word)
                        else:
                            processed_s_words.append(s_word)

                        w_span_starts.append(w_word_idx)
                        w_word_idx += len(
                            basic_tokenize(w_word, lang=self.lang))
                        w_span_ends.append(w_word_idx)
                        processed_nb_spans += 1
                        processed_classes.append(cls)
                        processed_w_words.append(w_word)

                    self.span_starts.append(w_span_starts)
                    self.span_ends.append(w_span_ends)
                    self.classes.append(processed_classes)
                    self.nb_spans.append(processed_nb_spans)
                    # Basic tokenization
                    input_words = basic_tokenize(' '.join(processed_w_words),
                                                 lang)
                    # Update self.directions, self.inputs, self.targets
                    self.directions.append(direction)
                    self.inputs.append(' '.join(input_words))
                    self.targets.append(
                        processed_s_words
                    )  # is list of lists where inner list contains target tokens (not words)

        self.examples = list(
            zip(
                self.directions,
                self.inputs,
                self.targets,
                self.classes,
                self.nb_spans,
                self.span_starts,
                self.span_ends,
            ))
Example #4
0
    def __init__(
        self,
        input_file: str,
        tokenizer: PreTrainedTokenizerBase,
        tokenizer_name: str,
        raw_instances: Optional[List[List[str]]] = None,
        mode: str = "joint",
        max_len: int = 512,
        decoder_data_augmentation: bool = False,
        lang: str = "en",
        do_basic_tokenize: bool = False,
        use_cache: bool = False,
        max_insts: int = -1,
        do_tokenize: bool = True,
        initial_shuffle: bool = False,
    ):
        assert mode in constants.MODES
        assert lang in constants.SUPPORTED_LANGS
        self.mode = mode
        self.lang = lang
        self.use_cache = use_cache
        self.max_insts = max_insts
        self.tokenizer = tokenizer
        self.max_seq_len = max_len
        self.mode = mode

        # Get cache path
        data_dir, filename = os.path.split(input_file)
        tokenizer_name_normalized = tokenizer_name.replace('/', '_')
        cached_data_file = os.path.join(
            data_dir,
            f'cached_decoder_{filename}_{tokenizer_name_normalized}_{lang}_{max_insts}_{mode}_{max_len}.pkl'
        )

        if use_cache and os.path.exists(cached_data_file):
            logging.warning(
                f"Processing of {input_file} is skipped as caching is enabled and a cache file "
                f"{cached_data_file} already exists.")
            with open(cached_data_file, 'rb') as f:
                data = pickle.load(f)
                self.insts, self.inputs, self.examples, self.tn_count, self.itn_count, self.label_ids_semiotic = data
        else:
            if raw_instances is None:
                raw_instances = read_data_file(fp=input_file,
                                               lang=self.lang,
                                               max_insts=max_insts)
            else:
                raw_instances = raw_instances[:max_insts]

            if initial_shuffle:
                random.shuffle(raw_instances)

            logging.debug(
                f"Converting raw instances to DecoderDataInstance for {input_file}..."
            )
            self.insts, all_semiotic_classes = self.__process_raw_entries(
                raw_instances,
                decoder_data_augmentation=decoder_data_augmentation,
                do_basic_tokenize=do_basic_tokenize)
            logging.debug(
                f"Extracted {len(self.insts)} DecoderDateInstances out of {len(raw_instances)} raw instances."
            )
            self.label_ids_semiotic = OrderedDict(
                {l: idx
                 for idx, l in enumerate(all_semiotic_classes)})
            logging.debug(f'Label_ids: {self.label_ids_semiotic}')
            # save labels list from the training file to the input_file to the file
            dir_name, file_name = os.path.split(input_file)
            if 'train' in file_name:
                with open(os.path.join(dir_name, f"label_ids_{file_name}"),
                          'w') as f:
                    f.write('\n'.join(self.label_ids_semiotic.keys()))

            if do_tokenize:
                logging.debug(
                    f'Processing samples, total number: {len(self.insts)}')
                self.__tokenize_samples(use_cache=use_cache,
                                        cached_data_file=cached_data_file)
Example #5
0
    def __init__(
        self,
        input_file: str,
        tokenizer: PreTrainedTokenizerBase,
        tokenizer_name: str,
        mode: str,
        do_basic_tokenize: bool,
        tagger_data_augmentation: bool,
        lang: str,
        max_seq_length: int,
        use_cache: bool = False,
        max_insts: int = -1,
    ):
        assert mode in constants.MODES
        assert lang in constants.SUPPORTED_LANGS
        self.mode = mode
        self.lang = lang
        self.use_cache = use_cache
        self.max_insts = max_insts

        # Get cache path
        data_dir, filename = os.path.split(input_file)
        tokenizer_name_normalized = tokenizer_name.replace('/', '_')
        cached_data_file = os.path.join(
            data_dir,
            f'cached_tagger_{filename}_{tokenizer_name_normalized}_{lang}_{max_insts}_{max_seq_length}.pkl'
        )

        if use_cache and os.path.exists(cached_data_file):
            logging.warning(
                f"Processing of {input_file} is skipped as caching is enabled and a cache file "
                f"{cached_data_file} already exists.")
            with open(cached_data_file, 'rb') as f:
                data = pickle.load(f)
                self.insts, self.tag2id, self.encodings, self.labels = data
        else:
            # Read the input raw data file, returns list of sentences parsed as list of class, w_words, s_words
            raw_insts = read_data_file(input_file, lang=lang)
            if max_insts >= 0:
                raw_insts = raw_insts[:max_insts]

            # Convert raw instances to TaggerDataInstance
            insts = []
            for (_, w_words, s_words) in tqdm(raw_insts):
                for inst_dir in constants.INST_DIRECTIONS:
                    if inst_dir == constants.INST_BACKWARD and mode == constants.TN_MODE:
                        continue
                    if inst_dir == constants.INST_FORWARD and mode == constants.ITN_MODE:
                        continue

                    # filter out examples that are longer than the maximum sequence length value
                    if (len(
                            tokenizer(w_words,
                                      is_split_into_words=True,
                                      padding=False,
                                      truncation=True)['input_ids']) >=
                            max_seq_length or len(
                                tokenizer(s_words,
                                          is_split_into_words=True,
                                          padding=False,
                                          truncation=True)['input_ids']) >=
                            max_seq_length):
                        continue

                    # Create a new TaggerDataInstance
                    inst = TaggerDataInstance(w_words, s_words, inst_dir,
                                              do_basic_tokenize)
                    insts.append(inst)
                    # Data Augmentation (if enabled)
                    if tagger_data_augmentation:
                        filtered_w_words, filtered_s_words = [], []
                        for ix, (w, s) in enumerate(zip(w_words, s_words)):
                            if not s in constants.SPECIAL_WORDS:
                                filtered_w_words.append(w)
                                filtered_s_words.append(s)
                        if len(filtered_s_words) > 1:
                            inst = TaggerDataInstance(filtered_w_words,
                                                      filtered_s_words,
                                                      inst_dir)
                            insts.append(inst)

            self.insts = insts
            texts = [inst.input_words for inst in insts]
            tags = [inst.labels for inst in insts]

            # Tags Mapping
            self.tag2id = {
                tag: id
                for id, tag in enumerate(constants.ALL_TAG_LABELS)
            }

            # Finalize
            self.encodings = tokenizer(texts,
                                       is_split_into_words=True,
                                       padding=False,
                                       truncation=True)
            self.labels = self.encode_tags(tags, self.encodings)

            # Write to cache (if use_cache)
            if use_cache:
                with open(cached_data_file, 'wb') as out_file:
                    data = self.insts, self.tag2id, self.encodings, self.labels
                    pickle.dump(data,
                                out_file,
                                protocol=pickle.HIGHEST_PROTOCOL)
Example #6
0
    def __init__(
        self,
        input_file: str,
        tokenizer: PreTrainedTokenizerBase,
        tokenizer_name: str,
        mode: str,
        max_len: int,
        decoder_data_augmentation: bool,
        lang: str,
        do_basic_tokenize: bool,
        use_cache: bool = False,
        max_insts: int = -1,
    ):
        assert mode in constants.MODES
        assert lang in constants.SUPPORTED_LANGS
        self.mode = mode
        self.lang = lang
        self.use_cache = use_cache
        self.max_insts = max_insts

        # Get cache path
        data_dir, filename = os.path.split(input_file)
        tokenizer_name_normalized = tokenizer_name.replace('/', '_')
        cached_data_file = os.path.join(
            data_dir, f'cached_decoder_{filename}_{tokenizer_name_normalized}_{lang}_{max_insts}.pkl'
        )

        if use_cache and os.path.exists(cached_data_file):
            logging.warning(
                f"Processing of {input_file} is skipped as caching is enabled and a cache file "
                f"{cached_data_file} already exists."
            )
            with open(cached_data_file, 'rb') as f:
                data = pickle.load(f)
                self.insts, self.inputs, self.examples, self.tn_count, self.itn_count = data
        else:
            raw_insts = read_data_file(input_file)
            if max_insts >= 0:
                raw_insts = raw_insts[:max_insts]

            # Convert raw instances to TaggerDataInstance
            insts, inputs, targets = [], [], []
            for (classes, w_words, s_words) in tqdm(raw_insts):
                for ix, (_class, w_word, s_word) in enumerate(zip(classes, w_words, s_words)):
                    if s_word in constants.SPECIAL_WORDS:
                        continue
                    for inst_dir in constants.INST_DIRECTIONS:
                        if inst_dir == constants.INST_BACKWARD and mode == constants.TN_MODE:
                            continue
                        if inst_dir == constants.INST_FORWARD and mode == constants.ITN_MODE:
                            continue
                        # Create a DecoderDataInstance
                        inst = DecoderDataInstance(
                            w_words,
                            s_words,
                            inst_dir,
                            start_idx=ix,
                            end_idx=ix + 1,
                            lang=self.lang,
                            semiotic_class=_class,
                            do_basic_tokenize=do_basic_tokenize,
                        )
                        insts.append(inst)
                        if decoder_data_augmentation:
                            noise_left = random.randint(1, 2)
                            noise_right = random.randint(1, 2)
                            inst = DecoderDataInstance(
                                w_words,
                                s_words,
                                inst_dir,
                                start_idx=ix - noise_left,
                                end_idx=ix + 1 + noise_right,
                                lang=self.lang,
                                do_basic_tokenize=do_basic_tokenize,
                            )
                            insts.append(inst)

            self.insts = insts
            inputs = [inst.input_str for inst in insts]
            targets = [inst.output_str for inst in insts]

            # Tokenization
            self.inputs, self.examples = [], []
            self.tn_count, self.itn_count, long_examples_filtered = 0, 0, 0
            input_max_len, target_max_len = 0, 0
            for idx in range(len(inputs)):
                # Input
                _input = tokenizer([inputs[idx]])
                input_len = len(_input['input_ids'][0])
                if input_len > max_len:
                    long_examples_filtered += 1
                    continue

                # Target
                _target = tokenizer([targets[idx]])
                target_len = len(_target['input_ids'][0])
                if target_len > max_len:
                    long_examples_filtered += 1
                    continue

                # Update
                self.inputs.append(inputs[idx])
                _input['labels'] = _target['input_ids']
                self.examples.append(_input)
                if inputs[idx].startswith(constants.TN_PREFIX):
                    self.tn_count += 1
                if inputs[idx].startswith(constants.ITN_PREFIX):
                    self.itn_count += 1
                input_max_len = max(input_max_len, input_len)
                target_max_len = max(target_max_len, target_len)
            print(f'long_examples_filtered: {long_examples_filtered}')
            print(f'input_max_len: {input_max_len} | target_max_len: {target_max_len}')

            # Write to cache (if use_cache)
            if use_cache:
                with open(cached_data_file, 'wb') as out_file:
                    data = self.insts, self.inputs, self.examples, self.tn_count, self.itn_count
                    pickle.dump(data, out_file, protocol=pickle.HIGHEST_PROTOCOL)
Example #7
0
    if not isdir(args.output_dir):
        mkdir(args.output_dir)

    # Read input datasets and combine them
    train, dev, test = [], [], []
    for split_name in constants.SPLIT_NAMES:
        if split_name == constants.TRAIN:
            cur_data = train
        if split_name == constants.DEV:
            cur_data = dev
        if split_name == constants.TEST:
            cur_data = test
        # Loop through each input directory
        for input_dir in args.input_dirs:
            input_fp = join(input_dir, f'{split_name}.tsv')
            insts = read_data_file(input_fp)
            cur_data.extend(insts)
    print('After combining the datasets:')
    print(f'len(train): {len(train)}')
    print(f'len(dev): {len(dev)}')
    print(f'len(test): {len(test)}')

    # Output
    for split_name in constants.SPLIT_NAMES:
        output_fp = join(args.output_dir, f'{split_name}.tsv')
        with open(output_fp, 'w+') as output_f:
            if split_name == constants.TRAIN:
                cur_data = train
            if split_name == constants.DEV:
                cur_data = dev
            if split_name == constants.TEST:
    if not isdir(args.output_dir):
        mkdir(args.output_dir)

    # Read input datasets and combine them
    train, dev, test = [], [], []
    for split_name in constants.SPLIT_NAMES:
        if split_name == constants.TRAIN:
            cur_data = train
        if split_name == constants.DEV:
            cur_data = dev
        if split_name == constants.TEST:
            cur_data = test
        # Loop through each input directory
        for input_dir in args.input_dirs:
            input_fp = join(input_dir, f'{split_name}.tsv')
            insts = read_data_file(input_fp, lang=args.language)
            cur_data.extend(insts)
    print('After combining the datasets:')
    print(f'len(train): {len(train)}')
    print(f'len(dev): {len(dev)}')
    print(f'len(test): {len(test)}')

    # Output
    for split_name in constants.SPLIT_NAMES:
        output_fp = join(args.output_dir, f'{split_name}.tsv')
        with open(output_fp, 'w+') as output_f:
            if split_name == constants.TRAIN:
                cur_data = train
            if split_name == constants.DEV:
                cur_data = dev
            if split_name == constants.TEST:
Example #9
0
    def __init__(
        self,
        input_file: str,
        tokenizer: PreTrainedTokenizerBase,
        mode: str,
        max_len: int,
        decoder_data_augmentation: bool,
    ):
        assert mode in constants.MODES
        self.mode = mode
        raw_insts = read_data_file(input_file)

        # Convert raw instances to TaggerDataInstance
        insts, inputs, targets = [], [], []
        for (classes, w_words, s_words) in tqdm(raw_insts):
            for ix, (_class, w_word,
                     s_word) in enumerate(zip(classes, w_words, s_words)):
                if s_word in constants.SPECIAL_WORDS:
                    continue
                for inst_dir in constants.INST_DIRECTIONS:
                    if inst_dir == constants.INST_BACKWARD and mode == constants.TN_MODE:
                        continue
                    if inst_dir == constants.INST_FORWARD and mode == constants.ITN_MODE:
                        continue
                    # Create a DecoderDataInstance
                    inst = DecoderDataInstance(w_words,
                                               s_words,
                                               inst_dir,
                                               start_idx=ix,
                                               end_idx=ix + 1,
                                               semiotic_class=_class)
                    insts.append(inst)
                    if decoder_data_augmentation:
                        noise_left = random.randint(1, 2)
                        noise_right = random.randint(1, 2)
                        inst = DecoderDataInstance(w_words,
                                                   s_words,
                                                   inst_dir,
                                                   start_idx=ix - noise_left,
                                                   end_idx=ix + 1 +
                                                   noise_right)
                        insts.append(inst)

        self.insts = insts
        inputs = [inst.input_str for inst in insts]
        targets = [inst.output_str for inst in insts]

        # Tokenization
        self.inputs, self.examples = [], []
        self.tn_count, self.itn_count, long_examples_filtered = 0, 0, 0
        input_max_len, target_max_len = 0, 0
        for idx in range(len(inputs)):
            # Input
            _input = tokenizer([inputs[idx]])
            input_len = len(_input['input_ids'][0])
            if input_len > max_len:
                long_examples_filtered += 1
                continue

            # Target
            _target = tokenizer([targets[idx]])
            target_len = len(_target['input_ids'][0])
            if target_len > max_len:
                long_examples_filtered += 1
                continue

            # Update
            self.inputs.append(inputs[idx])
            _input['labels'] = _target['input_ids']
            self.examples.append(_input)
            if inputs[idx].startswith(constants.TN_PREFIX):
                self.tn_count += 1
            if inputs[idx].startswith(constants.ITN_PREFIX):
                self.itn_count += 1
            input_max_len = max(input_max_len, input_len)
            target_max_len = max(target_max_len, target_len)
        print(f'long_examples_filtered: {long_examples_filtered}')
        print(
            f'input_max_len: {input_max_len} | target_max_len: {target_max_len}'
        )
Example #10
0
    def __init__(self, input_file: str, mode: str, lang: str):
        self.lang = lang
        insts = read_data_file(input_file, lang=lang)
        processor = MosesProcessor(lang_id=lang)
        # Build inputs and targets
        self.directions, self.inputs, self.targets, self.classes, self.nb_spans, self.span_starts, self.span_ends = (
            [],
            [],
            [],
            [],
            [],
            [],
            [],
        )
        for (classes, w_words, s_words) in insts:
            # Extract words that are not punctuations
            for direction in constants.INST_DIRECTIONS:
                if direction == constants.INST_BACKWARD:
                    if mode == constants.TN_MODE:
                        continue
                    # ITN mode
                    (
                        processed_w_words,
                        processed_s_words,
                        processed_classes,
                        processed_nb_spans,
                        processed_s_span_starts,
                        processed_s_span_ends,
                    ) = ([], [], [], 0, [], [])
                    s_word_idx = 0
                    for cls, w_word, s_word in zip(classes, w_words, s_words):
                        if s_word == constants.SIL_WORD:
                            continue
                        elif s_word == constants.SELF_WORD:
                            processed_s_words.append(w_word)
                        else:
                            processed_s_words.append(s_word)

                        s_word_last = processor.tokenize(
                            processed_s_words.pop()).split()
                        processed_s_words.append(" ".join(s_word_last))
                        num_tokens = len(s_word_last)
                        processed_nb_spans += 1
                        processed_classes.append(cls)
                        processed_s_span_starts.append(s_word_idx)
                        s_word_idx += num_tokens
                        processed_s_span_ends.append(s_word_idx)
                        processed_w_words.append(w_word)

                    self.span_starts.append(processed_s_span_starts)
                    self.span_ends.append(processed_s_span_ends)
                    self.classes.append(processed_classes)
                    self.nb_spans.append(processed_nb_spans)
                    input_words = ' '.join(processed_s_words)
                    # Update self.directions, self.inputs, self.targets
                    self.directions.append(direction)
                    self.inputs.append(input_words)
                    self.targets.append(
                        processed_w_words
                    )  # is list of lists where inner list contains target tokens (not words)
                # TN mode
                elif direction == constants.INST_FORWARD:
                    if mode == constants.ITN_MODE:
                        continue
                    (
                        processed_w_words,
                        processed_s_words,
                        processed_classes,
                        processed_nb_spans,
                        w_span_starts,
                        w_span_ends,
                    ) = ([], [], [], 0, [], [])
                    w_word_idx = 0
                    for cls, w_word, s_word in zip(classes, w_words, s_words):
                        # TN forward mode
                        # this is done for cases like `do n't`, this w_word will be treated as 2 tokens
                        w_word = processor.tokenize(w_word).split()
                        num_tokens = len(w_word)
                        if s_word in constants.SPECIAL_WORDS:
                            processed_s_words.append(" ".join(w_word))
                        else:
                            processed_s_words.append(s_word)
                        w_span_starts.append(w_word_idx)
                        w_word_idx += num_tokens
                        w_span_ends.append(w_word_idx)
                        processed_nb_spans += 1
                        processed_classes.append(cls)
                        processed_w_words.extend(w_word)

                    self.span_starts.append(w_span_starts)
                    self.span_ends.append(w_span_ends)
                    self.classes.append(processed_classes)
                    self.nb_spans.append(processed_nb_spans)
                    input_words = ' '.join(processed_w_words)
                    # Update self.directions, self.inputs, self.targets
                    self.directions.append(direction)
                    self.inputs.append(input_words)
                    self.targets.append(
                        processed_s_words
                    )  # is list of lists where inner list contains target tokens (not words)

        self.examples = list(
            zip(
                self.directions,
                self.inputs,
                self.targets,
                self.classes,
                self.nb_spans,
                self.span_starts,
                self.span_ends,
            ))