Example #1
0
 def generate(self):
     for row in self._data_df.itertuples(index=False, name='tweet'):
         text: str = row.text.lower()
         if self._augment and random.random() > 0.5:
             text_list = text.split()
             n = random.choice([0, 1, 2, 3])
             if n == 0:
                 text_list, change_logs = synonym_replacement(text_list, 2)
                 text = ' '.join(text_list)
             elif n == 1:
                 text_list, change_logs = random_char_repeat(text_list)
                 text = ' '.join(text_list)
             elif n == 2:
                 text_list, change_logs = random_char_deletion(text_list)
                 text = ' '.join(text_list)
             else:
                 text = ' '.join(text_list)
         text = ' ' + ' '.join(text.split())
         encoded_text = self._tokenizer.encode(text)
         sentiment_id = self._sentiment_ids[row.sentiment]
         # below [2] is token id for </s> token
         input_ids = [0] + encoded_text.ids + [2]
         len_encoded_ids = len(encoded_text.ids)
         attention_mask = [1] * (len_encoded_ids + 2)
         token_type_ids = [0] * (len_encoded_ids + 2)
         yield ({'ids': input_ids, 'att': attention_mask, 'tti': token_type_ids},
                {'sentiment': keras.utils.to_categorical(sentiment_id, num_classes=3)})
Example #2
0
    def generate(self):
        for row in self._data_df.itertuples(index=False, name='tweet'):
            text: str = row.text.lower()
            selected_text: str = row.selected_text.lower()
            if self._augment and random.random() > 0.5:
                text_list = text.split()
                n = random.choice([0, 1, 2, 3])
                if n == 0:
                    text_list, change_logs = synonym_replacement(text_list, 2)
                    text = ' '.join(text_list)
                    for k, v in change_logs.items():
                        selected_text = selected_text.replace(k, v)
                elif n == 1:
                    text_list, change_logs = random_char_repeat(text_list)
                    text = ' '.join(text_list)
                    for k, v in change_logs.items():
                        selected_text = selected_text.replace(k, v)
                elif n == 2:
                    text_list, change_logs = random_char_deletion(text_list)
                    text = ' '.join(text_list)
                    for k, v in change_logs.items():
                        selected_text = selected_text.replace(k, v)
                else:
                    text = ' '.join(text_list)
            # find overlap
            text = ' '.join(text.split())
            selected_text = ' '.join(selected_text.split())
            idx_selected_text = text.find(selected_text)
            chars = np.zeros((len(text)))
            chars[idx_selected_text:idx_selected_text + len(selected_text)] = 1

            # Id offsets
            offsets = []
            idx = 0
            for w in text.split():
                len_w = len(w)
                offsets.append((idx, idx + len_w))
                idx += len_w + 1

            # Start end tokens
            tokens = []
            for i, (a, b) in enumerate(offsets):
                sm = np.sum(chars[a:b])
                if sm > 0:
                    tokens.append(i)

            if len(tokens) > 0:
                self.mask.append(True)
                start_tokens = np.zeros(len(text.split()), dtype='int')
                start_tokens[tokens[0]] = 1
                end_tokens = np.zeros(len(text.split()), dtype='int')
                end_tokens[tokens[-1]] = 1
                start_tokens = start_tokens.tolist()
                end_tokens = end_tokens.tolist()
                text = f'{text} <senti> {row.sentiment}'
                inputs = np.zeros((len(text.split()), Config.ft_embeddings_size))
                for i, word in enumerate(text.split()):
                    try:
                        inputs[i] = self._embeddings.wv[word]
                    except KeyError:
                        inputs[i] = self._embeddings.wv['<unk>']
            else:
                self.count += 1
                self.exceptions.append({'text': text, 'selected_text': selected_text, 'sentiment': row.sentiment})
                self.mask.append(False)
                continue
            yield {'inputs': inputs.tolist()}, {'sts': start_tokens, 'ets': end_tokens}
Example #3
0
 def generate(self):
     for row in self._data_df.itertuples(index=False, name='tweet'):
         text: str = row.text.lower()
         selected_text: str = row.selected_text.lower()
         if self._augment and random.random() > 0.5:
             text_list = text.split()
             n = random.choice([0, 1, 2, 3])
             if n == 0:
                 text_list, change_logs = synonym_replacement(text_list, 2)
                 text = ' '.join(text_list)
                 for k, v in change_logs.items():
                     selected_text = selected_text.replace(k, v)
             elif n == 1:
                 text_list, change_logs = random_char_repeat(text_list)
                 text = ' '.join(text_list)
                 for k, v in change_logs.items():
                     selected_text = selected_text.replace(k, v)
             elif n == 2:
                 text_list, change_logs = random_char_deletion(text_list)
                 text = ' '.join(text_list)
                 for k, v in change_logs.items():
                     selected_text = selected_text.replace(k, v)
             else:
                 text = ' '.join(text_list)
         # find overlap
         text = ' ' + ' '.join(text.split())
         selected_text = ' '.join(selected_text.split())
         idx_selected_text = text.find(selected_text)
         chars = np.zeros((len(text)))
         chars[idx_selected_text:idx_selected_text + len(selected_text)] = 1
         if text[idx_selected_text - 1] == ' ':
             chars[idx_selected_text - 1] = 1
         encoded_text = self._tokenizer.encode(text)
         # Id offsets
         offsets = []
         idx = 0
         for t in encoded_text.ids:
             w = self._tokenizer.decode([t])
             len_w = len(w)
             offsets.append((idx, idx + len_w))
             idx += len_w
         # Start end tokens
         tokens = []
         for i, (a, b) in enumerate(offsets):
             sm = np.sum(chars[a:b])
             if sm > 0:
                 tokens.append(i)
         sentiment_token = self._sentiment_ids[row.sentiment]
         # below [2] is token id for </s> token
         input_ids = [0] + encoded_text.ids + [2, 2] + [sentiment_token] + [2]
         len_encoded_ids = len(encoded_text.ids)
         attention_mask = [1] * (len_encoded_ids + 5)
         token_type_ids = [0] * (len_encoded_ids + 5)
         if len(tokens) > 0:
             self.exception_mask.append(True)
             start_tokens = np.zeros((len_encoded_ids + 5), dtype='int')
             start_tokens[tokens[0] + 1] = 1
             end_tokens = np.zeros((len_encoded_ids + 5), dtype='int')
             end_tokens[tokens[-1] + 1] = 1
             start_tokens = start_tokens.tolist()
             end_tokens = end_tokens.tolist()
         else:
             self.exception_count += 1
             self.exceptions = {'text': text, 'selected_text': selected_text, 'sentiment': row.sentiment}
             self.exception_mask.append(False)
             continue
         yield ({'ids': input_ids, 'att': attention_mask, 'tti': token_type_ids},
                {'sts': start_tokens, 'ets': end_tokens})
Example #4
0
    def generate(self):
        for row in self._data_df.itertuples(index=False, name='tweet'):
            text: str = row.text.lower()
            selected_text: str = row.selected_text.lower()
            if self._augment and random.random() > 0.5:
                text_list = text.split()
                n = random.choice([0, 1, 2, 3])
                if n == 0:
                    text_list, change_logs = synonym_replacement(text_list, 2)
                    text = ' '.join(text_list)
                    for k, v in change_logs.items():
                        selected_text = selected_text.replace(k, v)
                elif n == 1:
                    text_list, change_logs = random_char_repeat(text_list)
                    text = ' '.join(text_list)
                    for k, v in change_logs.items():
                        selected_text = selected_text.replace(k, v)
                elif n == 2:
                    text_list, change_logs = random_char_deletion(text_list)
                    text = ' '.join(text_list)
                    for k, v in change_logs.items():
                        selected_text = selected_text.replace(k, v)
                else:
                    text = ' '.join(text_list)
            # find overlap
            text = ' '.join(text.split())
            selected_text = ' '.join(selected_text.split())
            # find the intersection between text and selected text
            idx_start = text.find(selected_text)

            # calculate offsets
            text_tokens = self._tokenizer.tokenize(text)
            selected_text_tokens = self._tokenizer.tokenize(selected_text)
            chars = np.zeros((len(''.join(text_tokens))))
            chars[idx_start:idx_start + len(''.join(selected_text_tokens))] = 1
            offsets = []
            idx = 0
            for t in text_tokens:
                len_t = len(t)
                offsets.append((idx, idx + len_t))
                idx += len_t

            # compute targets
            target_idx = []
            for i, (o1, o2) in enumerate(offsets):
                if sum(chars[o1: o2]) > 0:
                    target_idx.append(i)

            start_tokens = target_idx[0]
            end_tokens = target_idx[-1]

            input_ids_orig = self._tokenizer.encode(text, add_special_tokens=False)
            input_ids = [2] + input_ids_orig + [3] + [self._sentiment_ids[row.sentiment]] + [3]
            token_type_ids = [0] * (len(input_ids_orig) + 2) + [1, 1]
            attention_mask = [1] * (len(input_ids_orig) + 4)
            np_start_tokens = np.zeros((len(input_ids)), dtype='int')
            np_start_tokens[start_tokens] = 1
            np_end_tokens = np.zeros((len(input_ids)), dtype='int')
            np_end_tokens[end_tokens] = 1
            start_tokens = np_start_tokens.tolist()
            end_tokens = np_end_tokens.tolist()
            yield ({'ids': input_ids, 'att': attention_mask, 'tti': token_type_ids},
                   {'sts': start_tokens, 'ets': end_tokens})
Example #5
0
    def generate(self):
        for row in self._data_df.itertuples(index=False, name='tweet'):
            text: str = row.text.lower()
            selected_text: str = row.selected_text.lower()
            if self._augment and random.random() > 0.5:
                text_list = text.split()
                n = random.choice([0, 1, 2, 3])
                if n == 0:
                    text_list, change_logs = synonym_replacement(text_list, 2)
                    text = ' '.join(text_list)
                    for k, v in change_logs.items():
                        selected_text = selected_text.replace(k, v)
                elif n == 1:
                    text_list, change_logs = random_char_repeat(text_list)
                    text = ' '.join(text_list)
                    for k, v in change_logs.items():
                        selected_text = selected_text.replace(k, v)
                elif n == 2:
                    text_list, change_logs = random_char_deletion(text_list)
                    text = ' '.join(text_list)
                    for k, v in change_logs.items():
                        selected_text = selected_text.replace(k, v)
                else:
                    text = ' '.join(text_list)
            # find overlap
            text = ' '.join(text.split())
            selected_text = ' '.join(selected_text.split())
            # find the intersection between text and selected text
            idx_start, idx_end = None, None
            for index in (i for i, c in enumerate(text) if c == selected_text[0]):
                if text[index:index + len(selected_text)] == selected_text:
                    idx_start = index
                    idx_end = index + len(selected_text)
                    break

            intersection = [0] * len(text)
            if idx_start is not None and idx_end is not None:
                self.exception_mask.append(True)
                for char_idx in range(idx_start, idx_end):
                    intersection[char_idx] = 1
            else:
                self.exception_count += 1
                self.exceptions = {'text': text, 'selected_text': selected_text, 'sentiment': row.sentiment}
                self.exception_mask.append(False)
                continue

            # tokenize with offsets
            enc = self._tokenizer.encode(text, add_special_tokens=False)
            input_ids_orig, offsets = enc.ids, enc.offsets

            # compute targets
            target_idx = []
            for i, (o1, o2) in enumerate(offsets):
                if sum(intersection[o1: o2]) > 0:
                    target_idx.append(i)

            start_tokens = target_idx[0]
            end_tokens = target_idx[-1]

            input_ids = [101] + [self._sentiment_ids[row.sentiment]] + [102] + input_ids_orig + [102]
            token_type_ids = [0, 0, 0] + [1] * (len(input_ids_orig) + 1)
            attention_mask = [1] * (len(input_ids_orig) + 4)
            start_tokens += 3
            end_tokens += 3
            np_start_tokens = np.zeros((len(input_ids)), dtype='int')
            np_start_tokens[start_tokens] = 1
            np_end_tokens = np.zeros((len(input_ids)), dtype='int')
            np_end_tokens[end_tokens] = 1
            start_tokens = np_start_tokens.tolist()
            end_tokens = np_end_tokens.tolist()
            yield ({'ids': input_ids, 'att': attention_mask, 'tti': token_type_ids},
                   {'sts': start_tokens, 'ets': end_tokens})