Ejemplo n.º 1
0
def tokenize_sentence(X_text_list_train, X_text_list_test, MAX_SENTENCE_LEN):
    x_encoder = StaticTokenizerEncoder(
        sample=X_text_list_train,
        append_eos=False,
        tokenize=lambda x: x,
    )
    x_encoded_train = [x_encoder.encode(text) for text in X_text_list_train]
    x_padded_train = torch.LongTensor(
        pad_sequence(x_encoded_train, MAX_SENTENCE_LEN + 1))

    x_encoded_test = [x_encoder.encode(text) for text in X_text_list_test]
    x_padded_test = torch.LongTensor(
        pad_sequence(x_encoded_test, MAX_SENTENCE_LEN + 1))

    if x_padded_train.shape[1] > x_padded_test.shape[1]:
        x_padded_test = torch.cat(
            (
                x_padded_test,
                torch.zeros(
                    x_padded_test.shape[0],
                    x_padded_train.shape[1] - x_padded_test.shape[1],
                ),
            ),
            dim=1,
        ).type(torch.long)

    return x_encoder, x_padded_train, x_padded_test
    def __init__(self,
                 json,
                 text_encoder=None,
                 label_encoder=None,
                 vocab=None,
                 mode='train'):
        '''
        Initialization
        Arguments:
        json: Json file containing the data. 
            Structure of json file:
            e.g: 
                 json: {'data' : [{'id': filename,
                                   'title': title of page,
                                   'toc': [list of items in table of contents section of wikipage],
                                   'intro':introduction of wiki page,
                                   'label':'positive'/'negative' flag}]
                        }
            Labels-required only when mode = 'train'
        text_encoder: encoder object that encodes tokens to their unique integer ids
        label_encoder: encoder object that encodes labels to their unique integer ids
        vocab: external vocabulary used to intialize the text encoder. If vocab = None, it would be generated based on tokens from the datasets provided
        mode: 'train' or 'inference': in case of mode == 'inference', the dataset object skips the labels
        '''
        self.data = json
        assert 'data' in self.data

        # Define the mode in which the dataset object is to be used
        self.mode = mode

        # Define text encoder and vocabulary
        if text_encoder:
            self._text_encoder = text_encoder
            self._vocab = self._text_encoder.vocab
        elif vocab:
            self._vocab = vocab
            self._text_encoder = StaticTokenizerEncoder(self._vocab,
                                                        append_eos=False,
                                                        tokenize=self.split)
        else:
            self._vocab = self.create_vocab()
            self._text_encoder = StaticTokenizerEncoder(self._vocab,
                                                        append_eos=False,
                                                        tokenize=self.split)

        self._vocab_size = self._text_encoder.vocab_size

        # Define label encoder
        if self.mode == 'train':
            if label_encoder:
                self._label_encoder = label_encoder
            else:
                self._label_encoder = LabelEncoder(
                    [sample['label'] for sample in self.data['data']])

            self._label_size = self._label_encoder.vocab_size

        else:
            self._label_encoder = None
            self._label_size = None
Ejemplo n.º 3
0
def tokenize_pos_tags(X_tags_train, X_tags_test):
    x_postag_encoder = StaticTokenizerEncoder(
        sample=X_tags_train,
        append_eos=False,
        tokenize=lambda x: x,
    )
    x_postag_encoded_train = [
        x_postag_encoder.encode(text) for text in X_tags_train
    ]
    x_postag_padded_train = torch.LongTensor(
        pad_sequence(x_postag_encoded_train, MAX_SENTENCE_LEN + 1))
    # x_postag_ohe_train = torch.nn.functional.one_hot(x_postag_padded_train)

    x_postag_encoded_test = [
        x_postag_encoder.encode(text) for text in X_tags_test
    ]
    x_postag_padded_test = torch.LongTensor(
        pad_sequence(x_postag_encoded_test, MAX_SENTENCE_LEN + 1))

    if x_postag_padded_train.shape[1] > x_postag_padded_test.shape[1]:
        x_postag_padded_test = torch.cat(
            (
                x_postag_padded_test,
                torch.zeros(
                    x_postag_padded_test.shape[0],
                    x_postag_padded_train.shape[1] -
                    x_postag_padded_test.shape[1],
                ),
            ),
            dim=1,
        ).type(torch.long)

    # x_postag_ohe_test = torch.nn.functional.one_hot(x_postag_padded_test)
    return x_postag_encoder, x_postag_padded_train, x_postag_padded_test
Ejemplo n.º 4
0
    def train(
        self, pair_dataset, append_eos=True, append_sos=True, min_occurrences=300
    ):
        """train a tokenizer"""

        # create genartor for incorrect data only
        dataset_example_gen = (
            ex["incorrect"]
            for ex in itr.islice(pair_dataset, self._tokenizer_max_seq)
        )

        self.tokenizer = StaticTokenizerEncoder(
            dataset_example_gen,
            min_occurrences=min_occurrences,
            append_eos=append_eos,
            append_sos=append_sos,
            tokenize=uni_bi_grams_vocab_gen,
            detokenize=self._detokenize #lambda x: "".join(x),  # concat all tokens
        )

        self.tokenizer.tokenize =bigrams_tokenize #ngram_tokenizer(self.ngrams)

        # after training set the variables
        self.vocab_size = self.tokenizer.vocab_size
        self.padding_index = self.tokenizer.padding_index  # =0
Ejemplo n.º 5
0
def tokenize_sentence(X_text_list_train, X_text_list_test, max_sent_len=800):
    """
    Tokenized sentences with train data list and fits on both train + test
    :param X_text_list_train:
    :param X_text_list_test:
    :param max_sent_len: Max sentence len to pad to, defaults to 800
    :return: x_encoder, x_padded_train, x_padded_test
    """
    x_encoder = StaticTokenizerEncoder(
        sample=X_text_list_train, append_eos=False, tokenize=lambda x: x,
    )
    x_encoded_train = [x_encoder.encode(text) for text in X_text_list_train]
    x_padded_train = torch.LongTensor(
        pad_sequence(x_encoded_train, max_sent_len + 1)
    )

    x_encoded_test = [x_encoder.encode(text) for text in X_text_list_test]
    x_padded_test = torch.LongTensor(pad_sequence(x_encoded_test, max_sent_len + 1))

    if x_padded_train.shape[1] > x_padded_test.shape[1]:
        x_padded_test = torch.cat(
            (
                x_padded_test,
                torch.zeros(
                    x_padded_test.shape[0],
                    x_padded_train.shape[1] - x_padded_test.shape[1],
                ),
            ),
            dim=1,
        ).type(torch.long)

    return x_encoder, x_padded_train, x_padded_test
Ejemplo n.º 6
0
 def generate_encodings(self, data, labels):
     encoder = StaticTokenizerEncoder(data,
                                      tokenize=lambda s: s.split(),
                                      min_occurrences=3)
     encoded_data = [encoder.encode(document) for document in data]
     encoded_data = [pad_tensor(x, length=10000) for x in encoded_data]
     data = {'labels': labels, 'inputs': encoded_data}
     return pd.DataFrame(data=data)
Ejemplo n.º 7
0
def create_tokenizer():
    """
    Create and save Pytorch-NLP tokenizer.

    Args:
        root (string): Directory of TIMIT.
    """
    transcripts = pd.read_csv('TRAIN.csv')['transcript']
    tokenizer = StaticTokenizerEncoder(transcripts,
                                       append_sos=True,
                                       append_eos=True,
                                       tokenize=data_utils.encode_fn,
                                       detokenize=data_utils.decode_fn)
    torch.save(tokenizer, 'tokenizer.pth')
Ejemplo n.º 8
0
def load(batch_size, augmentation, split, shuffle=True):
    """
    Args:
        split (string): Which of the subset of data to take. One of 'train' or 'test'.
        batch_size (integer): Batch size.
        augmentation (bool): Whether to apply data augmentation. Only work on training set.

    Return:
        loader (DataLoader): A DataLoader can generate batches of (image, label sequence).
        tokenizer (Pytorch-NLP’s StaticTokenizerEncoder): A tokenizer to encode/decode label sequences.
    """
    assert split in ['train', 'test']

    train_dataset = load_json('train') + load_json('extra')
    train_dataset = preprocess_label(train_dataset)

    tokenizer = StaticTokenizerEncoder(
        [x['anno']['label'] for x in train_dataset],
        tokenize=lambda s: s.split(),
        append_eos=True,
        reserved_tokens=['<pad>', '<unk>', '</s>'])
    print(tokenizer.vocab)

    if split == 'train':
        dataset = train_dataset
    else:
        dataset = load_json('test')
        dataset = preprocess_label(dataset)

    print("Compute bounding boxes ...")
    dataset = compute_bbox(dataset)

    dataset = SVHN(dataset, augmentation=(augmentation and split == 'train'))
    print("Dataset size:", len(dataset))
    loader = DataLoader(
        dataset,
        batch_size,
        shuffle=shuffle,
        collate_fn=lambda batch: dataset.generateBatch(batch, tokenizer),
        num_workers=4,
        pin_memory=True)
    return loader, tokenizer
Ejemplo n.º 9
0
def load_data(dict_fn,
              data_fn,
              batch_size,
              start_sign,
              end_sign,
              checkpoint_dir,
              max_length,
              max_train_data_size=0):
    """
    数据加载方法,主要将分词好的数据进行整理,过程中保存字典文件,方便后续其他功能
    使用,方法返回处理好的dataset,steps_per_epoch,checkpoint_prefix
    Args:
        dict_fn: 将训练数据的字典保存,用于以后使用,路径
        data_fn: 分词好的训练数据路径
        batch_size: batch大小
        start_sign: 开始标记
        end_sign: 结束标记
        checkpoint_dir: 检查点保存路径
        max_length: 最大句子长度
        max_train_data_size: 最大训练数据大小
    Returns:
        dataset: PyTorch的DataLoader
        steps_per_epoch: 每轮的步数
        checkpoint_prefix: 保存检查点的前缀
    """
    print("训练数据读取中...")
    (input_lang,
     target_lang), diag_weight = read_tokenized_data(data_fn, start_sign,
                                                     end_sign,
                                                     max_train_data_size)
    diag_weight = torch.tensor(diag_weight, dtype=torch.float32)
    # 合并input,target用于生成统一的字典
    lang = np.hstack((input_lang, target_lang))
    print("读取完成,正在格式化训练数据...")
    tokenizer = StaticTokenizerEncoder(sample=lang,
                                       tokenize=lambda x: x.split())
    # 将文本序列转换文token id之后,并进行填充
    input_data = [
        pad_tensor(tensor=tokenizer.encode(example)[:max_length],
                   length=max_length,
                   padding_index=0) for example in input_lang
    ]
    target_data = [
        pad_tensor(tensor=tokenizer.encode(example)[:max_length],
                   length=max_length,
                   padding_index=0) for example in target_lang
    ]
    input_tensor = stack_and_pad_tensors(input_data)[0]
    target_tensor = stack_and_pad_tensors(target_data)[0]

    print("格式化完成,正在整理训练数据并保存字典")
    word_index = {}
    vocab_list = tokenizer.vocab
    for i in range(tokenizer.vocab_size):
        word_index[vocab_list[i]] = i
        word_index[i] = vocab_list[i]

    with open(dict_fn, 'w', encoding='utf-8') as file:
        file.write(json.dumps(word_index, indent=4, ensure_ascii=False))
    print("数据字典保存完成!")

    dataset = PairDataset(input_tensor, target_tensor, diag_weight)
    loader = DataLoader(dataset=dataset,
                        batch_size=batch_size,
                        shuffle=True,
                        num_workers=2)
    steps_per_epoch = len(input_tensor) // batch_size

    return loader, steps_per_epoch
Ejemplo n.º 10
0
class BiGramTokenizerEncoder(TokenizerEncoder):
    def __init__(self):

        self.vocab_size = None
        self.padding_index = None
        self.ngrams = 2

        # determine how many sequences we take to build the vocabulary
        self._tokenizer_max_seq = 3 * 10 ** 5
        self.tokenizer_name = "bigram_corrector_tokenizer"

    def train(
        self, pair_dataset, append_eos=True, append_sos=True, min_occurrences=300
    ):
        """train a tokenizer"""

        # create genartor for incorrect data only
        dataset_example_gen = (
            ex["incorrect"]
            for ex in itr.islice(pair_dataset, self._tokenizer_max_seq)
        )

        self.tokenizer = StaticTokenizerEncoder(
            dataset_example_gen,
            min_occurrences=min_occurrences,
            append_eos=append_eos,
            append_sos=append_sos,
            tokenize=uni_bi_grams_vocab_gen,
            detokenize=self._detokenize #lambda x: "".join(x),  # concat all tokens
        )

        self.tokenizer.tokenize =bigrams_tokenize #ngram_tokenizer(self.ngrams)

        # after training set the variables
        self.vocab_size = self.tokenizer.vocab_size
        self.padding_index = self.tokenizer.padding_index  # =0

    def _detokenize(self, tokens):
        return "".join(tokens)



    def encode(self, text):
        pass

    def encode_batch(self, samples):
        """
        Encodes list of strings

        Args:
        -----------
        samples: list of strings
        """

        # it is compatible with pytrochNLP
        tokens, lengths = self.tokenizer.batch_encode(samples)

        return tokens, lengths

    def decode(self, text):
        pass
class WikiDataset(Dataset):
    '''
    A custom dataset object that encodes a tokenized text and its labels according to the corresponding encoders 
    '''
    def __init__(self,
                 json,
                 text_encoder=None,
                 label_encoder=None,
                 vocab=None,
                 mode='train'):
        '''
        Initialization
        Arguments:
        json: Json file containing the data. 
            Structure of json file:
            e.g: 
                 json: {'data' : [{'id': filename,
                                   'title': title of page,
                                   'toc': [list of items in table of contents section of wikipage],
                                   'intro':introduction of wiki page,
                                   'label':'positive'/'negative' flag}]
                        }
            Labels-required only when mode = 'train'
        text_encoder: encoder object that encodes tokens to their unique integer ids
        label_encoder: encoder object that encodes labels to their unique integer ids
        vocab: external vocabulary used to intialize the text encoder. If vocab = None, it would be generated based on tokens from the datasets provided
        mode: 'train' or 'inference': in case of mode == 'inference', the dataset object skips the labels
        '''
        self.data = json
        assert 'data' in self.data

        # Define the mode in which the dataset object is to be used
        self.mode = mode

        # Define text encoder and vocabulary
        if text_encoder:
            self._text_encoder = text_encoder
            self._vocab = self._text_encoder.vocab
        elif vocab:
            self._vocab = vocab
            self._text_encoder = StaticTokenizerEncoder(self._vocab,
                                                        append_eos=False,
                                                        tokenize=self.split)
        else:
            self._vocab = self.create_vocab()
            self._text_encoder = StaticTokenizerEncoder(self._vocab,
                                                        append_eos=False,
                                                        tokenize=self.split)

        self._vocab_size = self._text_encoder.vocab_size

        # Define label encoder
        if self.mode == 'train':
            if label_encoder:
                self._label_encoder = label_encoder
            else:
                self._label_encoder = LabelEncoder(
                    [sample['label'] for sample in self.data['data']])

            self._label_size = self._label_encoder.vocab_size

        else:
            self._label_encoder = None
            self._label_size = None

    def __len__(self):
        '''
        Size of dataset
        '''
        return len(self.data['data'])

    def __getitem__(self, idx):
        '''
        Extract item corresponding to idx'th index in data
        '''
        item = self.data['data'][idx]

        intro_enc = self._text_encoder.encode(item['intro'])

        toc = item['toc']
        if toc == []:
            toc_enc = self._text_encoder.encode('.')
        else:
            toc = ' '.join(toc)
            toc_enc = self._text_encoder.encode(toc)

        title_enc = self._text_encoder.encode(item['title'])

        if self.mode == 'train':
            return title_enc, toc_enc, intro_enc, self._label_encoder.encode(
                item['label']).view(-1)
        else:
            return title_enc, toc_enc, intro_enc

    @property
    def vocab_size(self):
        return self._vocab_size

    @property
    def label_size(self):
        return self._label_size

    @property
    def text_encoder(self):
        return self._text_encoder

    @property
    def label_encoder(self):
        return self._label_encoder

    @property
    def vocab(self):
        return self._vocab

    def create_vocab(self, remove_less_freq_words=True, threshold=1):
        '''
        Creates vocabulary from the dataset tokens

        Returns:
        List of unique tokens in dataset
        '''
        temp_vocab = []
        for sample in self.data['data']:
            temp_vocab.extend(sample['title'].split())
            temp_vocab.extend(' '.join(sample['toc']).split())
            temp_vocab.extend(sample['intro'].split())

        vocab = []

        if remove_less_freq_words:

            count_dict = collections.Counter(temp_vocab)

            for word in count_dict.keys():
                if count_dict[word] > threshold:
                    vocab.append(word)

        else:
            vocab = sorted(list(set(temp_vocab)))

        return vocab

    def split(self, x):
        '''
        Splits the text into tokens 
        '''
        return x.split()

    def collate_fn(self, batch, padding=True):
        """
        Collate function needs to be passed to the pytorch dataloader

        Returns:
        (title,title_lengths): tuple containing padded sequence tensor for title and sequence lengths 
        (toc,toc_lengths): tuple containing padded sequence tensor for table of contents and sequence lengths 
        (intro,intro_lengths): tuple containing padded sequence tensor for introduction and sequence lengths 
        labels: tensor containing labels for the batch
        """
        if self.mode == 'train':
            title, toc, intro, labels = zip(*batch)
            labels = torch.cat(labels)
        else:
            title, toc, intro = zip(*batch)

        if isinstance(intro, collections.Sequence):

            if padding:
                title, title_lengths = stack_and_pad_tensors(title)
                toc, toc_lengths = stack_and_pad_tensors(toc)
                intro, intro_lengths = stack_and_pad_tensors(intro)

            if self.mode == 'train':
                return (title,
                        title_lengths), (toc,
                                         toc_lengths), (intro,
                                                        intro_lengths), labels
            else:
                return (title, title_lengths), (toc,
                                                toc_lengths), (intro,
                                                               intro_lengths)
        else:
            return batch

    @classmethod
    def fromJsonFile(cls,
                     json_file,
                     text_encoder=None,
                     label_encoder=None,
                     vocab=None,
                     mode='train'):
        '''
        Read data from json file

        Arguments:
        json_file: string specifying location to json_file
        '''
        with open(json_file, 'r') as f:
            json_data = json.load(f)

        return cls(json_data, text_encoder, label_encoder, vocab, mode)
Ejemplo n.º 12
0
def encoder(input_):
    return StaticTokenizerEncoder([input_])
Ejemplo n.º 13
0
def get_tokenizer(list_training_sentences):
    tokenizer = StaticTokenizerEncoder(sample=list_training_sentences, min_occurrences=2,
                                     append_sos=False, append_eos=True)
    return tokenizer