Beispiel #1
0
def main(args):
    logger = TensorBoardLogger(save_dir="./experiment_logs")

    # load dataset and tokenize
    train_dataset = (load_dataset(
        "sentiment140", split="train").shuffle().select(range(45000)))
    test_dataset = load_dataset("sentiment140", split="test").shuffle()
    tokenizer = BertWordPieceTokenizer("bert-base-uncased-vocab.txt",
                                       lowercase=True)

    train_dataset = train_dataset.map(lambda e: preprocess(e, tokenizer),
                                      num_proc=4)
    test_dataset = test_dataset.map(lambda e: preprocess(e, tokenizer),
                                    num_proc=4)

    train_dataset.set_format("torch", columns=["text", "sentiment"])
    test_dataset.set_format("torch", columns=["text", "sentiment"])

    train_dataset, val_dataset = train_dataset.train_test_split(
        train_size=args.train_frac).values()

    train_dataloader = DataLoader(train_dataset,
                                  batch_size=args.batch_size,
                                  num_workers=20,
                                  collate_fn=collate_fn)
    val_dataloader = DataLoader(val_dataset,
                                batch_size=args.batch_size,
                                num_workers=20,
                                collate_fn=collate_fn)
    test_dataloader = DataLoader(test_dataset,
                                 batch_size=args.batch_size,
                                 num_workers=20,
                                 collate_fn=collate_fn)

    trainer = pl.Trainer.from_argparse_args(
        args,
        logger=logger,
        # log_every_n_steps=10,
    )

    if args.action.lower() == "train":
        print("VOCAB SIZE")
        print(tokenizer.get_vocab_size())
        model = SentimentLSTM(
            embedding_dim=args.embedding_dim,
            hidden_dim=args.hidden_dim,
            output_dim=NUM_CLASSES,
            vocab_size=tokenizer.get_vocab_size(),
        )
        trainer.fit(model, train_dataloader, val_dataloader)

    elif args.action.lower() == "eval":
        model = SentimentLSTM.load_from_checkpoint(args.model_path)
        model.eval()
        trainer.test(model, test_dataloaders=test_dataloader)
Beispiel #2
0
class Tokenizer:
    def __init__(self, lang):
        """
        A Tokenizer class to load and train a custom tokenizer
        Using the Hugging Face tokenization library for the same
        """
        self.tokenizer_dir = r"data/{}".format(lang)
        if not os.path.exists(self.tokenizer_dir):
            os.mkdir(self.tokenizer_dir)
        self.vocab = self.tokenizer_dir + "/vocab.txt"
        if os.path.exists(self.vocab):
            print("Initialized tokenizer using cached vocab file {}".format(self.vocab))
            self.tokenizer = BertWordPieceTokenizer(vocab_file=self.vocab)
        else:
            self.tokenizer = BertWordPieceTokenizer()

        self.tokenizer.enable_padding(max_length=MAX_LENGTH)
        self.tokenizer.enable_truncation(max_length=MAX_LENGTH)

    def train_tokenizer(self, sentences):
        """
        Train a tokenizer with a list of sentences
        """

        if not os.path.exists(self.vocab):
            print("Training tokenizer for {}".format(self.tokenizer_dir))
            # Hugging Face only accepts a Temp File with sentences for Training Tokenizer
            with open(self.tokenizer_dir + "/data.txt", "w+", encoding="utf-8") as f:
                [f.write(i + "\n") for i in sentences]
            self.tokenizer.train([self.tokenizer_dir + "/data.txt"])
            self.tokenizer.save(self.tokenizer_dir)
            print("Trained a tokenizer with vocab size {}".format(self.tokenizer.get_vocab_size()))

            # Removing the temp file
            os.remove(self.tokenizer_dir + "/data.txt")

    def encode(self, decoded):
        return self.tokenizer.encode(decoded)

    def decode(self, encoded):
        return self.tokenizer.decode_batch(encoded)
Beispiel #3
0
def load_model():
    tokenizer = BertWordPieceTokenizer('bert-word-piece-custom-wikitext-vocab-10k-vocab.txt', lowercase = True, strip_accents = True)
    vocab_size = tokenizer.get_vocab_size()
    pad_id = 0
    CLS_label_id = 2
    num_class_heads = 2
    lst_num_cat_in_classes = [6, 47]
    seq_len = 100
    batch_size = 256
    num_workers = 3

    model = TwoClassHeadClassificationTransformer(
        vocab_size=vocab_size, pad_id=pad_id, CLS_label_id=CLS_label_id,
        num_class_heads=num_class_heads, 
        lst_num_cat_in_classes=lst_num_cat_in_classes, num_pos=seq_len
    )
    model = torch.load('classification_model_best.pt', map_location = 'cpu')
    model = model.to('cpu')
    model = model.eval()

    return model
Beispiel #4
0
tokenizer = BertWordPieceTokenizer(
    r'C:\Users\David\Documents\Machine_learning\NLP\CardioExplorer\vocab.txt',
    lowercase=True)

pretrain = True
sentence_block_length = 32
max_sentence_blocks = 48
hidden_size = 256
batch_size = 4
shuffle = True
drop_last = True

sentence_block_vector = torch.normal(mean=0.0, std=1.0, size=[hidden_size])

sentence_config = BertConfig()
sentence_config.vocab_size = tokenizer.get_vocab_size()
sentence_config.num_hidden_layers = 6
sentence_config.hidden_size = 256
sentence_config.num_attention_heads = 4
sentence_config.max_position_embeddings = sentence_block_length  # sentence_block_length

document_config = BertConfig()
document_config.vocab_size = tokenizer.get_vocab_size()
document_config.num_hidden_layers = 3
document_config.hidden_size = 256
document_config.num_attention_heads = 4
document_config.max_position_embeddings = max_sentence_blocks  # sentence_block_length

dataset = Dataset(file_path,
                  tokenizer,
                  sentence_block_length,
Beispiel #5
0
import os
import csv
from tokenizers import BertWordPieceTokenizer

# Files with commands.
data_path = "/home/tkornuta/data/local-leonardo-sierra5k"
processed_path = os.path.join(data_path, "processed")
command_templates = os.path.join(processed_path, "command_templates.csv")
command = os.path.join(processed_path, "command.csv")

# Initialize a new tokenizer
tokenizer = BertWordPieceTokenizer()

# Then train it!
tokenizer.train([command_templates, command], vocab_size=100)
print("Vocabulary size: ", tokenizer.get_vocab_size())
for k, v in tokenizer.get_vocab().items():
    print(k, ": ", v)

# Samples from 5k - human labels.
# data_00050000_00052798.gif,"Disjoint the given stacks to form a new stack with blue, red blocks.","Make a new stack with blue, red blocks."
# data_00150000_00150539.gif,Place all the blocks individually on the surface.,Disjoint the given stack of blocks.
# data_00110000_00110725.gif,"Separate the given stack to form yellow, red blocks stack.",Remove 2nd and 4th blocks from the given stack.
# data_00120000_00120478.gif,Remove 1st and 2nd block from the given stack and form stack with blue on top of yellow block.,Do not touch green and red block and form another stack with blue and yellow block

# Now, let's use it:
#input = "I can feel the magic, can you?"
#input = "Disjoint the given stacks to form a new stack with blue, red blocks."
#input = "Make a new stack with blue, red blocks."
input = "Remove 1st and 2nd block from the given stack and form stack with blue on top of yellow block.,Do not touch green and red block and form another stack with blue and yellow block"
print(input)
Beispiel #6
0
torch.cuda.manual_seed(SEED)


def load_pickle(filepath):
    with open(filepath, 'rb') as fp:
        return pickle.load(fp)


tokenizer = BertWordPieceTokenizer(
    '../data/bert-word-piece-custom-wikitext-vocab-10k-vocab.txt',
    lowercase=True,
    strip_accents=True)

data = load_pickle('../data/tokenized_questions_classes_subclasses_dict.pkl')

vocab_size = tokenizer.get_vocab_size()
pad_id = 0
CLS_label_id = 2
num_class_heads = 2
lst_num_cat_in_classes = [6, 47]
seq_len = 100
batch_size = 256
num_workers = 3

model = TwoClassHeadClassificationTransformer(
    vocab_size=vocab_size,
    pad_id=pad_id,
    CLS_label_id=CLS_label_id,
    num_class_heads=num_class_heads,
    lst_num_cat_in_classes=lst_num_cat_in_classes,
    num_pos=seq_len)
Beispiel #7
0
class Reader(object):
    def __init__(self,
                 bert_model: str,
                 tokenizer: BaseTokenizer = None,
                 cls: str = "[CLS]",
                 sep: str = "[SEP]",
                 threshold=6):

        self.tokenizer: BaseTokenizer = tokenizer
        self.cls = cls
        self.sep = sep
        if self.tokenizer is None:
            vocab_path: str = "tokenization/" + bert_model + ".txt"
            self.tokenizer = BertWordPieceTokenizer(vocab_path,
                                                    lowercase="-cased"
                                                    not in bert_model)

        self.threshold = threshold
        self.subword_alphabet: Optional[Alphabet] = None
        self.label_alphabet: Optional[Alphabet] = None

        self.train: Optional[List[SentInst]] = None
        self.dev: Optional[List[SentInst]] = None
        self.test: Optional[List[SentInst]] = None

    def _read_file(self, filename: str, mode: str = 'train') -> List[SentInst]:
        sent_list = []
        max_len = 0
        num_thresh = 0
        with open(filename, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if line == "":  # last few blank lines
                    break

                raw_tokens = line.split(' ')
                tokens = raw_tokens
                chars = [list(t) for t in raw_tokens]

                entities = next(f).strip()
                if entities == "":  # no entities
                    sent_inst = SentInst(tokens, chars, [])
                else:
                    entity_list = []
                    entities = entities.split("|")
                    for item in entities:
                        pointers, label = item.split()
                        pointers = pointers.split(",")
                        if int(pointers[1]) > len(tokens):
                            pdb.set_trace()
                        span_len = int(pointers[1]) - int(pointers[0])
                        if span_len < 0:
                            print("Warning! span_len < 0")
                            continue
                        if span_len > max_len:
                            max_len = span_len
                        if span_len > self.threshold:
                            num_thresh += 1

                        new_entity = (int(pointers[0]), int(pointers[1]),
                                      label)
                        # may be duplicate entities in some datasets
                        if (mode == 'train' and new_entity
                                not in entity_list) or (mode != 'train'):
                            entity_list.append(new_entity)

                    # assert len(entity_list) == len(set(entity_list)) # check duplicate
                    sent_inst = SentInst(tokens, chars, entity_list)
                assert next(f).strip() == ""  # separating line

                sent_list.append(sent_inst)
        print("Max length: {}".format(max_len))
        print("Threshold {}: {}".format(self.threshold, num_thresh))
        return sent_list

    def _gen_dic(self) -> None:
        label_set = set()

        for sent_list in [self.train, self.dev, self.test]:
            num_mention = 0
            for sentInst in sent_list:
                for entity in sentInst.entities:
                    label_set.add(entity[2])
                num_mention += len(sentInst.entities)
            print("# mentions: {}".format(num_mention))

        vocab = [
            self.tokenizer.id_to_token(idx)
            for idx in range(self.tokenizer.get_vocab_size())
        ]
        self.subword_alphabet = Alphabet(vocab, 0)
        self.label_alphabet = Alphabet(label_set, 0)

    @staticmethod
    def _pad_batches(input_ids_batches: List[List[List[int]]],
                     first_subtokens_batches: List[List[List[int]]]) \
            -> Tuple[List[List[List[int]]],
                     List[List[List[int]]],
                     List[List[List[bool]]]]:

        padded_input_ids_batches = []
        input_mask_batches = []
        mask_batches = []

        all_batches = list(zip(input_ids_batches, first_subtokens_batches))
        for input_ids_batch, first_subtokens_batch in all_batches:

            batch_len = len(input_ids_batch)
            max_subtokens_num = max(
                [len(input_ids) for input_ids in input_ids_batch])
            max_sent_len = max([
                len(first_subtokens)
                for first_subtokens in first_subtokens_batch
            ])

            padded_input_ids_batch = []
            input_mask_batch = []
            mask_batch = []

            for i in range(batch_len):

                subtokens_num = len(input_ids_batch[i])
                sent_len = len(first_subtokens_batch[i])

                padded_subtoken_vec = input_ids_batch[i].copy()
                padded_subtoken_vec.extend([0] *
                                           (max_subtokens_num - subtokens_num))
                input_mask = [1] * subtokens_num + [0] * (max_subtokens_num -
                                                          subtokens_num)
                mask = [True] * sent_len + [False] * (max_sent_len - sent_len)

                padded_input_ids_batch.append(padded_subtoken_vec)
                input_mask_batch.append(input_mask)
                mask_batch.append(mask)

            padded_input_ids_batches.append(padded_input_ids_batch)
            input_mask_batches.append(input_mask_batch)
            mask_batches.append(mask_batch)

        return padded_input_ids_batches, input_mask_batches, mask_batches

    def get_batches(self, sentences: List[SentInst], batch_size: int) -> Tuple:
        subtoken_dic_dic = defaultdict(lambda: defaultdict(list))
        first_subtoken_dic_dic = defaultdict(lambda: defaultdict(list))
        last_subtoken_dic_dic = defaultdict(lambda: defaultdict(list))
        label_dic_dic = defaultdict(lambda: defaultdict(list))

        this_input_ids_batches = []
        this_first_subtokens_batches = []
        this_last_subtokens_batches = []
        this_label_batches = []

        for sentInst in sentences:
            subtoken_vec = []
            first_subtoken_vec = []
            last_subtoken_vec = []
            subtoken_vec.append(self.tokenizer.token_to_id(self.cls))
            for t in sentInst.tokens:
                encoding = self.tokenizer.encode(t)
                ids = [
                    v for v, mask in zip(encoding.ids,
                                         encoding.special_tokens_mask)
                    if mask == 0
                ]
                first_subtoken_vec.append(len(subtoken_vec))
                subtoken_vec.extend(ids)
                last_subtoken_vec.append(len(subtoken_vec))
            subtoken_vec.append(self.tokenizer.token_to_id(self.sep))

            label_list = [(u[0], u[1], self.label_alphabet.get_index(u[2]))
                          for u in sentInst.entities]

            subtoken_dic_dic[len(
                sentInst.tokens)][len(subtoken_vec)].append(subtoken_vec)
            first_subtoken_dic_dic[len(
                sentInst.tokens)][len(subtoken_vec)].append(first_subtoken_vec)
            last_subtoken_dic_dic[len(
                sentInst.tokens)][len(subtoken_vec)].append(last_subtoken_vec)
            label_dic_dic[len(
                sentInst.tokens)][len(subtoken_vec)].append(label_list)

        input_ids_batches = []
        first_subtokens_batches = []
        last_subtokens_batches = []
        label_batches = []
        for length1 in sorted(subtoken_dic_dic.keys(), reverse=True):
            for length2 in sorted(subtoken_dic_dic[length1].keys(),
                                  reverse=True):
                input_ids_batches.extend(subtoken_dic_dic[length1][length2])
                first_subtokens_batches.extend(
                    first_subtoken_dic_dic[length1][length2])
                last_subtokens_batches.extend(
                    last_subtoken_dic_dic[length1][length2])
                label_batches.extend(label_dic_dic[length1][length2])

        [
            this_input_ids_batches.append(input_ids_batches[i:i + batch_size])
            for i in range(0, len(input_ids_batches), batch_size)
        ]
        [
            this_first_subtokens_batches.append(
                first_subtokens_batches[i:i + batch_size])
            for i in range(0, len(first_subtokens_batches), batch_size)
        ]
        [
            this_last_subtokens_batches.append(
                last_subtokens_batches[i:i + batch_size])
            for i in range(0, len(last_subtokens_batches), batch_size)
        ]
        [
            this_label_batches.append(label_batches[i:i + batch_size])
            for i in range(0, len(label_batches), batch_size)
        ]

        this_input_ids_batches, this_input_mask_batches, this_mask_batches \
            = self._pad_batches(this_input_ids_batches, this_first_subtokens_batches)

        return (this_input_ids_batches, this_input_mask_batches,
                this_first_subtokens_batches, this_last_subtokens_batches,
                this_label_batches, this_mask_batches)

    def to_batch(self, batch_size: int) -> Tuple:
        ret_list = []
        for sent_list in [self.train, self.dev, self.test]:
            ret_list.append(self.get_batches(sent_list, batch_size))
        return tuple(ret_list)

    def read_all_data(self, file_path: str, train_file: str, dev_file: str,
                      test_file: str) -> None:
        self.train = self._read_file(file_path + train_file)
        self.dev = self._read_file(file_path + dev_file, mode='dev')
        self.test = self._read_file(file_path + test_file, mode='test')
        self._gen_dic()

    def debug_single_sample(self, subtoken: List[int],
                            label_list: List[Tuple[int, int, int]]) -> None:
        print(" ".join(
            [self.subword_alphabet.get_instance(t) for t in subtoken]))
        for label in label_list:
            print(label[0], label[1],
                  self.label_alphabet.get_instance(label[2]))
Beispiel #8
0
import torch
from tokenizers import BertWordPieceTokenizer

from amadeus_model import Amadeus

tokenizer = BertWordPieceTokenizer('data/bert-base-uncased-vocab.txt',
                                   lowercase=True)

model = Amadeus(num_tokens=tokenizer.get_vocab_size(),
                enc_seq_len=4096,
                dec_seq_len=1024)
model.load_state_dict(
    torch.load('models/amadeus-performer-2020-11-03-16.54.13.pt'))
model.eval(fix_proj_matrices=True)

in_seq = torch.randint(0, tokenizer.get_vocab_size(), (1, model.in_seq_len))
out_seq = torch.randint(0, tokenizer.get_vocab_size(), (1, model.out_seq_len))

traced_script_model = torch.jit.trace(model, (in_seq, out_seq),
                                      check_trace=False)
traced_script_model.save('traced.pt')
Beispiel #9
0
def numerize(vocab_path, input_path, bin_path):
    tokenizer = BertWordPieceTokenizer(vocab_path,
                                       unk_token=UNK_TOKEN,
                                       sep_token=SEP_TOKEN,
                                       cls_token=CLS_TOKEN,
                                       pad_token=PAD_TOKEN,
                                       mask_token=MASK_TOKEN,
                                       lowercase=False,
                                       strip_accents=False)
    sentences = []
    with open(input_path, 'r') as f:
        batch_stream = []
        for i, line in enumerate(f):
            batch_stream.append(line)
            if i % 1000 == 0:
                res = tokenizer.encode_batch(batch_stream)
                batch_stream = []
                # flatten the list
                for s in res:
                    sentences.extend(s.ids[1:])
            if i % 100000 == 0:
                print(f'processed {i} lines')

    print('convert the data to numpy')

    # convert data to numpy format in uint16
    if tokenizer.get_vocab_size() < 1 << 16:
        sentences = np.uint16(sentences)
    else:
        assert tokenizer.get_vocab_size() < 1 << 31
        sentences = np.int32(sentences)

    # save special tokens for later processing
    sep_index = tokenizer.token_to_id(SEP_TOKEN)
    cls_index = tokenizer.token_to_id(CLS_TOKEN)
    unk_index = tokenizer.token_to_id(UNK_TOKEN)
    mask_index = tokenizer.token_to_id(MASK_TOKEN)
    pad_index = tokenizer.token_to_id(PAD_TOKEN)

    # sanity check
    assert sep_index == SEP_INDEX
    assert cls_index == CLS_INDEX
    assert unk_index == UNK_INDEX
    assert pad_index == PAD_INDEX
    assert mask_index == MASK_INDEX

    print('collect statistics')
    # collect some statistics of the dataset
    n_unks = (sentences == unk_index).sum()
    n_toks = len(sentences)
    p_unks = n_unks * 100. / n_toks
    n_seqs = (sentences == sep_index).sum()
    print(
        f'| {n_seqs} sentences - {n_toks} tokens - {p_unks:.2f}% unknown words'
    )

    # print some statistics
    data = {
        'sentences': sentences,
        'sep_index': sep_index,
        'cls_index': cls_index,
        'unk_index': unk_index,
        'pad_index': pad_index,
        'mask_index': mask_index
    }

    torch.save(data, bin_path, pickle_protocol=4)
Beispiel #10
0
import torch
from tokenizers import BertWordPieceTokenizer, Encoding

from amadeus_model import Amadeus

tokenizer = BertWordPieceTokenizer('data/bert-base-uncased-vocab.txt',
                                   lowercase=True)

model = Amadeus(num_tokens=tokenizer.get_vocab_size(),
                enc_seq_len=1024,
                dec_seq_len=512)
checkpoint = torch.load(
    'checkpoints/amadeus-performer-2020-11-25-00.20.57-300.pt')
model.eval(True)
# model.load_state_dict(torch.load('models/amadeus-performer-2020-11-06-12.47.52.pt'))
model.load_state_dict(checkpoint['model_state_dict'])
model.cuda()

run = True

sentences = []

while run:
    try:
        sentence = input('> ')
        if sentence in ['quit', 'exit']:
            run = False
            continue
        sentences.append(tokenizer.encode(sentence))
        if len(sentences) > 3:
            sentences = sentences[-3:]
Beispiel #11
0
parser.add_argument('--txtfolder',
                    type=str,
                    help='the FOLDER where are those txt files')
args = parser.parse_args()

paths = [str(x) for x in Path(str(args.txtfolder)).glob("**/*.txt")]

# Initialize a lm_model
tokenizer = BertWordPieceTokenizer()

#trainer = BpeTrainer(vocab_size= VOCAB_SIZE, show_progress=True, initial_alphabet=ByteLevel.alphabet())
#tokenizer.train(trainer, paths)
# Customize training
'''
tokenizer._tokenizer.post_processor = BertProcessing(("[CLS]", tokenizer.token_to_id("[CLS]")),
                                                     ("[SEP]", tokenizer.token_to_id("[SEP]")),
                                                     )
                                                    '''
tokenizer.train(files=paths,
                vocab_size=VOCAB_SIZE,
                min_frequency=2,
                special_tokens=[
                    "[PAD]",
                    "[UNK]",
                    "[CLS]",
                    "[SEP]",
                    "[MASK]",
                ])
print("Trained vocab size: {}".format(tokenizer.get_vocab_size()))
tokenizer.save_model('./lm_model')
print('tokenizer savedresults, they are vocab.json and merges.txt')