def __init__(self, num_classes=3): """We have some of the best constructors in the world""" super(TransformerClassifier, self).__init__() tokenizer = CharBPETokenizer('../Tokenize/thyme-tokenizer-vocab.json', '../Tokenize/thyme-tokenizer-merges.txt') vocab_size = tokenizer.get_vocab_size() self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=cfg.getint( 'model', 'emb_dim')) self.position = PositionalEncoding( embedding_dim=cfg.getint('model', 'emb_dim')) encoder_layer = nn.TransformerEncoderLayer( d_model=cfg.getint('model', 'emb_dim'), nhead=cfg.getint('model', 'num_heads'), dim_feedforward=cfg.getint('model', 'feedforw_dim')) self.trans_encoder = nn.TransformerEncoder(encoder_layer=encoder_layer, num_layers=cfg.getint( 'model', 'num_layers')) self.dropout = nn.Dropout(cfg.getfloat('model', 'dropout')) self.linear = nn.Linear(in_features=cfg.getint('model', 'emb_dim'), out_features=num_classes) self.init_weights()
def __init__(self, num_class=4): """Constructor""" super(BagOfEmbeddings, self).__init__() tokenizer = CharBPETokenizer('../Tokenize/thyme-tokenizer-vocab.json', '../Tokenize/thyme-tokenizer-merges.txt') vocab_size = tokenizer.get_vocab_size() self.embed = nn.Embedding(num_embeddings=vocab_size, embedding_dim=cfg.getint('model', 'emb_dim')) self.hidden1 = nn.Linear(in_features=cfg.getint('model', 'emb_dim'), out_features=cfg.getint( 'model', 'hidden_size')) self.relu = nn.ReLU() self.hidden2 = nn.Linear( in_features=cfg.getint('model', 'hidden_size'), out_features=cfg.getint('model', 'hidden_size')) self.dropout = nn.Dropout(cfg.getfloat('model', 'dropout')) self.classif = nn.Linear(in_features=cfg.getint( 'model', 'hidden_size'), out_features=num_class)
def __init__(self, num_class=4): """Constructor""" super(LstmClassifier, self).__init__() tokenizer = CharBPETokenizer('../Tokenize/thyme-tokenizer-vocab.json', '../Tokenize/thyme-tokenizer-merges.txt') vocab_size = tokenizer.get_vocab_size() self.embed = nn.Embedding(num_embeddings=vocab_size, embedding_dim=cfg.getint('model', 'emb_dim')) self.lstm = nn.LSTM(input_size=cfg.getint('model', 'emb_dim'), hidden_size=cfg.getint('model', 'hidden_size')) self.dropout = nn.Dropout(p=cfg.getfloat('model', 'dropout')) self.linear = nn.Linear(in_features=cfg.getint('model', 'hidden_size'), out_features=num_class)
class SubwordEncoder: "Subword tokenization" def __init__(self, path='subword/'): """ Args: path: str, a path to vocab file. """ # Load vocab self.subword_tokenizer = CharBPETokenizer(vocab_file=path+"/bpe-vocab.json", merges_file=path+"/bpe-merges.txt") self.encode = self._encode_subwords self.id_to_token = self._id_to_subword() self.token_to_id = self._subword_to_id() def get_vocab_size(self): return self.subword_tokenizer.get_vocab_size() def _encode_subwords(self, sentence, with_eos): """ Args: sentence: str, texts to be encoded. with_eos: end with <EOS> token. Returns: tokens: list, encoded sequence. """ tokens = self.subword_tokenizer.encode(sentence).ids if with_eos: tokens += [2] # 2 is the id of <EOS> token return tokens def _id_to_subword(self): id2subword = {} for i in range(self.get_vocab_size()): id2subword[i] = self.subword_tokenizer.id_to_token(i) return id2subword def _subword_to_id(self): subword2id = {} for i in range(self.get_vocab_size()): subword2id[self.subword_tokenizer.id_to_token(i)] = i return subword2id
def __init__(self, num_class=3): """Constructor""" super(BagOfEmbeddings, self).__init__() tokenizer = CharBPETokenizer( '../Tokenize/thyme-tokenizer-vocab.json', '../Tokenize/thyme-tokenizer-merges.txt') vocab_size = tokenizer.get_vocab_size() self.embed = nn.Embedding( num_embeddings=vocab_size, embedding_dim=cfg.getint('model', 'emb_dim')) self.posit = positions.BertPositionalEncoding.from_pretrained( 'bert-base-uncased') self.dropout = nn.Dropout(cfg.getfloat('model', 'dropout')) self.classif = nn.Linear( in_features=cfg.getint('model', 'emb_dim'), out_features=num_class)
class BPETokenizer: def __init__(self, text_list, vocab_size, lazy=False): if not lazy: self.tokenizer = CharBPETokenizer() self.tokenizer.train(text_list, vocab_size=vocab_size, special_tokens=[PAD, BOS, EOS, "<unk>"]) self.tokenizer.add_special_tokens([PAD, BOS, EOS]) else: self.tokenizer = None def tokens_to_ids(self, tokens): return [self.tokenizer.token_to_id(t) for t in tokens] def ids_to_tokens(self, ids): return [self.tokenizer.id_to_token(i) for i in ids] def encode(self, text): encodes = self.tokenizer.encode(text) return encodes.ids def decode(self, ids, skip_special=True): return self.tokenizer.decode(ids, skip_special_tokens=skip_special) def save(self, path, file_name): self.tokenizer.save(path, file_name) @classmethod def load(cls, vocab, merges): tkz = cls(None, None, lazy=True) tkz.tokenizer = CharBPETokenizer(vocab, merges) tkz.tokenizer.add_special_tokens([PAD, BOS, EOS]) return tkz def __len__(self): return self.tokenizer.get_vocab_size()