def __init__(self, embed_file, dicts, dropout=0.5, gpu=True, embed_size=100): super(BaseModel, self).__init__() self.gpu = gpu self.embed_size = embed_size # self.embed_drop = nn.Dropout(p=dropout) #make embedding layer if embed_file: print("loading pretrained embeddings...") W = torch.Tensor(extract_wvs.load_embeddings(embed_file)) print("Size of embedding matrix") print(W.size()) self.embed = nn.Embedding(W.size()[0], W.size()[1]) self.embed.weight.data = W.clone() else: #add 2 to include UNK and PAD # vocab_size = len(dicts[0]) # print("Vocab size: " + str(vocab_size)) vocab_size = 10 # TEMP self.embed = nn.Embedding(vocab_size + 2, embed_size)
def __init__(self, Y, embed_file, dicts, lmbda=0, dropout=0.5, gpu=True, embed_size=100): super(BaseModel, self).__init__() torch.manual_seed(1337) self.gpu = gpu self.Y = Y self.embed_size = embed_size self.embed_drop = nn.Dropout(p=dropout) self.lmbda = lmbda #make embedding layer if embed_file: print("loading pretrained embeddings...") el, ind = extract_wvs.load_embeddings(embed_file) assert ind == dicts[ 'w2ind'] #assert that the index of the pretrained embeddings file aligns *exactly* with that used to embed the text W = torch.Tensor(el) self.embed = nn.Embedding(W.size()[0], W.size()[1], padding_idx=0) self.embed.weight.data = W.clone() else: #add 2 to include UNK and PAD vocab_size = len(dicts['ind2w']) self.embed = nn.Embedding(vocab_size + 2, embed_size, padding_idx=0)
def __init__(self, Y, embed_file, dicts, lmbda=0, dropout=0.5, gpu=True, embed_size=100): super(BaseModel, self).__init__() self.gpu = gpu self.Y = Y self.embed_size = embed_size self.embed_drop = nn.Dropout(p=dropout) self.lmbda = lmbda #make embedding layer if embed_file: print("loading pretrained embeddings...") W = torch.Tensor(extract_wvs.load_embeddings(embed_file)) self.embed = nn.Embedding(W.size()[0], W.size()[1]) self.embed.weight.data = W.clone() else: #add 2 to include UNK and PAD vocab_size = len(dicts[0]) self.embed = nn.Embedding(vocab_size + 2, embed_size)
def __init__(self, embed_file, dicts, embed_size=100): super(BaseModel, self).__init__() self.embed_size = embed_size #make embedding layer if embed_file: print("loading pretrained embeddings...") W = torch.Tensor(extract_wvs.load_embeddings(embed_file)) print("Size of embedding matrix") print(W.size()) self.embed = nn.Embedding(W.size()[0], W.size()[1]) self.embed.weight.data = W.clone() self.embed.weight.requires_grad = True # Likely not needed else: vocab_size = len(dicts[0]) self.embed = nn.Embedding( vocab_size + 2, embed_size) #add 2 to include UNK and PAD
# Set parameters: maxlen = 200 embedding_dims = 200 nb_filter = 500 filter_length = 4 batch_size = 8 nb_epoch = 10 nb_labels = 50 train_data_path = "../mimicdata/mimic3/train_50.csv" dev_data_path = "../mimicdata/mimic3/dev_50.csv" test_data_path = "../mimicdata/mimic3/test_50.csv" vocab = "../mimicdata/mimic3/vocab.csv" embed_file = "../mimicdata/mimic3/processed_full.embed" dicts = datasets.load_lookups(train_data_path, vocab, Y=nb_labels) vocab_size = len(dicts[0]) embed_weight = extract_wvs.load_embeddings(embed_file) # Load data print('Loading data...') def slim_data_generator(data_path): while 1: for batch_idx, tup in enumerate( datasets.data_generator(data_path, dicts, batch_size=batch_size, num_labels=nb_labels)): X, y, _, code_set, descs = tup X = sequence.pad_sequences(X, maxlen=maxlen) yield X, y