Exemple #1
0
    def __init__(self,
                 embed_file,
                 dicts,
                 dropout=0.5,
                 gpu=True,
                 embed_size=100):
        super(BaseModel, self).__init__()
        self.gpu = gpu
        self.embed_size = embed_size
        #        self.embed_drop = nn.Dropout(p=dropout)

        #make embedding layer
        if embed_file:
            print("loading pretrained embeddings...")
            W = torch.Tensor(extract_wvs.load_embeddings(embed_file))
            print("Size of embedding matrix")
            print(W.size())
            self.embed = nn.Embedding(W.size()[0], W.size()[1])
            self.embed.weight.data = W.clone()

        else:
            #add 2 to include UNK and PAD
            #            vocab_size = len(dicts[0])
            #            print("Vocab size: " + str(vocab_size))
            vocab_size = 10  # TEMP
            self.embed = nn.Embedding(vocab_size + 2, embed_size)
Exemple #2
0
    def __init__(self,
                 Y,
                 embed_file,
                 dicts,
                 lmbda=0,
                 dropout=0.5,
                 gpu=True,
                 embed_size=100):
        super(BaseModel, self).__init__()
        torch.manual_seed(1337)
        self.gpu = gpu
        self.Y = Y
        self.embed_size = embed_size
        self.embed_drop = nn.Dropout(p=dropout)
        self.lmbda = lmbda

        #make embedding layer
        if embed_file:
            print("loading pretrained embeddings...")
            el, ind = extract_wvs.load_embeddings(embed_file)
            assert ind == dicts[
                'w2ind']  #assert that the index of the pretrained embeddings file aligns *exactly* with that used to embed the text

            W = torch.Tensor(el)

            self.embed = nn.Embedding(W.size()[0], W.size()[1], padding_idx=0)
            self.embed.weight.data = W.clone()
        else:
            #add 2 to include UNK and PAD
            vocab_size = len(dicts['ind2w'])
            self.embed = nn.Embedding(vocab_size + 2,
                                      embed_size,
                                      padding_idx=0)
Exemple #3
0
    def __init__(self,
                 Y,
                 embed_file,
                 dicts,
                 lmbda=0,
                 dropout=0.5,
                 gpu=True,
                 embed_size=100):
        super(BaseModel, self).__init__()
        self.gpu = gpu
        self.Y = Y
        self.embed_size = embed_size
        self.embed_drop = nn.Dropout(p=dropout)
        self.lmbda = lmbda

        #make embedding layer
        if embed_file:
            print("loading pretrained embeddings...")
            W = torch.Tensor(extract_wvs.load_embeddings(embed_file))

            self.embed = nn.Embedding(W.size()[0], W.size()[1])
            self.embed.weight.data = W.clone()
        else:
            #add 2 to include UNK and PAD
            vocab_size = len(dicts[0])
            self.embed = nn.Embedding(vocab_size + 2, embed_size)
    def __init__(self, embed_file, dicts, embed_size=100):
        super(BaseModel, self).__init__()
        self.embed_size = embed_size

        #make embedding layer
        if embed_file:
            print("loading pretrained embeddings...")
            W = torch.Tensor(extract_wvs.load_embeddings(embed_file))
            print("Size of embedding matrix")
            print(W.size())
            self.embed = nn.Embedding(W.size()[0], W.size()[1])
            self.embed.weight.data = W.clone()
            self.embed.weight.requires_grad = True  # Likely not needed

        else:
            vocab_size = len(dicts[0])
            self.embed = nn.Embedding(
                vocab_size + 2, embed_size)  #add 2 to include UNK and PAD
Exemple #5
0
# Set parameters:
maxlen = 200
embedding_dims = 200
nb_filter = 500
filter_length = 4
batch_size = 8
nb_epoch = 10
nb_labels = 50
train_data_path = "../mimicdata/mimic3/train_50.csv"
dev_data_path = "../mimicdata/mimic3/dev_50.csv"
test_data_path = "../mimicdata/mimic3/test_50.csv"
vocab = "../mimicdata/mimic3/vocab.csv"
embed_file = "../mimicdata/mimic3/processed_full.embed"
dicts = datasets.load_lookups(train_data_path, vocab, Y=nb_labels)
vocab_size = len(dicts[0])
embed_weight = extract_wvs.load_embeddings(embed_file)

# Load data
print('Loading data...')


def slim_data_generator(data_path):
    while 1:
        for batch_idx, tup in enumerate(
                datasets.data_generator(data_path,
                                        dicts,
                                        batch_size=batch_size,
                                        num_labels=nb_labels)):
            X, y, _, code_set, descs = tup
            X = sequence.pad_sequences(X, maxlen=maxlen)
            yield X, y