Example #1
0
def batch_generator(input_data, y, batch_size, max_sent_length, n_channels=1):
    X = list(map(lambda s: numpy.stack(s[:max_sent_length]), input_data))
    steps_per_epoch = math.ceil(len(X) / batch_size)
    """
    Yields embedded sentences in matrices of shape (max_sent_length, embedding_size, 1)
    """
    i = 0

    # Loops indefinitely, as precised in https://keras.io/models/sequential/
    while True:
        sl = slice(i * batch_size, (i + 1) * batch_size)

        mats = X[sl]
        # max_sent_length = numpy.max([m.shape[0] for m in mats])
        y_batch = y[sl]

        # pad all the matrices (sentences) one by one.
        e = get_embedding_dim()
        for k, m in enumerate(mats):
            mats[k] = numpy.vstack(
                (mats[k], numpy.zeros(((max_sent_length - m.shape[0]), e))))
            if n_channels > 0:
                mats[k] = mats[k].reshape(max_sent_length, e, n_channels)

        # Now stack em all in a 3D tensor of shape (batch_size, sent_length, embedding_size)
        batch = numpy.array(mats)

        # Avoid storing too large numbers by modulo.

        i = (i + 1) % steps_per_epoch

        # Reshape y
        y_keras = to_categorical(y_batch, num_classes=51)

        yield (batch, y_keras)
Example #2
0
def VDCNN(embed_type, maxlen=250, filter_sizes={2, 3, 4, 5}):
    embed_size = utils.get_embedding_dim(embed_type)
    inp = Input(shape=(maxlen, embed_size))
    x = inp

    conv_ops = []
    for filter_size in filter_sizes:
        conv = Conv1D(256, filter_size, activation='relu')(x)
        pool = MaxPool1D(5)(conv)
        conv_ops.append(pool)

    concat = Concatenate(axis=1)(conv_ops)
    # concat = Dropout(0.1)(concat)
    concat = BatchNormalization()(concat)

    conv_2_main = Conv1D(256, 5, activation='relu', padding='same')(concat)
    conv_2_main = BatchNormalization()(conv_2_main)
    conv_2_main = Conv1D(256, 5, activation='relu',
                         padding='same')(conv_2_main)
    conv_2_main = BatchNormalization()(conv_2_main)
    conv_2 = Add()([concat, conv_2_main])
    conv_2 = MaxPool1D(pool_size=2, strides=2)(conv_2)
    # conv_2 = BatchNormalization()(conv_2)
    # conv_2 = Dropout(0.1)(conv_2)

    conv_3_main = Conv1D(256, 5, activation='relu', padding='same')(conv_2)
    conv_3_main = BatchNormalization()(conv_3_main)
    conv_3_main = Conv1D(256, 5, activation='relu',
                         padding='same')(conv_3_main)
    conv_3_main = BatchNormalization()(conv_3_main)
    conv_3 = Add()([conv_2, conv_3_main])
    conv_3 = MaxPool1D(pool_size=2, strides=2)(conv_3)
    # conv_3 = BatchNormalization()(conv_3)
    # conv_3 = Dropout(0.1)(conv_3)

    flat = Flatten()(conv_3)

    op = Dense(256, activation="relu")(flat)
    op = Dropout(0.5)(op)
    op = BatchNormalization()(op)
    op = Dense(128, activation="relu")(op)
    op = Dropout(0.5)(op)
    op = BatchNormalization()(op)
    op = Dense(3, activation="softmax")(op)

    model = Model(inputs=inp, outputs=op)

    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer='adam',
                  metrics=[
                      'sparse_categorical_accuracy',
                      km.sparse_categorical_f1_score()
                  ])
    return model, 'VDCNN_{}.hdf5'.format(embed_type)
Example #3
0
def get_model(embed_type):
    def count_parameters(model):
        return sum(p.numel() for p in model.parameters() if p.requires_grad)

    EMBEDDING_DIM = utils.get_embedding_dim(embed_type)
    N_FILTERS = 512
    FILTER_SIZES = [3, 4, 5, 6]
    OUTPUT_DIM = 3
    DROPOUT = 0.5

    model = CNN1d(EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT)

    print(model)

    print('The model has {count_parameters(model):,} trainable parameters')
    return model, 'cnn_{}.pt'.format(embed_type)
Example #4
0
def LSTMCNN(embed_type, maxlen=250, filter_sizes={2, 3, 4, 5}):
    embed_size = utils.get_embedding_dim(embed_type)
    inp = Input(shape=(maxlen, embed_size))
    x = inp

    x = Bidirectional(CuDNNLSTM(128, return_sequences=True))(x)

    conv_ops = []
    for filter_size in filter_sizes:
        conv = Conv1D(256, filter_size, activation='relu')(x)
        pool = MaxPool1D(5)(conv)
        conv_ops.append(pool)

    concat = Concatenate(axis=1)(conv_ops)
    concat = Dropout(0.5)(concat)
    concat = BatchNormalization()(concat)

    conv_2 = Conv1D(256, 5, activation='relu')(concat)
    conv_2 = MaxPool1D(5)(conv_2)
    conv_2 = BatchNormalization()(conv_2)
    conv_2 = Dropout(0.5)(conv_2)

    conv_3 = Conv1D(256, 5, activation='relu')(conv_2)
    conv_3 = MaxPool1D(5)(conv_3)
    conv_3 = BatchNormalization()(conv_3)
    conv_3 = Dropout(0.1)(conv_3)

    flat = Flatten()(conv_3)

    op = Dense(256, activation="relu")(flat)
    op = Dropout(0.5)(op)
    op = BatchNormalization()(op)
    op = Dense(128, activation="relu")(op)
    op = Dropout(0.5)(op)
    op = BatchNormalization()(op)
    op = Dense(3, activation="softmax")(op)

    model = Model(inputs=inp, outputs=op)
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer='adam',
                  metrics=[
                      'sparse_categorical_accuracy',
                      km.sparse_categorical_f1_score()
                  ])
    return model, 'LSTMCNN_{}.hdf5'.format(embed_type)
Example #5
0
def get_model(embed_type):
    def count_parameters(model):
        return sum(p.numel() for p in model.parameters() if p.requires_grad)

    embedding_dim = utils.get_embedding_dim(embed_type)
    hidden_dim = 512
    output_dim = 3
    n_layer = 2
    bidirectional = True
    dropout = 0.5

    model = RNN(embedding_dim,
                hidden_dim,
                output_dim,
                n_layer,
                bidirectional,
                dropout)

    print(model)
    print('The model has {count_parameters(model):,} trainable parameters')
    return model, 'rnn_{}.pt'.format(embed_type)
    def __init__(self,
                 max_seq_len=100,
                 embedding_dim=20,
                 embeddings_path=None,
                 optimizer='adam',
                 batch_size=32,
                 epochs=10,
                 vocab=None,
                 vocab_size=None,
                 class_count=None,
                 **kwargs):

        self.vocab_size = vocab_size
        self.vocab = vocab
        self.class_count = class_count
        self.embedding_dim = embedding_dim
        self.max_seq_len = max_seq_len
        self.embeddings_path = embeddings_path
        if embeddings_path is not None:
            self.embedding_dim = get_embedding_dim(embeddings_path)
        self.optimizer = optimizer
        self.epochs = epochs
        self.batch_size = batch_size
        self.patience = 3
        self.history = None
        self.model = None
        self.pos_tag = None

        self.params = {
            'epochs': self.epochs,
            'max_seq_len': self.max_seq_len,
            'embedding_dim': self.embedding_dim,
            'embeddings_path': self.embeddings_path,
            'optimizer': self.optimizer,
            'batch_size': self.batch_size,
            'vocab': self.vocab,
            'vocab_size': self.vocab_size,
            'class_count': self.class_count
        }
Example #7
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument(
        '--model',
        type=str,
        default='rnn',
        help=
        "Available models are: 'rnn', 'cnn', 'bilstm', 'fasttext', and 'distilbert'\nDefault is 'rnn'"
    )
    parser.add_argument('--train_data_path',
                        type=str,
                        default="./data/train_clean.csv",
                        help="Path to the training data")
    parser.add_argument('--test_data_path',
                        type=str,
                        default="./data/dev_clean.csv",
                        help="Path to the test data")
    parser.add_argument('--seed', type=int, default=1234)
    parser.add_argument('--vectors',
                        type=str,
                        default='fasttext.simple.300d',
                        help="""
                                Pretrained vectors:
                                Visit 
                                https://github.com/pytorch/text/blob/9ce7986ddeb5b47d9767a5299954195a1a5f9043/torchtext/vocab.py#L146
                                for more 
                                """)
    parser.add_argument('--max_vocab_size', type=int, default=750)
    parser.add_argument('--batch_size', type=int, default=32)
    parser.add_argument('--bidirectional', type=bool, default=True)
    parser.add_argument('--dropout', type=float, default=0.5)
    parser.add_argument('--hidden_dim', type=int, default=64)
    parser.add_argument('--output_dim', type=int, default=1)
    parser.add_argument('--n_layers', type=int, default=2)
    parser.add_argument('--lr', type=float, default=1e-3)
    parser.add_argument('--n_epochs', type=int, default=5)
    parser.add_argument('--n_filters', type=int, default=100)
    parser.add_argument('--filter_sizes', type=list, default=[3, 4, 5])

    args = parser.parse_args()

    torch.manual_seed(args.seed)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    ##########  BILSTM ##########

    if args.model == "bilstm":
        print('\nBiLSTM')
        TEXT = Field(tokenize='spacy')
        LABEL = LabelField(dtype=torch.float)
        data_fields = [("text", TEXT), ("label", LABEL)]

        train_data = TabularDataset(args.train_data_path,
                                    format='csv',
                                    fields=data_fields,
                                    skip_header=True,
                                    csv_reader_params={'delimiter': ","})

        test_data = TabularDataset(args.test_data_path,
                                   format='csv',
                                   fields=data_fields,
                                   skip_header=True,
                                   csv_reader_params={'delimiter': ","})

        train_data, val_data = train_data.split(split_ratio=0.8,
                                                random_state=random.seed(
                                                    args.seed))

        TEXT.build_vocab(train_data,
                         max_size=args.max_vocab_size,
                         vectors=args.vectors,
                         unk_init=torch.Tensor.normal_)
        LABEL.build_vocab(train_data)

        train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
            (train_data, val_data, test_data),
            batch_size=args.batch_size,
            sort_key=lambda x: len(x.text),
            device=device)

        input_dim = len(TEXT.vocab)
        embedding_dim = get_embedding_dim(args.vectors)
        pad_idx = TEXT.vocab.stoi[TEXT.pad_token]
        unk_idx = TEXT.vocab.stoi[TEXT.unk_token]

        model = BiLSTM(input_dim, embedding_dim, args.hidden_dim,
                       args.output_dim, args.n_layers, args.bidirectional,
                       args.dropout, pad_idx)

        pretrained_embeddings = TEXT.vocab.vectors

        model.embedding.weight.data.copy_(pretrained_embeddings)
        model.embedding.weight.data[unk_idx] = torch.zeros(embedding_dim)
        model.embedding.weight.data[pad_idx] = torch.zeros(embedding_dim)

        optimizer = optim.Adam(model.parameters(), lr=args.lr)
        criterion = nn.BCEWithLogitsLoss()

        model.to(device)
        criterion.to(device)

        best_valid_loss = float('inf')

        print("\nTraining...")
        print("===========")
        for epoch in range(1, args.n_epochs + 1):

            start_time = time.time()

            train_loss, train_acc = train(model, train_iterator, optimizer,
                                          criterion)
            valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)

            end_time = time.time()

            epoch_mins, epoch_secs = epoch_time(start_time, end_time)

            if valid_loss < best_valid_loss:
                best_valid_loss = valid_loss
                torch.save(model.state_dict(),
                           './checkpoints/{}-model.pt'.format(args.model))

            print(
                f'[Epoch: {epoch:02}] | Epoch Time: {epoch_mins}m {epoch_secs}s'
            )
            print(
                f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%'
            )
            print(
                f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%'
            )

        model.load_state_dict(
            torch.load('./checkpoints/{}-model.pt'.format(args.model)))

        test_loss, test_acc = evaluate(model, test_iterator, criterion)

        print('\nEvaluating...')
        print("=============")
        print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%'
              )  # Test Loss: 0.139, Test Acc: 95.27%

    ##########  VANILLA RNN ##########

    else:
        print('\nVanilla RNN')
        TEXT = Field(tokenize='spacy')
        LABEL = LabelField(dtype=torch.float)
        data_fields = [("text", TEXT), ("label", LABEL)]

        train_data = TabularDataset(args.train_data_path,
                                    format='csv',
                                    fields=data_fields,
                                    skip_header=True,
                                    csv_reader_params={'delimiter': ","})

        test_data = TabularDataset(args.test_data_path,
                                   format='csv',
                                   fields=data_fields,
                                   skip_header=True,
                                   csv_reader_params={'delimiter': ","})

        train_data, val_data = train_data.split(split_ratio=0.8,
                                                random_state=random.seed(
                                                    args.seed))

        TEXT.build_vocab(train_data,
                         max_size=args.max_vocab_size,
                         vectors=args.vectors)
        LABEL.build_vocab(train_data)

        train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
            (train_data, val_data, test_data),
            batch_size=args.batch_size,
            sort_key=lambda x: len(x.text),
            device=device)

        input_dim = len(TEXT.vocab)
        embedding_dim = get_embedding_dim(args.vectors)

        model = RNN(input_dim, embedding_dim, args.hidden_dim, args.output_dim)

        pretrained_embeddings = TEXT.vocab.vectors

        model.embedding.weight.data.copy_(pretrained_embeddings)

        optimizer = optim.Adam(model.parameters(), lr=args.lr)
        criterion = nn.BCEWithLogitsLoss()

        model.to(device)
        criterion.to(device)

        best_valid_loss = float('inf')

        print("\nTraining...")
        print("===========")
        for epoch in range(1, args.n_epochs + 1):

            start_time = time.time()

            train_loss, train_acc = train(model, train_iterator, optimizer,
                                          criterion)
            valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)

            end_time = time.time()

            epoch_mins, epoch_secs = epoch_time(start_time, end_time)

            if valid_loss < best_valid_loss:
                best_valid_loss = valid_loss
                torch.save(model.state_dict(),
                           './checkpoints/{}-model.pt'.format(args.model))

            print(
                f'[Epoch: {epoch:02}] | Epoch Time: {epoch_mins}m {epoch_secs}s'
            )
            print(
                f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%'
            )
            print(
                f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%'
            )

        model.load_state_dict(
            torch.load('./checkpoints/{}-model.pt'.format(args.model)))

        test_loss, test_acc = evaluate(model, test_iterator, criterion)

        print('\nEvaluating...')
        print("=============")
        print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%'
              )  # Test Loss: 0.138, Test Acc: 95.05%
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(lambda: max_idf)
        for w, i in tfidf.vocabulary_.items():
            self.word2weight[w] = tfidf.idf_[i]

    def transform(self, X):
        return np.array([
            np.mean([
                self.fasttext_model.get_word_vector(w) * self.word2weight[w]
                for w in words
            ] or [np.zeros(self.dim)],
                    axis=0) for words in X
        ])


if __name__ == '__main__':
    descriptions_df = pnd.read_csv(utils.get_drugs_indication_path(),
                                   encoding=params.UTF_8)
    fasttext_model = FastText.load_model(MODEL_PATH)
    tfidf_emb = TfidfEmbeddingVectorizer(fasttext_model,
                                         utils.get_embedding_dim())
    tfidf_emb.fit(descriptions_df.descriptions)
    embeddings = tfidf_emb.transform(descriptions_df.descriptions)

    embeddings_dict = {}
    for i in range(descriptions_df.shape[0]):
        embeddings_dict[descriptions_df.loc[i, 'drug_names']] = embeddings[i]

    with open(utils.get_drug_embedding_path(), 'wb') as f:
        pickle.dump(embeddings_dict, f)