def get_model(embedding_dimension, essay_length):
    """
    Returns compiled model.
    """
    vocabulary_size = len(tokenizer.word_index) + 1
    embedding_matrix = load_embedding_matrix(GLOVE_DIR, embedding_dimension)

    model = Sequential()
    model.add(
        Embedding(vocabulary_size,
                  embedding_dimension,
                  weights=[embedding_matrix],
                  input_length=essay_length,
                  trainable=False,
                  mask_zero=False))
    model.add(Flatten())
    model.add(Dense(500))
    model.add(Dropout(0.4))
    model.add(Dense(500))
    model.add(Dropout(0.4))
    model.add(
        Dense(1,
              activation='sigmoid',
              activity_regularizer=keras.regularizers.l2(0.0)))
    model.compile(loss='mean_squared_error', optimizer='adam')

    return model
def get_model(embedding_dimension, essay_length):
    """
    Returns compiled model.
    """
    vocabulary_size = len(tokenizer.word_index) + 1
    embedding_matrix = load_embedding_matrix(GLOVE_DIR, embedding_dimension)

    model = Sequential()
    model.add(
        Embedding(vocabulary_size,
                  embedding_dimension,
                  weights=[embedding_matrix],
                  input_length=essay_length,
                  trainable=False,
                  mask_zero=False))
    model.add(Conv1D(filters=50, kernel_size=5, padding='same'))
    model.add(
        LSTM(300, dropout=0.4, recurrent_dropout=0.4, return_sequences=True))
    model.add(Lambda(lambda x: K.mean(x, axis=1)))
    model.add(Dropout(0.4))
    model.add(
        Dense(1,
              activation='sigmoid',
              activity_regularizer=keras.regularizers.l2(0.0)))
    model.compile(loss='mean_squared_error', optimizer='adam')

    return model
Example #3
0
def train(args):
    if not os.path.exists('models/'):
        os.mkdir('models/')

    #numeric data is a map of zpid to tuple of (zip, beds, baths, price)
    numeric_data, text_data, prices = preprocessing.load_tabular_data()

    word_index, tokenizer = util.tokenize_texts(text_data)
    embedding_matrix = util.load_embedding_matrix(word_index)
    additional_num_data = np.load('tabular_data/add_num_data.npy')

    if args.trainable_layers is None:
        trainable_convnet_layers = 10
    else:
        trainable_convnet_layers = int(args.trainable_layers)
    if args.reg_weight is None:
        reg_weight = 0.01
    else:
        reg_weight = float(args.reg_weight)

    if args.folder is not None:
        model, config = load_model(args.folder)
        model_folder = 'models/' + args.folder + '/'
    else:
        config = Config(word_index, embedding_matrix, tokenizer, imagenet_weights=True, trainable_convnet_layers=trainable_convnet_layers,
                    n_classes=50, lr=0.0001, reg_weight=reg_weight, img_only=args.img_only, numeric_input_size=additional_num_data.shape[1]+2-1,
                        numeric_only=args.numeric_only, distance_weight=0.01)
        model = build_model(config)
        if args.name is not None:
            if os.path.exists('models/' + args.name):
                print('A folder with that name already exists.')
                exit()
            os.mkdir('models/' + args.name)
            model_folder = 'models/' + args.name + '/'
        else:
            if not args.test:
                model_subfolders = os.listdir('models/')
                model_folder = 'models/' + str(len(model_subfolders)) + '/'
            else:
                model_folder = ''

    numeric_data = util.preprocess_numeric_data(numeric_data, additional_num_data)
    #bins = util.get_bins(prices, num=config.n_classes)
    bins = util.get_bins(prices, config.n_classes)
    binned_prices = util.buckets(prices, bins)
    np.savetxt('binned_prices.csv', binned_prices, delimiter=',')
    class_weights = 1.0 / (1.0 * np.bincount(binned_prices) / len(binned_prices))
    train_model(model, config, numeric_data, text_data, bins, model_folder, tokenizer, args.overfit, class_weights)
Example #4
0
def main():
    df = pd.read_csv(input_path)
    df_sample = df.sample(n=1000, random_state=46)
    df_sample.text = df_sample.text.progress_apply(clean_text)

    vocab_text = df_sample.text
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(vocab_text)
    vocab_size = len(tokenizer.word_index) + 1
    print("vocab_size = ", vocab_size)

    embedding_matrix = load_embedding_matrix(glove_100d_path, tokenizer,
                                             vocab_size, EMBED_SIZE)

    # splitting docs into sentences
    df_upd = df_sample.copy()
    df_upd.text = df_upd.text.progress_apply(tokenize_sent)

    padded_doc = df_upd.text.progress_apply(sent_tokenize_pad)

    X = pad_sequences(padded_doc, maxlen=MAX_SENTENCE_LEN)
    y = list(df_upd.apply(lambda x: 1 if x["type"] == label else 0, axis=1))

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=36)

    print("training the model")
    han_model = get_attention_model(vocab_size=vocab_size,
                                    embedding_matrix=embedding_matrix,
                                    embed_size=EMBED_SIZE)
    hist = han_model.fit(X_train,
                         y_train,
                         epochs=7,
                         batch_size=32,
                         validation_split=0.2)
def train():
    # Load datasets
    train_dataset = SST2Dataset("./SST-2/train.tsv")
    val_dataset = SST2Dataset("./SST-2/dev.tsv", train_dataset.vocab,
                              train_dataset.reverse_vocab)

    # Create data loaders for creating and iterating over batches
    print(TRAINING_BATCH_SIZE)
    train_loader = DataLoader(train_dataset,
                              batch_size=TRAINING_BATCH_SIZE,
                              collate_fn=collate_fn,
                              shuffle=True)
    val_loader = DataLoader(val_dataset,
                            batch_size=VAL_BATCH_SIZE,
                            collate_fn=collate_fn)

    # Print out some random examples from the data
    print("Data examples:")
    random_indices = torch.randperm(len(train_dataset))[:8].tolist()
    for index in random_indices:
        sequence_indices, label = train_dataset.sentences[
            index], train_dataset.labels[index]
        sentiment = "Positive" if label == 1 else "Negative"
        sequence = train_dataset.indices_to_tokens(sequence_indices)
        print(f"Sentiment: {sentiment}. Sentence: {sequence}")
    print()

    embedding_matrix = load_embedding_matrix(train_dataset.vocab)

    model = RNNBinaryClassificationModel(embedding_matrix)
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

    for epoch in range(NUM_EPOCHS):
        # Total loss across train data
        train_loss = 0.
        # Total number of correctly predicted training labels
        train_correct = 0
        # Total number of training sequences processed
        train_seqs = 0

        tqdm_train_loader = tqdm(train_loader)
        print(f"Epoch {epoch + 1}/{NUM_EPOCHS}")

        model.train()
        for batch_idx, batch in enumerate(tqdm_train_loader):
            sentences_batch, labels_batch = batch

            # Make predictions
            logits = model(sentences_batch)

            # Compute loss and number of correct predictions
            loss = model.loss(logits, labels_batch)
            correct = model.accuracy(logits, labels_batch).item() * len(logits)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Accumulate metrics and update status
            train_loss += loss.item()
            train_correct += correct
            train_seqs += len(sentences_batch)
            tqdm_train_loader.set_description_str(
                f"[Loss]: {train_loss / (batch_idx + 1):.4f} [Acc]: {train_correct / train_seqs:.4f}"
            )
        print()

        avg_train_loss = train_loss / len(tqdm_train_loader)
        train_accuracy = train_correct / train_seqs
        print(
            f"[Training Loss]: {avg_train_loss:.4f} [Training Accuracy]: {train_accuracy:.4f}"
        )

        print("Validating")
        # Total loss across validation data
        val_loss = 0.
        # Total number of correctly predicted validation labels
        val_correct = 0
        # Total number of validation sequences processed
        val_seqs = 0

        tqdm_val_loader = tqdm(val_loader)

        model.eval()
        for batch_idx, batch in enumerate(tqdm_val_loader):
            sentences_batch, labels_batch = batch

            with torch.no_grad():
                # Make predictions
                logits = model(sentences_batch)

                # Compute loss and number of correct predictions and accumulate metrics and update status
                val_loss += model.loss(logits, labels_batch).item()
                val_correct += model.accuracy(
                    logits, labels_batch).item() * len(logits)
                val_seqs += len(sentences_batch)
                tqdm_val_loader.set_description_str(
                    f"[Loss]: {val_loss / (batch_idx + 1):.4f} [Acc]: {val_correct / val_seqs:.4f}"
                )
        print()

        avg_val_loss = val_loss / len(tqdm_val_loader)
        val_accuracy = val_correct / val_seqs
        print(
            f"[Validation Loss]: {avg_val_loss:.4f} [Validation Accuracy]: {val_accuracy:.4f}"
        )