def get_model(embedding_dimension, essay_length): """ Returns compiled model. """ vocabulary_size = len(tokenizer.word_index) + 1 embedding_matrix = load_embedding_matrix(GLOVE_DIR, embedding_dimension) model = Sequential() model.add( Embedding(vocabulary_size, embedding_dimension, weights=[embedding_matrix], input_length=essay_length, trainable=False, mask_zero=False)) model.add(Flatten()) model.add(Dense(500)) model.add(Dropout(0.4)) model.add(Dense(500)) model.add(Dropout(0.4)) model.add( Dense(1, activation='sigmoid', activity_regularizer=keras.regularizers.l2(0.0))) model.compile(loss='mean_squared_error', optimizer='adam') return model
def get_model(embedding_dimension, essay_length): """ Returns compiled model. """ vocabulary_size = len(tokenizer.word_index) + 1 embedding_matrix = load_embedding_matrix(GLOVE_DIR, embedding_dimension) model = Sequential() model.add( Embedding(vocabulary_size, embedding_dimension, weights=[embedding_matrix], input_length=essay_length, trainable=False, mask_zero=False)) model.add(Conv1D(filters=50, kernel_size=5, padding='same')) model.add( LSTM(300, dropout=0.4, recurrent_dropout=0.4, return_sequences=True)) model.add(Lambda(lambda x: K.mean(x, axis=1))) model.add(Dropout(0.4)) model.add( Dense(1, activation='sigmoid', activity_regularizer=keras.regularizers.l2(0.0))) model.compile(loss='mean_squared_error', optimizer='adam') return model
def train(args): if not os.path.exists('models/'): os.mkdir('models/') #numeric data is a map of zpid to tuple of (zip, beds, baths, price) numeric_data, text_data, prices = preprocessing.load_tabular_data() word_index, tokenizer = util.tokenize_texts(text_data) embedding_matrix = util.load_embedding_matrix(word_index) additional_num_data = np.load('tabular_data/add_num_data.npy') if args.trainable_layers is None: trainable_convnet_layers = 10 else: trainable_convnet_layers = int(args.trainable_layers) if args.reg_weight is None: reg_weight = 0.01 else: reg_weight = float(args.reg_weight) if args.folder is not None: model, config = load_model(args.folder) model_folder = 'models/' + args.folder + '/' else: config = Config(word_index, embedding_matrix, tokenizer, imagenet_weights=True, trainable_convnet_layers=trainable_convnet_layers, n_classes=50, lr=0.0001, reg_weight=reg_weight, img_only=args.img_only, numeric_input_size=additional_num_data.shape[1]+2-1, numeric_only=args.numeric_only, distance_weight=0.01) model = build_model(config) if args.name is not None: if os.path.exists('models/' + args.name): print('A folder with that name already exists.') exit() os.mkdir('models/' + args.name) model_folder = 'models/' + args.name + '/' else: if not args.test: model_subfolders = os.listdir('models/') model_folder = 'models/' + str(len(model_subfolders)) + '/' else: model_folder = '' numeric_data = util.preprocess_numeric_data(numeric_data, additional_num_data) #bins = util.get_bins(prices, num=config.n_classes) bins = util.get_bins(prices, config.n_classes) binned_prices = util.buckets(prices, bins) np.savetxt('binned_prices.csv', binned_prices, delimiter=',') class_weights = 1.0 / (1.0 * np.bincount(binned_prices) / len(binned_prices)) train_model(model, config, numeric_data, text_data, bins, model_folder, tokenizer, args.overfit, class_weights)
def main(): df = pd.read_csv(input_path) df_sample = df.sample(n=1000, random_state=46) df_sample.text = df_sample.text.progress_apply(clean_text) vocab_text = df_sample.text tokenizer = Tokenizer() tokenizer.fit_on_texts(vocab_text) vocab_size = len(tokenizer.word_index) + 1 print("vocab_size = ", vocab_size) embedding_matrix = load_embedding_matrix(glove_100d_path, tokenizer, vocab_size, EMBED_SIZE) # splitting docs into sentences df_upd = df_sample.copy() df_upd.text = df_upd.text.progress_apply(tokenize_sent) padded_doc = df_upd.text.progress_apply(sent_tokenize_pad) X = pad_sequences(padded_doc, maxlen=MAX_SENTENCE_LEN) y = list(df_upd.apply(lambda x: 1 if x["type"] == label else 0, axis=1)) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=36) print("training the model") han_model = get_attention_model(vocab_size=vocab_size, embedding_matrix=embedding_matrix, embed_size=EMBED_SIZE) hist = han_model.fit(X_train, y_train, epochs=7, batch_size=32, validation_split=0.2)
def train(): # Load datasets train_dataset = SST2Dataset("./SST-2/train.tsv") val_dataset = SST2Dataset("./SST-2/dev.tsv", train_dataset.vocab, train_dataset.reverse_vocab) # Create data loaders for creating and iterating over batches print(TRAINING_BATCH_SIZE) train_loader = DataLoader(train_dataset, batch_size=TRAINING_BATCH_SIZE, collate_fn=collate_fn, shuffle=True) val_loader = DataLoader(val_dataset, batch_size=VAL_BATCH_SIZE, collate_fn=collate_fn) # Print out some random examples from the data print("Data examples:") random_indices = torch.randperm(len(train_dataset))[:8].tolist() for index in random_indices: sequence_indices, label = train_dataset.sentences[ index], train_dataset.labels[index] sentiment = "Positive" if label == 1 else "Negative" sequence = train_dataset.indices_to_tokens(sequence_indices) print(f"Sentiment: {sentiment}. Sentence: {sequence}") print() embedding_matrix = load_embedding_matrix(train_dataset.vocab) model = RNNBinaryClassificationModel(embedding_matrix) optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE) for epoch in range(NUM_EPOCHS): # Total loss across train data train_loss = 0. # Total number of correctly predicted training labels train_correct = 0 # Total number of training sequences processed train_seqs = 0 tqdm_train_loader = tqdm(train_loader) print(f"Epoch {epoch + 1}/{NUM_EPOCHS}") model.train() for batch_idx, batch in enumerate(tqdm_train_loader): sentences_batch, labels_batch = batch # Make predictions logits = model(sentences_batch) # Compute loss and number of correct predictions loss = model.loss(logits, labels_batch) correct = model.accuracy(logits, labels_batch).item() * len(logits) optimizer.zero_grad() loss.backward() optimizer.step() # Accumulate metrics and update status train_loss += loss.item() train_correct += correct train_seqs += len(sentences_batch) tqdm_train_loader.set_description_str( f"[Loss]: {train_loss / (batch_idx + 1):.4f} [Acc]: {train_correct / train_seqs:.4f}" ) print() avg_train_loss = train_loss / len(tqdm_train_loader) train_accuracy = train_correct / train_seqs print( f"[Training Loss]: {avg_train_loss:.4f} [Training Accuracy]: {train_accuracy:.4f}" ) print("Validating") # Total loss across validation data val_loss = 0. # Total number of correctly predicted validation labels val_correct = 0 # Total number of validation sequences processed val_seqs = 0 tqdm_val_loader = tqdm(val_loader) model.eval() for batch_idx, batch in enumerate(tqdm_val_loader): sentences_batch, labels_batch = batch with torch.no_grad(): # Make predictions logits = model(sentences_batch) # Compute loss and number of correct predictions and accumulate metrics and update status val_loss += model.loss(logits, labels_batch).item() val_correct += model.accuracy( logits, labels_batch).item() * len(logits) val_seqs += len(sentences_batch) tqdm_val_loader.set_description_str( f"[Loss]: {val_loss / (batch_idx + 1):.4f} [Acc]: {val_correct / val_seqs:.4f}" ) print() avg_val_loss = val_loss / len(tqdm_val_loader) val_accuracy = val_correct / val_seqs print( f"[Validation Loss]: {avg_val_loss:.4f} [Validation Accuracy]: {val_accuracy:.4f}" )