def train_pair(args, train_csv, test_csv): print('Reading word vectors.') embeddings_index = read_glove_vectors(args.embedding_file_path) print('Found {} word vectors.'.format(len(embeddings_index))) print('Processing input data') x_train, y_train, x_test, y_test, word_index, = read_input_csv(train_csv, test_csv, args.nb_words, args.max_sequence_len) print('train tensor {}.'.format(x_train.shape)) print('Preparing embedding matrix.') # initiate embedding matrix with zero vectors. nb_words = min(args.nb_words, len(word_index)) embedding_matrix = np.zeros((nb_words + 1, args.embedding_dim)) for word, i in word_index.items(): if i > nb_words: continue embedding_vector = embeddings_index.get(word) if embedding_vector is not None: embedding_matrix[i] = embedding_vector args.nb_words = nb_words # args.len_labels_index = len(labels_index) args.len_labels_index = 2 # fixed for sentiment detection. model = model_selector(args, embedding_matrix) checkpoint_filepath = os.path.join(args.model_dir, "weights.best.hdf5") checkpoint = ModelCheckpoint(checkpoint_filepath, monitor='val_loss', verbose=1, save_best_only=True) earlystop = EarlyStopping(monitor='val_loss', patience=1, verbose=1) tsb = TensorBoard(log_dir='./log', histogram_freq=0, write_graph=True, write_images=False) callbacks_list = [checkpoint, earlystop, tsb] model_json = model.to_json() with open(os.path.join(args.model_dir, "model.json"), "w") as json_file: json_file.write(model_json) model.fit(x_train, y_train, validation_split=0.1, nb_epoch=args.num_epochs, batch_size=args.batch_size, callbacks=callbacks_list) classes = earlystop.model.predict_classes(x_test, batch_size=args.batch_size) # acc only supports classes acc = np_utils.accuracy(classes, np_utils.categorical_probas_to_classes(y_test)) print('Test accuracy: {}.'.format(acc))
def train(args): print('Reading word vectors.') embeddings_index = read_glove_vectors(args.embedding_file_path) print('Found {} word vectors.'.format(len(embeddings_index))) print('Processing input data') texts, labels = read_input_data(args.data_dir) # texts - list of text samples # labels_index - dictionary mapping label name to numeric id # labels - list of label ids print('Found {} texts.'.format(len(texts))) # Vectorize the text sample into 2D integer tensor tokenizer = Tokenizer(nb_words=args.nb_words) tokenizer.fit_on_texts(texts) sequences = tokenizer.texts_to_sequences(texts) word_index = tokenizer.word_index # Transform labels to be categorical variables print('Found {} unique tokens.'.format(len(word_index))) data = pad_sequences(sequences, maxlen=args.max_sequence_len) x_train, y_train = train_data(data, labels) print(type(data)) print(x_train[100]) x_train = np.array(x_train).astype('int32') print(x_train[100]) # Transform labels to be categorical variables labels = to_categorical(np.asarray(labels)) y_train = to_categorical(np.asarray(y_train)) print('Shape of total data tensor:', data.shape) print('Shape of total label tensor:', labels.shape) # split the input data into training set and validation set indices = np.arange(x_train.shape[0]) np.random.shuffle(indices) x_train = x_train[indices] y_train = y_train[indices] indices = np.arange(data.shape[0]) np.random.shuffle(indices) x_val = data[indices] y_val = labels[indices] print('Preparing embedding matrix.') # initiate embedding matrix with zero vectors. nb_words = min(args.nb_words, len(word_index)) embedding_matrix = np.zeros((nb_words + 1, args.embedding_dim)) for word, i in word_index.items(): if i > nb_words: continue embedding_vector = embeddings_index.get(word) if embedding_vector is not None: embedding_matrix[i] = embedding_vector args.nb_words = nb_words args.len_labels_index = 3 model = model_selector(args, embedding_matrix) checkpoint_filepath = os.path.join(args.model_dir, "new.en.msd.weights.best.hdf5") checkpoint = ModelCheckpoint(checkpoint_filepath, monitor='val_acc', verbose=1, save_best_only=True) callbacks_list = [checkpoint] model_json = model.to_json() with open(os.path.join(args.model_dir, "new.en.msd.model.json"), "w") as json_file: json_file.write(model_json) model.fit(x_train, y_train, validation_data=(x_val, y_val), nb_epoch=args.num_epochs, batch_size=args.batch_size, callbacks=callbacks_list, verbose=1) proba = model.predict_proba(data, batch_size=300) np.savetxt('new_en_msd', proba, delimiter='\t', fmt='%.6f')
def train(args): print('Reading word vectors.') #embeddings_index = read_glove_vectors(args.embedding_file_path) embeddings_index = read_glove_vectors( "/home/quan/Desktop/CNN-Sentence-Classifier/app/GoogleNews-vectors-negative300.txt" ) #embeddings_index = read_glove_vectors("/home/duong/Desktop/CNN-Sentence-Classifier/app/glove.txt") print('Found {} word vectors in emdedding1.'.format(len(embeddings_index))) #print('Found {} word vectors in embedding2.'.format(len(embeddings_index2))) print('Processing input data') # input_name = ["input_CR_prccd.txt", "input_Sub_prccd.txt", "input_MPQA_prccd.txt", "inputPCQM_prccd.txt", # "input_flood_phi_prccd.txt", "input_flood_colorado_prccd.txt", "input_flood_qeen_prccd.txt", # "input_flood_manila_prccd.txt", "input_fire_australia_prccd.txt", "input_earthquake_chile_prccd.txt"] # label_name = ["label_CR.txt", "label_input_Sub.txt", "label_MPQA.txt", "labelPCQM.txt", "label_flood_phi.txt", # "label_flood_colorado.txt", "label_flood_qeen.txt", "label_flood_manila.txt", # "label_fire_australia.txt", "label_earthquake_chile.txt"] input_name = ["input_Nepal2.txt"] label_name = ["label_Nepal2.txt"] with open("30JanMulti_Nepal_Train2_W2V_nonstatic.txt", 'wb') as result_CV: for list in range(0, len(input_name)): texts, labels_index, labels = read_input_data( args.data_dir, input_name[list], label_name[list]) # texts - list of text samples # labels_index - dictionary mapping label name to numeric id # labels - list of label ids print('Found {} texts.'.format(len(texts))) # Vectorize the text sample into 2D integer tensor tokenizer = Tokenizer(nb_words=args.nb_words) tokenizer.fit_on_texts(texts) sequences = tokenizer.texts_to_sequences(texts) word_index = tokenizer.word_index print('Found {} unique tokens.'.format(len(word_index))) data = pad_sequences(sequences, maxlen=args.max_sequence_len) # Transform labels to be categorical variables labels = to_categorical(np.asarray(labels)) print('Shape of data tensor:', data.shape) print('Shape of label tensor:', labels.shape) # split the input data into training set and validation set indices = np.arange(data.shape[0]) np.random.shuffle(indices) data = data[indices] labels = labels[indices] # nb_validation_samples = int(args.validation_split * data.shape[0]) # x_train = data[:-nb_validation_samples] # y_train = labels[:-nb_validation_samples] # x_val = data[-nb_validation_samples:] # y_val = labels[-nb_validation_samples:] print('Preparing embedding matrix.') # initiate embedding matrix with zero vectors for embedding1. nb_words = min(args.nb_words, len(word_index)) embedding_matrix = np.zeros((nb_words + 1, args.embedding_dim)) for word, i in word_index.items(): if i > nb_words: continue embedding_vector = embeddings_index.get(word) if embedding_vector is not None: embedding_matrix[i] = embedding_vector args.nb_words = nb_words args.len_labels_index = len(labels_index) '''Remember uncomment model according to model.fit below''' model = model_selector(args, embedding_matrix) #model = model_selector2(args, embedding_matrix, embedding_matrix2) #model = model_selector3(args, embedding_matrix) # cv_scores = [] ROC_scores = [] fold = 10 for i in range(0, fold): print("\n") print("\n") print("\n") print("-------------FOLD :", (i + 1)) window_data = data.shape[0] / fold # Generate batches from indices x_train1 = data[:i * window_data] x_train2 = data[(i + 1) * window_data:] y_train1 = labels[:i * window_data] y_train2 = labels[(i + 1) * window_data:] if i == 0: x_train = x_train2 y_train = y_train2 else: x_train = np.concatenate((x_train1, x_train2), axis=0) y_train = np.concatenate((y_train1, y_train2), axis=0) x_val = data[i * window_data:(i + 1) * window_data] y_val = labels[i * window_data:(i + 1) * window_data] # Clear model and create model = None model = model_selector(args, embedding_matrix) # checkpoint_filepath = os.path.join(args.model_dir, "weights.best.hdf5") # earlystopper = EarlyStopping(monitor='val_loss', patience=3, verbose=1) # checkpointer = ModelCheckpoint(checkpoint_filepath, monitor='val_loss', verbose=1, save_best_only=True) # callbacks_list = [earlystopper, checkpointer] # model_json = model.to_json() # with open(os.path.join(args.model_dir, "model.json"), "w") as json_file: # json_file.write(model_json) # model.fit(x_train, y_train, epochs=30, batch_size=32, verbose=0) # model.fit(x_train, y_train, validation_data=(x_val, y_val), nb_epoch=args.num_epochs, # batch_size=args.batch_size, callbacks=callbacks_list) y_prob = model.predict(x_val) roc = metrics.roc_auc_score(y_val, y_prob) print("ROC Prediction (binary classification):", roc) scores = model.evaluate(x_val, y_val, verbose=0) print("%s: %.2f%%" % (model.metrics_names[1], scores[1] * 100)) cv_scores.append(scores[1] * 100) ROC_scores.append(roc * 100) print(input_name[list]) print("ACC: %.2f%% (+/- %.2f%%)" % (np.mean(cv_scores), np.std(cv_scores))) print("ROC: %.2f%% (+/- %.2f%%)" % (np.mean(ROC_scores), np.std(ROC_scores))) result_CV.write(input_name[list] + " ACC: %.2f%% (+/- %.2f%%)" % (np.mean(cv_scores), np.std(cv_scores)) + " ROC: %.2f%% (+/- %.2f%%)" % (np.mean(ROC_scores), np.std(ROC_scores)) + '\n') result_CV.write(time.asctime(time.localtime(time.time())) + '\n')
def train(args): print('Reading word vectors.') #embeddings_index = read_glove_vectors(args.embedding_file_path) embeddings_index = read_glove_vectors( "/home/duong/Desktop/CNN-Sentence-Classifier/app/GoogleNews-vectors-negative300.txt" ) #embeddings_index2 = read_glove_vectors("/home/duong/Desktop/CNN-Sentence-Classifier/app/glove2.txt") print('Found {} word vectors in emdedding1.'.format(len(embeddings_index))) #print('Found {} word vectors in embedding2.'.format(len(embeddings_index2))) print('Processing input data') texts, labels_index, labels = read_input_data(args.data_dir) # texts - list of text samples # labels_index - dictionary mapping label name to numeric id # labels - list of label ids print('Found {} texts.'.format(len(texts))) # Vectorize the text sample into 2D integer tensor tokenizer = Tokenizer(nb_words=args.nb_words) tokenizer.fit_on_texts(texts) sequences = tokenizer.texts_to_sequences(texts) word_index = tokenizer.word_index print('Found {} unique tokens.'.format(len(word_index))) data = pad_sequences(sequences, maxlen=args.max_sequence_len) # Transform labels to be categorical variables labels = to_categorical(np.asarray(labels)) print('Shape of data tensor:', data.shape) print('Shape of label tensor:', labels.shape) # split the input data into training set and validation set indices = np.arange(data.shape[0]) np.random.shuffle(indices) data = data[indices] labels = labels[indices] nb_validation_samples = int(args.validation_split * data.shape[0]) x_train = data[:-nb_validation_samples] y_train = labels[:-nb_validation_samples] x_val = data[-nb_validation_samples:] y_val = labels[-nb_validation_samples:] print('Preparing embedding matrix.') # initiate embedding matrix with zero vectors for embedding1. nb_words = min(args.nb_words, len(word_index)) embedding_matrix = np.zeros((nb_words + 1, args.embedding_dim)) for word, i in word_index.items(): if i > nb_words: continue embedding_vector = embeddings_index.get(word) if embedding_vector is not None: embedding_matrix[i] = embedding_vector args.nb_words = nb_words args.len_labels_index = len(labels_index) '''Remember uncomment model according to model.fit below''' model = model_selector(args, embedding_matrix) #model = model_selector2(args, embedding_matrix, embedding_matrix2) #model = model_selector3(args, embedding_matrix) # checkpoint_filepath = os.path.join(args.model_dir, "weights.best.hdf5") # checkpoint = ModelCheckpoint(checkpoint_filepath, monitor='val_loss', # verbose=1, save_best_only=True) # callbacks_list = [checkpoint] earlystopper = EarlyStopping(monitor='val_loss', patience=3, verbose=1) checkpointer = ModelCheckpoint(checkpoint_filepath, monitor='val_loss', verbose=1, save_best_only=True) callbacks_list = [earlystopper, checkpointer] model_json = model.to_json() with open(os.path.join(args.model_dir, "model.json"), "w") as json_file: json_file.write(model_json) model.fit(x_train, y_train, validation_data=(x_val, y_val), nb_epoch=args.num_epochs, batch_size=args.batch_size, callbacks=callbacks_list) #model.fit(x_train, x_train], y_train, validation_data=([x_val, x_val], y_val), nb_epoch=args.num_epochs, # batch_size=args.batch_size, callbacks=callbacks_list) #model.fit([x_train, x_train, x_train], y_train, validation_data=([x_val, x_val, x_val], y_val), nb_epoch=args.num_epochs, # batch_size=args.batch_size, callbacks=callbacks_list) print("Test model ...") print("Loading ...", checkpoint_filepath) model.load_weights(checkpoint_filepath) y_prob = model.predict(x_val) roc = metrics.roc_auc_score(y_val, y_prob) print("ROC Prediction (binary classification):", roc)
def run(args): # Data path path = args.data_dir #path = "data/" X_train = os.path.join(path, 'X.train') Y_train = os.path.join(path, 'Y.train') X_online_test = os.path.join(path, 'X.test') all_text_path = os.path.join(path, 'all_text') id_test = os.path.join(path, 'id.test') # Seed seed = 13 # fix random seed for reproducibility np.random.seed(seed) #print("Reading all text...") all_texts = open(all_text_path).readlines() #print("Tokenizing...") tokenizer = Tokenizer(num_words=args.nb_words) #print("Fitting...") tokenizer.fit_on_texts(all_texts) word_index = tokenizer.word_index print('Found %s unique tokens.' % len(word_index)) print("Loading training data...") X_train, y_train = ld.load_data(shulf=True, X_train=X_train, Y_train=Y_train, tokenizer=tokenizer, max_len=args.max_sequence_len) # Select Model model = model_selector(args, word_index) print(model.summary()) # Callback list callbacks = [] #filepath = "weights-improvement-{epoch:02d}-{acc:.2f}.hdf5" #check_point = keras.callbacks.ModelCheckpoint(filepath, monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=False, mode='auto', period=1) if args.early_stop == 1: eraly_stop = keras.callbacks.EarlyStopping(monitor='val_acc', min_delta=0, patience=0, verbose=0, mode='auto') callbacks.append(eraly_stop) print("Training...") r = model.fit(X_train, y_train, epochs=args.num_epochs, batch_size=args.batch_size, callbacks=callbacks, validation_split=args.validation_split) # Find out the best validation accuracy best_val_acc = max(r.history['val_acc']) if (best_val_acc < 0.69): print("Low Val_Acc.") # load X.validate X_validate, y_validate = ld.load_data(X_train=X_online_test, Y_train=None, tokenizer=tokenizer, max_len=args.max_sequence_len) # predict y_validate = model.predict(X_validate, verbose=0) # Convert from Categorical to numberical y_validate = np.argmax(y_validate, axis=1) # Compare with the baseline count_diff = 0 count_same = 0 if args.baseline: with open(args.baseline) as f: lines = f.readlines() for i in range(len(lines)): if y_validate[i] == int(lines[i].split('\t')[1]): count_same += 1 else: count_diff += 1 print("Same:%d" % count_same) # Load validation Ids ids = np.loadtxt(id_test, dtype=bytes).astype(str) assert (len(ids) == len(y_validate)) # Generate result result = [[ ids[i] + "\t" + str(y_validate[i]) + "\t" + "NULL" + "\t" + "NULL" ] for i in range(len(y_validate))] # Time ts = time.strftime("%Y%m%d-%H%M%S", time.localtime()) # Format output output = ts + ( "_acc_%.2f_%s_%s_%d_%d_%d_%d_%d_%d" % (best_val_acc * 100, args.model_name, path.replace("/", "-"), args.nb_words, args.max_sequence_len, args.embedding_dim, args.batch_size, count_same, args.use_word_embedding)) np.savetxt("result_" + output + ".txt", result, fmt='%s') # Done print("Done!")