def commons_testing(model_loaded, test, test_lab, identifier_string): test = [nltk.word_tokenize(i.lower()) for i in test] testTextsSeq = np.array([[word2id.get(w, word2id["UNK"]) for w in sent] for sent in test]) test_seq = sequence.pad_sequences(testTextsSeq, maxlen=max_doc_length, dtype='int32', padding='post', truncating='post', value=0.0) if TIMEDISTRIBUTED: test_lab = tile_reshape(test_lab, num_time_steps) else: test_lab = to_categorical(test_lab, 2) preds = model.predict(test_seq) prediction = pd.DataFrame(list(zip(np.argmax(test_lab,axis=1), np.argmax(preds, axis=1), [" ".join(i[:100]) for i in test])), columns=['label','prediction','text']).to_csv(identifier_string+'_prediction.csv') accuracy = accuracy_score(np.argmax(test_lab,axis=1), np.argmax(preds, axis=1)) print("accuracy:", accuracy) f1 = f1_score(np.argmax(test_lab,axis=1), np.argmax(preds, axis=1), average="weighted") print("F1=", f1) np.savetxt("LSTM"+trainingdata+"_"+identifier_string+"_labels.txt",test_lab) np.savetxt("LSTM"+trainingdata+"_"+identifier_string+"_preds.txt",preds) tn, fp, fn, tp = confusion_matrix(np.argmax(test_lab,axis=1), np.argmax(preds, axis=1)).ravel() print("tn, fp, fn, tp") print(tn, fp, fn, tp) test_score = model_loaded.evaluate(test_seq, test_lab, batch_size=num_batch, verbose=0) print("Test loss:", test_score[0]) print("Test accuracy:", test_score[1]) if TIMEDISTRIBUTED: test_preds = model_loaded.predict(test_seq) retrieve_lstmvis_files(model_loaded, test_seq, test_lab, test_preds, identifier_string)
test_seq = sequence.pad_sequences(testTextsSeq, maxlen=max_doc_length, dtype='int32', padding='post', truncating='post', value=0.0) if trainingdata == "liar": dev_seq = sequence.pad_sequences(devTextsSeq, maxlen=max_doc_length, dtype='int32', padding='post', truncating='post', value=0.0) if TIMEDISTRIBUTED: train_lab = tile_reshape(train_lab, num_time_steps) test_lab = tile_reshape(test_lab, num_time_steps) print(train_lab.shape) if trainingdata == "liar": dev_lab = tile_reshape(dev_lab, num_time_steps) else: train_lab = to_categorical(train_lab, 2) test_lab = to_categorical(test_lab, 2) print(train_lab.shape) if trainingdata == "liar": dev_lab = to_categorical(dev_lab, 2) print("validation target shape", dev_lab.shape) print("train target shape", train_lab.shape) #print("Parameters:: num_cells: "+str(num_cells)+" num_samples: "+str(num_samples)+" embedding_size: "+str(embedding_size)+" epochs: "+str(num_epochs)+" batch_size: "+str(num_batch))
def train_and_test(datapath="/home/ktj250/thesis/data/", emb_model_path="/home/ktj250/thesis/", TIMEDISTRIBUTED=False, trainingdata="liar", num_cells=32, num_epochs=10, dropout=0.4, r_dropout=0.4, num_batch=64, learning_rate=0.0001): K.clear_session() #colab_directory_path = "/gdrive/My Drive/Thesis/" #TIMEDISTRIBUTED = False use_pretrained_embeddings = True FAKE=1 #trainingdata = sys.argv[1] #"liar" # kaggle, FNC, BS print("trainingdata=",trainingdata) if trainingdata == "liar": train, dev, test, train_lab, dev_lab, test_lab = load_liar_data(datapath) elif trainingdata == "kaggle": train, test, train_lab, test_lab = load_kaggle_data(datapath) elif trainingdata == "FNC": train, test, train_lab, test_lab = load_FNC_data(datapath) elif trainingdata == "BS": train, test, train_lab, test_lab = load_BS_data(datapath) train = [nltk.word_tokenize(i.lower()) for i in train] test = [nltk.word_tokenize(i.lower()) for i in test] if trainingdata == "liar": dev = [nltk.word_tokenize(i.lower()) for i in dev] else: dev = train[int(abs((len(train_lab)/3)*2)):] dev_lab = train_lab[int(abs((len(train_lab)/3)*2)):] train = train[:int(abs((len(train_lab)/3)*2))] train_lab = train_lab[:int(abs((len(train_lab)/3)*2))] print(len(train), len(dev)) all_train_tokens = [] for i in train: for word in i: all_train_tokens.append(word) vocab = set(all_train_tokens) word2id = {word: i+1 for i, word in enumerate(vocab)}# making the first id is 1, so that I can pad with zeroes. word2id["UNK"] = len(word2id)+1 id2word = {v: k for k, v in word2id.items()} #trainTextsSeq: List of input sequence for each document (A matrix with size num_samples * max_doc_length) trainTextsSeq = np.array([[word2id[w] for w in sent] for sent in train]) testTextsSeq = np.array([[word2id.get(w, word2id["UNK"]) for w in sent] for sent in test]) #if trainingdata == "liar": devTextsSeq = np.array([[word2id.get(w, word2id["UNK"]) for w in sent] for sent in dev]) # PARAMETERS # vocab_size: number of tokens in vocabulary vocab_size = len(word2id)+1 # max_doc_length: length of documents after padding (in Keras, the length of documents are usually padded to be of the same size) max_doc_length = 100 # LIAR 100 (like Wang), Kaggle 3391, FakeNewsCorpus 2669 # num_samples: number of training/testing data samples num_samples = len(train_lab) # num_time_steps: number of time steps in LSTM cells, usually equals to the size of input, i.e., max_doc_length num_time_steps = max_doc_length embedding_size = 300 # also just for now.. # padding with max doc lentgh seq = sequence.pad_sequences(trainTextsSeq, maxlen=max_doc_length, dtype='int32', padding='post', truncating='post', value=0.0) print("train seq shape",seq.shape) test_seq = sequence.pad_sequences(testTextsSeq, maxlen=max_doc_length, dtype='int32', padding='post', truncating='post', value=0.0) #if trainingdata == "liar": dev_seq = sequence.pad_sequences(devTextsSeq, maxlen=max_doc_length, dtype='int32', padding='post', truncating='post', value=0.0) if TIMEDISTRIBUTED: train_lab = tile_reshape(train_lab, num_time_steps) test_lab = tile_reshape(test_lab, num_time_steps) print(train_lab.shape) #if trainingdata == "liar": dev_lab = tile_reshape(dev_lab, num_time_steps) else: train_lab = to_categorical(train_lab, 2) test_lab = to_categorical(test_lab, 2) print(train_lab.shape) #if trainingdata == "liar": dev_lab = to_categorical(dev_lab, 2) print("Parameters:: num_cells: "+str(num_cells)+" num_samples: "+str(num_samples)+" embedding_size: "+str(embedding_size)+" epochs: "+str(num_epochs)+" batch_size: "+str(num_batch)) if use_pretrained_embeddings: # https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html # Load Google's pre-trained Word2Vec model. model = gensim.models.KeyedVectors.load_word2vec_format(emb_model_path+'GoogleNews-vectors-negative300.bin', binary=True) embedding_matrix = np.zeros((len(word2id) + 1, 300)) for word, i in word2id.items(): try: embedding_vector = model.wv[word] except: embedding_vector = model.wv["UNK"] if embedding_vector is not None: embedding_matrix[i] = embedding_vector myInput = Input(shape=(max_doc_length,), name='input') print(myInput.shape) if use_pretrained_embeddings: x = Embedding(input_dim=vocab_size, output_dim=embedding_size, weights=[embedding_matrix],input_length=max_doc_length,trainable=True)(myInput) else: x = Embedding(input_dim=vocab_size, output_dim=embedding_size, input_length=max_doc_length)(myInput) print(x.shape) if TIMEDISTRIBUTED: lstm_out = LSTM(num_cells, dropout=dropout, recurrent_dropout=r_dropout, return_sequences=True, kernel_constraint=NonNeg())(x) predictions = TimeDistributed(Dense(1, activation='sigmoid', kernel_constraint=NonNeg()))(lstm_out) else: lstm_out = Bidirectional(LSTM(num_cells, dropout=dropout, recurrent_dropout=r_dropout))(x) predictions = Dense(2, activation='softmax')(lstm_out) model = Model(inputs=myInput, outputs=predictions) opt = Adam(lr=learning_rate) if TIMEDISTRIBUTED: model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy']) else: model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy']) print("fitting model..") #if trainingdata == "liar": history = model.fit({'input': seq}, train_lab, epochs=num_epochs, verbose=2, batch_size=num_batch, validation_data=(dev_seq,dev_lab)) #else: # history = model.fit({'input': seq}, train_lab, epochs=num_epochs, verbose=2, batch_size=num_batch) print("Testing...") test_score = model.evaluate(test_seq, test_lab, batch_size=num_batch, verbose=0) #if trainingdata == "liar": dev_score = model.evaluate(dev_seq, dev_lab, batch_size=num_batch, verbose=0) print("Test loss:", test_score[0]) print("Test accuracy:", test_score[1]) #if trainingdata == "liar": print("Valid loss:", dev_score[0]) print("Valid accuracy:", dev_score[1]) if not TIMEDISTRIBUTED: preds = model.predict(test_seq) f1 = f1_score(np.argmax(test_lab,axis=1), np.argmax(preds, axis=1)) tn, fp, fn, tp = confusion_matrix(np.argmax(test_lab,axis=1), np.argmax(preds, axis=1)).ravel() print("tn, fp, fn, tp") print(tn, fp, fn, tp) model.summary() #if trainingdata=="liar": # return dev_score[1], history #else: return test_score[1], dev_score[1], history, f1
def pre_modelling_stuff(TIMEDISTRIBUTED=False, trainingdata="liar", datapath="/home/ktj250/thesis/data/", emb_model_path="/home/ktj250/thesis/"): #directory_path = "/gdrive/My Drive/Thesis/" #TIMEDISTRIBUTED = False use_pretrained_embeddings = True FAKE=1 #trainingdata = sys.argv[1] #"liar" # kaggle, FNC, BS print("trainingdata=",trainingdata) if trainingdata == "liar": train, dev, test, train_lab, dev_lab, test_lab = load_liar_data(datapath) elif trainingdata == "kaggle": train, test, train_lab, test_lab = load_kaggle_data(datapath) elif trainingdata == "FNC": train, test, train_lab, test_lab = load_FNC_data(datapath) elif trainingdata == "BS": train, test, train_lab, test_lab = load_BS_data(datapath) train = [nltk.word_tokenize(i.lower()) for i in train] test = [nltk.word_tokenize(i.lower()) for i in test] if trainingdata == "liar": dev = [nltk.word_tokenize(i.lower()) for i in dev] else: dev = train[int(abs((len(train_lab)/3)*2)):] dev_lab = train_lab[int(abs((len(train_lab)/3)*2)):] train = train[:int(abs((len(train_lab)/3)*2))] train_lab = train_lab[:int(abs((len(train_lab)/3)*2))] print(len(train), len(dev)) all_train_tokens = [] for i in train: for word in i: all_train_tokens.append(word) vocab = set(all_train_tokens) word2id = {word: i+1 for i, word in enumerate(vocab)}# making the first id is 1, so that I can pad with zeroes. word2id["UNK"] = len(word2id)+1 id2word = {v: k for k, v in word2id.items()} #trainTextsSeq: List of input sequence for each document (A matrix with size num_samples * max_doc_length) trainTextsSeq = np.array([[word2id[w] for w in sent] for sent in train]) testTextsSeq = np.array([[word2id.get(w, word2id["UNK"]) for w in sent] for sent in test]) #if trainingdata == "liar": devTextsSeq = np.array([[word2id.get(w, word2id["UNK"]) for w in sent] for sent in dev]) # PARAMETERS # vocab_size: number of tokens in vocabulary vocab_size = len(word2id)+1 # max_doc_length: length of documents after padding (in Keras, the length of documents are usually padded to be of the same size) max_doc_length = 100 # LIAR 100 (like Wang), Kaggle 3391, FakeNewsCorpus 2669 # num_samples: number of training/testing data samples num_samples = len(train_lab) # num_time_steps: number of time steps in LSTM cells, usually equals to the size of input, i.e., max_doc_length num_time_steps = max_doc_length embedding_size = 300 # also just for now.. # padding with max doc lentgh seq = sequence.pad_sequences(trainTextsSeq, maxlen=max_doc_length, dtype='int32', padding='post', truncating='post', value=0.0) print("train seq shape",seq.shape) test_seq = sequence.pad_sequences(testTextsSeq, maxlen=max_doc_length, dtype='int32', padding='post', truncating='post', value=0.0) #if trainingdata == "liar": dev_seq = sequence.pad_sequences(devTextsSeq, maxlen=max_doc_length, dtype='int32', padding='post', truncating='post', value=0.0) if TIMEDISTRIBUTED: train_lab = tile_reshape(train_lab, num_time_steps) test_lab = tile_reshape(test_lab, num_time_steps) print(train_lab.shape) #if trainingdata == "liar": dev_lab = tile_reshape(dev_lab, num_time_steps) else: train_lab = to_categorical(train_lab, 2) test_lab = to_categorical(test_lab, 2) print(train_lab.shape) #if trainingdata == "liar": dev_lab = to_categorical(dev_lab, 2) if use_pretrained_embeddings: # https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html # Load Google's pre-trained Word2Vec model. model = gensim.models.KeyedVectors.load_word2vec_format(emb_model_path+'GoogleNews-vectors-negative300.bin', binary=True) embedding_matrix = np.zeros((len(word2id) + 1, 300)) for word, i in word2id.items(): try: embedding_vector = model.wv[word] except: embedding_vector = model.wv["UNK"] if embedding_vector is not None: embedding_matrix[i] = embedding_vector #if trainingdata=="liar": return embedding_matrix, seq, test_seq, dev_seq, train_lab, test_lab, dev_lab, vocab_size