def Update_Word_Embedding(languege): if(languege==0): question_pre = load_data(pre_path_files+'Persian/question_pre') answering_pre = load_data(pre_path_files+'Persian/answering_pre') tokenizer, embedding_matrix = word_embed_meta_data(question_pre + answering_pre,siamese_config['EMBEDDING_DIM']) save_data(pre_path_files+'Persian/tokenizer',tokenizer) save_data(pre_path_files+'Persian/embedding_matrix',embedding_matrix) elif(languege==1): question_pre = load_data(pre_path_files+'English/question_pre') answering_pre = load_data('English/answering_pre') tokenizer, embedding_matrix = word_embed_meta_data(question_pre + answering_pre,siamese_config['EMBEDDING_DIM']) save_data(pre_path_files+'English/tokenizer', tokenizer) save_data(pre_path_files+'English/embedding_matrix', embedding_matrix)
def Word_Embedding(languege='Persian'): file='' if not os.path.exists(pre_path_files+languege+'/Word2Vec'): os.makedirs(pre_path_files+languege+'/Word2Vec') file='model-'+str(datetime.datetime.now().strftime("%Y-%m-%d-%I-%M"))+'.model' else : file=glob.glob(pre_path_files+languege+'/Word2Vec/')[0] question_pre = load_data(pre_path_files+languege+'/question_pre') question_pre=[k for k, v in itertools.groupby(question_pre)] answering_pre = load_data(pre_path_files+languege+'/answering_pre') answering_pre=[k for k, v in itertools.groupby(answering_pre)] tokenizer, embedding_matrix = word_embed_meta_data(question_pre + answering_pre,siamese_config['EMBEDDING_DIM'],pre_path_files+languege+'/Word2Vec/'+file) save_data(pre_path_files+languege+'/Word2Vec/tokenizer',tokenizer) save_data(pre_path_files+languege+'/Word2Vec/embedding_matrix',embedding_matrix)
def fn(): df = pd.read_csv('train_data.csv') sentences1 = list(df['sentences1']) sentences2 = list(df['sentences2']) is_similar = list(df['is_similar']) sentences1 = pre.preprocessing(sentences1) sentences2 = pre.preprocessing(sentences2) tokenizer, embedding_matrix = word_embed_meta_data( sentences1 + sentences2, siamese_config['EMBEDDING_DIM']) embedding_meta_data = { 'tokenizer': tokenizer, 'embedding_matrix': embedding_matrix } return tokenizer, embedding_matrix, embedding_meta_data
def get_doc2vec_vectors_train_valid_split(trainingData): # split the dataset into training and validation datasets train_x, valid_x, train_y, valid_y = get_train_test_split_of_dataframe( trainingData, False) # label encode the target variable encoder = preprocessing.LabelEncoder() train_y = encoder.fit_transform(train_y) valid_y = encoder.fit_transform(valid_y) sentences1 = train_x['Q1'] sentences2 = train_x['Q2'] is_similar = list(train_y) sentences1_validate = valid_x['Q1'] sentences2_validate = valid_x['Q2'] is_similar_validate = list(valid_y) tokenizer, embedding_matrix = word_embed_meta_data( sentences1 + sentences2, siamese_config['EMBEDDING_DIM']) embedding_meta_data = { 'tokenizer': tokenizer, 'embedding_matrix': embedding_matrix } ## creating sentence pairs sentences_pairs = [(x1, x2) for x1, x2 in zip(sentences1, sentences2)] del sentences1 del sentences2 sentences_pairs_validate = [ (x1, x2) for x1, x2 in zip(sentences1_validate, sentences2_validate) ] del sentences1_validate del sentences2_validate test_data_x1, test_data_x2, leaks_test = create_test_data( tokenizer, sentences_pairs_validate, siamese_config['MAX_SEQUENCE_LENGTH']) test_data_x = [test_data_x1, test_data_x2, leaks_test] return sentences_pairs, is_similar, test_data_x, is_similar_validate, embedding_meta_data
df = pd.read_csv('intelligent_component/sample_data.csv') sentences1 = list(df['sentences1']) sentences2 = list(df['sentences2']) is_similar = list(df['is_similar']) del df #################################### ######## Word Embedding ############ #################################### # creating word embedding meta data for word embedding tokenizer, embedding_matrix = word_embed_meta_data(sentences1 + sentences2, siamese_config['EMBEDDING_DIM']) embedding_meta_data = { 'tokenizer': tokenizer, 'embedding_matrix': embedding_matrix } ## creating sentence pairs sentences_pair = [(x1, x2) for x1, x2 in zip(sentences1, sentences2)] del sentences1 del sentences2 ########################## ######## Training ######## ##########################
def Execute_Model(self): EMBEDDING_DIM = 50 MAX_SEQUENCE_LENGTH = 10 RATE_DROP_LSTM = 0.17 RATE_DROP_DENSE = 0.25 NUMBER_LSTM = 50 NUMBER_DENSE_UNITS = 50 ACTIVATION_FUNCTION = 'relu' VALIDATION_SPLIT = 0.1 sentences1 = list(self.df['question1'].astype(str)) sentences2 = list(self.df['question2'].astype(str)) is_similar = list(self.df['is_duplicate']) tokenizer, embedding_matrix = word_embed_meta_data( sentences1 + sentences2, EMBEDDING_DIM) embedding_meta_data = { 'tokenizer': tokenizer, 'embedding_matrix': embedding_matrix } sentences_pair = [(x1, x2) for x1, x2 in zip(sentences1, sentences2)] nb_words = len(tokenizer.word_index) + 1 embedding_layer = layers.Embedding(nb_words, siamese_config['EMBEDDING_DIM'], weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False) lstm_layer = layers.Bidirectional( layers.LSTM(NUMBER_LSTM, dropout=RATE_DROP_LSTM, recurrent_dropout=RATE_DROP_LSTM)) sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH, ), dtype='int32') embedded_sequences_1 = embedding_layer(sequence_1_input) left_output = lstm_layer(embedded_sequences_1) sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH, ), dtype='int32') embedded_sequences_2 = embedding_layer(sequence_2_input) right_output = lstm_layer(embedded_sequences_2) merged = layers.concatenate([left_output, right_output], axis=-1) merged = BatchNormalization()(merged) merged = layers.Dropout(0.1)(merged) merged = layers.Dense(128, activation='relu')(merged) merged = BatchNormalization()(merged) merged = layers.Dropout(0.1)(merged) predictions = layers.Dense(1, activation='sigmoid')(merged) model = Model([sequence_1_input, sequence_2_input], predictions) model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['acc']) model.summary() train_data_x1, train_data_x2, train_labels, leaks_train, \ val_data_x1, val_data_x2, val_labels, leaks_val = create_train_dev_set(tokenizer, sentences_pair, is_similar, MAX_SEQUENCE_LENGTH, VALIDATION_SPLIT) callbacks = [ keras.callbacks.TensorBoard( log_dir= 'E:\workdirectory\Code Name Val Halen\DS Sup\DL\Chapter 15\logs', histogram_freq=1) ] self.history = model.fit([train_data_x1, train_data_x2], train_labels, validation_data=([val_data_x1, val_data_x2], val_labels), epochs=200, batch_size=64, shuffle=True, callbacks=callbacks)
# 3 negative examples #model = load_model('./checkpoints/1560122132/lstm_50_50_0.10_0.25.h5') # model without arguments #model = load_model('./checkpoints/1559659364/lstm_50_50_0.17_0.25.h5') df = pd.read_csv('../data/final_hate_dataset_test.csv', sep='\t') hs_sentences = list(df['HS']) cn_sentences = list(df['CN']) cntype = list(df['CNtype']) # remove the cn type # cntype = ['none' for x in cntype] top_n = 3 tokenizer, embedding_matrix = word_embed_meta_data(hs_sentences + cn_sentences, siamese_config['EMBEDDING_DIM']) guess = 0 for i in range(len(hs_sentences)): resp = cn_sentences[i] hs = hs_sentences[i] all_pairs = [(hs, cn) for cn in cn_sentences] test_data_x1, test_data_x2, leaks_test, cntypes = create_test_data(tokenizer, all_pairs, cntype, siamese_config['MAX_SEQUENCE_LENGTH']) preds = list(model.predict([test_data_x1, test_data_x2, leaks_test, cntypes], verbose=1).ravel()) results = [(x, y, z) for (x, y), z in zip(all_pairs, preds)] results.sort(key=itemgetter(2), reverse=True) top = results[:top_n] responses = [x[1] for x in top] if resp in responses: print('Guess')
CONFIG.number_lstm_units = siamese_config['NUMBER_LSTM'] CONFIG.rate_drop_lstm = siamese_config['RATE_DROP_LSTM'] CONFIG.number_dense_units = siamese_config['NUMBER_DENSE_UNITS'] CONFIG.activation_function = siamese_config['ACTIVATION_FUNCTION'] CONFIG.rate_drop_dense = siamese_config['RATE_DROP_DENSE'] CONFIG.validation_split_ratio = siamese_config['VALIDATION_SPLIT'] CONFIG.max_nb_words = siamese_config['MAX_NB_WORDS'] CONFIG.filter_sizes = siamese_config['FILTER_SIZES'] CONFIG.num_filters = siamese_config['NUM_FILTERS'] #################################### ######## Word Embedding ############ #################################### # creating word embedding meta data for word embedding tokenizer, embedding_matrix, embedding_index = word_embed_meta_data( question["words"], CONFIG.embedding_dim, CONFIG.max_nb_words, 'data/word_embed.txt') embedding_meta_data = { 'tokenizer': tokenizer, 'embedding_matrix': embedding_matrix, 'embedding_index': embedding_index } ## creating sentence pairs sentences_pair = [(x1, x2) for x1, x2 in zip(sentences1, sentences2)] char_pair = [(x1, x2) for x1, x2 in zip(char1, char2)] del sentences1 del sentences2 ## get tfidf dictionary