def load_data(self): print(config.get_current_time("load row data")) train = pd.read_csv(config.TRAIN_DIR) test = pd.read_csv(config.TEST_DIR) ## split to train and val train_data, val_data = train_test_split(train, test_size=0.08, random_state=2018) print("Train data: {}, Valid data: {}, Test data: {}.".format( train.shape, val_data.shape, test.shape)) ## fill up the missing values train_X = train_data["question_text"].fillna("_##_").values val_X = val_data["question_text"].fillna("_##_").values test_X = test["question_text"].fillna("_##_").values ## Tokenize the sentences tokenizer = Tokenizer(num_words=self.MAX_NB_WORDS) tokenizer.fit_on_texts(list(train_X)) self.word_index = tokenizer.word_index train_X = tokenizer.texts_to_sequences(train_X) val_X = tokenizer.texts_to_sequences(val_X) test_X = tokenizer.texts_to_sequences(test_X) ## Pad the sentences train_X = pad_sequences(train_X, maxlen=self.data_len) val_X = pad_sequences(val_X, maxlen=self.data_len) test_X = pad_sequences(test_X, maxlen=self.data_len) ## Get the target values train_y = train_data['target'].values val_y = val_data['target'].values # shuffling the data np.random.seed(2018) trn_idx = np.random.permutation(len(train_X)) val_idx = np.random.permutation(len(val_X)) train_X = train_X[trn_idx] val_X = val_X[val_idx] train_y = train_y[trn_idx] val_y = val_y[val_idx] train_y = to_categorical(train_y, num_classes=2) val_y = to_categorical(val_y, num_classes=2) print(config.get_current_time("return data")) return train_X, train_y, val_X, val_y, test_X
def log_time_helper(mode, is_starting=True): if is_starting: w = "starting" else: w = "ending" current_time = get_current_time() my_logger.info("[+] Mode = {}; {} at {}".format(mode, w, current_time[0])) return current_time
def load_topic_info(self): ''' just get the topic ids :return: ''' print(config.get_current_time(), "loading topic info") with open(config.TOPIC_INFO_DIR, 'r') as f: for index, line in enumerate(f.readlines()): self.topic_dict[line.strip('\n').split('\t')[0]] = index self.topic_dict_inv[index] = line.strip('\n').split('\t')[0]
def load_wiki_news_em_matrix(self): ''' :return: ''' print(config.get_current_time("load_wiki_news_em_matrix")) embeddings_index = dict() embedding_max_value = 0 embedding_min_value = 1 with open(config.WIKI_NEWS_DIR, 'r', encoding='utf-8') as f: for line in f: line = line.strip().split(' ') if len(line) != 301: continue coefs = np.asarray(line[1:], dtype='float32') if np.max(coefs) > embedding_max_value: embedding_max_value = np.max(coefs) if np.min(coefs) < embedding_min_value: embedding_min_value = np.min(coefs) embeddings_index[line[0]] = coefs print( config.get_current_time( ('Found %s word vectors.' % len(embeddings_index)))) self.WIKI_NEWS_EM = np.zeros( (len(self.word_index) + 1, self.EMBEDDING_DIM)) for word, i in self.word_index.items(): embedding_vector = embeddings_index.get(word) if embedding_vector is not None: # words not found in embedding index will be all-zeros. self.WIKI_NEWS_EM[i] = embedding_vector else: self.WIKI_NEWS_EM[i] = np.random.uniform( low=embedding_min_value, high=embedding_max_value, size=self.EMBEDDING_DIM)
def get_quesids(self): ''' :return: ''' question_ids = [] print(config.get_current_time(), 'loading question eval ids') with open(config.QUESTION_EVAL_SET_DIR, 'r') as f: for index, line in enumerate(f.readlines()): splitted = line.strip('\n').split('\t') question_ids.append(splitted[0]) self.load_topic_info() return question_ids
def load_google_news_em_matrix(self): print(config.get_current_time("load_google_news_em_matrix")) self.GOOGLE_NEWS_EM = np.zeros( (len(self.word_index) + 1, self.EMBEDDING_DIM)) model = gensim.models.KeyedVectors.load_word2vec_format( config.GOOGLE_NEWS_DIR, binary=True) for word, i in self.word_index.items(): try: embedding_vector = model[word] except: embedding_vector = None if embedding_vector is not None: self.GOOGLE_NEWS_EM[i] = embedding_vector else: self.GOOGLE_NEWS_EM[i] = np.random.uniform( low=-0.0018054, high=0.047287, size=self.EMBEDDING_DIM)
def load_charembedding_matrix(self): embeddings_index = dict() embedding_max_value = 0 embedding_min_value = 1 with open(config.CHAR_EMBEDDING_DIR, 'r') as f: for line in f: line = line.strip().split(' ') if len(line) != 257: continue coefs = np.asarray(line[1:], dtype='float32') if np.max(coefs) > embedding_max_value: embedding_max_value = np.max(coefs) if np.min(coefs) < embedding_min_value: embedding_min_value = np.min(coefs) embeddings_index[line[0]] = coefs print(config.get_current_time(), ('Found %s char vectors.' % len(embeddings_index))) self.embedchar_matrix = np.zeros((len(self.char_index) + 1, 256)) for word, i in self.char_index.items(): embedding_vector = embeddings_index.get(word) if embedding_vector is not None: # words not found in embedding index will be all-zeros. self.embedchar_matrix[i] = embedding_vector else: self.embedchar_matrix[i] = np.random.uniform( low=embedding_min_value, high=embedding_max_value, size=256)
def load_train_data(self): ''' title_char+title_word+dsp_char+dsp_word :param istitle: bool :param iscontent: bool :param type_kind: char or word :return: ''' title_char_list = [] title_word_list = [] dsp_char_list = [] dsp_word_list = [] question_ids = [] print(config.get_current_time(), 'loading question train set file') with open(config.QUESTION_TRAIN_SET_DIR, 'r') as f: for index, line in enumerate(f.readlines()): if index > 500: break splitted = line.strip('\n').split('\t') if len(splitted) == 1: continue elif len(splitted) == 2: continue elif len(splitted) == 5: title_char_list.append(splitted[1].replace(',', ' ')) title_word_list.append(splitted[2].replace(',', ' ')) dsp_char_list.append(splitted[3].replace(',', ' ')) dsp_word_list.append(splitted[4].replace(',', ' ')) self.max_titlechar_len = max(len(splitted[1].split(',')), self.max_titlechar_len) self.max_titleword_len = max(len(splitted[2].split(',')), self.max_titleword_len) self.max_dspchar_len = max(len(splitted[3].split(',')), self.max_dspchar_len) self.max_dspword_len = max(len(splitted[4].split(',')), self.max_dspword_len) question_ids.append(splitted[0]) else: continue # print('max titlecharlength', self.max_titlechar_len) # print('max titleword length', self.max_titleword_len) # print('max dspchar length', self.max_dspchar_len) # print('max dspword length', self.max_dspword_len) pickle.dump(self.tw_len, open(self.savedir + '/tw_len.pkl', 'wb')) pickle.dump(self.tc_len, open(self.savedir + '/tc_len.pkl', 'wb')) pickle.dump(self.dsppad_length, open(self.savedir + '/dsp_pad_length.pkl', 'wb')) # ------titleword-------- print(config.get_current_time(), 'tokenizer title word working') tokenizer_word = Tokenizer(num_words=self.MAX_NB_WORDS) tokenizer_word.fit_on_texts(title_word_list + dsp_word_list) sequences_titleword = tokenizer_word.texts_to_sequences( title_word_list) self.word_index = tokenizer_word.word_index print(config.get_current_time(), 'Found %s unique word tokens.' % len(self.word_index)) titleword_array = pad_sequences(sequences_titleword, maxlen=self.tw_len) # return arrays pickle.dump(tokenizer_word, open(self.savedir + '/tokenizer_word.pkl', 'wb')) print('tokenzier is saved as %s/tokenizer_word.pkl' % (self.savedir)) # -----titlechar--------- print(config.get_current_time(), 'tokenizer title char working') tokenizer_char = Tokenizer(num_words=self.MAX_NB_WORDS) tokenizer_char.fit_on_texts(title_char_list + dsp_char_list) sequences_titlechar = tokenizer_char.texts_to_sequences( title_char_list) self.char_index = tokenizer_char.word_index print(config.get_current_time(), 'Found %s unique char tokens.' % len(self.char_index)) titlechar_array = pad_sequences(sequences_titlechar, maxlen=self.tc_len) # return arrays pickle.dump(tokenizer_char, open(self.savedir + '/tokenizer_char.pkl', 'wb')) print('tokenzier is saved as %s/tokenizer_char.pkl' % (self.savedir)) # -----dspword-------- print(config.get_current_time(), 'tokenizer dsp char working') sequences_dspchar = tokenizer_char.texts_to_sequences(dsp_char_list) dspchar_array = pad_sequences( sequences_dspchar, maxlen=self.dsppad_length) # return arrays # ---dspchar--------- print(config.get_current_time(), 'tokenizer dsp word working') sequences_dspword = tokenizer_word.texts_to_sequences(dsp_word_list) dspword_array = pad_sequences( sequences_dspword, maxlen=self.dsppad_length) # return arrays self.load_topic_info() question_to_label = {} print(config.get_current_time(), 'loading train labels') with open(config.QUESTION_TOPIC_TRAIN_DIR, 'r') as f: for index, line in enumerate(f.readlines()): # if index>100000: # break splitted = line.strip('\n').split('\t') if len(splitted) != 2: print('error!') question_to_label[splitted[0]] = [ self.topic_dict[i] for i in splitted[1].split(',') ] print(config.get_current_time(), 'duiqi traindata and labels') row_ = [] col_ = [] count_1 = 0 # label_dense = np.zeros((train_titleword_array.shape[0], 1999)) for row, quesid in enumerate(question_ids): cols = question_to_label.get(quesid) if cols is None: print('error!') count_1 += len(cols) for k in cols: row_.append(row) col_.extend(cols) data_ = [1 for i in row_] label_sparse = csr_matrix((data_, (row_, col_)), shape=(len(question_ids), 1999)) # # Shuffle data # shuffle_indices = np.random.permutation(np.arange(train_titleword_array.shape[0])) # x_word = train_titleword_array[shuffle_indices] # x_char = train_titlechar_array[shuffle_indices] # row_ = [row_[i] for i in shuffle_indices] # col_ = [col_[i] for i in shuffle_indices] # # # label_dense = label_dense[shuffle_indices] # # label_sparse = csr_matrix(([1 for i in range(count_1))],(row_,col_)),shape = ()) # # train_len = int(x_word.shape[0] * 0.9) # x_word_train = x_word[:train_len] # x_char_train = x_char[:train_len] # y_train = label_sparse[:train_len] # x_word_test = x_word[train_len:] # x_char_test = x_char[train_len:] # y_test = label_sparse[train_len:] # return (x_word_train, x_char_train, y_train, x_word_test, x_char_test, y_test) return titlechar_array, titleword_array, dspchar_array, dspword_array, label_sparse
def load_pred_data_4part(self): ''' :return: ''' title_char_list = [] title_word_list = [] dsp_char_list = [] dsp_word_list = [] question_ids = [] self.tw_len = pickle.load(open(self.savedir + '/tw_len.pkl', 'rb')) self.tc_len = pickle.load(open(self.savedir + '/tc_len.pkl', 'rb')) self.dsppad_length = pickle.load( open(self.savedir + '/dsp_pad_length.pkl', 'rb')) print('length is loaded!') print(config.get_current_time(), 'loading question eval set file') with open(config.QUESTION_EVAL_SET_DIR, 'r') as f: for index, line in enumerate(f.readlines()): # if index>50000: # break splitted = line.strip('\n').split('\t') if len(splitted) == 1: print('error!') exit() elif len(splitted) == 2: title_char_list.append(splitted[1].replace(',', ' ')) title_word_list.append(" ") dsp_char_list.append(" ") dsp_word_list.append(" ") elif len(splitted) == 3: title_char_list.append(splitted[1].replace(',', ' ')) title_word_list.append(splitted[2].replace(',', ' ')) dsp_char_list.append(" ") dsp_word_list.append(" ") elif len(splitted) == 4: title_char_list.append(splitted[1].replace(',', ' ')) title_word_list.append(splitted[2].replace(',', ' ')) dsp_char_list.append(splitted[3].replace(',', ' ')) dsp_word_list.append(" ") elif len(splitted) == 5: title_char_list.append(splitted[1].replace(',', ' ')) title_word_list.append(splitted[2].replace(',', ' ')) dsp_char_list.append(splitted[3].replace(',', ' ')) dsp_word_list.append(splitted[4].replace(',', ' ')) question_ids.append(splitted[0]) tokenizer_word = pickle.load( open(self.savedir + '/tokenizer_word.pkl', 'rb')) tokenizer_char = pickle.load( open(self.savedir + '/tokenizer_char.pkl', 'rb')) print('tokenizer word loaded!') print("") print(config.get_current_time(), 'tokenizer working title char') titlechar_sequences_char = tokenizer_char.texts_to_sequences( title_char_list) self.char_index = tokenizer_char.word_index titlechar_array = pad_sequences(titlechar_sequences_char, maxlen=self.tc_len) # return arrays print(config.get_current_time(), 'tokenizer working title word') titleword_sequences_word = tokenizer_word.texts_to_sequences( title_word_list) self.word_index = tokenizer_word.word_index titleword_array = pad_sequences(titleword_sequences_word, maxlen=self.tw_len) # return arrays print(config.get_current_time(), 'tokenizer working dsp char') dspchar_sequences_char = tokenizer_char.texts_to_sequences( dsp_char_list) dspchar_array = pad_sequences( dspchar_sequences_char, maxlen=self.dsppad_length) # return arrays print(config.get_current_time(), 'tokenizer working dsp word') dspword_sequences_word = tokenizer_word.texts_to_sequences( dsp_word_list) dspword_array = pad_sequences( dspword_sequences_word, maxlen=self.dsppad_length) # return arrays self.load_topic_info() return titlechar_array, titleword_array, dspchar_array, dspword_array, question_ids
with open("final_423.csv", 'w') as f: for i in range(predlabels.shape[0]): # f.write(ques_ids[i] + "," + ','.join([topic_dict_inv[k] for k in predlabels[i]]) + '\n') f.write(ques_ids[i] + "," + ','.join(tmpfunc(predlabels[i])) + '\n') if __name__ == '__main__': if len(sys.argv) < 2: print('error, give me mode ') exit() mode = sys.argv[1] print(config.get_current_time(), 'current mode:', mode) if mode == "train": save_root_dir = './model_exp' #your own path, to save models,tokenizers... dl = data_loader(save_root_dir) datatuple = dl.load_train_data() dl.load_charembedding_matrix() dl.load_wordembedding_matrix() mymodel = MultiModel(w_embed_matrix=dl.embedword_matrix, c_embed_matrix=dl.embedchar_matrix, word_index=dl.word_index, char_index=dl.char_index, titlechar_length=dl.tc_len,
def bulid_model(self): ''' :return: ''' print(config.get_current_time("building model ------")) # ----------- title local w2v ---------- with tf.device('/gpu:%d' % (0)): tl_embedding_layer = Embedding(len(self.word_index) + 1, self.EMBEDDING_DIM, weights=[self.GLOVE_EM], input_length=self.data_len, trainable=True, embeddings_initializer=initializers.RandomUniform(minval=-0.2, maxval=0.2, seed=None)) tl_sequence_input = Input(shape=(self.data_len,), name="title_local_w2v_input") tl_embedded_sequences = tl_embedding_layer(tl_sequence_input) with tf.device('/gpu:%d' % (0)): tl_z_pos = LSTM(512, implementation=2, return_sequences=True, go_backwards=False)(tl_embedded_sequences) tl_z_neg = LSTM(512, implementation=2, return_sequences=True, go_backwards=True)(tl_embedded_sequences) tl_z_concat = concatenate([tl_z_pos, tl_embedded_sequences, tl_z_neg], axis=-1) tl_z = Dense(512, activation='tanh')(tl_z_concat) tl_pool_rnn = Lambda(lambda x: K.max(x, axis=1), output_shape=(512,))(tl_z) # ---------- title ai w2v ---------- with tf.device('/gpu:%d' % (0)): ta_embedding_layer = Embedding(len(self.word_index) + 1, self.EMBEDDING_DIM, weights=[self.GOOGLE_NEWS_EM], input_length=self.data_len, trainable=True, embeddings_initializer=initializers.RandomUniform(minval=-0.2, maxval=0.2, seed=None)) ta_sequence_input = Input(shape=(self.data_len,), name="title_ai_w2v_input") ta_embedded_sequences = ta_embedding_layer(ta_sequence_input) with tf.device('/gpu:%d' % (0)): ta_z_pos = LSTM(512, implementation=2, return_sequences=True, go_backwards=False)(ta_embedded_sequences) ta_z_neg = LSTM(512, implementation=2, return_sequences=True, go_backwards=True)(ta_embedded_sequences) ta_z_concat = concatenate([ta_z_pos, ta_embedded_sequences, ta_z_neg], axis=-1) ta_z = Dense(512, activation='tanh')(ta_z_concat) ta_pool_rnn = Lambda(lambda x: K.max(x, axis=1), output_shape=(512,))(ta_z) # ----------- des local w2v ---------- with tf.device('/gpu:%d' % (0)): dl_embedding_layer = Embedding(len(self.word_index) + 1, self.EMBEDDING_DIM, weights=[self.PARAGRAM_EM], input_length=self.data_len, trainable=True, embeddings_initializer=initializers.RandomUniform(minval=-0.2, maxval=0.2, seed=None)) dl_sequence_input = Input(shape=(self.data_len,), name="des_local_w2v_input") dl_embedded_sequences = dl_embedding_layer(dl_sequence_input) with tf.device('/gpu:%d' % (0)): dl_z_pos = LSTM(512, implementation=2, return_sequences=True, go_backwards=False)(dl_embedded_sequences) dl_z_neg = LSTM(512, implementation=2, return_sequences=True, go_backwards=True)(dl_embedded_sequences) dl_z_concat = concatenate([dl_z_pos, dl_embedded_sequences, dl_z_neg], axis=-1) dl_z = Dense(512, activation='tanh')(dl_z_concat) dl_pool_rnn = Lambda(lambda x: K.max(x, axis=1), output_shape=(512,))(dl_z) # ---------- des ai w2v ---------- with tf.device('/gpu:%d' % (0)): da_embedding_layer = Embedding(len(self.word_index) + 1, self.EMBEDDING_DIM, weights=[self.WIKI_NEWS_EM], input_length=self.data_len, trainable=True, embeddings_initializer=initializers.RandomUniform(minval=-0.2, maxval=0.2, seed=None)) da_sequence_input = Input(shape=(self.data_len,), name="des_ai_w2v_input") da_embedded_sequences = da_embedding_layer(da_sequence_input) with tf.device('/gpu:%d' % (0)): da_z_pos = LSTM(512, implementation=2, return_sequences=True, go_backwards=False)(da_embedded_sequences) da_z_neg = LSTM(512, implementation=2, return_sequences=True, go_backwards=True)(da_embedded_sequences) da_z_concat = concatenate([da_z_pos, da_embedded_sequences, da_z_neg], axis=-1) da_z = Dense(512, activation='tanh')(da_z_concat) da_pool_rnn = Lambda(lambda x: K.max(x, axis=1), output_shape=(512,))(da_z) # ---------- att ---------- concat_t_d = concatenate([tl_pool_rnn, ta_pool_rnn, dl_pool_rnn, da_pool_rnn], axis=-1) concat_t_d = Reshape((2, 512 * 2))(concat_t_d) attention = Dense(1, activation='tanh')(concat_t_d) attention = Flatten()(attention) attention = Activation('softmax')(attention) attention = RepeatVector(512 * 2)(attention) attention = Permute([2, 1])(attention) sent_representation = multiply([concat_t_d, attention]) sent_representation = Lambda(lambda xin: K.sum(xin, axis=-2), output_shape=(512 * 2,))(sent_representation) # ---------- merge_4models ---------- model_final_ = Dense(2, activation='relu')(sent_representation) model_final_ = Dropout(0.5)(model_final_) model_final = Dense(2, activation='softmax')(model_final_) self.model = Model(inputs=[tl_sequence_input, ta_sequence_input, dl_sequence_input, da_sequence_input], outputs=model_final) adam = optimizers.adam(lr=0.00001) self.model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=[f1]) print(self.model.summary())