def train(): space = "words" train_df = DataSet.load_train() xtr_df, xval_df = train_test_split(train_df, test_size=0.25) test_df = DataSet.load_test() ### Generate data generator train_dg = DataGenerator(data_df=xtr_df,space=space,bucket_num=5,batch_size=5000,is_prefix_pad=True,is_shuffle=True,is_test=False) val_dg = DataGenerator(data_df=xval_df,space=space,bucket_num=5,batch_size=5000,is_prefix_pad=True,is_shuffle=False,is_test=False) test_dg = DataGenerator(data_df=test_df,space=space,bucket_num=5,batch_size=5000,is_prefix_pad=True,is_shuffle=False,is_test=True) ### Must do prepare before using train_dg.prepare() val_dg.prepare() test_dg.prepare() ### load word embedding, can use train_df, val_dg or test_dg item_embed = train_dg.get_item_embed_tensor(space) ### Initialize network siamese_cnn = Siamese_CNN(item_embed,is_freeze=True) ### Initialize model using network siamese_model = Model(siamese_cnn) criteria = nn.BCEWithLogitsLoss() optimizer_ft = optim.Adam(ifilter(lambda p: p.requires_grad, siamese_cnn.parameters()), lr=8e-4) exp_lr_scheduler = lr_scheduler.ExponentialLR(optimizer_ft, gamma=0.9) ### Train siamese_model.train(train_dg,val_dg,criteria,optimizer_ft,exp_lr_scheduler,25) preds = siamese_model.predict(test_dg).numpy() preds = pd.DataFrame({"y_pre":preds}) preds.to_csv("submission.csv",index=False)
def get_item_embeddings(self): term_embed = DataSet.load_term_embed(self.data_space) embed_size = term_embed.shape[1] pad_embed = np.array([0]*embed_size).reshape(1,-1) all_embeding = np.vstack([pad_embed,term_embed]) all_index = [_PAD_]+term_embed.index.values.tolist() all_embeding_df = pd.DataFrame(data=all_embeding,index=all_index) sort_word = [self.idx2item[i] for i in range(len(self.idx2item))] return all_embeding_df.loc[sort_word].values
def cv_main(): kf = KFold(n_splits=folder, shuffle=True, random_state=19920618) all_train_df = DataSet.load_train() test_df = DataSet.load_test() test_dg = DataGenerator(data_df=test_df, space=space, bucket_num=5, batch_size=256, is_prefix_pad=False, is_shuffle=False, is_test=True) print("prepare test data generator") test_dg.prepare() item_embed = test_dg.get_item_embed_tensor(space) train_eval = np.zeros(len(all_train_df)) test_eval = np.zeros((len(test_df), folder)) for i, (train_index, val_index) in enumerate(kf.split(all_train_df)): print() train_name = version + "_cv_%s" % (i) xtr_df = all_train_df.iloc[train_index] xval_df = all_train_df.iloc[val_index] train_dg = DataGenerator(data_df=xtr_df, space=space, bucket_num=5, batch_size=batch_size, is_prefix_pad=False, is_shuffle=True, is_test=False) val_dg = DataGenerator(data_df=xval_df, space=space, bucket_num=5, batch_size=256, is_prefix_pad=False, is_shuffle=False, is_test=False) print("prepare train data generator, cv_%s" % i) train_dg.prepare() print("prepare val data generator, cv_%s" % i) val_dg.prepare() siamese_lstm = Siamese_LSTM(pre_trained_embedding=item_embed, is_freeze=is_freeze, hidden_size=hidden_size, number_layers=num_layers, lstm_dropout_p=lstm_drop_p, bidirectional=bidirectional, linear_hid_size=linear_hidden_size, linear_hid_drop_p=linear_hid_drop_p, input_drop_p=lstm_input_drop_p) siamese_lstm.init_weights( ) ##TODO Whether to initialize customised weights as Keras siamese_model = Model(train_name, siamese_lstm) criteria = nn.BCEWithLogitsLoss() optimizer_ft = optim.Adam(ifilter(lambda p: p.requires_grad, siamese_lstm.parameters()), lr=LR) exp_lr_scheduler = lr_scheduler.ExponentialLR(optimizer_ft, gamma=Gamma) ### Train siamese_model.train(train_dg=train_dg, valid_dg=val_dg, criterion=criteria, optimizer=optimizer_ft, scheduler=exp_lr_scheduler, num_epochs=num_epochs, early_stop_rounds=early_stop) siamese_model.save_plot_() val_pred = siamese_model.predict(val_dg).numpy() train_eval[val_index] = val_pred test_preds = siamese_model.predict(test_dg).numpy() test_eval[:, i] = test_preds train_pred_df = pd.DataFrame({version + "_train_pred_cv": train_eval}) train_pred_df.to_csv(version + "_train_pred_cv.csv", index=False) test_pred_df = pd.DataFrame( test_eval, columns=[version + "_test_pred_cv_%s" % (i) for i in xrange(folder)]) test_pred_df["y_pre"] = test_pred_df.mean(axis=1) test_pred_df.to_csv(version + "_test_pred_cv.csv", index=False) test_pred_df[["y_pre"]].to_csv(version + "_submission_cv.csv", index=False)
def train_main(): train_name = version + "_sm" ##--------------parameters-------------------## train_df = DataSet.load_train() xtr_df, xval_df = train_test_split(train_df, test_size=0.20) test_df = DataSet.load_test() ### Generate data generator train_dg = DataGenerator(data_df=xtr_df, space=space, bucket_num=5, batch_size=batch_size, is_prefix_pad=False, is_shuffle=True, is_test=False) val_dg = DataGenerator(data_df=xval_df, space=space, bucket_num=5, batch_size=512, is_prefix_pad=False, is_shuffle=False, is_test=False) test_dg = DataGenerator(data_df=test_df, space=space, bucket_num=5, batch_size=512, is_prefix_pad=False, is_shuffle=False, is_test=True) ### Must do prepare before using train_dg.prepare() val_dg.prepare() test_dg.prepare() ### load word embedding, can use train_df, val_dg or test_dg item_embed = train_dg.get_item_embed_tensor(space) ### Initialize network siamese_lstm = Siamese_LSTM(pre_trained_embedding=item_embed, is_freeze=is_freeze, hidden_size=hidden_size, number_layers=num_layers, lstm_dropout_p=lstm_drop_p, bidirectional=bidirectional, linear_hid_size=linear_hidden_size, linear_hid_drop_p=linear_hid_drop_p, input_drop_p=lstm_input_drop_p) siamese_lstm.init_weights( ) ##TODO Whether initialize customised weights as Keras ### Initialize model using network siamese_model = Model(train_name, siamese_lstm) criteria = nn.BCEWithLogitsLoss() optimizer_ft = optim.Adam(ifilter(lambda p: p.requires_grad, siamese_lstm.parameters()), lr=LR) exp_lr_scheduler = lr_scheduler.ExponentialLR(optimizer_ft, gamma=Gamma) ### Train siamese_model.train(train_dg=train_dg, valid_dg=val_dg, criterion=criteria, optimizer=optimizer_ft, scheduler=exp_lr_scheduler, num_epochs=num_epochs, early_stop_rounds=early_stop) siamese_model.save_plot_() preds = siamese_model.predict(test_dg).numpy() preds = pd.DataFrame({"y_pre": preds}) preds.to_csv(version + "_submission_sm.csv", index=False)
def _generate_inter_files(self): print("loading question_df...") question_df = DataSet.load_all_questions() corpus = question_df[self.data_space] print("generating item2idx...") sen_list = corpus.values.tolist() self.item2idx = {_PAD_:0} for sen in sen_list: for word in sen.split(): if word not in self.item2idx: self.item2idx[word] = len(self.item2idx) print("generating idx2item...") self.idx2item = {v:k for k,v in self.item2idx.items()} print("load %s data..."%(self.train_test)) if self.train_test=="train": self.data_set = DataSet.load_train() else: self.data_set = DataSet.load_test() if self.data_space == "words": q1 = self.data_set["word_len_q1"] q2 = self.data_set["word_len_q2"] else: q1 = self.data_set["char_len_q1"] q2 = self.data_set["char_len_q2"] print("bucketing...") q_pair = list(zip(q1,q2)) bucket = GreedyBucket() fit_res = bucket.fit(q_pair) self.buckets,self.bounds = bucket.get_split_results(fit_res,self.bucket_num) #print("len of self.bounds",len(self.bounds)) print("generating id vectors...") data_set_id_vectors = [] for ind in range(self.data_set.shape[0]): cur_row = self.data_set.iloc[ind] cur_q1 = cur_row["q1"] cur_q1_items = question_df.loc[cur_q1][self.data_space].split() cur_q1_inds = [self.item2idx[x] for x in cur_q1_items] cur_q2 = cur_row["q2"] cur_q2_items = question_df.loc[cur_q2][self.data_space].split() cur_q2_inds = [self.item2idx[x] for x in cur_q2_items] cur_bound = self.bounds[ind] q1_pad_len = cur_bound - len(cur_q1_inds) q2_pad_len = cur_bound - len(cur_q2_inds) if self.pad_prefix: cur_q1_padded = [0]*q1_pad_len+cur_q1_inds cur_q2_padded = [0]*q2_pad_len+cur_q2_inds else: cur_q1_padded = cur_q1_inds+[0]*q1_pad_len cur_q2_padded = cur_q2_inds+[0]*q2_pad_len cur_pair_padded = cur_q1_padded + cur_q2_padded data_set_id_vectors.append(cur_pair_padded) data_set_id_vectors = np.array(data_set_id_vectors) print("generating bucket_idx_vectors...") self.bucket_idx_vectors = {} for b,id_list in self.buckets.items(): tmp = {} if self.train_test == "train": tmplabels = self.data_set["label"].iloc[id_list].values tmp["label"] = tmplabels tmpdata = np.array(data_set_id_vectors[id_list].tolist()) tmp["data"] = tmpdata self.bucket_idx_vectors[b] = tmp print("finish generating inter files.") print("begin caching..") all_cached = {} all_cached["item2idx"] = self.item2idx all_cached["idx2item"] = self.idx2item all_cached["buckets"] = self.buckets all_cached["bounds"] = self.bounds all_cached["bucket_idx_vectors"] = self.bucket_idx_vectors try: os.makedirs("./temp") except: pass pickle.dump(all_cached,open(self._temp_file,"wb")) print("finish caching")
def __init__(self,data_df,space,bucket_num,batch_size,is_prefix_pad,is_shuffle,is_test): assert space in ["words","chars"] self.data_df = data_df self.space = space self.bucket_num = bucket_num self.batch_size = batch_size self.is_prefix_pad = is_prefix_pad self.is_shuffle = is_shuffle self.is_test = is_test if os.path.exists(self._temp_file): print("detect cached intermediate files...loading...") if DataGenerator.item2idx is None: all_cached = pickle.load(open(self._temp_file,"rb")) DataGenerator.item2idx = all_cached["item2idx"] DataGenerator.idx2item = all_cached["idx2item"] DataGenerator.item_embed = all_cached["item_embed"] DataGenerator.q2idvec = all_cached["q2idvec"] print("finish") else: print("Generating intermediate files...") DataGenerator.item2idx = {} DataGenerator.idx2item = {} DataGenerator.item_embed = {} DataGenerator.q2idvec = {} spaces = ["words","chars"] question_df = DataSet.load_all_questions() all_qids = DataSet.load_all_unique_ids_train_test() for space in spaces: print("for",space) corpus = question_df[space] w2i,i2w = self._get_item2id_id2item(corpus) DataGenerator.item2idx[space] = w2i DataGenerator.idx2item[space] = i2w ##Finish mapping table term_embed = DataSet.load_term_embed(space) embed_size = term_embed.shape[1] pad_embed = np.array([0] * embed_size).reshape(1, -1) all_embeding = np.vstack([pad_embed, term_embed]) all_index = [_PAD_] + term_embed.index.values.tolist() all_embeding_df = pd.DataFrame(data=all_embeding, index=all_index) sort_word = [i2w[i] for i in range(len(i2w))] DataGenerator.item_embed[space] = all_embeding_df.loc[sort_word].values ##Finish item embedding tmp_q2idvec = {} for qid in all_qids: items = question_df.loc[qid][space].split() idvec = np.array([w2i[w] for w in items]) tmp_q2idvec[qid] = idvec DataGenerator.q2idvec[space]=tmp_q2idvec ##Finish map from question to id vector print("finish generating inter files.") print("begin caching..") all_cached = {} all_cached["item2idx"] = DataGenerator.item2idx all_cached["idx2item"] = DataGenerator.idx2item all_cached["item_embed"] = DataGenerator.item_embed all_cached["q2idvec"] = DataGenerator.q2idvec try: os.makedirs("./temp") except: pass pickle.dump(all_cached,open(self._temp_file,"wb")) print("finish caching")
all_embeding = np.vstack([pad_embed,term_embed]) all_index = [_PAD_]+term_embed.index.values.tolist() all_embeding_df = pd.DataFrame(data=all_embeding,index=all_index) sort_word = [self.idx2item[i] for i in range(len(self.idx2item))] return all_embeding_df.loc[sort_word].values if __name__ == "__main__": # train = DataSet.load_train() # a = zip(train["word_len_q1"],train["word_len_q2"])[:1000] # bucket = GreedyBucket() # fitres = bucket.fit(a) # bucket,bounds = bucket.get_split_results(fitres,5) # print bucket,bounds dl = DataGenerator() train = DataSet.load_train() from sklearn.model_selection import train_test_split xtr, xte = train_test_split(train, test_size=0.33) dl.prepare(xtr[:10], 1, "words", True, False) tr_g = dl.get_data_generator(True, 20) for d,i,l in tr_g: print(d) print(i) print(l)
def train(): space = "words" is_freeze = True hidden_size = 100 num_layers = 2 lstm_dropput_p = 0.6 ##TODO 0.4->0.5->0.6 lstm_input_dropout = 0.6 bidirectional = True linear_hidden_size = 200 linear_hid_drop_p = 0.3 train_name = "v0.2" train_df = DataSet.load_train() xtr_df, xval_df = train_test_split(train_df, test_size=0.25) test_df = DataSet.load_test() ### Generate data generator train_dg = DataGenerator(data_df=xtr_df, space=space, bucket_num=5, batch_size=512, is_prefix_pad=True, is_shuffle=True, is_test=False) val_dg = DataGenerator(data_df=xval_df, space=space, bucket_num=5, batch_size=512, is_prefix_pad=True, is_shuffle=False, is_test=False) test_dg = DataGenerator(data_df=test_df, space=space, bucket_num=5, batch_size=512, is_prefix_pad=True, is_shuffle=False, is_test=True) ### Must do prepare before using train_dg.prepare() val_dg.prepare() test_dg.prepare() ### load word embedding, can use train_df, val_dg or test_dg item_embed = train_dg.get_item_embed_tensor(space) ### Initialize network siamese_lstm = Siamese_LSTM(pre_trained_embedding=item_embed, is_freeze=is_freeze, hidden_size=hidden_size, number_layers=num_layers, lstm_dropout_p=lstm_dropput_p, bidirectional=bidirectional, linear_hid_size=linear_hidden_size, linear_hid_drop_p=linear_hid_drop_p, input_drop_p=lstm_input_dropout) ### Initialize model using network siamese_model = Model(siamese_lstm) criteria = nn.BCEWithLogitsLoss() optimizer_ft = optim.Adam(ifilter(lambda p: p.requires_grad, siamese_lstm.parameters()), lr=0.001) ##TODO 0.001 exp_lr_scheduler = lr_scheduler.ExponentialLR(optimizer_ft, gamma=0.99) ##TODO 0.99 ### Train siamese_model.train(train_name, train_dg, val_dg, criteria, optimizer_ft, exp_lr_scheduler, 150) ##TODO 150 preds = siamese_model.predict(test_dg).numpy() preds = pd.DataFrame({"y_pre": preds}) preds.to_csv("submission.csv", index=False)