def get_val(self, shuffle=True, iterable=True, max_sequence_length=0): x = self.datas['dev']['X'] x = [self.embedding.text_to_sequence(sent) for sent in x] y = to_categorical(np.asarray(self.datas['dev']['y'])) data = (x, y) if max_sequence_length == 0: max_sequence_length = self.max_sequence_length if iterable: # iterator = BucketIterator(data,batch_size=self.batch_size,shuffle=True,max_sequence_length=max_sequence_length) # for batch in iterator: # yield batch[0],batch[1] return BucketIterator(data, batch_size=self.batch_size, shuffle=True, max_sequence_length=max_sequence_length, backend=self.language) else: if self.bert_enabled: x, x_mask = to_array(x, maxlen=self.max_sequence_length, use_mask=True) return [x, x_mask], y else: x = to_array(x, maxlen=self.max_sequence_length, use_mask=False) return x, y
def get_test(self, overlap_feature=False, iterable=True): if overlap_feature: process = lambda row: [ self.embedding.text_to_sequence(row["question"]), self.embedding.text_to_sequence(row["answer"]), self.embedding.overlap_index(row['question'], row['answer']) ] else: process = lambda row: [ self.embedding.text_to_sequence(row["question"]), self.embedding.text_to_sequence(row["answer"]) ] samples = self.datas['test'].apply(process, axis=1) if iterable: return BucketIterator([i for i in zip(*samples)], batch_size=self.batch_size, shuffle=False) else: if self.match_type == 'pointwise': # [to_array(i,reader.max_sequence_length) for i in test_data] return [ to_array(i, self.max_sequence_length) for i in zip(*samples) ] else: # return [[i,i] for i in zip(*samples)] return [[ to_array(i, self.max_sequence_length), to_array(i, self.max_sequence_length) ] for i in zip(*samples)]
def get_test_2(self, shuffle=True, iterable=True, max_sequence_length=0, overlap_feature=False): x_data = [] #sample on the whole data, only support pointwise match type: x=[q,pos_a],y q = self.datas["test"]["question"] a = self.datas["test"]["answer"] y = self.datas["test"]["flag"] if max_sequence_length == 0: max_sequence_length = self.max_sequence_length q = [self.embedding.text_to_sequence(sent) for sent in q] # q = to_array(q,maxlen = self.max_sequence_length, use_mask = False) a = [self.embedding.text_to_sequence(sent) for sent in a] # a = to_array(a,maxlen = self.max_sequence_length, use_mask = False) y = to_categorical(np.asarray(y)) if self.bert_enabled: q, q_mask = to_array(q, maxlen=self.max_sequence_length, use_mask=True) a, a_mask = to_array(a, maxlen=self.max_sequence_length, use_mask=True) x_data = [q, q_mask, a, a_mask] if self.match_type == 'pairwise': x_data = x_data + [a, a_mask] y = [l for l in zip(*[q, a, a])] else: q = to_array(q, maxlen=self.max_sequence_length, use_mask=False) a = to_array(a, maxlen=self.max_sequence_length, use_mask=False) x_data = [q, a] if self.match_type == 'pairwise': x_data = x_data + [a] y = [l for l in zip(*[q, a, a])] if overlap_feature: overlap = [ self.overlap_index(q_seq, a_seq) for q_seq, a_seq in zip(*x_data) ] x_data = x_data + overlap if iterable: x = [l for l in zip(*x_data)] data = (x, y) return BucketIterator(data, batch_size=self.batch_size, batch_num=int(self.num_samples / self.batch_size), shuffle=True) else: return x_data, y
def transformKeras(self,data): list_of_data = [] for i in data: if type(i[0])!=int and type(i)!=np.ndarray and type(i[0][0])==int: list_of_data.append(to_array(i,self.max_sequence_length, use_mask = False)) else: list_of_data.append(np.asarray(i)) return list_of_data
def get_test(self, shuffle=True, iterable=True, max_sequence_length=0): x = self.datas['test']['X'] x = [self.embedding.text_to_sequence(sent) for sent in x] y = to_categorical(np.asarray(self.datas['test']['y'])) data = (x, y) if max_sequence_length == 0: max_sequence_length = self.max_sequence_length if iterable: return BucketIterator(data, batch_size=self.batch_size, shuffle=True, max_sequence_length=max_sequence_length) else: if self.bert_enabled: x, x_mask = to_array(x, maxlen=self.max_sequence_length, use_mask=True) return [x, x_mask], y else: x = to_array(x, maxlen=self.max_sequence_length, use_mask=False) return x, y
def transformTF(self,data): return [to_array(i,self.max_sequence_length) if type(i[0])!=int and type(i)!=np.ndarray else i for i in data]
qdnn = models.setup(params) model = qdnn.getModel() # model.compile(loss = rank_hinge_loss({'margin':0.2}), # optimizer = units.getOptimizer(name=params.optimizer,lr=params.lr), # metrics=['accuracy']) # test_data.append(test_data[0]) print(parameter) evaluations=[] if params.match_type == 'pointwise': if params.onehot: params.lr = 10 *params.lr test_data = [to_array(i,reader.max_sequence_length) for i in test_data] loss_type,metric_type = ("categorical_hinge","acc") if params.onehot else ("mean_squared_error","mean_squared_error") model.compile(loss =loss_type, #"" optimizer = units.getOptimizer(name=params.optimizer,lr=params.lr), metrics=[metric_type]) for i in range(params.epochs): if "unbalance" in params.__dict__ and params.unbalance: model.fit_generator(reader.getPointWiseSamples4Keras(onehot = params.onehot,unbalance=params.unbalance),epochs = 1,steps_per_epoch=int(len(reader.datas["train"])/reader.batch_size),verbose = True) else: model.fit_generator(reader.getPointWiseSamples4Keras(onehot = params.onehot),epochs = 1,steps_per_epoch=len(reader.datas["train"]["question"].unique())/reader.batch_size,verbose = True) y_pred = model.predict(x = test_data) score =batch_softmax_with_first_item(y_pred)[:,1] if params.onehot else y_pred metric = reader.evaluate(score, mode = "test") evaluations.append(metric) print(metric)
def run(params): if "bert" in params.network_type.lower() : params.max_sequence_length = 512 reader.max_sequence_length = 512 evaluation=[] # params=dataset.classification.process_embedding(reader,params) qdnn = models.setup(params) model = qdnn.getModel() model.summary() if hasattr(loss.pairwise_loss, params.loss): loss_func = getattr(loss.pairwise_loss, params.loss) else: loss_func = params.loss optimizer = units.getOptimizer(name=params.optimizer,lr=params.lr) test_data = params.reader.get_test(iterable = False) test_data = [to_array(i,reader.max_sequence_length) for i in test_data] if hasattr(loss.pairwise_loss, params.metric_type): metric_func = getattr(loss.pairwise_loss, params.metric_type) else: metric_func = params.metric_type model.compile(loss = loss_func, #"" optimizer = optimizer, metrics=[metric_func]) # pairwise: # loss = identity_loss # metric = precision_batch # pointwise: # loss = categorical_hinge or mean_squared_error # metric = acc or mean_squared_error # classification: # loss = mean_squared_error # matrix = acc if params.dataset_type == 'qa': # from models.match import keras as models for i in range(params.epochs): model.fit_generator(reader.batch_gen(reader.get_train(iterable = True)),epochs = 1,steps_per_epoch=int(len(reader.datas["train"])/reader.batch_size),verbose = True) y_pred = model.predict(x = test_data) score = batch_softmax_with_first_item(y_pred)[:,1] if params.onehot else y_pred metric = reader.evaluate(score, mode = "test") evaluation.append(metric) print(metric) logger.info(metric) df=pd.DataFrame(evaluation,columns=["map","mrr","p1"]) elif params.dataset_type == 'classification': # from models import representation as models # model.summary() train_data = params.reader.get_train(iterable = False) test_data = params.reader.get_test(iterable = False) val_data =params.reader.get_val(iterable = False) # (train_x, train_y),(test_x, test_y),(val_x, val_y) = reader.get_processed_data() train_x, train_y = train_data test_x, test_y = test_data val_x, val_y = val_data if "bert" in params.network_type.lower() : train_x, train_x_mask = to_array(train_x,reader.max_sequence_length,use_mask=True) test_x,test_x_mask = to_array(test_x,reader.max_sequence_length,use_mask=True) val_x,val_x_mask = to_array(val_x,reader.max_sequence_length,use_mask=True) #pretrain_x, pretrain_y = dataset.get_sentiment_dic_training_data(reader,params) #model.fit(x=pretrain_x, y = pretrain_y, batch_size = params.batch_size, epochs= 3,validation_data= (test_x, test_y)) history = model.fit(x=[train_x,train_x_mask], y = train_y, batch_size = params.batch_size, epochs= params.epochs,validation_data= ([test_x,test_x_mask], test_y)) metric = model.evaluate(x = [val_x,val_x_mask], y = val_y) # !!!!!! change the order to val and test myzip( else: train_x = to_array(train_x,reader.max_sequence_length,use_mask=False) test_x = to_array(test_x,reader.max_sequence_length,use_mask=False) val_x = to_array(val_x,reader.max_sequence_length,use_mask=False) #pretrain_x, pretrain_y = dataset.get_sentiment_dic_training_data(reader,params) #model.fit(x=pretrain_x, y = pretrain_y, batch_size = params.batch_size, epochs= 3,validation_data= (test_x, test_y)) history = model.fit(x=train_x, y = train_y, batch_size = params.batch_size, epochs= params.epochs,validation_data= (test_x, test_y)) metric = model.evaluate(x = val_x, y = val_y) # !!!!!! change the order to val and test evaluation.append(metric) logger.info(metric) print(metric) df=pd.DataFrame(evaluation,columns=["map","mrr","p1"]) logger.info("\n".join([params.to_string(),"score: "+str(df.max().to_dict())])) K.clear_session()
def batch_gen(self, data_generator): if self.match_type == 'pointwise': # self.unbalanced_sampling = False if self.unbalanced_sampling: # print('system goes here!!') process = lambda row: [ self.embedding.text_to_sequence(row["question"]), self.embedding.text_to_sequence(row["answer"]), row['flag'] ] samples = self.datas["train"].apply(process, axis=1) for batch in BucketIterator( [i for i in zip(*samples.values)], batch_size=self.batch_size, shuffle=True, max_sequence_length=self.max_sequence_length): if self.onehot: if self.bert_enabled: q, q_mask = to_array(batch[0], self.max_sequence_length, use_mask=True) a, a_mask = to_array(batch[1], self.max_sequence_length, use_mask=True) yield [q, q_mask, a, a_mask], np.array([[0, 1] if i else [1, 0] for i in batch[2]]) else: yield batch[:2], np.array([[0, 1] if i else [1, 0] for i in batch[2]]) else: if self.bert_enabled: q, q_mask = to_array(batch[0], self.max_sequence_length, use_mask=True) a, a_mask = to_array(batch[1], self.max_sequence_length, use_mask=True) yield [q, q_mask, a, a_mask], np.array(batch[2]) else: yield batch[:2], np.array(batch[2]) else: while True: for batch in data_generator: q, a, neg = batch if self.onehot: data = [[ np.concatenate([q, q], 0).astype(int), np.concatenate([a, neg], 0).astype(int) ], np.array([[0, 1]] * len(q) + [[1, 0]] * len(q))] else: data = [[ np.concatenate([q, q], 0).astype(int), np.concatenate([a, neg], 0).astype(int) ], [1] * len(q) + [0] * len(q)] yield data if self.match_type == 'pairwise': while True: for batch in data_generator: if self.bert_enabled: q, q_mask = to_array(batch[0], self.max_sequence_length, use_mask=True) a, a_mask = to_array(batch[1], self.max_sequence_length, use_mask=True) neg_a, neg_a_mask = to_array(batch[2], self.max_sequence_length, use_mask=True) yield [q, q_mask, a, a_mask, neg_a, neg_a_mask], [q, a, neg_a] else: yield batch, batch
def get_train_2(self, shuffle=True, iterable=True, max_sequence_length=0, overlap_feature=False, sampling_per_question=False, need_balanced=False, always=False, balance_temperature=1): x_data = [] num_samples = 0 if sampling_per_question: #sampling on a per-question basis q = [] pos_a = [] neg_a = [] a = [] overlap_pos = [] overlap_neg = [] y = [] for question, group in self.datas["train"].groupby("question"): seq_q = self.embedding.text_to_sequence(question) pos_answers = group[group["flag"] == 1]["answer"] neg_answers = group[group["flag"] == 0][ "answer"] #.reset_index() if len(pos_answers) == 0 or len(neg_answers) == 0: continue for pos in pos_answers: seq_pos_a = self.embedding.text_to_sequence(pos) neg_index = np.random.choice(neg_answers.index) neg = neg_answers.loc[neg_index, ] seq_neg_a = self.embedding.text_to_sequence(neg) if self.match_type == 'pointwise': q = q + [seq_q, seq_q] a = a + [seq_pos_a, seq_neg_a] y = y + [1, 0] num_samples = num_samples + 2 else: q.append(seq_q) num_samples = num_samples + 1 pos_a.append(seq_pos_a) neg_a.append(seq_neg_a) if overlap_feature: overlap_pos.append(self.overlap_index( seq_q, seq_pos_a)) overlap_neg.append(self.overlap_index( seq_q, seq_neg_a)) if self.bert_enabled: q, q_mask = to_array(q, maxlen=self.max_sequence_length, use_mask=True) if self.match_type == 'pairwise': pos_a, pos_a_mask = to_array( pos_a, maxlen=self.max_sequence_length, use_mask=True) neg_a, neg_a_mask = to_array( neg_a, maxlen=self.max_sequence_length, use_mask=True) x_data = [q, q_mask, pos_a, pos_a_mask, neg_a, neg_a_mask] y = [l for l in zip(*[q, pos_a, neg_a])] else: y = to_categorical(np.asarray(y)) a, a_mask = to_array(a, maxlen=self.max_sequence_length, use_mask=True) x_data = [q, q_mask, a, a_mask] else: q = to_array(q, maxlen=self.max_sequence_length, use_mask=False) if self.match_type == 'pairwise': pos_a = to_array(pos_a, maxlen=self.max_sequence_length, use_mask=False) neg_a = to_array(neg_a, maxlen=self.max_sequence_length, use_mask=False) x_data = [q, pos_a, neg_a] y = [l for l in zip(*[q, pos_a, neg_a])] else: y = to_categorical(np.asarray(y)) a = to_array(a, maxlen=self.max_sequence_length, use_mask=False) x_data = [q, a] if overlap_feature: x_data = x_data + [overlap_pos, overlap_neg] else: num_samples = int(len(self.datas["train"])) #sample on the whole data, only support pointwise match type: x=[q,pos_a],y assert self.match_type == 'pointwise' q = self.datas["train"]["question"] a = self.datas["train"]["answer"] y = self.datas["train"]["flag"] q = [self.embedding.text_to_sequence(sent) for sent in q] # q = to_array(q,maxlen = self.max_sequence_length, use_mask = False) a = [self.embedding.text_to_sequence(sent) for sent in a] # a = to_array(a,maxlen = self.max_sequence_length, use_mask = False) y = to_categorical(np.asarray(y)) if max_sequence_length == 0: max_sequence_length = self.max_sequence_length if self.bert_enabled: q, q_mask = to_array(q, maxlen=self.max_sequence_length, use_mask=True) a, a_mask = to_array(a, maxlen=self.max_sequence_length, use_mask=True) x_data = [q, q_mask, a, a_mask] else: q = to_array(q, maxlen=self.max_sequence_length, use_mask=False) a = to_array(a, maxlen=self.max_sequence_length, use_mask=False) x_data = [q, a] if overlap_feature: overlap = [ self.overlap_index(q_seq, a_seq) for q_seq, a_seq in zip(*x_data) ] x_data = x_data + overlap self.num_samples = num_samples if iterable: x = [l for l in zip(*x_data)] data = (x, y) return BucketIterator( data, batch_size=self.batch_size, shuffle=True, need_balanced=need_balanced, always=always, balance_temperature=balance_temperature).__iter__() # return BucketIterator(data,batch_size=self.batch_size, batch_num = int(self.num_samples/self.batch_size),shuffle=True,need_balanced=need_balanced,always=always).__iter__() else: return x_data, y