Ejemplo n.º 1
0
 def get_val(self, shuffle=True, iterable=True, max_sequence_length=0):
     x = self.datas['dev']['X']
     x = [self.embedding.text_to_sequence(sent) for sent in x]
     y = to_categorical(np.asarray(self.datas['dev']['y']))
     data = (x, y)
     if max_sequence_length == 0:
         max_sequence_length = self.max_sequence_length
     if iterable:
         #            iterator = BucketIterator(data,batch_size=self.batch_size,shuffle=True,max_sequence_length=max_sequence_length)
         #            for batch in iterator:
         #                yield batch[0],batch[1]
         return BucketIterator(data,
                               batch_size=self.batch_size,
                               shuffle=True,
                               max_sequence_length=max_sequence_length,
                               backend=self.language)
     else:
         if self.bert_enabled:
             x, x_mask = to_array(x,
                                  maxlen=self.max_sequence_length,
                                  use_mask=True)
             return [x, x_mask], y
         else:
             x = to_array(x,
                          maxlen=self.max_sequence_length,
                          use_mask=False)
             return x, y
Ejemplo n.º 2
0
    def get_test(self, overlap_feature=False, iterable=True):

        if overlap_feature:
            process = lambda row: [
                self.embedding.text_to_sequence(row["question"]),
                self.embedding.text_to_sequence(row["answer"]),
                self.embedding.overlap_index(row['question'], row['answer'])
            ]
        else:
            process = lambda row: [
                self.embedding.text_to_sequence(row["question"]),
                self.embedding.text_to_sequence(row["answer"])
            ]

        samples = self.datas['test'].apply(process, axis=1)
        if iterable:
            return BucketIterator([i for i in zip(*samples)],
                                  batch_size=self.batch_size,
                                  shuffle=False)
        else:
            if self.match_type == 'pointwise':

                #                [to_array(i,reader.max_sequence_length) for i in test_data]
                return [
                    to_array(i, self.max_sequence_length)
                    for i in zip(*samples)
                ]
            else:
                #                return [[i,i] for i in zip(*samples)]
                return [[
                    to_array(i, self.max_sequence_length),
                    to_array(i, self.max_sequence_length)
                ] for i in zip(*samples)]
Ejemplo n.º 3
0
    def get_test_2(self,
                   shuffle=True,
                   iterable=True,
                   max_sequence_length=0,
                   overlap_feature=False):

        x_data = []
        #sample on the whole data, only support pointwise match type: x=[q,pos_a],y

        q = self.datas["test"]["question"]
        a = self.datas["test"]["answer"]
        y = self.datas["test"]["flag"]

        if max_sequence_length == 0:
            max_sequence_length = self.max_sequence_length

        q = [self.embedding.text_to_sequence(sent) for sent in q]
        #        q = to_array(q,maxlen = self.max_sequence_length, use_mask = False)
        a = [self.embedding.text_to_sequence(sent) for sent in a]
        #        a = to_array(a,maxlen = self.max_sequence_length, use_mask = False)
        y = to_categorical(np.asarray(y))

        if self.bert_enabled:
            q, q_mask = to_array(q,
                                 maxlen=self.max_sequence_length,
                                 use_mask=True)
            a, a_mask = to_array(a,
                                 maxlen=self.max_sequence_length,
                                 use_mask=True)
            x_data = [q, q_mask, a, a_mask]
            if self.match_type == 'pairwise':
                x_data = x_data + [a, a_mask]
                y = [l for l in zip(*[q, a, a])]

        else:
            q = to_array(q, maxlen=self.max_sequence_length, use_mask=False)
            a = to_array(a, maxlen=self.max_sequence_length, use_mask=False)
            x_data = [q, a]
            if self.match_type == 'pairwise':
                x_data = x_data + [a]
                y = [l for l in zip(*[q, a, a])]
        if overlap_feature:
            overlap = [
                self.overlap_index(q_seq, a_seq)
                for q_seq, a_seq in zip(*x_data)
            ]
            x_data = x_data + overlap

        if iterable:
            x = [l for l in zip(*x_data)]
            data = (x, y)
            return BucketIterator(data,
                                  batch_size=self.batch_size,
                                  batch_num=int(self.num_samples /
                                                self.batch_size),
                                  shuffle=True)
        else:
            return x_data, y
Ejemplo n.º 4
0
 def transformKeras(self,data):
     list_of_data = []
     for i in data:
         if type(i[0])!=int and type(i)!=np.ndarray and type(i[0][0])==int:
             list_of_data.append(to_array(i,self.max_sequence_length, use_mask = False))
         else:
             list_of_data.append(np.asarray(i))
                 
     return list_of_data
Ejemplo n.º 5
0
 def get_test(self, shuffle=True, iterable=True, max_sequence_length=0):
     x = self.datas['test']['X']
     x = [self.embedding.text_to_sequence(sent) for sent in x]
     y = to_categorical(np.asarray(self.datas['test']['y']))
     data = (x, y)
     if max_sequence_length == 0:
         max_sequence_length = self.max_sequence_length
     if iterable:
         return BucketIterator(data,
                               batch_size=self.batch_size,
                               shuffle=True,
                               max_sequence_length=max_sequence_length)
     else:
         if self.bert_enabled:
             x, x_mask = to_array(x,
                                  maxlen=self.max_sequence_length,
                                  use_mask=True)
             return [x, x_mask], y
         else:
             x = to_array(x,
                          maxlen=self.max_sequence_length,
                          use_mask=False)
             return x, y
Ejemplo n.º 6
0
 def transformTF(self,data):
     
     return [to_array(i,self.max_sequence_length) if type(i[0])!=int and type(i)!=np.ndarray  else i for i in data]
Ejemplo n.º 7
0
     qdnn = models.setup(params)
     model = qdnn.getModel()
 
     
 #    model.compile(loss = rank_hinge_loss({'margin':0.2}),
 #                optimizer = units.getOptimizer(name=params.optimizer,lr=params.lr),
 #                metrics=['accuracy'])
     
     
 #    test_data.append(test_data[0])
     print(parameter)
     evaluations=[]
     if params.match_type == 'pointwise':
         if params.onehot:
             params.lr = 10 *params.lr
         test_data = [to_array(i,reader.max_sequence_length) for i in test_data]
         loss_type,metric_type = ("categorical_hinge","acc") if params.onehot else ("mean_squared_error","mean_squared_error")
         model.compile(loss =loss_type, #""
                 optimizer = units.getOptimizer(name=params.optimizer,lr=params.lr),
                 metrics=[metric_type])
         for i in range(params.epochs):
             if "unbalance" in  params.__dict__ and params.unbalance:
                 model.fit_generator(reader.getPointWiseSamples4Keras(onehot = params.onehot,unbalance=params.unbalance),epochs = 1,steps_per_epoch=int(len(reader.datas["train"])/reader.batch_size),verbose = True)        
             else:
                 model.fit_generator(reader.getPointWiseSamples4Keras(onehot = params.onehot),epochs = 1,steps_per_epoch=len(reader.datas["train"]["question"].unique())/reader.batch_size,verbose = True)        
             y_pred = model.predict(x = test_data) 
             score =batch_softmax_with_first_item(y_pred)[:,1]  if params.onehot else y_pred
             
             metric = reader.evaluate(score, mode = "test")
             evaluations.append(metric)
             print(metric)
Ejemplo n.º 8
0
def run(params):
    if "bert" in params.network_type.lower() :
        params.max_sequence_length = 512
        reader.max_sequence_length = 512
    evaluation=[]
#    params=dataset.classification.process_embedding(reader,params)    
    qdnn = models.setup(params)
    model = qdnn.getModel()
    model.summary()
    if hasattr(loss.pairwise_loss, params.loss): 
            
        loss_func = getattr(loss.pairwise_loss, params.loss)
    else:
        loss_func = params.loss
    optimizer = units.getOptimizer(name=params.optimizer,lr=params.lr)
    
    test_data = params.reader.get_test(iterable = False)
    test_data = [to_array(i,reader.max_sequence_length) for i in test_data]
    if hasattr(loss.pairwise_loss, params.metric_type):
        metric_func = getattr(loss.pairwise_loss, params.metric_type)
    else:
        metric_func = params.metric_type
    
    model.compile(loss = loss_func, #""
                      optimizer = optimizer,
                      metrics=[metric_func])
    # pairwise:
    # loss = identity_loss
    # metric = precision_batch

    # pointwise:
    # loss = categorical_hinge or mean_squared_error
    # metric = acc or mean_squared_error
    
    # classification:
    # loss = mean_squared_error
    # matrix = acc
      
    if params.dataset_type == 'qa':
#        from models.match import keras as models   
        for i in range(params.epochs):
            model.fit_generator(reader.batch_gen(reader.get_train(iterable = True)),epochs = 1,steps_per_epoch=int(len(reader.datas["train"])/reader.batch_size),verbose = True)        
            y_pred = model.predict(x = test_data) 
            score = batch_softmax_with_first_item(y_pred)[:,1]  if params.onehot else y_pred
                
            metric = reader.evaluate(score, mode = "test")
            evaluation.append(metric)
            print(metric)
            logger.info(metric)
        df=pd.DataFrame(evaluation,columns=["map","mrr","p1"]) 

            
    elif params.dataset_type == 'classification':
#        from models import representation as models   
        
    #    model.summary()    
        train_data = params.reader.get_train(iterable = False)
        test_data = params.reader.get_test(iterable = False)
        val_data =params.reader.get_val(iterable = False)
    #    (train_x, train_y),(test_x, test_y),(val_x, val_y) = reader.get_processed_data()
        train_x, train_y = train_data
        test_x, test_y = test_data
        val_x, val_y = val_data
        if "bert" in params.network_type.lower() :
            train_x, train_x_mask = to_array(train_x,reader.max_sequence_length,use_mask=True) 
            test_x,test_x_mask =  to_array(test_x,reader.max_sequence_length,use_mask=True)
            val_x,val_x_mask =  to_array(val_x,reader.max_sequence_length,use_mask=True)
                #pretrain_x, pretrain_y = dataset.get_sentiment_dic_training_data(reader,params)
            #model.fit(x=pretrain_x, y = pretrain_y, batch_size = params.batch_size, epochs= 3,validation_data= (test_x, test_y))
        
            history = model.fit(x=[train_x,train_x_mask], y = train_y, batch_size = params.batch_size, epochs= params.epochs,validation_data= ([test_x,test_x_mask], test_y))
        
            metric = model.evaluate(x = [val_x,val_x_mask], y = val_y)   # !!!!!! change the order to val and test myzip(
        else:
            train_x = to_array(train_x,reader.max_sequence_length,use_mask=False) 
            test_x =  to_array(test_x,reader.max_sequence_length,use_mask=False)
            val_x =  to_array(val_x,reader.max_sequence_length,use_mask=False)
            #pretrain_x, pretrain_y = dataset.get_sentiment_dic_training_data(reader,params)
            #model.fit(x=pretrain_x, y = pretrain_y, batch_size = params.batch_size, epochs= 3,validation_data= (test_x, test_y))
        
            history = model.fit(x=train_x, y = train_y, batch_size = params.batch_size, epochs= params.epochs,validation_data= (test_x, test_y))
        
            metric = model.evaluate(x = val_x, y = val_y)   # !!!!!! change the order to val and test
            
        evaluation.append(metric)
        logger.info(metric)
        print(metric)

        df=pd.DataFrame(evaluation,columns=["map","mrr","p1"])  
        
    logger.info("\n".join([params.to_string(),"score: "+str(df.max().to_dict())]))

    K.clear_session()
Ejemplo n.º 9
0
    def batch_gen(self, data_generator):
        if self.match_type == 'pointwise':
            #            self.unbalanced_sampling = False
            if self.unbalanced_sampling:
                #                print('system goes here!!')
                process = lambda row: [
                    self.embedding.text_to_sequence(row["question"]),
                    self.embedding.text_to_sequence(row["answer"]), row['flag']
                ]
                samples = self.datas["train"].apply(process, axis=1)
                for batch in BucketIterator(
                    [i for i in zip(*samples.values)],
                        batch_size=self.batch_size,
                        shuffle=True,
                        max_sequence_length=self.max_sequence_length):
                    if self.onehot:
                        if self.bert_enabled:

                            q, q_mask = to_array(batch[0],
                                                 self.max_sequence_length,
                                                 use_mask=True)
                            a, a_mask = to_array(batch[1],
                                                 self.max_sequence_length,
                                                 use_mask=True)
                            yield [q, q_mask, a,
                                   a_mask], np.array([[0, 1] if i else [1, 0]
                                                      for i in batch[2]])
                        else:
                            yield batch[:2], np.array([[0, 1] if i else [1, 0]
                                                       for i in batch[2]])
                    else:

                        if self.bert_enabled:
                            q, q_mask = to_array(batch[0],
                                                 self.max_sequence_length,
                                                 use_mask=True)
                            a, a_mask = to_array(batch[1],
                                                 self.max_sequence_length,
                                                 use_mask=True)
                            yield [q, q_mask, a, a_mask], np.array(batch[2])
                        else:
                            yield batch[:2], np.array(batch[2])
            else:
                while True:
                    for batch in data_generator:
                        q, a, neg = batch
                        if self.onehot:
                            data = [[
                                np.concatenate([q, q], 0).astype(int),
                                np.concatenate([a, neg], 0).astype(int)
                            ],
                                    np.array([[0, 1]] * len(q) +
                                             [[1, 0]] * len(q))]
                        else:
                            data = [[
                                np.concatenate([q, q], 0).astype(int),
                                np.concatenate([a, neg], 0).astype(int)
                            ], [1] * len(q) + [0] * len(q)]
                        yield data

        if self.match_type == 'pairwise':
            while True:
                for batch in data_generator:
                    if self.bert_enabled:

                        q, q_mask = to_array(batch[0],
                                             self.max_sequence_length,
                                             use_mask=True)
                        a, a_mask = to_array(batch[1],
                                             self.max_sequence_length,
                                             use_mask=True)
                        neg_a, neg_a_mask = to_array(batch[2],
                                                     self.max_sequence_length,
                                                     use_mask=True)

                        yield [q, q_mask, a, a_mask, neg_a,
                               neg_a_mask], [q, a, neg_a]
                    else:
                        yield batch, batch
Ejemplo n.º 10
0
    def get_train_2(self,
                    shuffle=True,
                    iterable=True,
                    max_sequence_length=0,
                    overlap_feature=False,
                    sampling_per_question=False,
                    need_balanced=False,
                    always=False,
                    balance_temperature=1):

        x_data = []
        num_samples = 0
        if sampling_per_question:
            #sampling on a per-question basis
            q = []
            pos_a = []
            neg_a = []
            a = []
            overlap_pos = []
            overlap_neg = []
            y = []
            for question, group in self.datas["train"].groupby("question"):
                seq_q = self.embedding.text_to_sequence(question)
                pos_answers = group[group["flag"] == 1]["answer"]
                neg_answers = group[group["flag"] == 0][
                    "answer"]  #.reset_index()
                if len(pos_answers) == 0 or len(neg_answers) == 0:
                    continue

                for pos in pos_answers:

                    seq_pos_a = self.embedding.text_to_sequence(pos)
                    neg_index = np.random.choice(neg_answers.index)
                    neg = neg_answers.loc[neg_index, ]
                    seq_neg_a = self.embedding.text_to_sequence(neg)
                    if self.match_type == 'pointwise':
                        q = q + [seq_q, seq_q]
                        a = a + [seq_pos_a, seq_neg_a]
                        y = y + [1, 0]
                        num_samples = num_samples + 2
                    else:
                        q.append(seq_q)
                        num_samples = num_samples + 1

                    pos_a.append(seq_pos_a)
                    neg_a.append(seq_neg_a)
                    if overlap_feature:
                        overlap_pos.append(self.overlap_index(
                            seq_q, seq_pos_a))
                        overlap_neg.append(self.overlap_index(
                            seq_q, seq_neg_a))

            if self.bert_enabled:
                q, q_mask = to_array(q,
                                     maxlen=self.max_sequence_length,
                                     use_mask=True)
                if self.match_type == 'pairwise':
                    pos_a, pos_a_mask = to_array(
                        pos_a, maxlen=self.max_sequence_length, use_mask=True)
                    neg_a, neg_a_mask = to_array(
                        neg_a, maxlen=self.max_sequence_length, use_mask=True)
                    x_data = [q, q_mask, pos_a, pos_a_mask, neg_a, neg_a_mask]
                    y = [l for l in zip(*[q, pos_a, neg_a])]
                else:
                    y = to_categorical(np.asarray(y))
                    a, a_mask = to_array(a,
                                         maxlen=self.max_sequence_length,
                                         use_mask=True)
                    x_data = [q, q_mask, a, a_mask]
            else:
                q = to_array(q,
                             maxlen=self.max_sequence_length,
                             use_mask=False)
                if self.match_type == 'pairwise':
                    pos_a = to_array(pos_a,
                                     maxlen=self.max_sequence_length,
                                     use_mask=False)
                    neg_a = to_array(neg_a,
                                     maxlen=self.max_sequence_length,
                                     use_mask=False)
                    x_data = [q, pos_a, neg_a]
                    y = [l for l in zip(*[q, pos_a, neg_a])]

                else:
                    y = to_categorical(np.asarray(y))
                    a = to_array(a,
                                 maxlen=self.max_sequence_length,
                                 use_mask=False)
                    x_data = [q, a]
            if overlap_feature:
                x_data = x_data + [overlap_pos, overlap_neg]

        else:
            num_samples = int(len(self.datas["train"]))
            #sample on the whole data, only support pointwise match type: x=[q,pos_a],y
            assert self.match_type == 'pointwise'

            q = self.datas["train"]["question"]
            a = self.datas["train"]["answer"]
            y = self.datas["train"]["flag"]
            q = [self.embedding.text_to_sequence(sent) for sent in q]
            #        q = to_array(q,maxlen = self.max_sequence_length, use_mask = False)
            a = [self.embedding.text_to_sequence(sent) for sent in a]
            #        a = to_array(a,maxlen = self.max_sequence_length, use_mask = False)
            y = to_categorical(np.asarray(y))

            if max_sequence_length == 0:
                max_sequence_length = self.max_sequence_length
            if self.bert_enabled:
                q, q_mask = to_array(q,
                                     maxlen=self.max_sequence_length,
                                     use_mask=True)
                a, a_mask = to_array(a,
                                     maxlen=self.max_sequence_length,
                                     use_mask=True)
                x_data = [q, q_mask, a, a_mask]
            else:
                q = to_array(q,
                             maxlen=self.max_sequence_length,
                             use_mask=False)
                a = to_array(a,
                             maxlen=self.max_sequence_length,
                             use_mask=False)
                x_data = [q, a]

            if overlap_feature:
                overlap = [
                    self.overlap_index(q_seq, a_seq)
                    for q_seq, a_seq in zip(*x_data)
                ]
                x_data = x_data + overlap

        self.num_samples = num_samples

        if iterable:
            x = [l for l in zip(*x_data)]
            data = (x, y)
            return BucketIterator(
                data,
                batch_size=self.batch_size,
                shuffle=True,
                need_balanced=need_balanced,
                always=always,
                balance_temperature=balance_temperature).__iter__()
#            return BucketIterator(data,batch_size=self.batch_size, batch_num = int(self.num_samples/self.batch_size),shuffle=True,need_balanced=need_balanced,always=always).__iter__()
        else:
            return x_data, y