def predict(self,dataset,summaries,groups,test_groups):
        if test_groups is not None:
            features = readFeatures(self.feature_types,dataset,summaries,groups,test_groups)
        else:
            topics = sorted(set([gg.split('-')[1] for gg in groups if dataset in gg]))
            features = readFeatures(self.feature_types,dataset,summaries,groups,topics)

        ### for directly returning features
        #return list(features.reshape(1,-1))[0]

        return self.lin_reg.predict(features)
    def train(self,dataset,summaries,groups,train_groups,targets):
        if train_groups is not None:
            features = readFeatures(self.feature_types,dataset,summaries,groups,train_groups)
        else:
            features = None
            for dd in dataset:
                topics = [gg.split('-')[1] for gg in groups if dd in gg]
                ff = readFeatures(self.feature_types,dd,summaries,groups,topics)
                if features is None:
                    features = np.copy(ff)
                else:
                    features = np.append(features,ff,axis=0)

        self.lin_reg.fit(features,targets)
    def predict(self, dataset, summaries, groups, test_groups):
        if test_groups is not None:
            features = readFeatures(self.feature_types, dataset, summaries,
                                    groups, test_groups)
        else:
            topics = sorted(
                set([gg.split('-')[1] for gg in groups if dataset in gg]))
            features = readFeatures(self.feature_types, dataset, summaries,
                                    groups, topics)

        weights = self.best_weights['combination.weight'].data.numpy()
        biases = self.best_weights['combination.bias'].data.numpy()
        if 'entropy' in self.loss_type:
            weights = weights[1] - weights[0]
            aa = np.dot(features, weights) + biases[1] - biases[0]
        else:
            aa = np.dot(features, weights.reshape(-1, 1)) + biases
        return normaliseList([i[0] for i in aa])
    def predict(self, dataset, summaries, groups, test_groups):
        if test_groups is not None:
            features = readFeatures(self.feature_types, dataset, summaries,
                                    groups, test_groups)
        else:
            topics = sorted(
                set([gg.split('-')[1] for gg in groups if dataset in gg]))
            features = readFeatures(self.feature_types, dataset, summaries,
                                    groups, topics)
        '''
        print('before random feature, ', features.shape)
        random_feature = np.array([np.random.random() for i in range(features.shape[0])])
        print('random feature size', random_feature.shape)
        features = np.c_[features,random_feature]
        print('after random feature, ', features.shape)
        '''

        aa = np.dot(features, np.array(self.best_weights))
        return normaliseList(aa)
    def train(self,
              dataset,
              summaries,
              groups,
              train_groups,
              targets,
              lrate,
              edge=False,
              sorted_idxs=None):
        if train_groups is not None:  ### in topic
            features = readFeatures(self.feature_types, dataset, summaries,
                                    groups, train_groups)
            assert features.shape[0] % len(train_groups) == 0
        else:  ### cross-topic
            features = None
            all_topics = []
            for dd in dataset:
                topics = sorted(
                    set([gg.split('-')[1] for gg in groups if dd in gg]))
                all_topics.extend(topics)
                ff = readFeatures(self.feature_types, dd, summaries, groups,
                                  topics)
                if features is None:
                    features = np.copy(ff)
                else:
                    features = np.append(features, ff, axis=0)

        assert features.shape[0] == targets.shape[0]

        ### initialise model
        if 'entropy' in self.loss_type:
            self.model = PrefModel(features.shape[1])
        else:
            self.model = RegModel(features.shape[1])

        ### select 15% data as dev
        dev_groups = []
        if train_groups is not None:
            avai_groups = train_groups
        else:
            avai_groups = all_topics
        while len(dev_groups) < 0.15 * len(avai_groups):
            gg = random.randint(0, len(avai_groups) - 1)
            if gg not in dev_groups:
                dev_groups.append(gg)
        train_features, train_targets, dev_features, dev_targets = self.getTrainDevData(
            features, len(avai_groups), dev_groups, targets)

        ### start training
        dev_results = []
        weights = []
        for epoch in range(int(self.epoch_num)):
            feature_list = []
            pref_list = []
            for batch in range(self.batch_size):
                if 'mse' in self.loss_type: break
                if 'no' in edge.lower():
                    delta_feature, pref = self.randomPairSampler(
                        features, len(avai_groups), dev_groups, targets)
                else:
                    delta_feature, pref = self.edgePairSampler(
                        features, len(avai_groups), dev_groups, targets,
                        sorted_idxs, edge)
                feature_list.append(delta_feature)
                pref_list.append(pref)
            if 'entropy' in self.loss_type:
                self.entropy_train(np.array(feature_list), np.array(pref_list),
                                   lrate)
            elif 'hinge' in self.loss_type:
                self.hinge_train(np.array(feature_list), np.array(pref_list),
                                 lrate)
            elif 'relative' in self.loss_type:
                self.rel_train(np.array(feature_list), np.array(pref_list),
                               lrate)
            else:
                assert 'mse' in self.loss_type
                pointer = 0
                while pointer < len(train_features):
                    #print('pointer', pointer, 'train_features size', len(train_features))
                    ff = train_features[
                        pointer:min(pointer +
                                    self.batch_size, len(train_features))]
                    tt = train_targets[
                        pointer:min(pointer +
                                    self.batch_size, len(train_features))]
                    self.mse_train(np.array(ff), np.array(tt), lrate)
                    pointer += self.batch_size

            devr = self.evaluateOnDev(dev_groups, dev_features, dev_targets)
            dev_results.append(devr)
            print('epoch {}, loss {}'.format(epoch, devr))
            weights.append(copy.deepcopy(self.model.state_dict()))

        self.best_weights = weights[dev_results.index(min(dev_results))]
Example #6
0
def cv(datasets,
       features,
       summaries,
       targets,
       topic_lists,
       gg,
       sentences_of_topics,
       sorted_idxs_list,
       learner_type='linear-pref',
       cv_fold_num=10,
       round=1e6,
       epoch=10,
       edge_sampling='no',
       validation_size=0.1,
       cnn_args={}):
    print('\n---Data reading finished. Now cross-validation starts---\n')
    print('Features used: {}'.format(features))
    cv_cnt = 0

    rewards_dic = {}

    if 'cnn' in learner_type:
        token2idx = read_duc_token2idx()
        summaries_tokens = actions_to_idx(summaries, gg, sentences_of_topics,
                                          token2idx)
        longest_summary = max([len(summary) for summary in summaries_tokens])
        summaries_tokens = np.array([
            summary + [0] * (longest_summary - len(summary))
            for summary in summaries_tokens
        ])
        if cnn_args['feature_count'] > 0:
            feature_matrix = []
            for dd in datasets:
                topics = sorted(
                    set([gg.split('-')[1] for gg in groups if dd in gg]))
                ff = readFeatures(features, dd, summaries, groups, topics)
                if features is None:
                    feature_matrix = np.copy(ff)
                else:
                    feature_matrix = np.append(feature_matrix, ff, axis=0)

    ### cross validation
    for ii in range(len(datasets)):
        all_result_dic = OrderedDict()
        test_ds = datasets[ii]
        train_ds = np.array(datasets)[[
            i for i in range(len(datasets)) if i != ii
        ]]

        test = np.array([test_ds in g for g in gg])
        train = np.array([not tt for tt in test])

        print('\n=====CV Fold {}, TRAIN {}, TEST {}====='.format(
            cv_cnt, train_ds, test_ds))

        if 'cnn' in learner_type:
            validation_groups = [
                '{}-{}'.format(datasets[i], topic)
                for i in range(len(datasets)) if i != ii
                for topic in topic_list[i][0:int(0.5 * validation_size *
                                                 len(topic_list[i]))]
            ]
            validation = np.array([g in validation_groups for g in gg])
            train = np.array(
                [not tt and not vv for tt, vv in zip(test, validation)])

            embedding_dim = 300
            filter_sizes = cnn_args['filter_sizes']
            filter_map_size = cnn_args['filter_count']
            max_out_of_filter_maps = 1
            max_out_of_all = cnn_args['final_max_pool']
            feature_count = cnn_args['feature_count']
            criteria = cnn_args['criteria']
            p = 0.5
            pretrained_embedding = cnn_args[
                'pretrained_embedding'] if 'pretrained_embedding' in cnn_args else ""

            if feature_count > 0:
                feature_train = feature_matrix[train]
                feature_val = feature_matrix[validation]
            else:
                feature_train = None
                feature_val = None

            rewarder = cnn.CNNRewarder(
                len(token2idx) + 1, embedding_dim, filter_sizes,
                filter_map_size, max_out_of_filter_maps, max_out_of_all,
                feature_count, p, pretrained_embedding)

            if criteria == 'mse':
                trainset = Dataset(summaries_tokens[train], targets[train],
                                   feature_train)
                validationset = Dataset(summaries_tokens[validation],
                                        targets[validation], feature_val)
            elif criteria == 'margin' or criteria == 'margin_rel' or criteria == 'cross_entropy':
                trainset = PairDataset(int(round), summaries_tokens[train],
                                       targets[train], groups[train],
                                       feature_train, sorted_idxs_list[train],
                                       cnn_args['sampling'])
                validationset = PairDataset(int(round * validation_size),
                                            summaries_tokens[validation],
                                            targets[validation],
                                            groups[validation], feature_val,
                                            sorted_idxs_list[validation],
                                            cnn_args['sampling'])
            elif criteria == 'warp':
                n = cnn_args['warp_samples']
                trainset = TupleDataset(n, round, summaries_tokens[train],
                                        targets[train], groups[train],
                                        feature_train)
                validationset = TupleDataset(n, int(round * validation_size),
                                             summaries_tokens[validation],
                                             targets[validation],
                                             groups[validation], feature_val)
            cnn.train(rewarder,
                      trainset,
                      validationset,
                      epoches=epoch,
                      batch_size=200,
                      criteria=criteria)

        elif 'reg' in learner_type:
            rewarder = LinearRegRewarder(features)
            rewarder.train(train_ds, summaries, groups, None, targets[train])
        elif 'pref' in learner_type:
            rewarder = PrefRewarder(features, round,
                                    learner_type.split('-')[0])
            rewarder.train(train_ds, summaries, groups, None, targets[train],
                           epoch)

        ### test
        if learner_type == 'cnn':
            test_features = torch.from_numpy(
                feature_matrix[test]) if feature_count > 0 else None
            learnt_rewards = cnn.predict(rewarder,
                                         torch.from_numpy(
                                             summaries_tokens[test]),
                                         features=test_features,
                                         use_best_model=True,
                                         batch_size=200,
                                         to_numpy=True)
        else:
            learnt_rewards = rewarder.predict(test_ds, summaries, groups, None)
        rewards_dic[test_ds] = learnt_rewards
        topics = topic_lists[ii]
        for it, tt in enumerate(topics):
            rr = getTopicReward(learnt_rewards, topics, tt)
            test_result = evaluateReward(
                list(rr), list(targets[[tt in gg for gg in groups]]), True)
            addResult(all_result_dic, test_result)
            print('---Test Results, {} TOPIC {}---'.format(test_ds, tt))
            for metric in test_result:
                print('{} : {}'.format(metric, test_result[metric]))
        print('\n===AVERAGE REWARD QUALITY FOR {}==='.format(test_ds))
        print('features {}, learner {}, sample pair num {}, edge {}, epoch {}'.
              format(features, learner_type, round, edge_sampling, epoch))
        print('cnn args {}'.format(cnn_args))
        for metric in all_result_dic:
            print('{}-mean : {}'.format(metric,
                                        np.mean(all_result_dic[metric])))
            print('{}-std: {}'.format(metric, np.std(all_result_dic[metric])))

    return rewards_dic
Example #7
0
def cv(dataset,
       features,
       summaries,
       targets,
       groups,
       gg,
       sentences_of_topics,
       sorted_idxs_list,
       learner_type='linear-pref',
       cv_fold_num=2,
       round=1e6,
       epoch=10,
       edge_sampling='no',
       validation_size=0.1,
       cnn_args={}):
    ### store all results
    all_test_reward_dic = OrderedDict()
    pointer = 0
    print('\n---Data reading finished. Now cross-validation starts---\n')
    print('features {}, learner {}, sample pair num {}, edge {}, epoch {}'.
          format(features, learner_type, round, edge_sampling, epoch))
    print('cnn args {}'.format(cnn_args))
    cv_cnt = 0

    rewards_dic = {}

    if 'cnn' in learner_type:
        token2idx = read_duc_token2idx()
        summaries_tokens = actions_to_idx(summaries, groups,
                                          sentences_of_topics, token2idx)
        longest_summary = max([len(summary) for summary in summaries_tokens])
        summaries_tokens = np.array([
            summary + [0] * (longest_summary - len(summary))
            for summary in summaries_tokens
        ])
        if cnn_args['feature_count'] > 0:
            feature_matrix = readFeatures(features, dataset, summaries, groups,
                                          gg)

    ### cross validation
    for ii in range(cv_fold_num):
        test_groups = gg[int(pointer
                             ):min(int(pointer +
                                       len(gg) / float(cv_fold_num)), len(gg))]
        if 'cnn' in learner_type:
            train_groups = sorted(list(set(groups) - set(test_groups)))
            validation_groups = train_groups[0:int(validation_size *
                                                   len(train_groups))]
            train_groups = sorted(
                list(set(train_groups) - set(validation_groups)))
            validation = np.array([ele in validation_groups for ele in groups])
        else:
            train_groups = sorted(list(set(groups) - set(test_groups)))
        train = np.array([ele in train_groups for ele in groups])

        cv_cnt += 1
        pointer = (ii + 1) * len(gg) / float(cv_fold_num)
        print('\n=====CV Fold {}====='.format(cv_cnt))

        if 'cnn' in learner_type:
            embedding_dim = 300
            filter_sizes = cnn_args['filter_sizes']
            filter_map_size = cnn_args['filter_count']
            max_out_of_filter_maps = 1
            max_out_of_all = cnn_args['final_max_pool']
            feature_count = cnn_args['feature_count']
            criteria = cnn_args['criteria']
            pretrained_embedding = cnn_args[
                'pretrained_embedding'] if 'pretrained_embedding' in cnn_args else ""
            p = 0.5
            ae_split = cnn_args['ae_split'] if 'ae_split' in cnn_args else 5

            if feature_count > 0:
                feature_train = feature_matrix[train]
                feature_val = feature_matrix[validation]
            else:
                feature_train = None
                feature_val = None

            rewarder = cnn.CNNRewarder(
                len(token2idx) + 1, embedding_dim, filter_sizes,
                filter_map_size, max_out_of_filter_maps, max_out_of_all,
                feature_count, p, pretrained_embedding)

            if 'mse' in criteria:
                trainset = Dataset(summaries_tokens[train], targets[train],
                                   feature_train)
                validationset = Dataset(summaries_tokens[validation],
                                        targets[validation], feature_val)
            elif 'margin' in criteria or criteria == 'cross_entropy':
                trainset = PairDataset(int(round), summaries_tokens[train],
                                       targets[train], groups[train],
                                       feature_train, sorted_idxs_list[train],
                                       cnn_args['sampling'])
                validationset = PairDataset(int(round * validation_size),
                                            summaries_tokens[validation],
                                            targets[validation],
                                            groups[validation], feature_val,
                                            sorted_idxs_list[validation],
                                            cnn_args['sampling'])
            elif criteria == 'warp':
                n = cnn_args['warp_samples']
                trainset = TupleDataset(n, round, summaries_tokens[train],
                                        targets[train], groups[train],
                                        feature_train)
                validationset = TupleDataset(n, int(round * validation_size),
                                             summaries_tokens[validation],
                                             targets[validation],
                                             groups[validation], feature_val)
            cnn.train(rewarder,
                      trainset,
                      validationset,
                      epoches=epoch,
                      batch_size=200,
                      criteria=criteria,
                      split=ae_split)

        elif 'reg' in learner_type:
            rewarder = LinearRegRewarder(features)
            rewarder.train(dataset, summaries, groups, train_groups,
                           targets[train])
        elif 'pref' in learner_type:
            rewarder = PrefRewarder(features, round,
                                    learner_type.split('-')[0])
            rewarder.train(dataset, summaries, groups, train_groups,
                           targets[train], epoch, edge_sampling,
                           sorted_idxs_list[train])

        ### test
        weights_added = False
        for tg in test_groups:
            test = np.array([ele == tg for ele in groups])
            if learner_type == 'cnn':
                test_features = torch.from_numpy(
                    feature_matrix[test]) if feature_count > 0 else None
                learnt_rewards = cnn.predict(rewarder,
                                             torch.from_numpy(
                                                 summaries_tokens[test]),
                                             features=test_features,
                                             use_best_model=True,
                                             batch_size=200,
                                             to_numpy=True)
            else:
                learnt_rewards = rewarder.predict(dataset, summaries, groups,
                                                  [tg])
            rewards_dic[tg] = learnt_rewards
            test_result = evaluateReward(list(learnt_rewards),
                                         list(targets[test]), True)
            if not weights_added:
                weights_added = True
                if learner_type == 'cnn':
                    test_result[
                        'weights'] = rewarder.combination.weight.data.numpy(
                        ).copy()
                elif 'reg' in learner_type:
                    test_result['weights'] = rewarder.lin_reg.coef_.copy()
                elif 'pref' in learner_type:
                    test_result['weights'] = rewarder.rank_learner.coef_.copy()
            #rmse,temp = plotAgreement(targets[test],learnt_rewards,plot=False,bin_num=bin_num)
            #test_result['rmse-bin{}'.format(bin_num)] = rmse
            #test_result['temperature-bin{}'.format(bin_num)] = temp
            addResult(all_test_reward_dic, test_result)
            print('---Test Results, TOPIC {}---'.format(tg))
            for metric in test_result:
                print('{} : {}'.format(metric, test_result[metric]))

    print('\n====={}, AVERAGE PERFORMANCE OVER {}-FOLD CV====='.format(
        dataset, cv_cnt))
    print('features {}, learner {}, sample pair num {}, edge {}, epoch {}'.
          format(features, learner_type, round, edge_sampling, epoch))
    print('cnn args {}'.format(cnn_args))
    print('---Test Results---')
    for metric in all_test_reward_dic:
        if metric == 'weights':
            print('{} mean : {}'.format(
                metric, np.mean(all_test_reward_dic[metric], 0)))
            print('{} std : {}'.format(metric,
                                       np.std(all_test_reward_dic[metric], 0)))
        else:
            print('{} mean : {}'.format(metric,
                                        np.mean(all_test_reward_dic[metric])))
            print('{} std : {}'.format(metric,
                                       np.std(all_test_reward_dic[metric])))

    return rewards_dic
    reader = CorpusReader(PROCESSED_PATH)
    data = reader.get_data(dataset)

    ### store all results
    all_test_reward_dic = OrderedDict()
    topic_cnt = 0

    feature_type = ['infersent_max']

    ### read data
    for topic, docs, models in data:
        topic_cnt += 1
        summs, ref_values_dic = readSummaries(dataset, topic, 'rouge',
                                              sample_num)
        ref_rewards = aggregateScores(ref_values_dic)
        groups = [topic] * len(summs)

        features = readFeatures(feature_type, dataset, np.array(summs), groups,
                                [topic])
        features = features.reshape(1, -1)[0]
        rr = evaluateReward(list(features), ref_rewards, True)
        print('\n\n===TOPIC {}: {}==='.format(topic_cnt, topic))
        addResult(all_test_reward_dic, rr)
        for metric in rr:
            print('{}:\t{}'.format(metric, rr[metric]))

    print('\n\n===TYPE {} AVERAGE OVER {} TOPICS==='.format(
        feature_type[0], topic_cnt))
    for metric in all_test_reward_dic:
        print('{}:\t{}'.format(metric, rr[metric]))
Example #9
0
def correlation(features, sizes):
    names = []
    for f, s in zip(features, sizes):
        if s > 1:
            for i in range(s):
                names.append(f + str(i))
        else:
            names.append(f)
    names.append('rouge_reward')

    dataset = 'DUC2001'  ## DUC2001, DUC2002, DUC2004
    sample_num = 9999
    bin_num = 20
    cv_fold_num = 10

    ### read documents and ref. summaries
    reader = CorpusReader(PROCESSED_PATH)
    data = reader.get_data(dataset)

    topic_cnt = 0

    summaries = []
    groups = []
    models_list = []
    docs_list = []
    targets = []

    ### read data
    for topic, docs, models in data:

        print('read DATA {}, TOPIC {}'.format(dataset, topic))
        summs, ref_values_dic = readSummaries(dataset, topic, 'rouge',
                                              sample_num)
        print('num of summaries read: {}'.format(len(summaries)))
        ref_rewards = aggregateScores(ref_values_dic)
        models_list.append(models)
        docs_list.append(docs)
        summaries.extend(summs)
        groups.extend([topic] * len(summs))
        targets.extend(ref_rewards)
        topic_cnt += 1

    allFeatures = readFeatures(features, dataset, np.array(summaries), groups,
                               set(groups))
    allFeatures = np.c_[allFeatures, np.array(targets)]
    correlations = {}
    threshold_correlation = {}
    for col1, col2 in itertools.combinations(range(len(names)), 2):
        pcc = pearsonr(allFeatures[:, col1], allFeatures[:, col2])[0]
        correlations[names[col1] + ' ' + names[col2] + ': pcc = '] = pcc
        # other way for ease of reading
        correlations[names[col2] + ' ' + names[col1] + ': pcc = '] = pcc
        if pcc < -0.8:
            threshold_correlation[names[col1] + ' ' + names[col2] +
                                  ': pcc = '] = pcc
            threshold_correlation[names[col2] + ' ' + names[col1] +
                                  ': pcc = '] = pcc
    #for key in sorted(correlations.keys()):
    #    print(key+str(correlations[key]))
    print("Pairs with pcc >.9")
    for key in sorted(threshold_correlation.keys()):
        print(key + str(threshold_correlation[key]))
    def train(self,
              dataset,
              summaries,
              groups,
              train_groups,
              targets,
              epoch=20,
              edge='no',
              sorted_idxs=None):
        if train_groups is not None:
            features = readFeatures(self.feature_types, dataset, summaries,
                                    groups, train_groups)
            assert features.shape[0] % len(train_groups) == 0
        else:
            features = None
            all_topics = []
            for dd in dataset:
                topics = sorted(
                    set([gg.split('-')[1] for gg in groups if dd in gg]))
                all_topics.extend(topics)
                ff = readFeatures(self.feature_types, dd, summaries, groups,
                                  topics)
                if features is None:
                    features = np.copy(ff)
                else:
                    features = np.append(features, ff, axis=0)

        assert features.shape[0] == targets.shape[0]
        '''
        print('before random feature, ', features.shape)
        random_feature = np.array([np.random.random() for i in range(features.shape[0])])
        print('random feature size', random_feature.shape)
        features = np.c_[features,random_feature]
        print('after random feature, ', features.shape)
        '''

        ### select 15% data as dev
        dev_groups = []
        if train_groups is not None:
            avai_groups = train_groups
        else:
            avai_groups = all_topics
        while len(dev_groups) < 0.15 * len(avai_groups):
            gg = random.randint(0, len(avai_groups) - 1)
            if gg not in dev_groups:
                dev_groups.append(gg)
        self.getDevResult(features, len(avai_groups), dev_groups, targets,
                          True)

        cnt = 0
        feature_list = []
        pref_list = []
        dev_results = []
        weights = []
        step = self.round / epoch
        while cnt < self.round:
            if 'no' in edge.lower():
                delta_feature, pref = self.randomPairSampler(
                    features, len(avai_groups), dev_groups, targets)
            else:
                delta_feature, pref = self.edgePairSampler(
                    features, len(avai_groups), dev_groups, targets,
                    sorted_idxs, edge)
            feature_list.append(delta_feature)
            pref_list.append(pref)
            cnt += 1
            if (cnt) % step == 0:
                self.rank_learner.fit(np.array(feature_list),
                                      np.array(pref_list))
                rr = self.getDevResult(features, len(avai_groups), dev_groups,
                                       targets, False)
                dev_results.append(rr)
                print('pair {}, ndcg at 10% {}'.format(cnt, rr))
                weights.append(self.rank_learner.coef_[0])

        if len(weights) == 0:
            self.rank_learner.fit(np.array(feature_list), np.array(pref_list))
            weights.append(self.rank_learner.coef_[0])

        self.best_weights = weights[dev_results.index(max(dev_results))]