def rescale(config, online_preds_fp):
        online_preds = DataUtil.load_vector(online_preds_fp, 'float')

        feature_name = 'graph_edge_max_clique_size'
        feature_pt = config.get('DEFAULT', 'feature_pt')
        test_feature_fp = '%s/%s.test.smat' % (feature_pt, feature_name)
        test_features_mc = Feature.load(test_feature_fp).toarray()

        feature_name = 'graph_edge_cc_size'
        feature_pt = config.get('DEFAULT', 'feature_pt')
        test_feature_fp = '%s/%s.test.smat' % (feature_pt, feature_name)
        test_features_cc = Feature.load(test_feature_fp).toarray()

        for index in range(len(online_preds)):
            score = online_preds[index]
            if test_features_mc[index][0] == 3.:
                score = PostProcessor.adj(score, te=0.40883512, tr=0.623191)
            elif test_features_mc[index][0] > 3.:
                score = PostProcessor.adj(score, te=0.96503024, tr=0.972554)
            else:
                if test_features_cc[index][0] < 3.:
                    score = PostProcessor.adj(score,
                                              te=0.05739666,
                                              tr=0.233473)
                else:
                    score = PostProcessor.adj(score,
                                              te=0.04503431,
                                              tr=0.149471)
            online_preds[index] = score

        DataUtil.save_vector(online_preds_fp + '.rescale', online_preds)
    def __generate_data(indexs, labels, features, positive_rate):
        """
        generate data set according to the `indexs` and `positive_rate`
        :param indexs: indexs which will select data from raw data set
        :param labels: all labels of raw data set
        :param features: feature matrix
        :param positive_rate: positive_rate in data set
        :return: feature matrix, labels, balanced indexs
        """
        # balance the data set
        balanced_indexs = Feature.balance_index(indexs, labels, positive_rate)
        # sample labels
        labels = [labels[index] for index in balanced_indexs]
        # sample features
        features = Feature.sample_row(features, balanced_indexs)

        return features, labels, balanced_indexs
Beispiel #3
0
    def extract(self, data_set_name, part_num=1, part_id=0):
        """
        Extract the feature from original data set
        :param data_set_name: name of data set
        :param part_num: number of partitions of data
        :param part_id: partition ID which will be extracted
        :return:
        """
        # load data set from disk
        data = pd.read_csv(
            '%s/%s.csv' %
            (self.config.get('DEFAULT', 'source_pt'), data_set_name)).fillna(
                value="")
        begin_id = int(1. * len(data) / part_num * part_id)
        end_id = int(1. * len(data) / part_num * (part_id + 1))

        # set feature file path
        feature_pt = self.config.get('DEFAULT', 'feature_pt')
        if 1 == part_num:
            self.data_feature_fp = '%s/%s.%s.smat' % (
                feature_pt, self.feature_name, data_set_name)
        else:
            self.data_feature_fp = '%s/%s.%s.smat.%03d_%03d' % (
                feature_pt, self.feature_name, data_set_name, part_num,
                part_id)

        feature_file = open(self.data_feature_fp, 'w')
        feature_file.write('%d %d\n' %
                           (end_id - begin_id, int(self.get_feature_num())))
        # extract feature
        for index, row in data[begin_id:end_id].iterrows():
            feature = self.extract_row(row)
            Feature.save_feature(feature, feature_file)
        feature_file.close()

        LogUtil.log(
            'INFO', 'save features (%s, %s, %d, %d) done' %
            (self.feature_name, data_set_name, part_num, part_id))
 def run_online(self):
     # load feature matrix
     online_features = Feature.load_all(
         self.config.get('DIRECTORY', 'feature_pt'),
         self.config.get('FEATURE', 'feature_selected').split(),
         self.config.get('MODEL', 'online_rawset_name'),
         self.config.get('FEATURE', 'will_save'))
     model = Model.new(self.config.get('MODEL', 'model_name'), self.config)
     model_fp = self.config.get(
         'DIRECTORY', 'model_pt') + '/se.%s.model' % self.config.get(
             'MODEL', 'model_name')
     model.load(model_fp)
     online_preds = model.predict(online_features)
     online_preds_fp = '%s/se_online.%s.pred' % (self.config.get(
         'DIRECTORY',
         'pred_pt'), self.config.get('MODEL', 'online_test_rawset_name'))
     DataUtil.save_vector(online_preds_fp, online_preds, 'w')
    def run_offline(self):
        LogUtil.log('INFO', 'cv_tag(%s)' % self.cv_tag)
        # load feature matrix
        offline_features = Feature.load_all(
            self.config.get('DIRECTORY', 'feature_pt'),
            self.config.get('FEATURE', 'feature_selected').split(),
            self.config.get('MODEL', 'offline_rawset_name'),
            self.config.get('FEATURE', 'will_save'))
        # load labels
        offline_labels = DataUtil.load_vector(
            '%s/%s.label' % (self.config.get('DIRECTORY', 'label_pt'),
                             self.config.get('MODEL', 'offline_rawset_name')),
            True)
        # generate index file
        if '' == self.cv_tag:
            self.cv_tag = self.out_tag
            self.__generate_index(offline_features.shape[0])
        # cross validation
        offline_valid_preds_all = [0.] * offline_features.shape[0]
        offline_test_preds_all = [0.] * offline_features.shape[0]
        for fold_id in range(self.cv_num):
            LogUtil.log('INFO', 'cross validation fold_id(%d) begin' % fold_id)

            # generate training data set
            offline_train_pos_rate = float(
                self.config.get('MODEL', 'train_pos_rate'))
            offline_train_indexs_fp = '%s/cv_tag%s_n%d_f%d_train.%s.index' % (
                self.config.get('DIRECTORY',
                                'index_pt'), self.cv_tag, self.cv_num, fold_id,
                self.config.get('MODEL', 'offline_rawset_name'))
            offline_train_indexs = DataUtil.load_vector(
                offline_train_indexs_fp, 'int')
            offline_train_features, offline_train_labels, offline_train_balanced_indexs = \
                CrossValidation.__generate_data(offline_train_indexs,
                                                offline_labels,
                                                offline_features,
                                                offline_train_pos_rate)
            LogUtil.log('INFO', 'offline train data generation done')

            # generate validation data set
            offline_valid_pos_rate = float(
                self.config.get('MODEL', 'valid_pos_rate'))
            offline_valid_indexs_fp = '%s/cv_tag%s_n%d_f%d_valid.%s.index' % (
                self.config.get('DIRECTORY',
                                'index_pt'), self.cv_tag, self.cv_num, fold_id,
                self.config.get('MODEL', 'offline_rawset_name'))
            offline_valid_indexs = DataUtil.load_vector(
                offline_valid_indexs_fp, 'int')
            offline_valid_features, offline_valid_labels, offline_valid_balanced_indexs = \
                CrossValidation.__generate_data(offline_valid_indexs,
                                                offline_labels,
                                                offline_features,
                                                offline_valid_pos_rate)
            LogUtil.log('INFO', 'offline valid data generation done')

            # generate test data set
            offline_test_pos_rate = float(
                self.config.get('MODEL', 'test_pos_rate'))
            offline_test_indexs_fp = '%s/cv_tag%s_n%d_f%d_test.%s.index' % (
                self.config.get('DIRECTORY',
                                'index_pt'), self.cv_tag, self.cv_num, fold_id,
                self.config.get('MODEL', 'offline_rawset_name'))
            offline_test_indexs = DataUtil.load_vector(offline_test_indexs_fp,
                                                       'int')
            offline_test_features, offline_test_labels, offline_test_balanced_indexs = \
                CrossValidation.__generate_data(offline_test_indexs,
                                                offline_labels,
                                                offline_features,
                                                offline_test_pos_rate)
            LogUtil.log('INFO', 'offline test data generation done')

            model = Model.new(self.config.get('MODEL', 'model_name'),
                              self.config)
            model_fp = self.config.get('DIRECTORY', 'model_pt') + '/cv_n%d_f%d.%s.model' % \
                                                                  (self.cv_num,
                                                                   fold_id,
                                                                   self.config.get('MODEL', 'model_name'))
            model.save(model_fp)
            offline_train_preds, offline_valid_preds, offline_test_preds = model.fit(
                offline_train_features, offline_train_labels,
                offline_valid_features, offline_valid_labels,
                offline_test_features, offline_test_labels)
            offline_train_score = Evaluator.evaluate(
                self.config.get('MODEL', 'evaluator_name'),
                offline_train_labels, offline_train_preds)
            offline_valid_score = Evaluator.evaluate(
                self.config.get('MODEL', 'evaluator_name'),
                offline_valid_labels, offline_valid_preds)
            offline_test_score = Evaluator.evaluate(
                self.config.get('MODEL', 'evaluator_name'),
                offline_test_labels, offline_test_preds)
            score_fp = '%s/%s.score' % (self.config.get(
                'DIRECTORY', 'score_pt'), 'cv')
            score_file = open(score_fp, 'a')
            score_file.write('fold:%d\ttrain:%s\tvalid:%s\ttest:%s\n' %
                             (fold_id, offline_train_score,
                              offline_valid_score, offline_test_score))
            score_file.close()
            # merge prediction results
            for index in range(len(offline_valid_balanced_indexs)):
                offline_valid_preds_all[offline_valid_balanced_indexs[
                    index]] = offline_valid_preds[index]
            for index in range(len(offline_test_balanced_indexs)):
                offline_test_preds_all[offline_test_balanced_indexs[
                    index]] = offline_test_preds[index]
            LogUtil.log('INFO', 'cross test fold_id(%d) done' % fold_id)
        # save prediction results
        offline_valid_preds_all_fp = '%s/cv_n%d_valid.%s.pred' % (
            self.config.get('DIRECTORY', 'pred_pt'), self.cv_num,
            self.config.get('MODEL', 'offline_rawset_name'))
        DataUtil.save_vector(offline_valid_preds_all_fp,
                             offline_valid_preds_all, 'w')
        offline_test_preds_all_fp = '%s/cv_n%d_test.%s.pred' % (
            self.config.get('DIRECTORY', 'pred_pt'), self.cv_num,
            self.config.get('MODEL', 'offline_rawset_name'))
        DataUtil.save_vector(offline_test_preds_all_fp, offline_test_preds_all,
                             'w')
        # evaluate
        offline_valid_score = Evaluator.evaluate(
            self.config.get('MODEL', 'evaluator_name'), offline_labels,
            offline_valid_preds_all)
        offline_test_score = Evaluator.evaluate(
            self.config.get('MODEL', 'evaluator_name'), offline_labels,
            offline_test_preds_all)
        score_fp = '%s/%s.score' % (self.config.get('DIRECTORY',
                                                    'score_pt'), 'cv')
        score_file = open(score_fp, 'a')
        score_file.write('cross_validation\tvalid:%s\ttest:%s\n' %
                         (offline_valid_score, offline_test_score))
        score_file.close()
    def run_offline(self):
        # load feature matrix
        offline_features = Feature.load_all(
            self.config.get('DIRECTORY', 'feature_pt'),
            self.config.get('FEATURE', 'feature_selected').split(),
            self.config.get('MODEL', 'offline_rawset_name'),
            self.config.get('FEATURE', 'will_save'))
        # load labels
        offline_labels = DataUtil.load_vector(
            '%s/%s.label' % (self.config.get('DIRECTORY', 'label_pt'),
                             self.config.get('MODEL', 'offline_rawset_name')),
            True)
        # generate index file
        if '' == self.se_tag:
            self.se_tag = self.out_tag
            self.__generate_index(offline_features.shape[0])
        index_pt = self.config.get('DIRECTORY', 'index_pt')
        # generate training data set
        offline_train_pos_rate = float(
            self.config.get('MODEL', 'train_pos_rate'))
        offline_train_indexs_fp = '%s/se_tag%s_train.%s.index' % (
            index_pt, self.se_tag,
            self.config.get('MODEL', 'offline_rawset_name'))
        offline_train_indexs = DataUtil.load_vector(offline_train_indexs_fp,
                                                    'int')
        offline_train_features, offline_train_labels, offline_train_balanced_indexs = \
            SingleExec.__generate_data(offline_train_indexs,
                                       offline_labels,
                                       offline_features,
                                       offline_train_pos_rate)
        LogUtil.log('INFO', 'offline train data generation done')

        # generate validation data set
        offline_valid_pos_rate = float(
            self.config.get('MODEL', 'valid_pos_rate'))
        offline_valid_indexs_fp = '%s/se_tag%s_valid.%s.index' % (
            index_pt, self.se_tag,
            self.config.get('MODEL', 'offline_rawset_name'))
        offline_valid_indexs = DataUtil.load_vector(offline_valid_indexs_fp,
                                                    'int')
        offline_valid_features, offline_valid_labels, offline_valid_balanced_indexs = \
            SingleExec.__generate_data(offline_valid_indexs,
                                       offline_labels,
                                       offline_features,
                                       offline_valid_pos_rate)
        LogUtil.log('INFO', 'offline valid data generation done')

        # generate test data set
        offline_test_pos_rate = float(self.config.get('MODEL',
                                                      'test_pos_rate'))
        offline_test_indexs_fp = '%s/se_tag%s_test.%s.index' % (
            index_pt, self.se_tag,
            self.config.get('MODEL', 'offline_rawset_name'))
        offline_test_indexs = DataUtil.load_vector(offline_test_indexs_fp,
                                                   'int')
        offline_test_features, offline_test_labels, offline_test_balanced_indexs = \
            SingleExec.__generate_data(offline_test_indexs,
                                       offline_labels,
                                       offline_features,
                                       offline_test_pos_rate)
        LogUtil.log('INFO', 'offline test data generation done')

        model = Model.new(self.config.get('MODEL', 'model_name'), self.config)
        model_fp = self.config.get(
            'DIRECTORY', 'model_pt') + '/se.%s.model' % self.config.get(
                'MODEL', 'model_name')
        model.save(model_fp)
        offline_train_preds, offline_valid_preds, offline_test_preds = model.fit(
            offline_train_features, offline_train_labels,
            offline_valid_features, offline_valid_labels,
            offline_test_features, offline_test_labels)
        offline_train_score = Evaluator.evaluate(
            self.config.get('MODEL', 'evaluator_name'), offline_train_labels,
            offline_train_preds)
        offline_valid_score = Evaluator.evaluate(
            self.config.get('MODEL', 'evaluator_name'), offline_valid_labels,
            offline_valid_preds)
        offline_test_score = Evaluator.evaluate(
            self.config.get('MODEL', 'evaluator_name'), offline_test_labels,
            offline_test_preds)
        score_fp = '%s/%s.score' % (self.config.get('DIRECTORY',
                                                    'score_pt'), 'cv')
        score_file = open(score_fp, 'a')
        score_file.write(
            'single_exec\ttrain:%s\tvalid:%s\ttest:%s\n' %
            (offline_train_score, offline_valid_score, offline_test_score))
        score_file.close()
        # save prediction results
        offline_valid_preds_fp = '%s/se_valid.%s.pred' % (self.config.get(
            'DIRECTORY',
            'pred_pt'), self.config.get('MODEL', 'offline_rawset_name'))
        DataUtil.save_vector(offline_valid_preds_fp, offline_valid_preds, 'w')
        offline_test_preds_fp = '%s/se_test.%s.pred' % (self.config.get(
            'DIRECTORY',
            'pred_pt'), self.config.get('MODEL', 'offline_rawset_name'))
        DataUtil.save_vector(offline_test_preds_fp, offline_test_preds, 'w')