예제 #1
0
    def process_subject(self, subject_id):
        """ Function processes evaluation for a single given subject. This includes randomly
        choosing properties used as input for recommendation evaluation, filtering for classifying
        properties and calling the actual evaluation.

        :param: subject_id: the id of the subject to be evaluated
        :type subject_id: int
        """

        self.stats['subjects'] += 1
        """" get all properties occuring on current subject """
        properties_objects = self.recommender.get_subject_properties_objects(
            subject_id, self.long_tail_limit)
        """ intersected array contains only items if classifying properties are defined """
        contained_classifying_properties = (set(
            properties_objects.keys()).intersection(
                set(self.classifying_property_ids)))
        classifying_properties_objects = {
            key: properties_objects[key]
            for key in contained_classifying_properties
        }
        """ gather first <initial_no_properties> property-object elements (in given order) and
        merge dictionary with classifying properties to make sure to not exclude any
        classifying properties  """
        initial_properties_objects = OrderedDict(
            list(properties_objects.items())[:self.initial_no_properties])
        input_properties_objects = initial_properties_objects.copy()
        input_properties_objects.update(classifying_properties_objects)
        """ at least initial_no_properties + 1 required """
        if len(properties_objects) > len(input_properties_objects):
            self.stats['processed_subjects'] += 1
            """ choose three random (ordered by randomly generated column idx) """
            """ if it is a classified algorithm, add classifying properties """
            removed_properties_set = list(
                set(properties_objects.keys()).difference(
                    set(input_properties_objects.keys())))
            removed_properties = [
                x for x in properties_objects if x in removed_properties_set
            ]

            self.stats['details'][subject_id] = {}
            self.stats['details'][
                subject_id] = self.perform_evaluation_on_subject(
                    input_properties_objects, removed_properties)
            self.stats['details'][subject_id]['removed_properties'] = len(
                removed_properties)
            self.stats['details'][subject_id][
                'initial_no_of_properties'] = self.initial_no_properties
예제 #2
0
    def output_clean_csv(self, csv_in_fd, out_fd_dict):

        # For reading the original Propublica export:
        reader = csv.DictReader(csv_in_fd)
        # Get Propublica column header:
        col_headers = reader.fieldnames.copy()
        # We will not output the html code:
        col_headers.remove('html')
        col_headers.remove('entities')
        col_headers.remove('message')
        col_headers.remove('targets')

        metadata_out_writer = csv.DictWriter(out_fd_dict['metadata_out_fd'],
                                             col_headers)
        metadata_out_writer.writeheader()

        entities_writer = csv.DictWriter(out_fd_dict['entities_out_fd'],
                                         ['id', 'entity', 'entity_type'])
        entities_writer.writeheader()

        text_writer = csv.DictWriter(out_fd_dict['text_out_fd'],
                                     ['id', 'message'])
        text_writer.writeheader()

        targets_writer = csv.DictWriter(out_fd_dict['targets_out_fd'],
                                        ['id', 'target', 'segment'])
        targets_writer.writeheader()

        rows_processed = 0
        total_rows_processed = 0

        html_rm_pat = re.compile(r'<[^>]+>')

        # Read from the export csv file:
        for row_dict in reader:
            if self.num_rows is not None:
                if self.num_rows <= 0:
                    break
                else:
                    self.num_rows -= 1

            row_id = row_dict['id']

            # The entities column is a JSON object.
            # Grab it, and generate rows for the
            # entities table:
            id_only_dict = OrderedDict({'id': row_id})
            try:
                entities = json.loads(row_dict['entities'])
                for entity_dict in entities:
                    entities = id_only_dict.copy()
                    # Join the id and each entity dict
                    entities.update(entity_dict)
                    entities_writer.writerow(entities)

            except json.decoder.JSONDecodeError:
                entities = entities_writer.writerow({
                    'id':
                    row_id,
                    'entity':
                    'not specified',
                    'entity_type':
                    'not_specified'
                })

            # Write the plain text column's output to the text csv file:
            txt_output = {'id': row_id, 'message': row_dict['message']}
            text_writer.writerow(txt_output)

            # Targeting is also a JSON array, and goes to
            # a different table:
            id_only_dict = OrderedDict({'id': row_id})
            try:
                targets = json.loads(row_dict['targets'])
                if len(targets) == 0:
                    targets_writer.writerow({
                        'id': row_id,
                        'target': 'none',
                        'segment': 'none'
                    })
                else:
                    for target_dict in targets:
                        all_targets = id_only_dict.copy()
                        # Join the id and each entity dict
                        all_targets.update(target_dict)
                        targets_writer.writerow(all_targets)

            except json.decoder.JSONDecodeError:
                targets = targets_writer.writerow({
                    'id': row_id,
                    'target': 'not specified',
                    'segment': 'not_specified'
                })

            # Don't want to save the HTML, entities, msg text, or targets
            # in the metadata csv:
            del (row_dict['html'])
            del (row_dict['message'])
            del (row_dict['entities'])
            del (row_dict['targets'])

            # The targeting and targetings columns are
            # HTML: get plain text:
            if len(row_dict['targeting']) > 0:
                row_dict['targeting'] = html_rm_pat.sub(
                    '', row_dict['targeting'])
            if len(row_dict['targetings']) > 0:
                row_dict['targetings'] = html_rm_pat.sub(
                    '', row_dict['targetings'])

            metadata_out_writer.writerow(row_dict)

            # Another row processed:
            rows_processed += 1
            if rows_processed >= self.REPORT_EVERY:
                total_rows_processed += rows_processed
                print(f"Processed {total_rows_processed} rows")
                rows_processed = 0

        # Announce final row count:
        total_rows_processed += rows_processed
        print(f"Processed total of {total_rows_processed} rows.")
def result_summary(res_dict,
                   args_dict,
                   TNR_target=0.05,
                   skip_pattern=None,
                   include_pattern='.*',
                   pvalue_record=None):
    from utils.meters import simple_auc
    from _collections import OrderedDict
    ## if not configured setup logging for external caller
    if not logging.getLogger('').handlers:
        setup_logging()
    in_dist = args_dict['dataset']
    alphas = args_dict['alphas']
    logging.info(f'Report for {args_dict["model"]} - {in_dist}')
    logging.info(f'Tag: {args_dict["tag"]}')
    result_dict = OrderedDict(
        model=args_dict["model"],
        in_dist=args_dict['dataset'],
        LDA=args_dict.get('LDA'),
        joint=args_dict['measure_joint_distribution'],
        tag=args_dict['tag'],
        channles_sellect=args_dict.get('channel_selection_fn'))
    # read indist results to calibrate alpha value for target TNR
    rows = []
    accuracies = {'model': {}}
    for reduction_name, reduction_metrics in res_dict[in_dist].items():
        accuracies[reduction_name] = {}
        if reduction_name.endswith('_acc'):
            acc = reduction_metrics.mean.cpu().numpy()
            std = reduction_metrics.std.cpu().numpy()
            acc_name = reduction_name.replace('_acc', '')
            if acc_name == 'model':
                reduction_name = 'model'
            if acc_name.endswith('rescaled-smx'):
                reduction_name = acc_name[:-13]
                acc_name = 'model_rescaled_smx'
            elif acc_name.endswith('-pval'):
                reduction_name = acc_name[:-5]
                acc_name = 'pval'

            accuracies[reduction_name][f'{acc_name}_t1'] = acc[0]
            accuracies[reduction_name][f'{acc_name}_t5'] = acc[1]
            accuracies[reduction_name][f'{acc_name}_std_t1'] = std[0]

    for reduction_name, reduction_metrics in res_dict[in_dist].items():
        if skip_pattern and bool(re.match(
                skip_pattern, reduction_name)) or include_pattern and not bool(
                    re.match(include_pattern, reduction_name)):
            continue
        result_dict['reduction'] = reduction_name
        result_dict.update(**accuracies['model'])
        result_dict.update(**accuracies[reduction_name])
        logging.info(reduction_name)
        if type(reduction_metrics) != dict:
            # report simple metric
            logging.info(
                f'\t{reduction_metrics.mean}\t({reduction_metrics.std})')
            continue
        # report reduction specific metrics
        for metric_name, meter_object in reduction_metrics.items():
            metric_stats = MeterDict()
            if not metric_name.endswith('_roc'):
                logging.info(
                    f'\t{metric_name}: {meter_object.mean.numpy():0.3}')
                continue
            FPR = meter_object.mean.numpy()
            calibrated_alpha_id = min((FPR < TNR_target).sum() - 1, len(FPR))

            if calibrated_alpha_id == -1:
                # all pvalues are larger than alpha
                fpr_under_target_alpha = meter_object.mean[0]
                interp_alpha = FPR[0]
                calibrated_alpha_id = 0
            else:
                fpr_under_target_alpha = FPR[calibrated_alpha_id]
                # actual rejection threshold to use for TNR 95%
                interp_alpha = np.interp(0.05, FPR.squeeze(), alphas)

            result_dict.update(
                dict(metric_name=metric_name,
                     FPR_strict=fpr_under_target_alpha,
                     FPR_over=FPR[calibrated_alpha_id + 1],
                     chosen_alpha=interp_alpha))
            logging.info(
                f'\t{metric_name} - in-dist rejected: '
                # f'alpha-{indist_pvalues_roc[alphas.index(TNR_target)]:0.3f} ({TNR_target:0.3f}), '
                f'under-{fpr_under_target_alpha:0.3f} ({alphas[calibrated_alpha_id]:0.3f}), '
                f'interp-{TNR_target:0.3f} ({interp_alpha:0.3f}), '
                f'over-{FPR[calibrated_alpha_id + 1]:0.3f} ({alphas[calibrated_alpha_id + 1]})'
            )

            if pvalue_record and reduction_name in pvalue_record[in_dist]:
                if metric_name.startswith(
                        'class_cond'
                ) and 'predicted_id' in pvalue_record[in_dist]:
                    predicted_ids = pvalue_record[in_dist]['predicted_id']
                    in_cc_pval_pred = pvalue_record[in_dist][reduction_name][
                        th.arange(predicted_ids.shape[0]), predicted_ids]
                else:
                    in_cc_pval_pred = pvalue_record[in_dist][
                        reduction_name].max(1)[0]

            for target_dataset_name, reduction_metrics in res_dict.items():
                if target_dataset_name != in_dist and metric_name in reduction_metrics[
                        reduction_name]:
                    interp_rejected = np.interp(
                        interp_alpha, alphas, reduction_metrics[reduction_name]
                        [metric_name].mean.numpy())
                    TPR = reduction_metrics[reduction_name][
                        metric_name].mean.numpy()
                    raw_rejected = TPR[alphas.index(TNR_target)]
                    auroc = simple_auc(TPR, FPR)
                    logging.info(
                        f'\t\t{target_dataset_name}:\traw-{raw_rejected:0.3f}\tinterp-{interp_rejected:0.3f}\tAUROC:{auroc:0.3f}'
                    )
                    if pvalue_record and reduction_name in pvalue_record[
                            target_dataset_name]:
                        if metric_name.startswith(
                                'class_cond'
                        ) and 'predicted_id' in pvalue_record[
                                target_dataset_name]:
                            predicted_ids = pvalue_record[target_dataset_name][
                                'predicted_id']
                            out_cc_pval_pred = pvalue_record[
                                target_dataset_name][reduction_name][
                                    th.arange(predicted_ids.shape[0]),
                                    predicted_ids]
                        else:
                            out_cc_pval_pred = pvalue_record[
                                target_dataset_name][reduction_name].max(1)[0]

                        m = metric(in_cc_pval_pred.numpy(),
                                   out_cc_pval_pred.numpy())
                        logging.info(f'\t\t\tbenchmark metrics: {m}')
                        result_dict.update(**m)

                    result_dict.update(
                        dict(out_dist=target_dataset_name,
                             TPR95_raw=raw_rejected,
                             TPR95_interp=interp_rejected,
                             AUROC=auroc))
                    rows.append(result_dict.copy())

                    if in_dist.startswith(
                            'cifar') and target_dataset_name.startswith(
                                'cifar'):
                        continue
                    metric_stats.update(
                        dict(TPR95_raw=th.tensor([raw_rejected]),
                             TPR95_interp=th.tensor([interp_rejected]),
                             AUROC=th.tensor([auroc])))

            if target_dataset_name != in_dist and metric_name in reduction_metrics[
                    reduction_name]:
                result_dict['out_dist'] = 'avg'
                logging.info(
                    f'\tmetric avg stats: {[k + " " + str(float(v)) for k, v in metric_stats.get_mean_dict().items()]}'
                )
                result_dict.update(**metric_stats.get_mean_dict())
                rows.append(result_dict.copy())
    return rows
예제 #4
0
파일: utilities.py 프로젝트: paepcke/birds
    def find_class_paths(cls, data_root):
        '''
        Given a root directory, return an Ordered dict
        mapping class names to lists of directories
        that contain at least one sample of that class. 
        
        Both the class names (i.e. keys), and the
        lists of directories (i.e. values) will be
        naturally sorted, and absolute.
        
        Directory names that begin with a period ('.')
        are excluded.

        The assumption is that the names of the last
        path element of the returned directories are 
        class names. This assumption is often made in 
        torchvision packages.  

        :param data_root: root directory for search 
        :type data_root: str
        :return dict mapping target classes to a list
            of directories that contain samples of that
            class. The directories will be Path objs
        :rtype Ordered{str : [Path]}
        '''

        # If path is relative, compute abs
        # path relative to current dir:
        if not os.path.isabs(data_root):
            data_root = os.path.join(os.path.dirname(__file__), data_root)
        class_paths = set([])
        for root, _dirs, files in os.walk(data_root):

            if len(files) == 0:
                # Found only directories:
                continue

            # For convenience, turn the file paths
            # into Path objects:
            file_Paths = [Path(name) for name in files]
            root_Path = Path(root)

            # Pick out files with an image extension:
            full_paths = []
            for file_path in file_Paths:
                if file_path.suffix in cls.IMG_EXTENSIONS \
                   and not file_path.parent.stem.startswith('.'):
                    full_paths.append(
                        Path.joinpath(root_Path, file_path).parent)

            # Using union in this loop guarantees
            # uniqeness of the gathered class names:

            class_paths = class_paths.union(set(full_paths))

        # Order the paths so that all machines
        # have the same sample-id assignements later:

        class_paths = natsort.natsorted(list(class_paths))

        # Get dict {class-name : [paths-to-samples-of-that-class]}
        class_path_dict = OrderedDict()

        for class_path in class_paths:
            try:
                # dict[class-name] gets more
                # paths that hold samples of class-name:
                class_path_dict[class_path.stem].append(class_path)
            except KeyError:
                class_path_dict[class_path.stem] = [class_path]

        # Now ensure that the list of directories,
        # (i.e. the values) for each class-name's entry
        # are also sorted:

        # Use copy of class_path_dict for iteration,
        # b/c we modify class_path_dict in the loop:

        class_path_dict_copy = class_path_dict.copy()
        for class_name, dirs in class_path_dict_copy.items():
            class_path_dict[class_name] = natsort.natsorted(dirs)

        return class_path_dict