Python get_dataset_from_filename Exemples, utils.filename_utils.get_dataset_from_filename Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : test_graph_helper.py Projet : davidgengenbach/bachelor-thesis

 def iterate_graph_cache_datasets(
         self) -> typing.Generator[typing.Tuple, None, None]:
     for graph_dataset in dataset_helper.get_all_cached_graph_datasets():
         dataset_name = filename_utils.get_dataset_from_filename(
             graph_dataset)
         with self.subTest(graph_dataset=graph_dataset,
                           dataset_name=dataset_name):
             yield graph_dataset, dataset_name

Exemple #2

0

Afficher le fichier

Fichier : script_relabel_graphs.py Projet : davidgengenbach/bachelor-thesis

def process_dataset(cache_file, label_lookup_file, args):
    dataset = filename_utils.get_dataset_from_filename(cache_file)

    cache_filename = filename_utils.get_filename_only(cache_file,
                                                      with_extension=False)

    threshold, topn = filename_utils.get_topn_threshold_from_lookupfilename(
        label_lookup_file)

    result_file = cache_file.replace(
        dataset,
        'relabeled_threshold_{}_topn_{}_{}'.format(threshold, topn, dataset))
    if not args.force and os.path.exists(result_file):
        return

    with open(label_lookup_file, 'rb') as f:
        label_lookup = pickle.load(f)

    X, Y = dataset_helper.get_dataset_cached(cache_file)
    X = graph_helper.get_graphs_only(X)

    # Get label to be renamed
    node_labels = list(chain.from_iterable([x.nodes() for x in X]))
    unique_labels = set(node_labels)
    counter = collections.Counter(node_labels)

    node_labels_to_be_renamed = set([
        label for label, occurrences in counter.items()
        if occurrences <= args.max_occurrence
    ])

    lookup_ = {
        label: new_label
        for label, new_label in label_lookup.items()
        if label in node_labels_to_be_renamed
    }

    new_labels = set(lookup_.values())
    lookup__ = collections.defaultdict(list)

    for label, new_label in label_lookup.items():
        if new_label in new_labels:
            lookup__[label].append(new_label)

    lookup_ = dict(lookup_, **lookup__)

    LOGGER.info(
        '{:80} topn={:4} threshold={:4}\n\t\t#relabeled labels: {}\n\t\t#unique labels: {}\n\t\t#nodes: {}'
        .format(cache_filename, topn, threshold, len(lookup_),
                len(unique_labels), len(node_labels)))

    relabel_trans = transformers.RelabelGraphsTransformer(lookup_)

    X = relabel_trans.transform(X)

    with open(result_file, 'wb') as f:
        pickle.dump((X, Y), f)

Exemple #3

0

Afficher le fichier

def get_dataset_names_with_concept_map(limit_datasets: typing.Iterable = None):
    all_graph_datasets = [
        filename_utils.get_dataset_from_filename(x)
        for x in get_all_cached_graph_datasets(graph_type=TYPE_CONCEPT_MAP)
    ]
    return [
        x
        for x in get_all_available_dataset_names(limit_datasets=limit_datasets)
        if x in all_graph_datasets
    ]

Exemple #4

0

Afficher le fichier

    def graph_dataset_filter(cache_file):
        filename = filename_utils.get_filename_only(cache_file)
        is_graph_dataset = filename.startswith('dataset_graph')
        is_not_relabeled = '_relabeled' not in filename
        is_not_gram_or_phi = 'gram' not in filename and 'phi' not in filename
        is_in_dataset = not dataset_name or dataset_name == filename_utils.get_dataset_from_filename(
            filename)
        is_right_graph_type = not graph_type or graph_type == graph_helper.get_graph_type_from_filename(
            cache_file)

        return np.all([
            is_graph_dataset, is_not_relabeled, is_not_gram_or_phi,
            is_in_dataset, is_right_graph_type
        ])

Exemple #5

0

Afficher le fichier

def get_filtered_tasks(task_type=None,
                       dataset=None,
                       graph_type_filter: str = None,
                       task_name_filter: str = None,
                       tasks=None) -> typing.List[ExperimentTask]:
    task_type = _ensure_is_container(task_type)
    dataset = _ensure_is_container(dataset)

    if tasks is None:
        tasks = get_all_tasks()

    return [
        t for t in tasks if (not task_type or t.type in task_type) and (
            not dataset or filename_utils.get_dataset_from_filename(t.name) in
            dataset) and (not task_name_filter or task_name_filter in t.name)
        and (not graph_type_filter or graph_type_filter ==
             graph_helper.get_graph_type_from_filename(t.name))
    ]

Exemple #6

0

Afficher le fichier

Fichier : filter_utils.py Projet : davidgengenbach/bachelor-thesis

def file_should_be_processed(file : str, include_filter : str, exclude_filter : str, limit_dataset : list):
    """Returns true, if file is included AND not excluded AND in the limited datasets.
    
    Args:
        file (str): the file to be processed
        include_filter (str): string that has to be in `file` (can be None)
        exclude_filter (str): string that must not be in `file` (can be None)
        dataset (str): the dataset of the file
        limit_dataset (list(str)): the datasets that have been limited (= are allowed)
    
    Returns:
        bool: Whether the file should be processed
    """
    dataset = filename_utils.get_dataset_from_filename(file)

    is_in_limited_datasets = (not limit_dataset or dataset in limit_dataset)
    is_included = (not include_filter or include_filter in file)
    is_excluded = (exclude_filter and exclude_filter in file)
    return is_in_limited_datasets and is_included and not is_excluded

Exemple #7

0

Afficher le fichier

Fichier : graph_helper.py Projet : davidgengenbach/bachelor-thesis

def get_combined_text_graph_dataset(graph_cache_file, use_ana=False) -> typing.Tuple[typing.List[typing.Tuple], typing.List]:
    dataset_name = filename_utils.get_dataset_from_filename(graph_cache_file)

    X_text, Y_text = dataset_helper.get_dataset(dataset_name + ('-ana' if use_ana else ''))
    X_graph, Y_graph = dataset_helper.get_dataset_cached(graph_cache_file)

    # Same length but has ID
    if len(X_graph) == len(X_text) and (not isinstance(X_graph[0], tuple) or not isinstance(X_graph[0][1], str)):
        return list(zip(X_graph, X_text, [None] * len(X_graph))), Y_graph

    # Get class to class ids mapping
    class_2_id = collections.defaultdict(lambda: [])
    for x, y in zip(X_text, Y_text):
        class_2_id[y].append(x)

    X_combined, Y_combined = [], Y_graph
    for (x_graph, y_id), y_graph in zip(X_graph, Y_graph):
        y_id = int(y_id)
        X_combined.append((x_graph, class_2_id[y_graph][y_id], y_id))

    return X_combined, Y_combined

Exemple #8

0

Afficher le fichier

    def should_process_task(task: ExperimentTask):
        # Dataset filter
        is_filtered_by_dataset = limit_dataset and filename_utils.get_dataset_from_filename(
            task.name) not in limit_dataset

        # Task type filters
        is_filtered_by_include_filter = (args.task_type_include_filter
                                         and task.type
                                         not in args.task_type_include_filter)
        is_filtered_by_exclude_filter = (args.task_type_exclude_filter
                                         and task.type
                                         in args.task_type_exclude_filter)

        is_filtered_by_name_filter = (args.task_name_filter and
                                      args.task_name_filter not in task.name)
        is_filtered_by_param_options = (filtered_task_types and task.type
                                        not in filtered_task_types)
        is_filtered_by_graph_type = (
            limit_graph_type and graph_helper.get_graph_type_from_filename(
                task.name) not in [None] + limit_graph_type)

        # Do not process tasks that have already been calculated (unless args.force == True)
        created_files = [
            '{}/{}'.format(
                args.results_folder,
                filename_utils.get_result_filename_for_task(
                    task, experiment_config, cfo=classification_options))
        ]
        is_filtered_by_file_exists = (not args.force and np.any(
            [os.path.exists(file) for file in created_files]))

        should_process = not np.any([
            is_filtered_by_graph_type, is_filtered_by_dataset,
            is_filtered_by_include_filter, is_filtered_by_name_filter,
            is_filtered_by_file_exists, is_filtered_by_exclude_filter,
            is_filtered_by_param_options
        ])

        return should_process

Exemple #9

0

Afficher le fichier

Fichier : experiments.py Projet : davidgengenbach/bachelor-thesis

def get_task_graphs(graph_cache_file: str) -> ExperimentTask:
    def process() -> tuple:
        X, Y = dataset_helper.get_dataset_cached(graph_cache_file)
        X = graph_helper.get_graphs_only(X)
        estimator, params = task_helper.get_graph_estimator_and_params(X, Y)
        return ClassificationData(X, Y, estimator, params)
    tasks = list()
    tasks.append(ExperimentTask('graph', get_filename_only(graph_cache_file), process))

    dataset = get_dataset_from_filename(graph_cache_file)

    def process_relabeled():
        X, Y = dataset_helper.get_dataset_cached(graph_cache_file)
        X = graph_helper.get_graphs_only(X)
        estimator, params = task_helper.get_graph_estimator_and_params(X, Y)
        params['graph_preprocessing'] = [transformers.RelabelGraphsTransformer()]
        params['graph_preprocessing__dataset'] = [dataset]
        params['graph_preprocessing__threshold'] = [0.99]
        params['graph_preprocessing__topn'] = [10]
        return ClassificationData(X, Y, estimator, params)

    tasks.append(ExperimentTask('graph_relabeled', get_filename_only(graph_cache_file), process_relabeled))
    return tasks

Exemple #10

0

Afficher le fichier

Fichier : experiments.py Projet : davidgengenbach/bachelor-thesis

def get_tasks() -> typing.List[ExperimentTask]:
    graph_cache_files = dataset_helper.get_all_cached_graph_datasets()
    gram_cache_files = dataset_helper.get_all_gram_datasets()
    datasets = dataset_helper.get_all_available_dataset_names()

    cmap_cache_files = dataset_helper.get_all_cached_graph_datasets(graph_type=constants.TYPE_CONCEPT_MAP)
    cmap_datasets = [get_dataset_from_filename(x) for x in cmap_cache_files]
    datasets = [x for x in datasets if x in cmap_datasets]

    graph_task_fns = [
        get_task_graph_content_only,
        get_task_graph_structure_only,
        get_task_combined,
        get_task_graphs,
        get_task_graph_node_weights
    ]

    tasks = []

    tasks += task_helper.get_tasks(graph_task_fns, graph_cache_files)
    tasks += task_helper.get_tasks([get_task_dummy, get_task_text], datasets)
    tasks += task_helper.get_tasks([get_gram_task], gram_cache_files)
    tasks += task_helper.get_tasks([get_task_extra_graphs], graph_helper.get_all_graph_benchmark_dataset_names())
    return tasks

Exemple #11

0

Afficher le fichier

Fichier : results_helper.py Projet : davidgengenbach/bachelor-thesis

def get_results(folder=None,
                results_directory=constants.RESULTS_FOLDER,
                log_progress=tqdm.tqdm_notebook,
                exclude_filter=None,
                include_filter=None,
                remove_split_cols=True,
                remove_rank_cols=True,
                remove_fit_time_cols=True,
                filter_out_experiment=None,
                ignore_experiments=True,
                only_load_dataset=None,
                fetch_predictions=False):
    '''
    Retrieves results from result folder.

    Note: This function _seriously_ has to be refactored!

    Args:
        folder: specify the results folder. If not specified, defaults to the most recent results folder
        results_directory: the base folder
        log_progress: function to log the progess. Takes an iterable and yields the item
        exclude_filter: which files to exclude
        include_filter: which files to include
        remove_split_cols: whether to keep the individual results for each split in CV
        remove_rank_cols: whether to keep the rank information in the CV results
        remove_fit_time_cols: keep the fit time
        filter_out_experiment: string thats gets filtered out
        ignore_experiments:
        only_load_dataset: filter the dataset
        fetch_predictions: whether to also retrieve the predictions and calculate the results on them

    Returns:
        pd.DataFrame: the results
    '''
    result_folders = get_result_folders(results_directory)

    folder = 'data/results/{}'.format(folder) if folder else result_folders[-1]

    result_files = get_result_filenames_from_folder(folder)

    if filter_out_experiment:
        result_files = [
            x for x in result_files
            if _get_experiment_name_from_filename(x) == filter_out_experiment
        ]

    if ignore_experiments and not filter_out_experiment:
        result_files = [x for x in result_files if 'experiment_' not in x]

    if only_load_dataset is not None:
        result_files = [
            x for x in result_files
            if filename_utils.get_dataset_from_filename(x) in only_load_dataset
        ]

    data_ = []
    for result_file in log_progress(
            result_files) if log_progress else result_files:
        if include_filter and include_filter not in result_file: continue
        if exclude_filter and exclude_filter in result_file: continue

        if '_nested_' in result_file:
            LOGGER.warning(
                'Encountered nested CV result file. Currently not implemented. File: {}'
                .format(result_file))
            continue

        dataset_name = filename_utils.get_dataset_from_filename(result_file)

        with open(result_file, 'rb') as f:
            result_data = pickle.load(f)

        remove_transformer_classes(result_data)

        result_file = filename_utils.get_filename_only(result_file)
        result = result_data if 'params' in result_data else result_data[
            'results']
        assert 'params' in result

        result = clean_result_keys(result)
        for idx, el in enumerate(result['params']):
            result['params'][idx] = clean_result_keys(el)

        prediction_file = '{}/predictions/{}'.format(
            folder, filename_utils.get_filename_only(result_file))
        predictions_exist = os.path.exists(prediction_file)

        num_results = len(result['params'])
        result['prediction_file_exists'] = [predictions_exist] * num_results

        if fetch_predictions and not predictions_exist:
            LOGGER.warning(
                'fetch_predictions=True but could not find prediction: {}'.
                format(prediction_file))

        # Fetch predictions and check whether the git commits are the same.
        # Also, calculate the prediction scores
        if fetch_predictions and predictions_exist:
            with open(prediction_file, 'rb') as f:
                r = pickle.load(f)
            result_git_commit = result_data['meta_data']['git_commit']
            git_commit = r['meta_data']['git_commit']
            if not git_commit == result_git_commit:
                LOGGER.warning(
                    'Unmatching git commit for prediction/result file! Prediction: {}, Result: {}'
                    .format(git_commit, result_git_commit))
            else:
                prediction = r['results']
                Y_real, Y_pred, X_test = prediction['Y_real'], prediction[
                    'Y_pred'], prediction['X_test']
                scores = calculate_scores(Y_real, Y_pred)
                for name, val in scores.items():
                    result['prediction_score_{}'.format(
                        name)] = [val] * num_results
                result['prediction_file'] = [prediction_file] * num_results

        def is_graph_dataset():
            graph_file_types = [
                constants.TYPE_CONCEPT_MAP, constants.TYPE_COOCCURRENCE,
                'graph_extra'
            ]
            is_graph_dataset_ = False
            for x in graph_file_types:
                if '_{}_'.format(x) in result_file:
                    is_graph_dataset_ = True
                    break
            return is_graph_dataset_

        result['combined'] = np.any([
            'graph_combined__dataset_' in result_file,
            'graph_text_combined__dataset_' in result_file
        ])

        # TEXT
        if is_graph_dataset():
            is_cooccurrence_dataset = constants.TYPE_COOCCURRENCE in result_file
            result[
                'type'] = constants.TYPE_COOCCURRENCE if is_cooccurrence_dataset else constants.TYPE_CONCEPT_MAP

            result['lemmatized'] = '_lemmatized_' in result_file
            result['kernel'] = get_kernel_from_filename(result_file)

            # Co-Occurrence
            if is_cooccurrence_dataset:
                parts = re.findall(r'cooccurrence_(.+?)_(.+?)_',
                                   result_file)[0]
                assert len(parts) == 2
                result['window_size'], result['words'] = parts
            # Concept Maps
            else:
                result['words'] = 'concepts'
        # DUMMY
        elif 'dummy' in result_file:
            result['type'] = 'dummy'
            result['words'] = 'dummy'
        # TEXT
        else:
            result['type'] = 'text'
            result['words'] = ['all'] * num_results

        if 'time_checkpoints' in result_data:
            timestamps = result_data['time_checkpoints']
            timestamps = sorted(timestamps.items(), key=lambda x: x[1])

            start = timestamps[0][1]
            end = timestamps[-1][1]

            result['timestamps'] = [timestamps] * num_results
            result['time'] = [end - start] * num_results

        result['filename'] = result_file
        result['dataset'] = dataset_name

        # Add meta data
        info = {}
        if 'results' in result_data:
            info = {
                'info__' + k: v
                for k, v in result_data.get('meta_data', result_data).items()
                if k != 'results'
            }
        result = dict(result,
                      **{k: [v] * num_results
                         for k, v in info.items()})

        data_.append(result)

    df_all = None
    for d in data_:
        result_df = pd.DataFrame(d)
        df_all = result_df if df_all is None else df_all.append(result_df)

    if df_all is None or not len(df_all):
        LOGGER.warning('Did not retrieve results! Aborting')
        return None

    # Remove cols
    df_all = df_all[[
        x for x in df_all.columns.tolist()
        if (not remove_split_cols or not re.match(r'^split\d', x)) and (
            not remove_fit_time_cols or not re.match(r'_time$', x)) and (
                not remove_rank_cols or not re.match(r'rank_', x))
    ]]

    # Change the column order
    prio_columns = ['dataset', 'type', 'combined']
    low_prio_columns = ['params', 'filename'] + [
        c for c in df_all.columns
        if c.startswith('std_') or c.startswith('mean_')
    ]
    columns = df_all.columns.tolist()
    for c in prio_columns + low_prio_columns:
        columns.remove(c)

    return df_all.reset_index(drop=True)[prio_columns + columns +
                                         low_prio_columns]

Exemple #12

0

Afficher le fichier

def process_graph_cache_file(graph_cache_file, args):
    graph_cache_filename = graph_cache_file.split('/')[-1].rsplit('.')[0]
    dataset = filename_utils.get_dataset_from_filename(graph_cache_file)

    if '.phi.' in graph_cache_filename or not filter_utils.file_should_be_processed(
            graph_cache_filename, args.include_filter, args.exclude_filter,
            args.limit_dataset):
        return

    LOGGER.info('{:15} starting ({})'.format(dataset, graph_cache_filename))

    fast_wl_trans = FastWLGraphKernelTransformer(
        h=args.wl_h, use_early_stopping=False, truncate_to_highest_label=False)

    try:
        phi_graph_cache_file = graph_cache_file.replace('.npy', '.phi.npy')
        X_graphs, Y = dataset_helper.get_dataset_cached(graph_cache_file)
        X_graphs = graph_helper.get_graphs_only(X_graphs)

        # Kernel: WL
        if args.use_wl:
            used_phi_graph_cache_file = phi_graph_cache_file
            splitted_phi_graph_cache_file = phi_graph_cache_file.replace(
                '.phi', '.splitted.phi')
            phi_same_label_graph_cache_file = phi_graph_cache_file.replace(
                dataset, '{}_same-label'.format(dataset)).replace(
                    '.phi', '.splitted.phi')

            # Stop here if all files have already been created
            if not args.force and np.all([
                    os.path.exists(x) for x in
                [
                    splitted_phi_graph_cache_file, used_phi_graph_cache_file,
                    phi_same_label_graph_cache_file
                ]
            ]):
                return

            X_, Y_ = np.array(np.copy(X_graphs)), np.array(np.copy(Y))
            if args.wl_sort_classes:
                X_, Y_ = sort(X_, Y_, by=Y_)

            num_vertices = len(graph_helper.get_all_node_labels(X_))
            fast_wl_trans.set_params(phi_dim=num_vertices)

            X_train, X_test, Y_train, Y_test = sklearn.model_selection.train_test_split(
                np.copy(X_),
                np.copy(Y_),
                stratify=Y_,
                test_size=args.wl_test_size)

            X_train, Y_train = sort(X_train, Y_train, by=Y_train)
            X_test, Y_test = sort(X_test, Y_test, by=Y_test)

            # Splitted version
            if args.force or not os.path.exists(splitted_phi_graph_cache_file):
                t = sklearn.base.clone(fast_wl_trans).set_params(
                    same_label=True)
                phi_train = t.fit_transform(np.copy(X_train))
                phi_test = t.transform(np.copy(X_test))

                with open(splitted_phi_graph_cache_file, 'wb') as f:
                    pickle.dump((phi_train, phi_test, X_train, X_test, Y_train,
                                 Y_test), f)

            # Splitted, same label
            if args.force or not os.path.exists(
                    phi_same_label_graph_cache_file):
                t = sklearn.base.clone(fast_wl_trans)
                phi_train = t.fit_transform(X_train)
                phi_test = t.transform(X_test)

                with open(phi_same_label_graph_cache_file, 'wb') as f:
                    pickle.dump((phi_train, phi_test, X_train, X_test, Y_train,
                                 Y_test), f)

            # Whole dataset
            if args.force or not os.path.exists(used_phi_graph_cache_file):
                t = sklearn.base.clone(fast_wl_trans)
                with open(used_phi_graph_cache_file, 'wb') as f:
                    pickle.dump((t.fit_transform(X_), Y_), f)

        # Kernel: spgk
        if args.use_spgk:
            for depth in args.spgk_depth:
                spgk_graph_cache_file = graph_cache_file.replace(
                    '.npy', '.spgk-{}.gram.npy'.format(depth))

                if args.force or not os.path.exists(spgk_graph_cache_file):
                    K = spgk.transform(X_graphs, depth=depth)

                    with open(spgk_graph_cache_file, 'wb') as f:
                        pickle.dump((K, Y), f)
    except Exception as e:
        LOGGER.exception(e)

    LOGGER.info('{:15} finished ({})'.format(dataset, graph_cache_filename))

Exemple #13

0

Afficher le fichier

 def test(self):
     for test_item in FILENAMES:
         self.assertEqual(
             filename_utils.get_dataset_from_filename(
                 test_item.filename, test_item.ignore_subtype),
             test_item.expected_dataset)

Exemple #14

0

Afficher le fichier

def get_all_gram_datasets(dataset_name=None, cache_path=CACHE_PATH):
    gram_files = glob('{}/*gram*.npy'.format(cache_path))
    return [
        x for x in gram_files if not dataset_name
        or filename_utils.get_dataset_from_filename(x) == dataset_name
    ]

Exemple #15

0

Afficher le fichier

 def print_tasks(tasks: typing.List[ExperimentTask]):
     for task in tasks:
         print('\t{t.type:26} {dataset:18} {t.name}'.format(
             t=task,
             dataset=filename_utils.get_dataset_from_filename(task.name)))
     print('\n')

Exemple #16

0

Afficher le fichier

def start_tasks(args, all_tasks: typing.List[ExperimentTask],
                classification_options: ClassificationOptions,
                experiment_config: dict):
    filtered_task_types = experiment_config['params_per_type'].keys(
    ) if experiment_config else None

    if experiment_config.get('limit_dataset', None) is not None:
        limit_dataset = experiment_config['limit_dataset']
    else:
        limit_dataset = args.limit_dataset

    limit_graph_type = experiment_config.get('limit_graph_type', None)

    def should_process_task(task: ExperimentTask):
        # Dataset filter
        is_filtered_by_dataset = limit_dataset and filename_utils.get_dataset_from_filename(
            task.name) not in limit_dataset

        # Task type filters
        is_filtered_by_include_filter = (args.task_type_include_filter
                                         and task.type
                                         not in args.task_type_include_filter)
        is_filtered_by_exclude_filter = (args.task_type_exclude_filter
                                         and task.type
                                         in args.task_type_exclude_filter)

        is_filtered_by_name_filter = (args.task_name_filter and
                                      args.task_name_filter not in task.name)
        is_filtered_by_param_options = (filtered_task_types and task.type
                                        not in filtered_task_types)
        is_filtered_by_graph_type = (
            limit_graph_type and graph_helper.get_graph_type_from_filename(
                task.name) not in [None] + limit_graph_type)

        # Do not process tasks that have already been calculated (unless args.force == True)
        created_files = [
            '{}/{}'.format(
                args.results_folder,
                filename_utils.get_result_filename_for_task(
                    task, experiment_config, cfo=classification_options))
        ]
        is_filtered_by_file_exists = (not args.force and np.any(
            [os.path.exists(file) for file in created_files]))

        should_process = not np.any([
            is_filtered_by_graph_type, is_filtered_by_dataset,
            is_filtered_by_include_filter, is_filtered_by_name_filter,
            is_filtered_by_file_exists, is_filtered_by_exclude_filter,
            is_filtered_by_param_options
        ])

        return should_process

    def print_tasks(tasks: typing.List[ExperimentTask]):
        for task in tasks:
            print('\t{t.type:26} {dataset:18} {t.name}'.format(
                t=task,
                dataset=filename_utils.get_dataset_from_filename(task.name)))
        print('\n')

    # Filter out tasks
    tasks = sorted(
        [task for task in all_tasks if should_process_task(task)],
        key=lambda x: filename_utils.get_dataset_from_filename(x.name))

    if args.dry_run:
        print('All tasks:')
        print_tasks(all_tasks)

    print('Filtered tasks:')
    print_tasks(tasks)

    print('# tasks per type (filtered/unfiltered)')
    task_type_counter_unfiltered = collections.Counter(
        [t.type for t in all_tasks])
    task_type_counter_filtered = collections.Counter([t.type for t in tasks])
    for task_type, unfiltered_count in task_type_counter_unfiltered.items():
        print('\t{:25} {:2}/{:2}'.format(
            task_type, task_type_counter_filtered.get(task_type, 0),
            unfiltered_count))
    print('\n')

    if args.dry_run:
        print('Only doing a dry-run. Exiting.')
        return

    num_tasks = len(tasks)
    for task_idx, t in enumerate(tasks):

        def print_task(msg=''):
            LOGGER.info(
                'Task {idx:>2}/{num_tasks}: {t.type:30} - {t.name:40} - {msg}'.
                format(idx=task_idx + 1, num_tasks=num_tasks, t=t, msg=msg))

        start_time = time()
        print_task('Started')
        try:
            task_runner.run_classification_task(t, classification_options,
                                                experiment_config)
            gc.collect()
        except Exception as e:
            print_task('Error: {}'.format(e))
            LOGGER.exception(e)
        elapsed_seconds = time() - start_time
        print_task('Finished (time={})'.format(
            time_utils.seconds_to_human_readable(elapsed_seconds)))
        gc.collect()

    LOGGER.info('Finished!')