Ejemplo n.º 1
0
def exp_process_function(
    thread_device_id: int,
    thread_exp_id: int,
    thread_exp_args: Dict,
    res_q: mp.Queue,
):
    logging_utils.setup_unique_log_file(
        root_folder_path=thread_exp_args["prune_out_folder_path"],
        file_name_format=FILE_NAME_FORMAT_SUB_LOG,
    )
    logger = logging_utils.get_logger(LOGGER_NAME)
    logger.info("""
>>> process <<<
thread_device_id: {thread_device_id}
thread_exp_id: {thread_exp_id}
thread_exp_args: {thread_exp_args}
""".format(
        thread_device_id=thread_device_id,
        thread_exp_id=thread_exp_id,
        thread_exp_args=thread_exp_args,
    ))
    with torch.cuda.device(thread_device_id):
        res = run_single_experiment(**thread_exp_args)
        res_q.put(res, block=True)
    res_q.close()
    clear_mem(logger)
Ejemplo n.º 2
0
def main() -> None:
    args = get_args()

    # Logging.
    # Assuming that an existing log file means that the corresponding results file will be taken.
    datetime_string: Text = logging_utils.setup_unique_log_file(
        root_folder_path=args.out_folder,
        file_name_format=FILE_NAME_FORMAT_MAIN_LOG,
    )
    logger = logging_utils.get_logger(LOGGER_NAME)
    logger.info(args)

    exp_config: exp_config_utils.ExpConfig = exp_config_utils.get_config_from_file(
        config_file_loc=args.exp_config)
    model_config: Optional[model_config_utils.ModelConfig] = None
    if args.model_config:
        model_config = model_config_utils.get_config_from_file(
            config_file_loc=args.model_config)

    # Set mp 'spawn' method for torch.
    mp.set_start_method("spawn")

    # Run everything.
    run_experiments(
        exp_config=exp_config,
        model_checkpoint_path=args.model_checkpoint,
        out_folder_path=args.out_folder,
        model_config=model_config,
        datetime_string=datetime_string,
    )
Ejemplo n.º 3
0
def get_layer_craig_subset(layer: Union[nn.Linear, nn.Conv2d],
                           original_num_nodes: int,
                           prune_percent_per_layer: float,
                           similarity_metric: Union[Text, Dict] = "",
                           prune_type: Text = "craig",
                           **kwargs) -> Tuple[List[int], List[float]]:
    logger = logging_utils.get_logger(LOGGER_NAME)

    assert (0 <= prune_percent_per_layer) and (
        prune_percent_per_layer <=
        1), "prune_percent_per_layer ({}) must be within [0,1]".format(
            prune_percent_per_layer)

    assert prune_type in (
        "craig",
        "random",
    ), "prune_type must be 'craig' or 'random'"

    assert (prune_type == "random") or (
        similarity_metric
    ), "similarity_metric must be set for prune_type '{}'".format(prune_type)

    target_num_nodes: int = math.ceil(
        (1 - prune_percent_per_layer) * original_num_nodes)

    subset_nodes: List
    subset_weights: List

    if prune_type == "random":
        subset_nodes = random.sample(list(range(original_num_nodes)),
                                     target_num_nodes)
        subset_weights = [1 for _ in subset_nodes]
    else:  # Assumes similarity_metric is set correctly.
        similarity_matrix: Any
        if isinstance(similarity_metric, dict):
            similarity_matrix = getattr(SimilarityMetrics,
                                        similarity_metric["name"])(
                                            layer=layer, **similarity_metric)
        else:
            similarity_matrix = getattr(SimilarityMetrics,
                                        similarity_metric)(layer=layer)

        (
            subset_nodes,
            subset_weights,
            craig_time,
        ) = craig.get_craig_subset_and_weights(
            similarity_matrix=similarity_matrix, target_size=target_num_nodes)
        logger.info("craig runtime (s): {}".format(craig_time))

    return subset_nodes, subset_weights
Ejemplo n.º 4
0
def main() -> None:
    args = get_args()

    experiment_folder_path: Text = args.out_folder
    if not os.path.exists(experiment_folder_path):
        os.makedirs(experiment_folder_path)

    logging_utils.setup_logging(
        os.path.join(
            experiment_folder_path,
            "log-{}.txt".format(datetime.now().strftime("%Y_%m_%d-%H_%M_%S")),
        ))

    logger = logging_utils.get_logger(__name__)
    logger.info(args)

    model_config_or_checkpoint: Union[model_config_utils.ModelConfig, Text]
    if args.model_config:
        model_config_or_checkpoint = model_config_utils.get_config_from_file(
            args.model_config)
    elif args.checkpoint:
        model_config_or_checkpoint = args.checkpoint
    else:
        err_msg = "Either --model_config or --checkpoint must be provided."
        logger.error(err_msg)
        raise ValueError(err_msg)

    train_config: train_config_utils.TrainConfig = train_config_utils.get_config_from_file(
        args.train_config)

    with torch.cuda.device(args.cuda_device_id):
        train_model_with_configs(
            model_config_or_checkpoint=model_config_or_checkpoint,
            train_config=train_config,
            experiment_folder_path=experiment_folder_path,
            resume_training=args.resume_training,
            save_interval=args.save_interval,
            save_best_checkpoint=args.save_best_checkpoint,
            use_gpu=not args.no_cuda,
        )
Ejemplo n.º 5
0
def main() -> None:
    args = get_args()

    config: prune_config_utils.PruneConfig = prune_config_utils.get_config_from_file(
        args.config)
    pruned_output_folder: Text = (args.out_folder if args.out_folder else
                                  config.pruned_model_out_folder)

    # Logging
    datetime_string: Text = datetime.now().strftime("%Y_%m_%d-%H_%M_%S")
    logging_utils.setup_logging(log_file_loc=os.path.join(
        pruned_output_folder,
        FILE_NAME_FORMAT_LOG.format(datetime_string),
    ))
    logger = logging_utils.get_logger(LOGGER_NAME)
    logger.info(args)

    prune_network(
        prune_config=config,
        pruned_output_folder=pruned_output_folder,
        model_checkpoint_path=args.model,
    )
Ejemplo n.º 6
0
import json
import traceback

import boto3

from utils.aws_utils import setup_s3_client, put_template_into_s3
from utils.pipeline_utils import put_job_failure, put_job_success, continue_job_later, \
    PipelineUserParameters, PipelineStackConfig, load_pipeline_artifacts, \
    parse_override_params, get_file_from_artifact, generate_output_artifact
from utils.stack_utils import stack_exists, get_stack_status, \
    stack_delete, change_set_exists, execute_change_set, get_change_set_status, delete_change_set, create_change_set, \
    update_stack, create_stack, get_stack_output

from utils.logging_utils import get_logger

logger = get_logger()


def start_stack_create_or_update(cf,
                                 job_id,
                                 stack_name,
                                 template_url,
                                 config: PipelineStackConfig,
                                 update=False,
                                 role_arn=None):
    if update:
        status = get_stack_status(cf, stack_name)
        if status not in [
                'CREATE_COMPLETE', 'ROLLBACK_COMPLETE', 'UPDATE_COMPLETE',
                'UPDATE_ROLLBACK_COMPLETE'
        ]:
from __future__ import print_function
from __future__ import unicode_literals
from __future__ import division
from __future__ import with_statement
from __future__ import absolute_import

import tarfile
import argparse
import json
import shutil

from utils.logging_utils import get_logger
from utils.sha_utils import get_digest_sha256

logger = get_logger(__name__)


def arg_parse():
    parser = argparse.ArgumentParser(
        description='Make resulting update docker image')
    parser.add_argument('--difftar',
                        help='diff tar previously produced',
                        type=str,
                        required=True)
    parser.add_argument('--oldimg',
                        help='old img tar',
                        type=str,
                        required=True)
    parser.add_argument('--output',
                        help='new tar to load to docker',
                        type=str,
import json
import tempfile
import zipfile
import os
import boto3

from utils.aws_utils import file_to_dict
from utils.logging_utils import get_logger

code_pipeline = boto3.client('codepipeline')
logger = get_logger()


class PipelineUserParameters:
    def __init__(self, job_data, lambda_ctx):
        """Decodes the JSON user parameters and validates the required properties passed into Lambda function

        :param job_data: The job data structure containing the UserParameters string which should be a valid JSON structure
        :param lambda_ctx: Lambda context

        Possible ActionMode:
            - CREATE_UPDATE
            - DELETE_ONLY
            - REPLACE_ON_FAILURE
            - CHANGE_SET_REPLACE
            - CHANGE_SET_EXECUTE
        """
        logger.debug("getting user parameters")
        user_parameters = None
        self.TemplateFile = None
        self.TemplateArtifact = None
Ejemplo n.º 9
0
def run_experiments(
    exp_config: exp_config_utils.ExpConfig,
    model_checkpoint_path: Text,
    out_folder_path: Text,
    model_config: Optional[model_config_utils.ModelConfig],
    datetime_string: Text,
) -> None:
    """
    For each experiment:
    - Prune network, save pruned model (pruner).
    - Get train and test accuracy for pruned model on MNIST (eval_model).
    - Fine tune model.
    - Get train and test accuracy for pruned+finetuned model on MNIST (eval_model).

    Accumulate size, train accuracy, and test accuracy for each experiment.
    Print these out in a csv (or formatted that way).
    """

    # Logging.
    logger = logging_utils.get_logger(name=LOGGER_NAME)

    # Track total run time.
    full_start_time = time.time()

    # Original model.
    original_model_name: Text = os.path.basename(model_checkpoint_path)
    original_size, original_train_acc, original_test_acc = evaluate_model(
        model_path=model_checkpoint_path,
        dataset_name=exp_config.evaluation_dataset_name,
        batch_size=exp_config.evaluation_dataset_batch_size,
        model_size_type="numel",
    )
    original_model_results: List = [
        original_size,
        original_train_acc,
        original_test_acc,
    ]
    logger.info(
        PRINT_FORMAT.format("original", original_size, original_train_acc,
                            original_test_acc))

    clear_mem(logger)

    # Experiment results container.
    # experiment_vals: List[List] = []
    experiment_vals: Dict[int, List] = {}

    try:
        prune_type: Text = exp_config.prune_type
        if prune_type == "craig":
            run_craig_experiments(
                experiment_vals=experiment_vals,
                exp_config=exp_config,
                original_model_name=original_model_name,
                original_model_path=model_checkpoint_path,
                original_model_config=model_config,
                original_model_results=original_model_results,
                out_folder_path=out_folder_path,
                datetime_string=datetime_string,
            )
        # TODO: Make Mussay compatible again.
        # elif prune_type == "mussay":
        #     run_mussay_experiments(
        #         experiment_vals=experiment_vals,
        #         prune_out_folder_root_path=prune_out_folder_root_path,
        #         original_model_name=original_model_name,
        #         original_model_path=original_model_path,
        #         original_model_config_path=original_model_config_path,
        #         original_model_train_config_path=original_model_train_config_path,
        #         original_model_results=original_model_results,
        #         evaluation_epochs_list=evaluation_epochs_list,
        #     )
        else:
            raise ValueError("prune_type not supported: {}".format(prune_type))
    finally:
        # Write results to csv.
        logger.info("writing final results...")
        out_csv_path: Text = write_results_to_csv(
            experiment_vals=experiment_vals,
            out_folder_path=out_folder_path,
            file_name_format=FILE_NAME_FORMAT_MAIN_RESULTS,
            datetime_string=datetime_string,
        )
        logger.info("results written to: {}".format(out_csv_path))

    logger.info("Total run time: {}".format(
        str(timedelta(seconds=time.time() - full_start_time))))
Ejemplo n.º 10
0
def run_craig_experiments(
    experiment_vals: Dict[int, List],
    exp_config: exp_config_utils.ExpConfig,
    original_model_name: Text,
    original_model_path: Text,
    original_model_config: Optional[model_config_utils.ModelConfig],
    original_model_results: List,
    out_folder_path: Text,
    datetime_string: Text,
) -> None:
    # Logging.
    logger = logging_utils.get_logger(name=LOGGER_NAME)

    clear_mem(logger)

    # Set up multiprocessing and cuda.
    num_cuda_devices: int = torch.cuda.device_count()
    mem_per_cuda_device: List[int] = [
        torch.cuda.get_device_properties(device_id).
        total_memory  # TODO: Make sure this is the right value, in bytes.
        for device_id in range(num_cuda_devices)
    ]
    cuda_device_names: List[Text] = [
        torch.cuda.get_device_properties(device_id).name
        for device_id in range(num_cuda_devices)
    ]
    max_model_size: int = -1 if (exp_config.cuda_model_max_mb
                                 == -1) else (1000 * 1000 *
                                              exp_config.cuda_model_max_mb)
    max_procs_per_device: List[int] = [
        int(  # Take the floor.
            1 if (max_model_size == -1) else
            ((mem * exp_config.cuda_max_percent_mem_usage) / max_model_size))
        for mem in mem_per_cuda_device
    ]
    max_process_count: int = sum(max_procs_per_device)
    logger.info("Found the following cuda devices: {}".format([
        "(name='{cdn}', mem={cdm}, max_exp={cde})".format(cdn=cdn,
                                                          cdm=cdm,
                                                          cde=cde)
        for cdn, cdm, cde in zip(cuda_device_names, mem_per_cuda_device,
                                 max_procs_per_device)
    ]))
    # TODO: Give each config an id. Allow each process to save its results to a list.

    # Set up root configs.
    prune_config_root: prune_config_utils.PruneConfig = prune_config_utils.PruneConfig(
        {
            "prune_type": "craig",
            "prune_params": {},
            "original_model_path": original_model_path,
        })
    if "model_input_shape" in exp_config._raw_dict:
        prune_config_root.model_input_shape = exp_config.model_input_shape
    if "data_transform_name" in exp_config._raw_dict:
        prune_config_root.data_transform_name = exp_config.data_transform_name
    finetuning_train_config: train_config_utils.TrainConfig = exp_config.finetuning_train_config

    # Create experiment parameters.
    # prune_layer_params: OrderedDict = OrderedDict(
    prune_layer_params: Dict = exp_config.prune_params[
        prune_config_utils.KEY_LAYER_PARAMS]
    prune_param_values: List = []
    layer_name_map: List[Text] = []
    param_name_map: List[Text] = []
    for layer_name, layer_params in prune_layer_params.items():
        for param_name, param_list in layer_params.items():
            prune_param_values.append(param_list)
            layer_name_map.append(layer_name)
            param_name_map.append(param_name)
    exp_value_permutations: List[List] = list(
        itertools.product(*prune_param_values))

    # Create list of experiment function arguments.
    exp_function_arguments: List[Dict] = []
    exp_names: List[Text] = []
    for exp_id, param_permutation in enumerate(exp_value_permutations):
        # Start with a exp_id of 0.

        # Build layer params from this param_permutation.
        exp_layer_params: Dict = {}
        for exp_param_ind, exp_param in enumerate(param_permutation):
            exp_param_dict = exp_layer_params.setdefault(
                layer_name_map[exp_param_ind], {})
            exp_param_dict[param_name_map[exp_param_ind]] = exp_param
        prune_config_root.prune_params = {
            prune_config_utils.KEY_LAYER_PARAMS: exp_layer_params
        }

        # Create experiment name.
        exp_name_temp_list = []
        for e_layer_name, e_layer in exp_layer_params.items():
            e_params = [
                get_exp_str_from_param(e_p) for e_p in e_layer.values()
            ]
            exp_name_temp_list.append("{}-{}".format(e_layer_name,
                                                     "_".join(e_params)))
        exp_name = "--".join(exp_name_temp_list)

        # Name the output folder after the experiment name.
        prune_out_folder_path: Text = os.path.join(out_folder_path, exp_name)

        exp_names.append(exp_name)
        exp_function_arguments.append(
            dict(
                exp_id=exp_id,
                prune_config=prune_config_utils.PruneConfig(
                    prune_config_root._raw_dict.copy()),
                prune_out_folder_path=prune_out_folder_path,
                finetuning_train_config=finetuning_train_config,
                original_model_config=original_model_config,
                evaluation_epochs_list=exp_config.evaluation_epochs,
            ))

    logger.info("All experiment configs: {}".format(exp_function_arguments))
    num_experiments_total: int = len(exp_function_arguments)
    num_experiments_complete: int = 0
    next_exp_id: int = 0
    processes_per_device: List[Dict] = [{} for i in range(num_cuda_devices)]

    # Results queue
    exp_results_q: mp.Queue = mp.Queue()

    def exp_thread_function(thread_device_id: int, thread_exp_id: int,
                            thread_exp_args: Dict):
        # NOTE: Using a mp.Queue because it is process-safe, this is not the most elegant solution.
        thread_q: mp.Queue = mp.Queue()
        proc = mp.Process(
            target=exp_process_function,
            kwargs=dict(
                thread_device_id=thread_device_id,
                thread_exp_id=thread_exp_id,
                thread_exp_args=thread_exp_args,
                res_q=thread_q,
            ),
        )
        try:
            proc.start()
            proc.join()
        except Exception as e:
            logger.error(e, exc_info=True)
        finally:
            # Adding this empty result so the thread can exit if something breaks.
            thread_q.put([])
        exp_results_q.put(
            (thread_device_id, thread_exp_id, thread_q.get(block=True)))
        thread_q.close()

    # First, attempt to start new processes.
    for device_id, max_procs in enumerate(max_procs_per_device):
        if next_exp_id >= num_experiments_total:
            break
        for pid in range(max_procs):
            if next_exp_id >= num_experiments_total:
                break

            thread = threading.Thread(
                target=exp_thread_function,
                kwargs=dict(
                    thread_device_id=device_id,
                    thread_exp_id=next_exp_id,
                    thread_exp_args=exp_function_arguments[next_exp_id],
                ),
            )
            thread.start()
            processes_per_device[device_id][next_exp_id] = thread
            next_exp_id += 1

    # Now, continually check for free devices.
    while num_experiments_complete < num_experiments_total:
        # Since there are still experiments to complete, just wait for results.
        next_result = exp_results_q.get(block=True)
        res_device_id, res_exp_id, res_vals = next_result
        logger.info("Got result for {} : {}".format(exp_names[res_exp_id],
                                                    next_result))

        # If a result is available, then an experiment is done.
        # Join on the thread, then remove from list.
        processes_per_device[res_device_id][res_exp_id].join()
        del processes_per_device[res_device_id][res_exp_id]

        # Increment the completion count.
        num_experiments_complete += 1

        if not res_vals:
            logger.warning(
                "Results were empty so exp may have failed. Not saving.")
        else:
            # Add results to total results.
            experiment_vals[res_exp_id] = ([
                res_exp_id,
                original_model_name,
                exp_names[res_exp_id],
                "",
            ] + original_model_results.copy() + res_vals)

            # Incrementally save experiment_vals.
            write_results_to_csv(
                experiment_vals=experiment_vals,
                out_folder_path=out_folder_path,
                file_name_format=FILE_NAME_FORMAT_MAIN_RESULTS,
                datetime_string=datetime_string,
            )

        logger.info("Jobs complete: {}/{} ({:.2%})".format(
            num_experiments_complete,
            num_experiments_total,
            num_experiments_complete / num_experiments_total,
        ))

        # Add a new process to this device, if needed.
        if next_exp_id < num_experiments_total:
            num_seconds_to_sleep: float = 5
            logger.info(
                "Sleeping for {} seconds to allow VRAM to clear...".format(
                    num_seconds_to_sleep))
            time.sleep(num_seconds_to_sleep)
            logger.info("Waking up from sleep.")

            logger.info("Starting next exp on device {} : ({}) {}".format(
                res_device_id, next_exp_id, exp_names[next_exp_id]))
            thread = threading.Thread(
                target=exp_thread_function,
                kwargs=dict(
                    thread_device_id=res_device_id,
                    thread_exp_id=next_exp_id,
                    thread_exp_args=exp_function_arguments[next_exp_id],
                ),
            )
            thread.start()
            processes_per_device[res_device_id][next_exp_id] = thread
            next_exp_id += 1

        # Print running procs for reference.
        logger.info("Current process count per device: {}".format(
            [len(procs) for procs in processes_per_device]))
        logger.info("Current experiment IDs per device: {}".format(
            [list(procs.keys()) for procs in processes_per_device]))

    logger.info("All {}/{} jobs completed".format(num_experiments_complete,
                                                  num_experiments_total))
Ejemplo n.º 11
0
def run_single_experiment(
    exp_id: int,
    prune_config: prune_config_utils.PruneConfig,
    prune_out_folder_path: Text,
    finetuning_train_config: train_config_utils.TrainConfig,
    original_model_config: Optional[model_config_utils.ModelConfig],
    evaluation_epochs_list: Sequence[Union[Text, int]],
) -> List:
    # Logging.
    logger = logging_utils.get_logger(name=LOGGER_NAME)
    logger.info("Starting experiment with exp_id: {}".format(exp_id))

    # Set up prune folder.
    if not os.path.exists(prune_out_folder_path):
        os.makedirs(prune_out_folder_path)

    # Copy original model config.
    if original_model_config:
        general_config_utils.write_config_to_file(
            original_model_config,
            os.path.join(prune_out_folder_path,
                         "config-model-original_model.json"),
        )

    # Prune.
    logger.info("pruning...")
    pruner.prune_network(prune_config=prune_config,
                         pruned_output_folder=prune_out_folder_path)
    pruned_model_path: Text = os.path.join(prune_out_folder_path,
                                           pruner.FILE_NAME_MODEL)
    # pruned_model_path: Text = os.path.join(
    #     prune_out_folder_path, pruner.FILE_NAME_STATE_DICT
    # )

    # Finetune.
    logger.info("finetuning...")
    finetuning_folder_path: Text = os.path.join(prune_out_folder_path,
                                                "finetuning")
    stat_counters: Dict[
        Text, train_utils.StatCounter] = train_algo_1.train_model_with_configs(
            model_config_or_checkpoint=pruned_model_path,
            train_config=finetuning_train_config,
            experiment_folder_path=finetuning_folder_path,
            save_interval=
            0,  # Set to zero to never save per epoch, to save space.
            save_best_checkpoint=True,
            use_gpu=True,
        )

    # Save results from stat_counters: train/test accuracy, and size.
    eval_results: List = []
    evaluation_epochs_list = [(finetuning_train_config.num_epochs if
                               (epoch == -1) else epoch)
                              for epoch in evaluation_epochs_list]
    model_size_epochs: train_utils.StatCounter = stat_counters[
        "model_size_epochs"]
    train_acc_epochs: train_utils.StatCounter = stat_counters[
        "train_acc_epochs"]
    test_acc_epochs: train_utils.StatCounter = stat_counters["test_acc_epochs"]
    for epoch in evaluation_epochs_list:
        model_size: int
        train_acc: float
        test_acc: float
        if epoch == "best":
            test_acc_ind = max(
                range(len(test_acc_epochs._counter)),
                key=lambda x: test_acc_epochs._counter[x],
            )
            test_acc = test_acc_epochs._counter[test_acc_ind]
            train_acc = train_acc_epochs._counter[test_acc_ind]
            model_size = model_size_epochs._counter[test_acc_ind]
        elif isinstance(epoch, int):
            test_acc = test_acc_epochs._counter[epoch]
            train_acc = train_acc_epochs._counter[epoch]
            model_size = model_size_epochs._counter[epoch]
        else:
            raise TypeError(
                "Found unsupported type in evaluation_epochs_list: {}".format(
                    epoch))

        eval_results.extend([
            "",
            epoch,
            model_size,
            train_acc,
            test_acc,
        ])
        logger.info(
            PRINT_FORMAT.format(
                "Epoch {}".format(epoch),
                model_size,
                train_acc,
                test_acc,
            ))

    return eval_results
Ejemplo n.º 12
0
def train_model_with_configs(
    model_config_or_checkpoint: Union[model_config_utils.ModelConfig, Text],
    train_config: train_config_utils.TrainConfig,
    experiment_folder_path: Text,
    resume_training: bool = False,
    save_interval: int = 1,
    save_best_checkpoint: bool = True,
    use_gpu: bool = True,
    # cuda_device_id: int = 0,
) -> Dict[Text, train_utils.StatCounter]:
    logger = logging_utils.get_logger(__name__)
    log_interval: int = 100

    assert save_interval >= 0, "save_interval must be >= 0"
    save_checkpoint_per_epoch: bool = (save_interval != 0)

    torch_device = torch.device("cuda" if use_gpu else "cpu")
    if "random_seed" in train_config._raw_dict:
        random.seed(train_config.random_seed)
        np.random.seed(train_config.random_seed)
        torch.manual_seed(train_config.random_seed)
        torch.cuda.manual_seed(train_config.random_seed)
    # Using this for reproducibility
    torch.backends.cudnn.deterministic = True

    random_info_str: Text = """Random info:
random.setstate({random})
np.random.set_state({nprandom})
torch.manual_seed({torch})
torch.cuda.manual_seed({torchcuda})
torch.backends.cudnn.deterministic = {torchcudnn}
    """.format(
        random=random.getstate(),
        nprandom=np.random.get_state(),
        torch=torch.initial_seed(),
        torchcuda=torch.cuda.initial_seed(),
        torchcudnn=torch.backends.cudnn.deterministic,
    )
    logger.info(random_info_str)

    # Set up some experiment directories.
    checkpoints_folder_path: Text = os.path.join(experiment_folder_path,
                                                 FOLDER_NAME_CHECKPOINTS)
    if not os.path.exists(checkpoints_folder_path):
        os.makedirs(checkpoints_folder_path)
    stats_folder_path: Text = os.path.join(experiment_folder_path, "stats")

    # Set up counters.
    train_loss_batches: train_utils.StatCounter = train_utils.StatCounter(
        default_save_params=dict(
            folder_path=stats_folder_path,
            file_prefix="train_loss_batches",
            xlabel="batch",
            ylabel="loss",
            title_prefix="train_loss_batches",
        ))
    train_loss_epochs: train_utils.StatCounter = train_utils.StatCounter(
        default_save_params=dict(
            folder_path=stats_folder_path,
            file_prefix="train_loss_epochs",
            xlabel="epoch",
            ylabel="loss",
            title_prefix="train_loss_epochs",
        ))
    train_acc_batches: train_utils.StatCounter = train_utils.StatCounter(
        default_save_params=dict(
            folder_path=stats_folder_path,
            file_prefix="train_accuracy_batches",
            xlabel="batch",
            ylabel="accuracy",
            title_prefix="train_accuracy_batches",
        ))
    train_acc_epochs: train_utils.StatCounter = train_utils.StatCounter(
        default_save_params=dict(
            folder_path=stats_folder_path,
            file_prefix="train_accuracy_epochs",
            xlabel="epoch",
            ylabel="accuracy",
            title_prefix="train_accuracy_epochs",
        ))
    test_loss_epochs: train_utils.StatCounter = train_utils.StatCounter(
        default_save_params=dict(
            folder_path=stats_folder_path,
            file_prefix="test_loss_epochs",
            xlabel="epoch",
            ylabel="loss",
            title_prefix="test_loss_epochs",
        ))
    test_acc_epochs: train_utils.StatCounter = train_utils.StatCounter(
        default_save_params=dict(
            folder_path=stats_folder_path,
            file_prefix="test_accuracy_epochs",
            xlabel="epoch",
            ylabel="accuracy",
            title_prefix="test_accuracy_epochs",
        ))
    model_size_epochs: train_utils.StatCounter = train_utils.StatCounter(
        default_save_params=dict(
            folder_path=stats_folder_path,
            file_prefix="model_size_epochs",
            xlabel="epoch",
            ylabel="number of model parameters",
            title_prefix="model_size_epochs",
        ))
    stat_counters: Dict[Text, train_utils.StatCounter] = {
        "train_loss_batches": train_loss_batches,
        "train_loss_epochs": train_loss_epochs,
        "train_acc_batches": train_acc_batches,
        "train_acc_epochs": train_acc_epochs,
        "test_loss_epochs": test_loss_epochs,
        "test_acc_epochs": test_acc_epochs,
        "model_size_epochs": model_size_epochs,
    }

    # Get data.
    data_transform = train_utils.DATASET_TRANSFORMS[train_config.dataset_name]
    train_loader = torch.utils.data.DataLoader(
        train_utils.DATASET_FUNCTIONS[train_config.dataset_name](
            train_utils.DATA_FOLDER_PATH,
            train=True,
            download=True,
            transform=data_transform,
        ),
        batch_size=train_config.batch_size_train,
        shuffle=True,
    )
    test_loader = torch.utils.data.DataLoader(
        train_utils.DATASET_FUNCTIONS[train_config.dataset_name](
            train_utils.DATA_FOLDER_PATH,
            train=False,
            download=True,
            transform=data_transform,
        ),
        batch_size=train_config.batch_size_test,
        shuffle=True,
    )

    # Load model.
    model_config: Optional[model_config_utils.ModelConfig] = None
    optimizer_state_dict: Optional[Any] = None
    scheduler_state_dict: Optional[Any] = None
    resume_epoch: Optional[int] = None
    model: torch.nn.Module
    if isinstance(model_config_or_checkpoint, model_config_utils.ModelConfig):
        model_config = model_config_or_checkpoint
        model_py_module = importlib.import_module("models.{}".format(
            model_config.model_architecture))
        Model = model_py_module.Model  # type: ignore
        model = Model(**model_config.model_params)
    elif isinstance(model_config_or_checkpoint, Text):
        model_checkpoint_path: Text = model_config_or_checkpoint
        loaded = torch.load(model_checkpoint_path, map_location=torch_device)
        if isinstance(loaded, torch.nn.Module):
            # Model.
            model = loaded
        else:
            # State dict.
            model_config = model_config_utils.ModelConfig(
                loaded["model_config"])
            model_py_module = importlib.import_module("models.{}".format(
                model_config.model_architecture))
            Model = model_py_module.Model  # type: ignore
            model = Model(**model_config.model_params)
            model.load_state_dict(loaded["model_state_dict"])
            if resume_training:
                optimizer_state_dict = loaded.get("optimizer_state_dict", None)
                scheduler_state_dict = loaded.get("scheduler_state_dict", None)
                resume_epoch = loaded.get("epoch", None)

    else:
        err_msg: Text = "Model config or path to model checkpoint must be provided."
        logger.error(err_msg)
        raise TypeError(err_msg)
    model = model.to(device=torch_device)

    # Just using basic Stochastic Gradient Descent.
    # TODO: Add weigh decay? May not be necesssary for this task
    optimizer = torch.optim.SGD(
        params=model.parameters(),
        lr=train_config.learning_rate,
        momentum=train_config.momentum,
        weight_decay=train_config.weight_decay,
    )
    if optimizer_state_dict:
        optimizer.load_state_dict(optimizer_state_dict)
    # optimizer = torch.optim.Adadelta(model.parameters(), lr=learning_rate)

    scheduler = torch.optim.lr_scheduler.StepLR(
        optimizer,
        step_size=train_config.lr_step_size,
        gamma=train_config.gamma)
    if scheduler_state_dict:
        scheduler.load_state_dict(scheduler_state_dict)

    # Set up first epoch, if need to resume.
    first_epoch: int = 1
    if resume_epoch:
        first_epoch = resume_epoch

    try:
        # First, get initial train and test scores.
        initial_train_acc, initial_train_loss = eval_model.evaluate_model(
            model=model, dataloader=train_loader, torch_device=torch_device)
        train_acc_batches.add(initial_train_acc)
        train_acc_epochs.add(initial_train_acc)
        train_loss_batches.add(initial_train_loss)
        train_loss_epochs.add(initial_train_loss)
        initial_test_acc, initial_test_loss = eval_model.evaluate_model(
            model=model, dataloader=test_loader, torch_device=torch_device)
        test_acc_epochs.add(initial_test_acc)
        test_loss_epochs.add(initial_test_loss)
        model_size_epochs.add(
            eval_model.get_number_of_model_parameters(model=model))

        # Save initial model checkpoint.
        if save_checkpoint_per_epoch:
            save_model_and_state_dict_checkpoint(
                model=model,
                checkpoints_folder_path=checkpoints_folder_path,
                epoch=0,
                model_config=model_config,
                optimizer=optimizer,
                scheduler=scheduler,
            )

        clear_mem(logger)

        # Track best test accuracy.
        best_test_acc: float = initial_test_acc

        # Train.
        for epoch in range(first_epoch, train_config.num_epochs + 1):
            train(
                logger,
                log_interval,
                model,
                train_loader,
                epoch,
                optimizer,
                scheduler,
                torch_device,
                train_loss_batches,
                train_loss_epochs,
                train_acc_batches,
                train_acc_epochs,
            )

            test_acc, test_loss = eval_model.evaluate_model(
                model=model, dataloader=test_loader, torch_device=torch_device)
            test_acc_epochs.add(test_acc)
            test_loss_epochs.add(test_loss)
            model_size_epochs.add(
                eval_model.get_number_of_model_parameters(model=model))

            scheduler.step()

            # Save best model checkpoint, if needed.
            if test_acc > best_test_acc:
                best_test_acc = test_acc
                if save_best_checkpoint:
                    save_model_and_state_dict_checkpoint(
                        model=model,
                        checkpoints_folder_path=checkpoints_folder_path,
                        epoch=epoch,
                        checkpoint_name=BEST_CHECKPOINT_EPOCH_TEXT,
                        model_config=model_config,
                        optimizer=optimizer,
                        scheduler=scheduler,
                    )

            # Save incremental checkpoint, if needed.
            if save_checkpoint_per_epoch and (
                (epoch == 1) or (epoch == train_config.num_epochs) or
                ((epoch % save_interval) == 0)):
                save_model_and_state_dict_checkpoint(
                    model=model,
                    checkpoints_folder_path=checkpoints_folder_path,
                    epoch=epoch,
                    model_config=model_config,
                    optimizer=optimizer,
                    scheduler=scheduler,
                )

            # Incrementally save losses per epoch.
            for stat_counter in stat_counters.values():
                stat_counter.save_default()

            clear_mem(logger)

    except Exception as exception:
        logger.error(exception, exc_info=True)
    finally:
        # Save losses.
        for stat_counter in stat_counters.values():
            stat_counter.save_default()

        return stat_counters
Ejemplo n.º 13
0
def prune_network(prune_config: prune_config_utils.PruneConfig,
                  pruned_output_folder: Text,
                  model_checkpoint_path: Optional[Text] = None,
                  **kwargs) -> None:
    """
    Can provide a model_checkpoint_path to override any model checkpoint path
    specified in prune_config. 
    """
    logger = logging_utils.get_logger(LOGGER_NAME)

    # Create output folder, if it does not exist.
    if not os.path.exists(pruned_output_folder):
        os.makedirs(pruned_output_folder)

    # Save original prune config.
    general_config_utils.write_config_to_file(
        prune_config, os.path.join(pruned_output_folder,
                                   FILE_NAME_PRUNE_CONFIG))

    # Load model.
    model_path: Text
    if model_checkpoint_path:
        model_path = model_checkpoint_path
        prune_config.original_model_path = model_checkpoint_path
    else:
        model_path = prune_config.original_model_path
    logger.info("Loading model checkpoint from: {}".format(model_path))
    load_location = torch.device("cpu")  # Can make this None, as default
    model = torch.load(model_path, map_location=load_location)

    with torch.no_grad():
        # Perform pruning.
        model.eval()
        logger.info("Starting pruning for prune_type: {}".format(
            prune_config.prune_type))
        if prune_config.prune_type == "craig":
            prune_network_with_craig(model=model,
                                     prune_config=prune_config,
                                     **kwargs)
        elif prune_config.prune_type == "mussay":
            torch_device: torch.device = torch.device("cpu")
            prune_network_with_mussay(model=model,
                                      prune_config=prune_config,
                                      torch_device=torch_device,
                                      **kwargs)
        else:
            raise ValueError("prune_type not supported: {}".format(
                prune_config.prune_type))

    # Save pruned model.
    out_model_path: Text = os.path.join(pruned_output_folder, FILE_NAME_MODEL)
    torch.save(model, out_model_path)
    logger.info("Pruning complete")
    logger.info(model)

    # Save new model config.
    model_architecture = model.ARCHITECTURE_NAME
    out_model_config: Dict
    if model_architecture == "vgg":
        out_model_config = {
            "model_architecture": "vgg",
            "model_params": {
                "vgg_version":
                model.vgg_version,
                "num_classes":
                model.num_classes,
                "pretrained_imagenet":
                getattr(model, "pretrained_imagenet", False),
            },
        }
    elif model_architecture == "fc_classifier":
        fc_layers = [
            layer for layer in model.sequential_module
            if isinstance(layer, nn.Linear)
        ]
        out_model_config = {
            "model_architecture": "fc_classifier",
            "model_params": {
                "input_shape": [28, 28],
                "layers": [l.out_features for l in fc_layers[:-1]],
                "output_dim": 10,
            },
        }
    elif model_architecture == "fc_2":
        fc_layers = [
            layer for layer in model.sequential_module
            if isinstance(layer, nn.Linear)
        ]
        out_model_config = {
            "model_architecture": "fc_2",
            "model_params": {
                "input_shape": [28, 28],
                "layer_1_dim": fc_layers[0].out_features,
                "layer_2_dim": fc_layers[1].out_features,
                "output_dim": 10,
            },
        }
    else:
        # Not supported.
        logger.info("Model architecture config not supported: {}".format(
            model_architecture))
        return

    out_model_config_path: Text = os.path.join(pruned_output_folder,
                                               FILE_NAME_MODEL_CONFIG)
    with open(out_model_config_path, "w") as out_model_config_file:
        json.dump(out_model_config, out_model_config_file)
    logger.info("Wrote model config to: {}".format(out_model_config_path))
Ejemplo n.º 14
0
def prune_network_with_craig(model: nn.Module,
                             prune_config: prune_config_utils.PruneConfig,
                             **kwargs) -> None:
    """This currently assumes that all fully connected layers are directly in
    one sequence, and that there are no non-FC layers after the last FC layer
    of that sequence."""
    logger = logging_utils.get_logger(LOGGER_NAME)

    model = model.to(torch.device("cpu"))

    # Get params for each layer.
    layer_params: Dict = prune_config.prune_params[
        prune_config_utils.KEY_LAYER_PARAMS]

    # Get list of model layers/parameters.
    model_layers: List[nn.Module] = model.ordered_unpacking
    num_layers: int = len(model_layers)
    output_layer_index: int = num_layers - 1
    model_data_shapes: List = [[] for _ in model_layers]

    # Use model input shape to get data output shape for each layer.
    def layer_shape_hook(layer_ind):
        def inner(self, input, output):
            # Discard the batch size.
            model_data_shapes[layer_ind] = output.data.shape[1:]

        return inner

    model_hooks = []
    for layer_ind, layer in enumerate(model_layers):
        model_hooks.append(
            layer.register_forward_hook(layer_shape_hook(layer_ind)))
    run_single_data_point(
        model=model,
        model_input_shape=prune_config.model_input_shape,
        data_transform_name=prune_config.data_transform_name,
    )
    for mhook in model_hooks:
        mhook.remove()

    curr_layer_i: int = 0
    while curr_layer_i < output_layer_index:
        # Iterate through layers, prune as necessary.
        curr_layer: nn.Module = model_layers[curr_layer_i]
        curr_layer_type: Type[nn.Module] = type(curr_layer)
        curr_layer_params: Optional[Dict]
        curr_layer_prune_func: Optional[Callable[..., Tuple[List[int],
                                                            List[float]]]]

        curr_layer_prune_func = CRAIG_LAYER_FUNCTION_MAP.get(
            curr_layer_type, None)
        curr_layer_params = layer_params.get(
            LAYER_NAME_MAP.get(
                curr_layer_type,
                None),  # First try to get the current layer params
            layer_params.get(
                prune_config_utils.
                KEY_LAYER_ALL,  # Otherwise try to get an "all" overriding param
                None,
            ),
        )

        if (not curr_layer_prune_func) or (not curr_layer_params):
            # If either the prune function or prune params was not found, skip.
            curr_layer_i += 1
            continue

        # Prune the current layer.
        subset_nodes: List[int]
        subset_weights: List[float]
        subset_nodes, subset_weights = curr_layer_prune_func(
            layer=curr_layer, **(curr_layer_params))
        subset_len: int = len(subset_nodes)

        next_layer_i: int = curr_layer_i + 1
        while next_layer_i < num_layers:
            # Find the next prunable layer and update the weights accordingly.
            next_layer: nn.Module = model_layers[next_layer_i]
            next_layer_type: Type[nn.Module] = type(next_layer)

            if next_layer_type not in CRAIG_LAYER_FUNCTION_MAP:
                # If this layer is not prunable, skip.
                next_layer_i += 1
                continue

            if isinstance(next_layer, nn.Conv2d):
                # Change conv in channels to match the pruned subset.
                next_layer.weight = nn.Parameter(
                    next_layer.weight[:, subset_nodes])
                next_layer.in_channels = subset_len
                next_layer._in_channels = (
                    subset_len  # Not sure if this is necessary.
                )
            elif isinstance(next_layer, nn.Linear):
                # Assuming a pre-Linear flatten op, need to find the weights
                # that correspond to the channels that were kept in the pruning
                # of the previous layer.
                num_weights_per_channel: int

                if isinstance(curr_layer, nn.Conv2d):
                    # If the initially pruned layer was a conv, then re-iterate
                    # from curr_layer to next_layer, searching for the last
                    # conv/pooling/relu/etc before a flatten-esque operation.
                    for temp_i in range(curr_layer_i, next_layer_i):
                        if len(model_data_shapes[temp_i]) != 3:
                            break
                        num_weights_per_channel = int(
                            np.prod(model_data_shapes[temp_i][1:]))
                else:
                    # Otherwise, the initially pruned layer must have been a
                    # linear layer. In that case, we are currently assuming
                    # that only other linear/relu/flatten/etc layers lie in
                    # between. So, we can simply use the number of original
                    # channels/features, which should be =1.
                    num_weights_per_channel = int(
                        np.prod(model_data_shapes[curr_layer_i][1:]))

                weights_to_keep: List[int] = []
                for si in subset_nodes:
                    weights_to_keep.extend(
                        list(
                            range(
                                num_weights_per_channel * si,
                                num_weights_per_channel * (si + 1),
                            )))
                next_layer.weight = nn.Parameter(
                    next_layer.weight[:, weights_to_keep])

                next_layer.in_features = len(weights_to_keep)
            else:
                logger.warning(
                    "No pruning adjustment made to layer {} of type {}".format(
                        next_layer_i, next_layer_type))

            # Adjustments were attempted, now continue to the next layer for
            # pruning.
            break

        # Now that we have found the next prunable layer, we can jump to it.
        curr_layer_i = next_layer_i
Ejemplo n.º 15
0
"""
CS224N 2018-19: Homework 3
parser_model.py: Feed-Forward Neural Network for Dependency Parsing
Sahil Chopra <*****@*****.**>
"""
import pickle
import os
import time

import torch
import torch.nn as nn
import torch.nn.functional as F

import logging
from utils import logging_utils
logger = logging_utils.get_logger(module=__name__, loglevel=logging.INFO)


class ParserModel(nn.Module):
    """ Feedforward neural network with an embedding layer and single hidden layer.
    The ParserModel will predict which transition should be applied to a
    given partial parse configuration.

    PyTorch Notes:
        - Note that "ParserModel" is a subclass of the "nn.Module" class. In PyTorch all neural networks
            are a subclass of this "nn.Module".
        - The "__init__" method is where you define all the layers and their respective parameters
            (embedding layers, linear layers, dropout layers, etc.).
        - "__init__" gets automatically called when you create a new instance of your class, e.g.
            when you write "m = ParserModel()".
        - Other methods of ParserModel can access variables that have "self." prefix. Thus,
Ejemplo n.º 16
0
import pickle
import math
import time

from torch import nn, optim
import torch
from tqdm import tqdm
import sys

from parser_model import ParserModel
from utils.parser_utils import minibatches, load_and_preprocess_data, AverageMeter

import logging

from utils import logging_utils
logger = logging_utils.get_logger(loglevel = logging.DEBUG)

# -----------------
# Primary Functions
# -----------------
def train(parser, train_data, dev_data, output_path, batch_size=1024, n_epochs=10, lr=0.0005):
    """ Train the neural dependency parser.

    @param parser (Parser): Neural Dependency Parser
    @param train_data ():
    @param dev_data ():
    @param output_path (str): Path to which model weights and results are written.
    @param batch_size (int): Number of examples in a single batch
    @param n_epochs (int): Number of training epochs
    @param lr (float): Learning rate
    """