Exemple #1
0
def start_worker(
    experiment_id,
    experiment_named_id,
    named_combination,
    cmd,
    human_cmd,
    experiment_path,
    running_processes,
):
    if "--device" in cmd:
        device_index_in_list = cmd.index("--device")
        device = cmd[device_index_in_list + 1]
    else:
        device = -1

    running_processes[experiment_id] = (True, device)
    logger = get_logger()

    logger.debug("starting single setup: {}".format(human_cmd))
    with open(os.path.join(experiment_path, "stdlog.out"), "w") as file_stdout, open(
        os.path.join(experiment_path, "stdlog.err"), "w"
    ) as file_stderr:
        completed_process = subprocess.run(cmd, stdout=file_stdout, stderr=file_stderr)

    experiment_details = get_experiment_result_detailed(experiment_path)

    running_processes[experiment_id] = (False, device)
    return {
        **named_combination,
        **{"rc": completed_process.returncode, "experiment_id": experiment_id},
        "details": experiment_details,
        "experiment_named_id": experiment_named_id,
    }
Exemple #2
0
 def __init__(self, sorted_expected_label_values, polarity_associations,
              snem_name):
     self.logger = get_logger()
     self.polarity_associations = polarity_associations
     self.pos_label_value = polarity_associations['positive']
     self.neg_label_value = polarity_associations['negative']
     self.sorted_expected_label_values = sorted_expected_label_values
     self.pos_label_index = self.sorted_expected_label_values.index(
         self.pos_label_value)
     self.neg_label_index = self.sorted_expected_label_values.index(
         self.neg_label_value)
     self.snem_name = snem_name
Exemple #3
0
    def __init__(self, dataset: Dataset, random_seed=None):
        x = []
        y = []
        for ind, example in enumerate(dataset):
            x.append(ind)
            y.append(example["polarity"])

        x_arr = np.asarray(x).reshape((len(x), 1))
        y_arr = np.asarray(y).ravel()

        ros = RandomOverSampler(random_state=random_seed)
        x_sampled, y_sampled = ros.fit_resample(x_arr, y_arr)
        self.sampled_indexes = x_sampled.ravel().tolist()
        sampled_labels = y_sampled.tolist()

        assert len(self.sampled_indexes) == len(sampled_labels)

        random.shuffle(self.sampled_indexes)

        get_logger().info(
            f"oversampled to {len(self.sampled_indexes)} samples. label distribution: {Counter(sampled_labels)}"
        )
Exemple #4
0
 def __init__(self,
              mode: str,
              max_seq_length: int,
              max_hop_distance: int = 10):
     self.logger = get_logger()
     self.nlp = self.__get_spacy()
     self.tag2ind, self.ind2tag = self.__create_association()
     self.num_tags = len(list(self.tag2ind.keys()))
     self.max_seq_length = max_seq_length
     self.max_hop_distance = max_hop_distance
     self.mode = mode
     cache_filepath = DependencyParser.__CACHE_FILEPATH_TEMPLATE.format(
         mode)
     self.cache = shelve.open(cache_filepath)
     self.logger.info(
         f"loaded cache with {len(self.cache)} entries from {cache_filepath}"
     )
Exemple #5
0
 def __init__(self, patience=2, delta=0.01):
     """
     Args:
         patience (int): How long to wait after last time validation loss improved.
                         Default: 2
         verbose (bool): If True, prints a message for each validation loss improvement.
                         Default: False
         delta (float): Minimum change in the monitored quantity to qualify as an improvement.
                         Default: 0
     """
     self.patience = patience
     self.counter = 0
     self.best_score = None
     self.early_stop = False
     self.delta = delta
     self.logger = get_logger()
     self.flag_has_score_increased_since_last_check = False
Exemple #6
0
    def __init__(self, name, basepath_datasets, human, non_human):
        self.basepath_datasets = basepath_datasets
        self.name = name
        self.human_created_filenames = human
        self.non_human_created_filenames = non_human
        self.human_created_filepaths = [
            self.get_filepath_by_name(x) for x in self.human_created_filenames
        ]
        self.non_human_created_filepaths = [
            self.get_filepath_by_name(x)
            for x in self.non_human_created_filenames
        ]
        self.data_types = ["human", "nonhum"]

        self.sets_info = None

        self.random_seed = 1337
        random.seed(self.random_seed)
        self.logger = get_logger()

        self.examples_human = self.files_to_dictlst(
            self.human_created_filepaths)
        self.examples_nonhum = self.files_to_dictlst(
            self.non_human_created_filepaths)

        self.logger.info("shuffling example lists with seed {}".format(
            self.random_seed))
        random.shuffle(self.examples_human)
        random.shuffle(self.examples_nonhum)

        self.logger.info(
            "{} examples read created by humans (from: {})".format(
                len(self.examples_human), self.human_created_filepaths))
        self.logger.info(
            "{} examples read not created by humans (from: {})".format(
                len(self.examples_nonhum), self.non_human_created_filepaths))
Exemple #7
0
import os

import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import confusion_matrix

from fxlogger import get_logger

logger = get_logger()


def create_save_plotted_confusion_matrix(conf_matrix, expected_labels,
                                         basepath):
    ax, title = plot_confusion_matrix(conf_matrix,
                                      expected_labels,
                                      normalize=False)
    filepath = os.path.join(basepath, 'stats.png')
    plt.savefig(filepath, bbox_inches='tight')
    logger.debug("created confusion matrices in path: {}".format(filepath))


def plot_confusion_matrix(cm,
                          classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix. Normalization can be applied by setting `normalize=True`.
    based on https://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html#sphx-glr-auto-examples-model-selection-plot-confusion-matrix-py
    """
    if not title:
Exemple #8
0
 def __init__(self, global_context_max_seqs_per_doc):
     self.logger = get_logger()
     self.count_truncated = 0
     self.count_all_sequences_where_we_count_truncation = 0
     self.count_truncated_long_docs = 0
     self.max_seqs_per_doc = global_context_max_seqs_per_doc
Exemple #9
0
    def __init__(self, options):
        self.logger = get_logger()
        self.opt = options

        if self.opt.cuda_devices:
            # to run on SCC
            if self.opt.cuda_devices == "SGE_GPU":
                self.cuda_devices = os.environ.get("SGE_GPU")
            else:
                self.cuda_devices = self.opt.cuda_devices

            if self.cuda_devices:
                self.logger.info("cuda devices:" + self.cuda_devices)
                self.cuda_devices = self.cuda_devices.split(",")
                self.logger.info(
                    f"was assigned {len(self.cuda_devices)} cuda devices: {self.cuda_devices}"
                )
                if self.opt.num_workers < 0:
                    self.logger.info(
                        f"num_workers < 0: using cuda device count. setting num_workers={len(self.cuda_devices)}"
                    )
                    self.opt.num_workers = len(self.cuda_devices)

        else:
            # do not use CUDA
            self.cuda_devices = None

        self.use_cross_validation = 0  # if 0: do not use cross validation
        self.snem = "f1_macro"
        self.experiment_base_path = self.opt.experiments_path

        args_names_ordered = [
            "model_name",
            "optimizer",
            "initializer",
            "learning_rate",
            "batch_size",
            "balancing",
            "num_epoch",
            "lsr",
            "use_tp_placeholders",
            "spc_lm_representation",
            "spc_input_order",
            "aen_lm_representation",
            "spc_lm_representation_distilbert",
            "finetune_glove",
            "eval_only_after_last_epoch",
            "devmode",
            "local_context_focus",
            "SRD",
            "pretrained_model_name",
            "state_dict",
            "use_global_context",
            "global_context_seqs_per_doc",
            "focus_mode",
        ]

        combinations = None
        if self.opt.combi_mode == "default":
            if self.opt.combi_id == 0:
                combinations = combinations_default_0
        elif self.opt.combi_mode == "combinations_g":
            if self.opt.combi_id == 0:
                combinations = combinations_g_0

        if not combinations:
            raise ValueError(
                "combination(mode={}, id={}) not defined".format(
                    self.opt.combi_mode, self.opt.combi_id
                )
            )

        # key: name of parameter that is only applied if its conditions are met
        # pad_value: list of tuples, consisting of parameter name and the pad_value it needs to have in order for the
        # condition to be satisfied
        # Note that all tuples in this list are OR connected, so if at least one is satisfied, the conditions are met.
        # If we need AND connected conditions, my idea is to add an outer list, resulting in a list of lists (of
        # tuples) where all lists are AND connected.
        # If a condition is not satisfied, the corresponding parameter will still be pass
        conditions = {
            "spc_lm_representation_distilbert": [("model_name", "distilbert")],
            "spc_lm_representation": [
                ("model_name", "spc_bert"),
                ("model_name", "spc_roberta"),
            ],
            "spc_input_order": [
                ("model_name", "spc_bert"),
                ("model_name", "spc_roberta"),
                ("model_name", "spc_distilbert"),
            ],
            "aen_lm_representation": [
                ("model_name", "aen_bert"),
                ("model_name", "aen_roberta"),
                ("model_name", "aen_distilbert"),
            ],
            "use_early_stopping": [("num_epoch", "10")],
            "finetune_glove": [("model_name", "aen_glove")],
            "local_context_focus": [("model_name", "lcf_bert")],
            "SRD": [("model_name", "lcf_bert")],
            "pretrained_model_name": [
                ("model_name", "lcf_bert"),
                ("model_name", "aen_bert"),
                ("model_name", "spc_bert"),
            ],
        }

        assert len(args_names_ordered) == len(combinations.keys())

        self.experiment_base_id = (
            self.opt.dataset + "_" + datetime.today().strftime("%Y%m%d-%H%M%S")
        )
        self.basecmd = ["python", "train.py"]
        self.basepath = "controller_data"
        self.basepath_data = os.path.join(self.basepath, "datasets")

        combination_count = 1
        _combination_values = []
        for arg_name in args_names_ordered:
            arg_values = list(combinations[arg_name])
            combination_count = combination_count * len(arg_values)
            _combination_values.append(arg_values)

        combinations = list(product(*_combination_values))
        assert len(combinations) == combination_count

        self.logger.info(
            "{} arguments, totaling in {} combinations".format(
                len(args_names_ordered), combination_count
            )
        )

        # apply conditions
        self.logger.info("applying conditions...")
        self.named_combinations, count_duplicates = self._apply_conditions(
            combinations, args_names_ordered, conditions
        )
        self.logger.info(
            "applied conditions. removed {} combinations. {} -> {}".format(
                count_duplicates, combination_count, len(self.named_combinations)
            )
        )
        self.combination_count = len(self.named_combinations)

        if self.use_cross_validation > 0:
            self.logger.info(
                "using {}-fold cross validation".format(self.use_cross_validation)
            )
            self.dataset_preparer = DatasetPreparer.poltsanews_crossval8010_allhuman(
                self.basepath_data
            )
        else:
            self.logger.info(
                "not using cross validation".format(self.use_cross_validation)
            )
            if self.opt.dataset == "poltsanews_rel801010_allhuman":
                (
                    self.dataset_preparer,
                    self.datasetname,
                    self.task_format,
                ) = DatasetPreparer.poltsanews_rel801010_allhuman(self.basepath_data)
            elif self.opt.dataset == "semeval14restaurants":
                (
                    self.dataset_preparer,
                    self.datasetname,
                    self.task_format,
                ) = DatasetPreparer.semeval14restaurants(self.basepath_data)
            elif self.opt.dataset == "semeval14laptops":
                (
                    self.dataset_preparer,
                    self.datasetname,
                    self.task_format,
                ) = DatasetPreparer.semeval14laptops(self.basepath_data)
            elif self.opt.dataset == "acl14twitter":
                (
                    self.dataset_preparer,
                    self.datasetname,
                    self.task_format,
                ) = DatasetPreparer.acl14twitter(self.basepath_data)
            elif self.opt.dataset == "sentinews":
                (
                    self.dataset_preparer,
                    self.datasetname,
                    self.task_format,
                ) = DatasetPreparer.sentinews(self.basepath_data)
            elif self.opt.dataset == "newstsc":
                (
                    self.dataset_preparer,
                    self.datasetname,
                    self.task_format,
                ) = DatasetPreparer.newstsc(self.basepath_data)
            elif self.opt.dataset == "newstsc2":
                (
                    self.dataset_preparer,
                    self.datasetname,
                    self.task_format,
                ) = DatasetPreparer.newstsc2(self.basepath_data)
            elif self.opt.dataset == "newstsc3":
                (
                    self.dataset_preparer,
                    self.datasetname,
                    self.task_format,
                ) = DatasetPreparer.newstsc3(self.basepath_data)
            elif self.opt.dataset == "newstsc4":
                (
                    self.dataset_preparer,
                    self.datasetname,
                    self.task_format,
                ) = DatasetPreparer.newstsc4(self.basepath_data)
            elif self.opt.dataset == "newstsc5":
                (
                    self.dataset_preparer,
                    self.datasetname,
                    self.task_format,
                ) = DatasetPreparer.newstsc5(self.basepath_data)
            elif self.opt.dataset == "newstscg":
                (
                    self.dataset_preparer,
                    self.datasetname,
                    self.task_format,
                ) = DatasetPreparer.newstscg(self.basepath_data)
            else:
                raise Exception("unknown dataset: {}".format(self.opt.dataset))