Ejemplo n.º 1
0
    def __init__(self,
                 message: str,
                 decimal_places: int = 3,
                 terminal_break: bool = False,
                 skip_output: bool = False):
        """Creates a new instance of ``Timer``.

        Args:
            message (str): The message to print at the end of the ``with`` block. Notice that the measured time, as
                string ``" in X.XXXs"``, is appended to the provided message automatically.
                this message
            decimal_places (int, optional): The number of decimal places to print for the time, which is measured and
                printed in seconds.
            terminal_break (bool, optional): Indicates whether to add an additional line break to the printed message.
            skip_output (bool, optional): If ``True``, then no output is printed to the screen.
        """

        # sanitize args
        insanity.sanitize_type("decimal_places", decimal_places, int)
        insanity.sanitize_range("decimal_places", decimal_places, minimum=0)

        # create the message to print at the end of the with-block
        self._message = str(message).strip() + " in {:.%df}s" % decimal_places
        if terminal_break:
            self._message += "\n"

        self._start = None  # the time when the clock is started
        self._total = 0  # the total time measured

        self._skip_output = bool(
            skip_output
        )  # indicates whether to print the time at the end of a ``with`` block
Ejemplo n.º 2
0
 def max_branching_factor(self, max_branching_factor: int) -> None:
     insanity.sanitize_type("max_branching_factor", max_branching_factor,
                            int)
     insanity.sanitize_range("max_branching_factor",
                             max_branching_factor,
                             minimum=1)
     self._max_branching_factor = max_branching_factor
Ejemplo n.º 3
0
 def stop_prob(self, stop_prob: numbers.Real) -> None:
     insanity.sanitize_type("stop_prob", stop_prob, numbers.Real)
     insanity.sanitize_range("stop_prob",
                             stop_prob,
                             minimum=0,
                             maximum=1,
                             max_inclusive=False)
     self._stop_prob = float(stop_prob)
Ejemplo n.º 4
0
def position(index: int) -> typing.Callable[[property], property]:
    """A decorator that allows for specifying the position of a property of a configuration class among all parsed
    positional args.
    
    Notice that ``argmagic`` ignore this decorator if positional args are not used or if the annotated property
    defines an optional configuration value.
    
    Args:
        index (int): The index of the annotated configuration in the sequence of positional args.
    """
    insanity.sanitize_type("index", index, int)
    insanity.sanitize_range("index", index, minimum=0)

    def _position(func: property) -> property:
        if not isinstance(func, property):
            raise TypeError(
                "The decorator @position may be applied to properties only!")
        func.fget.__dict__[argmagic.POSITION] = index
        return func

    return _position
Ejemplo n.º 5
0
 def num_training_samples(self, num_training_samples: int) -> None:
     insanity.sanitize_type("num_training_samples", num_training_samples, int)
     insanity.sanitize_range("num_training_samples", num_training_samples, minimum=1)
     self._num_training_samples = num_training_samples
Ejemplo n.º 6
0
 def num_datasets(self, num_datasets: int) -> None:
     insanity.sanitize_type("num_datasets", num_datasets, int)
     insanity.sanitize_range("num_samples", num_datasets, minimum=1)
     self._num_datasets = num_datasets
Ejemplo n.º 7
0
    def generate_datasets(
            self,
            num_datasets: int,
            num_training_samples: int,
            output_dir: str
    ) -> None:
        """Generates datasets from the data that was provided to this instance of ``DatasetGenerator`, and writes them
        to disk.
        
        Args:
            num_datasets (int): The total number of datasets to create.
            num_training_samples (int): The number of training samples to create for each dataset.
            output_dir (str): The path of the output directory.
        """
        # sanitize args
        insanity.sanitize_type("num_datasets", num_datasets, int)
        insanity.sanitize_range("num_datasets", num_datasets, minimum=1)
        insanity.sanitize_type("num_training_samples", num_training_samples, int)
        insanity.sanitize_range("num_training_samples", num_training_samples, minimum=1)

        # create patterns for the names of the directories that are created for the single datasets and for
        # the base names of training samples
        output_dir_pattern = "{:0" + str(len(str(num_datasets - 1))) + "d}"
        sample_filename_pattern = "{:0" + str(len(str(num_training_samples - 1))) + "d}"
        
        for dataset_idx in range(num_datasets):
            
            print("generating dataset #{}...".format(dataset_idx))

            # assemble needed paths
            ds_output_dir = os.path.join(output_dir, output_dir_pattern.format(dataset_idx))
            train_dir = os.path.join(ds_output_dir, "train")
            dev_dir = os.path.join(ds_output_dir, "dev")
            test_dir = os.path.join(ds_output_dir, "test")

            # create folder structure for storing the current dataset
            if not os.path.isdir(ds_output_dir):
                os.mkdir(ds_output_dir)
            if not os.path.isdir(train_dir):
                os.mkdir(train_dir)
            if not os.path.isdir(dev_dir):
                os.mkdir(dev_dir)
            if not os.path.isdir(test_dir):
                os.mkdir(test_dir)
        
            # split countries into train/dev/test
            train, dev, test = self._split_countries()

            # write selected dev+test countries to disk
            with open(os.path.join(ds_output_dir, "countries.dev.txt"), "w") as f:
                for c in dev:
                    f.write("{}\n".format(c))
            with open(os.path.join(ds_output_dir, "countries.test.txt"), "w") as f:
                for c in test:
                    f.write("{}\n".format(c))
            
            # create training samples + write them to disk
            for sample_idx in range(num_training_samples):
                print("generating training sample #{}...".format(sample_idx))
                sample = self._generate_sample(train)
                kg_writer.KgWriter.write(sample, train_dir, sample_filename_pattern.format(sample_idx))
            
            # create evaluation sample + write it to disk
            print("generating dev sample... ")
            dev_sample = self._generate_sample(train, inf_countries=dev, minimal=True)
            kg_writer.KgWriter.write(dev_sample, dev_dir, "dev")

            # create test sample + write it to disk
            print("generating test sample...")
            test_sample = self._generate_sample(train, inf_countries=test, minimal=True)
            kg_writer.KgWriter.write(test_sample, test_dir, "test")
            
            # print statistics about test sample
            num_spec = len([t for t in test_sample.triples if not t.inferred])
            num_inf = len([t for t in test_sample.triples if t.inferred])
            print("number triples in test sample: {} ({} spec / {} inf)".format(num_spec + num_inf, num_spec, num_inf))

            print("OK\n")
Ejemplo n.º 8
0
 def max_tree_size(self, max_tree_size: int) -> None:
     insanity.sanitize_type("max_tree_size", max_tree_size, int)
     insanity.sanitize_range("max_tree_size", max_tree_size, minimum=1)
     self._max_tree_size = max_tree_size
Ejemplo n.º 9
0
 def max_tree_depth(self, max_tree_depth: int) -> None:
     insanity.sanitize_type("max_tree_depth", max_tree_depth, int)
     insanity.sanitize_range("max_tree_depth", max_tree_depth, minimum=1)
     self._max_tree_depth = max_tree_depth
Ejemplo n.º 10
0
    def read(cls, input_dir: str, basename: str, index: int = None) -> knowledge_graph.KnowledgeGraph:
        """Loads a knowledge graph from the specified location.
        
        Args:
            input_dir (str): The directory that contains all of the files.
            basename (str): The base name, i.e., the prefix, included in all files' names.
            index (int, optional): If this is provided, then ``input_dir`` and ``basename`` are assumed to specify a
                sequence of knowledge graphs, and ``index`` specifies the element of this sequence to retrieve.
        
        Returns:
            :class:`knowledge_graph.KnowledgeGraph`: A knowledge graph that has been populated according to the read
                information.
        
        Raises:
            ValueError: If ``input_dir`` does not refer to an existing directory or if any of the needed files is
                missing.
        """
        # //////// Sanitize Args ---------------------------------------------------------------------------------------
        
        # ensure that the inputs are strings
        input_dir = str(input_dir)
        basename = str(basename)
        if index is not None:
            insanity.sanitize_type("index", index, int)
            insanity.sanitize_range("index", index, minimum=0)
        
        # assemble all needed paths
        # the used postfixes have the following meanings:
        # * _vocab -> definition of a class/relation/literal
        # * _spec  -> user-defined data
        # * _inf   -> inferred data
        individual_spec = os.path.join(input_dir, basename + io.INDIVIDUALS_SPEC_EXT)
        classes_vocab = os.path.join(input_dir, basename + io.CLASSES_VOCAB_EXT)
        classes_spec = os.path.join(input_dir, basename + io.CLASSES_SPEC_EXT)
        classes_inf = os.path.join(input_dir, basename + io.CLASSES_INF_EXT)
        classes_pred = os.path.join(input_dir, basename + io.CLASSES_PRED_EXT)
        relations_vocab = os.path.join(input_dir, basename + io.RELATIONS_VOCAB_EXT)
        relations_spec = os.path.join(input_dir, basename + io.RELATIONS_SPEC_EXT)
        relations_inf = os.path.join(input_dir, basename + io.RELATIONS_INF_EXT)
        relations_pred = os.path.join(input_dir, basename + io.RELATIONS_PRED_EXT)
        literals_vocab = os.path.join(input_dir, basename + io.LITERALS_VOCAB_EXT)
        literals_spec = os.path.join(input_dir, basename + io.LITERALS_SPEC_EXT)
        literals_inf = os.path.join(input_dir, basename + io.LITERALS_INF_EXT)
        literals_pred = os.path.join(input_dir, basename + io.LITERALS_PRED_EXT)
        if index is not None:
            classes_spec += "." + str(index)
            classes_inf += "." + str(index)
            classes_pred += "." + str(index)
            relations_spec += "." + str(index)
            relations_inf += "." + str(index)
            relations_pred += "." + str(index)
            literals_spec += "." + str(index)
            literals_inf += "." + str(index)
            literals_pred += "." + str(index)
        
        # check whether the input directory exists
        if not os.path.isdir(input_dir):
            raise ValueError("The provided <input_dir> does not exist: '{}'!".format(input_dir))
        
        # check whether all of the needed files exist:
        if not os.path.isfile(individual_spec):
            raise ValueError("Missing file: '{}'!".format(individual_spec))
        if not os.path.isfile(classes_vocab):
            raise ValueError("Missing file: '{}'!".format(classes_vocab))
        if not os.path.isfile(classes_spec):
            raise ValueError("Missing file: '{}'!".format(classes_spec))
        if not os.path.isfile(classes_inf):
            raise ValueError("Missing file: '{}'!".format(classes_inf))
        if not os.path.isfile(classes_pred):
            raise ValueError("Missing file: '{}'!".format(classes_pred))
        if not os.path.isfile(relations_vocab):
            raise ValueError("Missing file: '{}'!".format(relations_vocab))
        if not os.path.isfile(relations_spec):
            raise ValueError("Missing file: '{}'!".format(relations_spec))
        if not os.path.isfile(relations_inf):
            raise ValueError("Missing file: '{}'!".format(relations_inf))
        if not os.path.isfile(relations_pred):
            raise ValueError("Missing file: '{}'!".format(relations_pred))
        if not os.path.isfile(literals_vocab):
            raise ValueError("Missing file: '{}'!".format(literals_vocab))
        if not os.path.isfile(literals_spec):
            raise ValueError("Missing file: '{}'!".format(literals_spec))
        if not os.path.isfile(literals_inf):
            raise ValueError("Missing file: '{}'!".format(literals_inf))
        if not os.path.isfile(literals_pred):
            raise ValueError("Missing file: '{}'!".format(literals_pred))
    
        # //////// Read Vocabulary ---------------------------------------------------------------------------------
        
        # create new empty knowledge graph
        kg = knowledge_graph.KnowledgeGraph()
        
        # read classes
        with open(classes_vocab, "r") as f:
            for index, line in enumerate(f):
                if line == "":
                    continue
                m = re.match(cls.VOCAB_REGEX, line)
                assert int(m.group("index")) == index
                kg.classes.add(ctf.ClassTypeFactory.create_class(m.group("name")))
        
        # read relations
        with open(relations_vocab, "r") as f:
            for index, line in enumerate(f):
                if line == "":
                    continue
                m = re.match(cls.VOCAB_REGEX, line)
                assert int(m.group("index")) == index
                kg.relations.add(rtf.RelationTypeFactory.create_relation(m.group("name")))
        
        # read literals
        with open(literals_vocab, "r") as f:
            for index, line in enumerate(f):
                if line == "":
                    continue
                m = re.match(cls.VOCAB_REGEX, line)
                assert int(m.group("index")) == index
                kg.literals.add(ltf.LiteralTypeFactory.create_literal(m.group("name")))
        
        # //////// Read Individuals --------------------------------------------------------------------------------
        
        with open(individual_spec, "r") as f:
            for index, line in enumerate(f):
                if line == "":
                    continue
                m = re.match(cls.VOCAB_REGEX, line)
                assert int(m.group("index")) == index
                kg.individuals.add(individual_factory.IndividualFactory.create_individual(m.group("name")))

        # //////// Read Class Memberships --------------------------------------------------------------------------

        # read specified memberships
        with open(classes_spec, "r") as f:
            for individual_index, line in enumerate(f):  # run through all individuals
                if line == "":
                    continue
                current_ind = kg.individuals[individual_index]
                for class_index, mem in enumerate(map(int, re.findall(cls.MEMBERSHIPS_REGEX, line))):
                    if mem != 0:  # only consider specified memberships
                        current_ind.classes.add(
                                class_membership.ClassMembership(
                                        kg.classes[class_index],
                                        mem == 1
                                )
                        )
        
        # read inferred memberships
        with open(classes_inf, "r") as f:
            for individual_index, line in enumerate(f):  # run through all individuals
                if line == "":
                    continue
                current_ind = kg.individuals[individual_index]
                for class_index, mem in enumerate(map(int, re.findall(cls.MEMBERSHIPS_REGEX, line))):
                    if mem != 0:  # only consider specified memberships
                        current_ind.classes.add(
                            class_membership.ClassMembership(
                                kg.classes[class_index],
                                mem == 1,
                                inferred=True
                            )
                        )
        
        # read memberships that are prediction targets
        with open(classes_pred, "r") as f:
            for individual_index, line in enumerate(f):  # run through all individuals
                if line == "":
                    continue
                current_ind = kg.individuals[individual_index]
                for class_index, mem in enumerate(map(int, re.findall(cls.MEMBERSHIPS_REGEX, line))):
                    if mem != 0:  # only consider specified memberships
                        current_ind.classes.add(
                                class_membership.ClassMembership(
                                        kg.classes[class_index],
                                        mem == 1,
                                        prediction=True
                                )
                        )

        # //////// Read Literals -----------------------------------------------------------------------------------
        
        # read specified literals
        with open(literals_spec, "r") as f:
            for line in f:
                if line == "":
                    continue
                
                # parse read line
                m = re.match(cls.TRIPLE_REGEX, line)
                
                # fetch respective individual, literal, and value
                current_ind = kg.individuals[int(m.group("subject"))]
                current_lit = kg.literals[int(m.group("predicate"))]
                current_value = m.group("object")
                
                # add literal to individual
                current_ind.literals.add(
                        literal_value.LiteralValue(
                                current_lit,
                                current_value
                        )
                )
        
        # read inferred literals
        with open(literals_inf, "r") as f:
            for line in f:
                if line == "":
                    continue
                
                # parse read line
                m = re.match(cls.TRIPLE_REGEX, line)
        
                # fetch respective individual, literal, and value
                current_ind = kg.individuals[int(m.group("subject"))]
                current_lit = kg.literals[int(m.group("predicate"))]
                current_value = m.group("object")
        
                # add literal to individual
                current_ind.literals.add(
                        literal_value.LiteralValue(
                                current_lit,
                                current_value,
                                inferred=True
                        )
                )
        
        # read literals that are prediction targets
        with open(literals_pred, "r") as f:
            for line in f:
                if line == "":
                    continue
        
                # parse read line
                m = re.match(cls.TRIPLE_REGEX, line)
        
                # fetch respective individual, literal, and value
                current_ind = kg.individuals[int(m.group("subject"))]
                current_lit = kg.literals[int(m.group("predicate"))]
                current_value = m.group("object")
        
                # add literal to individual
                current_ind.literals.add(
                        literal_value.LiteralValue(
                                current_lit,
                                current_value,
                                prediction=True
                        )
                )
        
        # //////// Read Triples ------------------------------------------------------------------------------------
        
        # read specified triples
        with open(relations_spec, "r") as f:
            for line in f:
                if line == "":
                    continue
                
                # parse read line
                m = re.match(cls.TYPED_TRIPLE_REGEX, line)
    
                # fetch respective individual, literal, and value
                positive = m.group("type") == "+"
                sub = kg.individuals[int(m.group("subject"))]
                pred = kg.relations[int(m.group("predicate"))]
                obj = kg.individuals[int(m.group("object"))]
                
                # add triple
                kg.triples.add(triple.Triple(sub, pred, obj, positive))
        
        # read inferred triples
        with open(relations_inf, "r") as f:
            for line in f:
                if line == "":
                    continue
                    
                # parse read line
                m = re.match(cls.TYPED_TRIPLE_REGEX, line)
        
                # fetch respective individual, literal, and value
                positive = m.group("type") == "+"
                sub = kg.individuals[int(m.group("subject"))]
                pred = kg.relations[int(m.group("predicate"))]
                obj = kg.individuals[int(m.group("object"))]
        
                # add triple
                kg.triples.add(triple.Triple(sub, pred, obj, positive, inferred=True))
        
        # read triples that are prediction targets
        with open(relations_pred, "r") as f:
            for line in f:
                if line == "":
                    continue
        
                # parse read line
                m = re.match(cls.TYPED_TRIPLE_REGEX, line)
        
                # fetch respective individual, literal, and value
                positive = m.group("type") == "+"
                sub = kg.individuals[int(m.group("subject"))]
                pred = kg.relations[int(m.group("predicate"))]
                obj = kg.individuals[int(m.group("object"))]
        
                # add triple
                kg.triples.add(triple.Triple(sub, pred, obj, positive, prediction=True))
        
        return kg
Ejemplo n.º 11
0
    def __init__(
            self,
            model: encoder.Encoder,
            word_emb: nn.Embedding,
            pos_emb: nn.Embedding,
            mask_index: int,
            prediction_rate: numbers.Real = 0.15,
            mask_rate: numbers.Real = 0.8,
            random_rate: numbers.Real = 0.1
    ):
        """Creates a new instance of ``BERTLoss`.

        Args:
            model (encoder.Encoder): The encoder model being pretrained.
            word_emb (nn.Embedding): The used word embeddings.
            pos_emb (nn.Embedding): The used positional embeddings.
            mask_index (int): The index of the mask token.
            prediction_rate (numbers.Real, optional): The percentage of tokens in each training sequence that
                predictions are computed for, which is set to ``0.8``, by default.
            mask_rate (numbers.Real, optional): Among all tokens that predictions are computed for, the percentage of
                tokens that are replaced with the mask token, as specified by ``mask_index``. This is set to ``0.8``, by
                default.
            random_rate (numbers.Real, optional): Among all tokens that predictions are computed for, the percentage of
                tokens that are randomly replaced with other tokens. This is set to ``0.1``, by default.
        """
        super().__init__()
        
        # sanitize args
        insanity.sanitize_type("model", model, encoder.Encoder)
        insanity.sanitize_type("word_emb", word_emb, nn.Embedding)
        insanity.sanitize_type("pos_emb", word_emb, nn.Embedding)
        if pos_emb.embedding_dim != word_emb.embedding_dim:
            raise ValueError("<pos_emb> is not compatible with <word_emb>!")
        insanity.sanitize_type("mask_index", mask_index, int)
        if mask_index < 0 or mask_index >= word_emb.num_embeddings:
            raise ValueError("The <mask_index> does not exist in <word_emb>!")
        insanity.sanitize_type("prediction_rate", prediction_rate, numbers.Real)
        prediction_rate = float(prediction_rate)
        insanity.sanitize_range("prediction_rate", prediction_rate, minimum=0, maximum=1)
        insanity.sanitize_type("mask_rate", mask_rate, numbers.Real)
        mask_rate = float(mask_rate)
        insanity.sanitize_range("mask_rate", mask_rate, minimum=0, maximum=1)
        insanity.sanitize_type("random_rate", random_rate, numbers.Real)
        random_rate = float(random_rate)
        insanity.sanitize_range("random_rate", random_rate, minimum=0, maximum=1)
        if mask_rate + random_rate > 1:
            raise ValueError("<mask_rate> + <random_rate> has to be at most 1!")
        
        # store args
        self._mask_index = mask_index
        self._mask_rate = mask_rate
        self._model = model
        self._pad_index = model.pad_index
        self._pos_emb = pos_emb
        self._prediction_rate = prediction_rate
        self._random_rate = random_rate
        self._word_emb = word_emb
        
        # create an output layer, which is trained together with the model, for predicting masked tokens
        self._output_layer = nn.Sequential(
                nn.Linear(self._word_emb.embedding_dim, self._word_emb.num_embeddings),
                nn.Softmax(dim=1)
        )
        
        # create the used loss function
        self._loss = nn.CrossEntropyLoss()
Ejemplo n.º 12
0
 def position(self, position: typing.Union[int, None]):
     insanity.sanitize_type("position", position, int, none_allowed=True)
     if position is not None:
         insanity.sanitize_range("position", position, minimum=0)
     self._position = position