def __init__(self, message: str, decimal_places: int = 3, terminal_break: bool = False, skip_output: bool = False): """Creates a new instance of ``Timer``. Args: message (str): The message to print at the end of the ``with`` block. Notice that the measured time, as string ``" in X.XXXs"``, is appended to the provided message automatically. this message decimal_places (int, optional): The number of decimal places to print for the time, which is measured and printed in seconds. terminal_break (bool, optional): Indicates whether to add an additional line break to the printed message. skip_output (bool, optional): If ``True``, then no output is printed to the screen. """ # sanitize args insanity.sanitize_type("decimal_places", decimal_places, int) insanity.sanitize_range("decimal_places", decimal_places, minimum=0) # create the message to print at the end of the with-block self._message = str(message).strip() + " in {:.%df}s" % decimal_places if terminal_break: self._message += "\n" self._start = None # the time when the clock is started self._total = 0 # the total time measured self._skip_output = bool( skip_output ) # indicates whether to print the time at the end of a ``with`` block
def max_branching_factor(self, max_branching_factor: int) -> None: insanity.sanitize_type("max_branching_factor", max_branching_factor, int) insanity.sanitize_range("max_branching_factor", max_branching_factor, minimum=1) self._max_branching_factor = max_branching_factor
def stop_prob(self, stop_prob: numbers.Real) -> None: insanity.sanitize_type("stop_prob", stop_prob, numbers.Real) insanity.sanitize_range("stop_prob", stop_prob, minimum=0, maximum=1, max_inclusive=False) self._stop_prob = float(stop_prob)
def position(index: int) -> typing.Callable[[property], property]: """A decorator that allows for specifying the position of a property of a configuration class among all parsed positional args. Notice that ``argmagic`` ignore this decorator if positional args are not used or if the annotated property defines an optional configuration value. Args: index (int): The index of the annotated configuration in the sequence of positional args. """ insanity.sanitize_type("index", index, int) insanity.sanitize_range("index", index, minimum=0) def _position(func: property) -> property: if not isinstance(func, property): raise TypeError( "The decorator @position may be applied to properties only!") func.fget.__dict__[argmagic.POSITION] = index return func return _position
def num_training_samples(self, num_training_samples: int) -> None: insanity.sanitize_type("num_training_samples", num_training_samples, int) insanity.sanitize_range("num_training_samples", num_training_samples, minimum=1) self._num_training_samples = num_training_samples
def num_datasets(self, num_datasets: int) -> None: insanity.sanitize_type("num_datasets", num_datasets, int) insanity.sanitize_range("num_samples", num_datasets, minimum=1) self._num_datasets = num_datasets
def generate_datasets( self, num_datasets: int, num_training_samples: int, output_dir: str ) -> None: """Generates datasets from the data that was provided to this instance of ``DatasetGenerator`, and writes them to disk. Args: num_datasets (int): The total number of datasets to create. num_training_samples (int): The number of training samples to create for each dataset. output_dir (str): The path of the output directory. """ # sanitize args insanity.sanitize_type("num_datasets", num_datasets, int) insanity.sanitize_range("num_datasets", num_datasets, minimum=1) insanity.sanitize_type("num_training_samples", num_training_samples, int) insanity.sanitize_range("num_training_samples", num_training_samples, minimum=1) # create patterns for the names of the directories that are created for the single datasets and for # the base names of training samples output_dir_pattern = "{:0" + str(len(str(num_datasets - 1))) + "d}" sample_filename_pattern = "{:0" + str(len(str(num_training_samples - 1))) + "d}" for dataset_idx in range(num_datasets): print("generating dataset #{}...".format(dataset_idx)) # assemble needed paths ds_output_dir = os.path.join(output_dir, output_dir_pattern.format(dataset_idx)) train_dir = os.path.join(ds_output_dir, "train") dev_dir = os.path.join(ds_output_dir, "dev") test_dir = os.path.join(ds_output_dir, "test") # create folder structure for storing the current dataset if not os.path.isdir(ds_output_dir): os.mkdir(ds_output_dir) if not os.path.isdir(train_dir): os.mkdir(train_dir) if not os.path.isdir(dev_dir): os.mkdir(dev_dir) if not os.path.isdir(test_dir): os.mkdir(test_dir) # split countries into train/dev/test train, dev, test = self._split_countries() # write selected dev+test countries to disk with open(os.path.join(ds_output_dir, "countries.dev.txt"), "w") as f: for c in dev: f.write("{}\n".format(c)) with open(os.path.join(ds_output_dir, "countries.test.txt"), "w") as f: for c in test: f.write("{}\n".format(c)) # create training samples + write them to disk for sample_idx in range(num_training_samples): print("generating training sample #{}...".format(sample_idx)) sample = self._generate_sample(train) kg_writer.KgWriter.write(sample, train_dir, sample_filename_pattern.format(sample_idx)) # create evaluation sample + write it to disk print("generating dev sample... ") dev_sample = self._generate_sample(train, inf_countries=dev, minimal=True) kg_writer.KgWriter.write(dev_sample, dev_dir, "dev") # create test sample + write it to disk print("generating test sample...") test_sample = self._generate_sample(train, inf_countries=test, minimal=True) kg_writer.KgWriter.write(test_sample, test_dir, "test") # print statistics about test sample num_spec = len([t for t in test_sample.triples if not t.inferred]) num_inf = len([t for t in test_sample.triples if t.inferred]) print("number triples in test sample: {} ({} spec / {} inf)".format(num_spec + num_inf, num_spec, num_inf)) print("OK\n")
def max_tree_size(self, max_tree_size: int) -> None: insanity.sanitize_type("max_tree_size", max_tree_size, int) insanity.sanitize_range("max_tree_size", max_tree_size, minimum=1) self._max_tree_size = max_tree_size
def max_tree_depth(self, max_tree_depth: int) -> None: insanity.sanitize_type("max_tree_depth", max_tree_depth, int) insanity.sanitize_range("max_tree_depth", max_tree_depth, minimum=1) self._max_tree_depth = max_tree_depth
def read(cls, input_dir: str, basename: str, index: int = None) -> knowledge_graph.KnowledgeGraph: """Loads a knowledge graph from the specified location. Args: input_dir (str): The directory that contains all of the files. basename (str): The base name, i.e., the prefix, included in all files' names. index (int, optional): If this is provided, then ``input_dir`` and ``basename`` are assumed to specify a sequence of knowledge graphs, and ``index`` specifies the element of this sequence to retrieve. Returns: :class:`knowledge_graph.KnowledgeGraph`: A knowledge graph that has been populated according to the read information. Raises: ValueError: If ``input_dir`` does not refer to an existing directory or if any of the needed files is missing. """ # //////// Sanitize Args --------------------------------------------------------------------------------------- # ensure that the inputs are strings input_dir = str(input_dir) basename = str(basename) if index is not None: insanity.sanitize_type("index", index, int) insanity.sanitize_range("index", index, minimum=0) # assemble all needed paths # the used postfixes have the following meanings: # * _vocab -> definition of a class/relation/literal # * _spec -> user-defined data # * _inf -> inferred data individual_spec = os.path.join(input_dir, basename + io.INDIVIDUALS_SPEC_EXT) classes_vocab = os.path.join(input_dir, basename + io.CLASSES_VOCAB_EXT) classes_spec = os.path.join(input_dir, basename + io.CLASSES_SPEC_EXT) classes_inf = os.path.join(input_dir, basename + io.CLASSES_INF_EXT) classes_pred = os.path.join(input_dir, basename + io.CLASSES_PRED_EXT) relations_vocab = os.path.join(input_dir, basename + io.RELATIONS_VOCAB_EXT) relations_spec = os.path.join(input_dir, basename + io.RELATIONS_SPEC_EXT) relations_inf = os.path.join(input_dir, basename + io.RELATIONS_INF_EXT) relations_pred = os.path.join(input_dir, basename + io.RELATIONS_PRED_EXT) literals_vocab = os.path.join(input_dir, basename + io.LITERALS_VOCAB_EXT) literals_spec = os.path.join(input_dir, basename + io.LITERALS_SPEC_EXT) literals_inf = os.path.join(input_dir, basename + io.LITERALS_INF_EXT) literals_pred = os.path.join(input_dir, basename + io.LITERALS_PRED_EXT) if index is not None: classes_spec += "." + str(index) classes_inf += "." + str(index) classes_pred += "." + str(index) relations_spec += "." + str(index) relations_inf += "." + str(index) relations_pred += "." + str(index) literals_spec += "." + str(index) literals_inf += "." + str(index) literals_pred += "." + str(index) # check whether the input directory exists if not os.path.isdir(input_dir): raise ValueError("The provided <input_dir> does not exist: '{}'!".format(input_dir)) # check whether all of the needed files exist: if not os.path.isfile(individual_spec): raise ValueError("Missing file: '{}'!".format(individual_spec)) if not os.path.isfile(classes_vocab): raise ValueError("Missing file: '{}'!".format(classes_vocab)) if not os.path.isfile(classes_spec): raise ValueError("Missing file: '{}'!".format(classes_spec)) if not os.path.isfile(classes_inf): raise ValueError("Missing file: '{}'!".format(classes_inf)) if not os.path.isfile(classes_pred): raise ValueError("Missing file: '{}'!".format(classes_pred)) if not os.path.isfile(relations_vocab): raise ValueError("Missing file: '{}'!".format(relations_vocab)) if not os.path.isfile(relations_spec): raise ValueError("Missing file: '{}'!".format(relations_spec)) if not os.path.isfile(relations_inf): raise ValueError("Missing file: '{}'!".format(relations_inf)) if not os.path.isfile(relations_pred): raise ValueError("Missing file: '{}'!".format(relations_pred)) if not os.path.isfile(literals_vocab): raise ValueError("Missing file: '{}'!".format(literals_vocab)) if not os.path.isfile(literals_spec): raise ValueError("Missing file: '{}'!".format(literals_spec)) if not os.path.isfile(literals_inf): raise ValueError("Missing file: '{}'!".format(literals_inf)) if not os.path.isfile(literals_pred): raise ValueError("Missing file: '{}'!".format(literals_pred)) # //////// Read Vocabulary --------------------------------------------------------------------------------- # create new empty knowledge graph kg = knowledge_graph.KnowledgeGraph() # read classes with open(classes_vocab, "r") as f: for index, line in enumerate(f): if line == "": continue m = re.match(cls.VOCAB_REGEX, line) assert int(m.group("index")) == index kg.classes.add(ctf.ClassTypeFactory.create_class(m.group("name"))) # read relations with open(relations_vocab, "r") as f: for index, line in enumerate(f): if line == "": continue m = re.match(cls.VOCAB_REGEX, line) assert int(m.group("index")) == index kg.relations.add(rtf.RelationTypeFactory.create_relation(m.group("name"))) # read literals with open(literals_vocab, "r") as f: for index, line in enumerate(f): if line == "": continue m = re.match(cls.VOCAB_REGEX, line) assert int(m.group("index")) == index kg.literals.add(ltf.LiteralTypeFactory.create_literal(m.group("name"))) # //////// Read Individuals -------------------------------------------------------------------------------- with open(individual_spec, "r") as f: for index, line in enumerate(f): if line == "": continue m = re.match(cls.VOCAB_REGEX, line) assert int(m.group("index")) == index kg.individuals.add(individual_factory.IndividualFactory.create_individual(m.group("name"))) # //////// Read Class Memberships -------------------------------------------------------------------------- # read specified memberships with open(classes_spec, "r") as f: for individual_index, line in enumerate(f): # run through all individuals if line == "": continue current_ind = kg.individuals[individual_index] for class_index, mem in enumerate(map(int, re.findall(cls.MEMBERSHIPS_REGEX, line))): if mem != 0: # only consider specified memberships current_ind.classes.add( class_membership.ClassMembership( kg.classes[class_index], mem == 1 ) ) # read inferred memberships with open(classes_inf, "r") as f: for individual_index, line in enumerate(f): # run through all individuals if line == "": continue current_ind = kg.individuals[individual_index] for class_index, mem in enumerate(map(int, re.findall(cls.MEMBERSHIPS_REGEX, line))): if mem != 0: # only consider specified memberships current_ind.classes.add( class_membership.ClassMembership( kg.classes[class_index], mem == 1, inferred=True ) ) # read memberships that are prediction targets with open(classes_pred, "r") as f: for individual_index, line in enumerate(f): # run through all individuals if line == "": continue current_ind = kg.individuals[individual_index] for class_index, mem in enumerate(map(int, re.findall(cls.MEMBERSHIPS_REGEX, line))): if mem != 0: # only consider specified memberships current_ind.classes.add( class_membership.ClassMembership( kg.classes[class_index], mem == 1, prediction=True ) ) # //////// Read Literals ----------------------------------------------------------------------------------- # read specified literals with open(literals_spec, "r") as f: for line in f: if line == "": continue # parse read line m = re.match(cls.TRIPLE_REGEX, line) # fetch respective individual, literal, and value current_ind = kg.individuals[int(m.group("subject"))] current_lit = kg.literals[int(m.group("predicate"))] current_value = m.group("object") # add literal to individual current_ind.literals.add( literal_value.LiteralValue( current_lit, current_value ) ) # read inferred literals with open(literals_inf, "r") as f: for line in f: if line == "": continue # parse read line m = re.match(cls.TRIPLE_REGEX, line) # fetch respective individual, literal, and value current_ind = kg.individuals[int(m.group("subject"))] current_lit = kg.literals[int(m.group("predicate"))] current_value = m.group("object") # add literal to individual current_ind.literals.add( literal_value.LiteralValue( current_lit, current_value, inferred=True ) ) # read literals that are prediction targets with open(literals_pred, "r") as f: for line in f: if line == "": continue # parse read line m = re.match(cls.TRIPLE_REGEX, line) # fetch respective individual, literal, and value current_ind = kg.individuals[int(m.group("subject"))] current_lit = kg.literals[int(m.group("predicate"))] current_value = m.group("object") # add literal to individual current_ind.literals.add( literal_value.LiteralValue( current_lit, current_value, prediction=True ) ) # //////// Read Triples ------------------------------------------------------------------------------------ # read specified triples with open(relations_spec, "r") as f: for line in f: if line == "": continue # parse read line m = re.match(cls.TYPED_TRIPLE_REGEX, line) # fetch respective individual, literal, and value positive = m.group("type") == "+" sub = kg.individuals[int(m.group("subject"))] pred = kg.relations[int(m.group("predicate"))] obj = kg.individuals[int(m.group("object"))] # add triple kg.triples.add(triple.Triple(sub, pred, obj, positive)) # read inferred triples with open(relations_inf, "r") as f: for line in f: if line == "": continue # parse read line m = re.match(cls.TYPED_TRIPLE_REGEX, line) # fetch respective individual, literal, and value positive = m.group("type") == "+" sub = kg.individuals[int(m.group("subject"))] pred = kg.relations[int(m.group("predicate"))] obj = kg.individuals[int(m.group("object"))] # add triple kg.triples.add(triple.Triple(sub, pred, obj, positive, inferred=True)) # read triples that are prediction targets with open(relations_pred, "r") as f: for line in f: if line == "": continue # parse read line m = re.match(cls.TYPED_TRIPLE_REGEX, line) # fetch respective individual, literal, and value positive = m.group("type") == "+" sub = kg.individuals[int(m.group("subject"))] pred = kg.relations[int(m.group("predicate"))] obj = kg.individuals[int(m.group("object"))] # add triple kg.triples.add(triple.Triple(sub, pred, obj, positive, prediction=True)) return kg
def __init__( self, model: encoder.Encoder, word_emb: nn.Embedding, pos_emb: nn.Embedding, mask_index: int, prediction_rate: numbers.Real = 0.15, mask_rate: numbers.Real = 0.8, random_rate: numbers.Real = 0.1 ): """Creates a new instance of ``BERTLoss`. Args: model (encoder.Encoder): The encoder model being pretrained. word_emb (nn.Embedding): The used word embeddings. pos_emb (nn.Embedding): The used positional embeddings. mask_index (int): The index of the mask token. prediction_rate (numbers.Real, optional): The percentage of tokens in each training sequence that predictions are computed for, which is set to ``0.8``, by default. mask_rate (numbers.Real, optional): Among all tokens that predictions are computed for, the percentage of tokens that are replaced with the mask token, as specified by ``mask_index``. This is set to ``0.8``, by default. random_rate (numbers.Real, optional): Among all tokens that predictions are computed for, the percentage of tokens that are randomly replaced with other tokens. This is set to ``0.1``, by default. """ super().__init__() # sanitize args insanity.sanitize_type("model", model, encoder.Encoder) insanity.sanitize_type("word_emb", word_emb, nn.Embedding) insanity.sanitize_type("pos_emb", word_emb, nn.Embedding) if pos_emb.embedding_dim != word_emb.embedding_dim: raise ValueError("<pos_emb> is not compatible with <word_emb>!") insanity.sanitize_type("mask_index", mask_index, int) if mask_index < 0 or mask_index >= word_emb.num_embeddings: raise ValueError("The <mask_index> does not exist in <word_emb>!") insanity.sanitize_type("prediction_rate", prediction_rate, numbers.Real) prediction_rate = float(prediction_rate) insanity.sanitize_range("prediction_rate", prediction_rate, minimum=0, maximum=1) insanity.sanitize_type("mask_rate", mask_rate, numbers.Real) mask_rate = float(mask_rate) insanity.sanitize_range("mask_rate", mask_rate, minimum=0, maximum=1) insanity.sanitize_type("random_rate", random_rate, numbers.Real) random_rate = float(random_rate) insanity.sanitize_range("random_rate", random_rate, minimum=0, maximum=1) if mask_rate + random_rate > 1: raise ValueError("<mask_rate> + <random_rate> has to be at most 1!") # store args self._mask_index = mask_index self._mask_rate = mask_rate self._model = model self._pad_index = model.pad_index self._pos_emb = pos_emb self._prediction_rate = prediction_rate self._random_rate = random_rate self._word_emb = word_emb # create an output layer, which is trained together with the model, for predicting masked tokens self._output_layer = nn.Sequential( nn.Linear(self._word_emb.embedding_dim, self._word_emb.num_embeddings), nn.Softmax(dim=1) ) # create the used loss function self._loss = nn.CrossEntropyLoss()
def position(self, position: typing.Union[int, None]): insanity.sanitize_type("position", position, int, none_allowed=True) if position is not None: insanity.sanitize_range("position", position, minimum=0) self._position = position