def add_string(self, string, encoded_from=None, train=None): """Add string to mapping and return id and optionally character id. Arguments: string: string. encoded_from: string, only used if is_encoded == True or is_encoded == None (undecided). train: Train mapping. If given, the words and alphabets are reused from the train mapping. Returns: If characters are allowed, a tuple (string id, character id). Otherwise only string id. """ # Store strings when is_encoded == None if self.is_encoded is None: self.strings_original.add(string) # Encode string with lemma rule if self.is_encoded == None or self.is_encoded == True: # Do not encode special labels if not string in ["<pad>", "<unk>", "<none>", "<root>", "<anchor>"]: encoded_string = Lemmatizer.gen_absolute_lemma_rule(encoded_from, string) if encoded_string in self.strings_map: string = encoded_string else: string = Lemmatizer.gen_lemma_rule(encoded_from, string) # Word-level information if string not in self.strings_map: if train: string = '<unk>' else: self.strings_map[string] = len(self.strings) self.strings.append(string) if self._include_characters: # Character-level information if string not in self.charseqs_map: self.charseqs_map[string] = len(self.charseqs) self.charseqs.append([]) for c in string: if c not in self.alphabet_map: if train: c = '<unk>' else: self.alphabet_map[c] = len(self.alphabet) self.alphabet.append(c) self.charseqs[-1].append(self.alphabet_map[c]) return (self.strings_map[string], self.charseqs_map[string]) if self._include_characters else self.strings_map[string]