コード例 #1
0
    def add_string(self, string, encoded_from=None, train=None): 
        """Add string to mapping and return id and optionally character id.
        Arguments:
            string: string.
            encoded_from: string, only used if is_encoded == True or
                is_encoded == None (undecided).
            train: Train mapping. If given, the words and alphabets are reused
                from the train mapping.
        Returns:
            If characters are allowed, a tuple (string id, character id).
            Otherwise only string id.
        """

        # Store strings when is_encoded == None
        if self.is_encoded is None:
            self.strings_original.add(string)

        # Encode string with lemma rule
        if self.is_encoded == None or self.is_encoded == True:
            # Do not encode special labels
            if not string in ["<pad>", "<unk>", "<none>", "<root>", "<anchor>"]:
                encoded_string = Lemmatizer.gen_absolute_lemma_rule(encoded_from, string)
                if encoded_string in self.strings_map:
                    string = encoded_string
                else:
                    string = Lemmatizer.gen_lemma_rule(encoded_from, string)

        # Word-level information
        if string not in self.strings_map:
            if train:
                string = '<unk>'
            else:
                self.strings_map[string] = len(self.strings)
                self.strings.append(string)
        
        if self._include_characters:
            # Character-level information
            if string not in self.charseqs_map:
                self.charseqs_map[string] = len(self.charseqs)
                self.charseqs.append([])
                for c in string:
                    if c not in self.alphabet_map:
                        if train:
                            c = '<unk>'
                        else:
                            self.alphabet_map[c] = len(self.alphabet)
                            self.alphabet.append(c)
                    self.charseqs[-1].append(self.alphabet_map[c])

        return (self.strings_map[string], self.charseqs_map[string]) if self._include_characters else self.strings_map[string]