コード例 #1
0
def build_dataset(data, word_dict, char_dict, tag_dict):
    dataset = []
    for record in data:
        chars_list = []
        words = []
        for word in record["words"]:
            chars = [char_dict[char] if char in char_dict else char_dict[UNK] for char in word]
            chars_list.append(chars)
            word = word_convert(word, keep_number=False, lowercase=True)
            words.append(word_dict[word] if word in word_dict else word_dict[UNK])
        tags = [tag_dict[tag] for tag in record["tags"]]
        dataset.append({"words": words, "chars": chars_list, "tags": tags})
    return dataset
コード例 #2
0
def raw_dataset_iter(filename, encoding="utf-8"):
    with codecs.open(filename, mode="r", encoding=encoding) as f:
        words, tags = [], []
        for line in f:
            line = line.lstrip().rstrip()
            if len(line) == 0 or line.startswith(
                    "--------------"):  # means read whole one sentence
                if len(words) != 0:
                    yield words, tags
                    words, tags = [], []
            else:
                _, word, tag = line.split("\t")
                word = word_convert(word, language="french")
                words.append(word)
                tags.append(tag)
コード例 #3
0
def raw_dataset_iter(filename, task_name, keep_number, lowercase):
    with codecs.open(filename, mode="r", encoding="utf-8") as f:
        words, tags = [], []
        for line in f:
            line = line.lstrip().rstrip()
            if len(line) == 0 or line.startswith(
                    "-DOCSTART-"):  # means read whole one sentence
                if len(words) != 0:
                    yield words, tags
                    words, tags = [], []
            else:
                word, ner = line.split(" ")
                tag = ner

                word = word_convert(word,
                                    keep_number=keep_number,
                                    lowercase=lowercase)
                words.append(word)
                tags.append(tag)
コード例 #4
0
 def words_to_indices(self, words):
     """
     Convert input words into batchnized word/chars indices for inference
     :param words: input words
     :return: batchnized word indices
     """
     chars_idx = []
     for word in words:
         chars = [
             self.char_dict[char]
             if char in self.char_dict else self.char_dict[UNK]
             for char in word
         ]
         chars_idx.append(chars)
     words = [
         word_convert(word, language=self.cfg["language"]) for word in words
     ]
     words_idx = [
         self.word_dict[word]
         if word in self.word_dict else self.word_dict[UNK]
         for word in words
     ]
     return process_batch_data([words_idx], [chars_idx])