def words_to_indices(self, words): chars_idx = [] for word in words: chars = [self.char_dict[char] if char in self.char_dict else self.char_dict[UNK] for char in word] chars_idx.append(chars) words = [word_convert(word) for word in words] words_idx = [self.word_dict[word] if word in self.word_dict else self.word_dict[UNK] for word in words] return self.batcher.make_each_batch([words_idx], [chars_idx])
def _make_ids(_words): _char_ids = [] _word_ids = [] for word in _words: _char_ids.append([ self.char_dict[char] if char in self.char_dict else self.char_dict[UNK] for char in word ]) word = word_convert(word, keep_number=False, lowercase=True) _word_ids.append(self.word_dict[word] if word in self.word_dict else self.word_dict[UNK]) return _char_ids, _word_ids
def load_dataset(self, filename, keep_number=False, lowercase=True): dataset = [] for record in load_json(filename): words = [ word_convert(word, keep_number=keep_number, lowercase=lowercase) for word in record["words"] ] dataset.append({ "sent_id": record["sent_id"], "words": words, "tags": record["spans"] }) return dataset
def raw_dataset_iter(filename, encoding="utf-8"): with codecs.open(filename, mode="r", encoding=encoding) as f: words, tags = [], [] for line in f: line = line.lstrip().rstrip() if len(line) == 0 or line.startswith( "--------------"): # means read whole one sentence if len(words) != 0: yield words, tags words, tags = [], [] else: _, word, tag = line.split("\t") word = word_convert(word) words.append(word) tags.append(tag)
def build_dataset(data, word_dict, char_dict, tag_dict): dataset = [] for record in data: chars_list = [] words = [] for word in record["words"]: chars = [ char_dict[char] if char in char_dict else char_dict[UNK] for char in word ] chars_list.append(chars) word = word_convert(word, keep_number=False, lowercase=True) words.append(word_dict[word] if word in word_dict else word_dict[UNK]) tags = [(tag_dict[tag], i, j) for (tag, i, j) in record["tags"]] dataset.append({"words": words, "chars": chars_list, "tags": tags}) return dataset
def raw_dataset_iter(filename, keep_number, lowercase): with codecs.open(filename, mode="r", encoding="utf-8") as f: words, tags = [], [] for line in f: line = line.lstrip().rstrip() if line.startswith("-DOCSTART-"): continue if len(line) == 0: if len(words) != 0: yield words, tags words, tags = [], [] else: line = line.split() word = line[0] tag = line[-1] word = word_convert(word, keep_number=keep_number, lowercase=lowercase) words.append(word) tags.append(tag)
def raw_dataset_iter(dataset, task_name, keep_number, lowercase): for sentence in dataset: words, tags = [], [] for word_details in sentence: # print(word_details) word, pos, chunk, ner = word_details[0:4] if task_name == "ner": tag = ner elif task_name == "chunk": tag = chunk else: tag = pos word = word_convert(word, keep_number=keep_number, lowercase=lowercase) words.append(word) tags.append(tag) yield words, tags
def words_to_indices(self, words): """ Convert input words into batchnized word/chars indices for inference :param words: input words :return: batchnized word indices """ chars_idx = [] for word in words: chars = [ self.char_dict[char] if char in self.char_dict else self.char_dict[UNK] for char in word ] chars_idx.append(chars) words = [word_convert(word) for word in words] words_idx = [ self.word_dict[word] if word in self.word_dict else self.word_dict[UNK] for word in words ] return process_batch_data([words_idx], [chars_idx])
def raw_dataset_iter(filename, task_name, keep_number, lowercase): with codecs.open(filename, mode="r", encoding="utf-8") as f: words, tags = [], [] for line in f: line = line.lstrip().rstrip() if len(line) == 0 or line.startswith( "-DOCSTART-"): # means read whole one sentence if len(words) != 0: yield words, tags words, tags = [], [] else: word, pos, chunk, ner = line.split(" ") if task_name == "ner": tag = ner elif task_name == "chunk": tag = chunk else: tag = pos word = word_convert(word, keep_number=keep_number, lowercase=lowercase) words.append(word) tags.append(tag)