def load(self, folder): fns ={ 'dev':'{}_dev.csv'.format(self.lg1_lg2), 'test':'{}_test500.csv'.format(self.lg1_lg2), 'train': '{}_train500_10.csv'.format(self.lg1_lg2) } target_lg = self.lg1_lg2.split('_')[0] data_bundle = DataBundle() for name, fn in fns.items(): path = os.path.join(folder, fn) ds = DataSet() with open(path, 'r', encoding='utf-8') as f: for line in f: line = line.strip() if line: parts = line.split('\t') if self.lower: ins = Instance(word=parts[1].lower(), definition=parts[-1].lower()) else: ins = Instance(word=parts[1], definition=parts[-1]) ds.append(ins) data_bundle.set_dataset(ds, name=name) target_words = {} with open(os.path.join(folder, '{}.txt'.format(target_lg)), encoding='utf-8') as f: for line in f: line = line.strip() if line: if self.lower: line = line.lower() target_words[line] = 1 target_words = list(target_words.keys()) setattr(data_bundle, 'target_words', target_words) return data_bundle
def load(self, folder): data_bundle = DataBundle() fns = { 'dev': '{}_dev.csv', # 'test':'{}_test500.csv'.format(self.lg1_lg2), 'train': '{}_train500_10.csv' } data_bundle = DataBundle() words = {} for lg in ['en', 'es', 'fr']: for name, fn in fns.items(): path = os.path.join(folder, fn.format(lg)) ds = read_dataset(path, self.lower, 0) data_bundle.set_dataset(ds, name=f'{lg}_{name}') target_words = {} with open(os.path.join(folder, '{}.txt'.format(lg)), encoding='utf-8') as f: for line in f: line = line.strip() if line: if self.lower: line = line.lower() target_words[line] = 1 target_words = list(target_words.keys()) words[lg] = target_words setattr(data_bundle, 'target_words_dict', words) for bi in ['en_fr', 'fr_en', 'en_es', 'es_en']: path = os.path.join(folder, '{}_test500.csv'.format(bi)) ds = read_dataset(path, self.lower, 1) data_bundle.set_dataset(ds, '{}_test'.format(bi)) return data_bundle
def test_demo(self): # related to issue https://github.com/fastnlp/fastNLP/issues/324#issue-705081091 from fastNLP import DataSet, Instance from fastNLP.io import DataBundle data_bundle = DataBundle() ds = DataSet() ds.append(Instance(raw_words="截流 进入 最后 冲刺 ( 附 图片 1 张 )")) data_bundle.set_dataset(ds, name='train') data_bundle = CWSPipe().process(data_bundle) self.assertFalse('<' in data_bundle.get_vocab('chars'))
def load(self, folder): # 首先读取两个单语文件 lg1, lg2 = self.lg1_lg2.split('_') fns = { 'dev': '{}_dev.csv', # 'test':'{}_test500.csv'.format(self.lg1_lg2), 'train': '{}_train500_10.csv' } data_bundle = DataBundle() words = {} for lg in [lg1, lg2]: for name, fn in fns.items(): path = os.path.join(folder, fn.format(lg)) ds = read_dataset(path, self.lower, 0) data_bundle.set_dataset(ds, name=f'{lg}_{name}') target_words = {} with open(os.path.join(folder, '{}.txt'.format(lg)), encoding='utf-8') as f: for line in f: line = line.strip() if line: if self.lower: line = line.lower() target_words[line] = 1 target_words = list(target_words.keys()) words[lg] = target_words setattr(data_bundle, 'target_words_dict', words) # 读取bi的测试数据 bi1 = f'{lg1}_{lg2}' bi2 = f'{lg2}_{lg1}' for bi in [bi1, bi2]: path = os.path.join(folder, '{}_test500.csv'.format(bi)) ds = read_dataset(path, self.lower, 1) # ds = DataSet() # with open(path, 'r', encoding='utf-8') as f: # for line in f: # line = line.strip() # if line: # parts = line.split('\t') # ins = Instance(word=parts[1].lower(), definition=parts[-1]) # ds.append(ins) data_bundle.set_dataset(ds, '{}_test'.format(bi)) return data_bundle
def load(self, paths): """ 输出的DataSet包含以下的field tokens pos dep aspects ["The", "bread", ...] ["DET", "NOUN",...] [["dep", 2, 1], ["nsubj", 4, 2], ...] [{"term": ["bread"], "polarity": "positive", "from": 1, "to": 2}] 其中dep中["dep", 2, 1]指当前这个word的head是2(0是root,这里2就是bread),"dep"是依赖关系为dep :param paths: :return: """ data_bundle = DataBundle() folder_name = os.path.basename(paths) fns = [ f'{folder_name}_Test_biaffine_depparsed.json', f'{folder_name}_Train_biaffine_depparsed.json' ] if not os.path.exists(os.path.join(paths, fns[0])): fns = [ f'Test_biaffine_depparsed.json', f'Train_biaffine_depparsed.json' ] for split, name in zip(['test', 'train'], fns): fp = os.path.join(paths, name) with open(fp, 'r', encoding='utf-8') as f: data = json.load(f) ds = DataSet() for ins in data: tokens = ins['token'] pos = ins['pos'] dep = ins['dependencies'] aspects = ins['aspects'] ins = Instance(tokens=tokens, pos=pos, dep=dep, aspects=aspects) ds.append(ins) data_bundle.set_dataset(ds, name=split) return data_bundle # c = ConllUDataset('./data/EWT/en_ewt-ud-test.conllu') # print('done')
def load(self, folder): data_bundle = DataBundle() for name in ['desc.json', 'dev.json', 'seen.json', 'train.json', 'unseen.json']: path = os.path.join(folder, name) dataset = DataSet() with open(path, 'r', encoding='utf-8') as f: data = json.load(f) for d in data: word = d['word'].lower() definition = d['definitions'].lower() ins = Instance(word=word, definition=definition) dataset.append(ins) data_bundle.set_dataset(dataset, name=name.split('.')[0]) words = [] with open(os.path.join(folder, 'target_words.txt'), 'r', encoding='utf-8') as f: for line in f: line = line.strip() if line: words.append(line) setattr(data_bundle, 'target_words', words) return data_bundle
def load(self, paths): """ 输出的DataSet包含以下的field tokens pos dep aspects ["The", "bread", ...] ["DET", "NOUN",...] [["dep", 2, 1], ["nsubj", 4, 2], ...] [{"term": ["bread"], "polarity": "positive", "from": 1, "to": 2}] 其中dep中["dep", 2, 1]指当前这个word的head是2(0是root,这里2就是bread),"dep"是依赖关系为dep :param paths: :return: """ data_bundle = DataBundle() folder_name = os.path.basename(paths) fns = [ f"{folder_name}_Test.json", f"{folder_name}_Train.json", ] if not os.path.exists(os.path.join(paths, fns[0])): fns = [f"Test.json", f"Train.json"] for split, name in zip(["test", "train"], fns): fp = os.path.join(paths, name) with open(fp, "r", encoding="utf-8") as f: data = json.load(f) ds = DataSet() for ins in data: tokens = ins["token"] pos = ins["pos"] dep = ins["dependencies"] aspects = ins["aspects"] ins = Instance(tokens=tokens, pos=pos, dep=dep, aspects=aspects) ds.append(ins) data_bundle.set_dataset(ds, name=split) return data_bundle
def process(self, data_bundle: DataBundle) -> DataBundle: new_bundle = DataBundle() aspect_dict = {} mask_id = self.tokenizer.convert_tokens_to_ids([self.mask])[0] if isinstance(self.tokenizer, BertTokenizer): cls = "[CLS]" sep = "[SEP]" else: cls = self.tokenizer.cls_token sep = self.tokenizer.sep_token for name, ds in data_bundle.iter_datasets(): new_ds = DataSet() for ins in ds: tokens = ins["tokens"] if not isinstance(self.tokenizer, XLNetTokenizer): tokens.insert(0, cls) tokens.append(sep) shift = 1 else: tokens.append(sep) tokens.append(cls) shift = 0 starts = [] ends = [] for aspect in ins["aspects"]: starts.append(aspect["from"] + shift) ends.append(aspect["to"] + shift) for aspect in ins["aspects"]: target = aspect["polarity"] start = aspect["from"] + shift end = aspect["to"] + shift aspect_mask = [0] * len(tokens) for i in range(start, end): aspect_mask[i] = 1 pieces = [] piece_masks = [] raw_words = tokens[shift:-1] raw_words.insert(start - 1, "[[") raw_words.insert(end, "]]") for mask, token in zip(aspect_mask, tokens): bpes = self.tokenizer.convert_tokens_to_ids( self.tokenizer.tokenize(token)) pieces.extend(bpes) piece_masks.extend([mask] * (len(bpes))) new_ins = Instance( tokens=pieces, target=target, aspect_mask=piece_masks, raw_words=" ".join(raw_words), ) new_ds.append(new_ins) new_bundle.set_dataset(new_ds, name) target_vocab = Vocabulary(padding=None, unknown=None) target_vocab.add_word_lst( ["neutral", "positive", "negative", "smooth"]) target_vocab.index_dataset(*new_bundle.datasets.values(), field_name="target") new_bundle.set_target("target") new_bundle.set_input("tokens", "aspect_mask", "raw_words") new_bundle.apply_field(lambda x: len(x), field_name="tokens", new_field_name="seq_len") # new_bundle.set_vocab(vocab, 'tokens') if hasattr(self.tokenizer, "pad_token_id"): new_bundle.set_pad_val("tokens", self.tokenizer.pad_token_id) else: new_bundle.set_pad_val("tokens", self.tokenizer.pad_index) new_bundle.set_vocab(target_vocab, "target") return new_bundle