def test(): from fastNLP import DataSetIter, DataSet # 0 for not match,1 for match testset = DataSet({"raw_words": ["5::five"]}) testset.apply(addWords, new_field_name="p_words") testset.apply(addWordPiece, new_field_name="t_words") testset.apply(processItem, new_field_name="word_pieces") testset.apply(processNum, new_field_name="word_nums") testset.apply(addSeqlen, new_field_name="seq_len") testset.field_arrays["word_pieces"].is_input = True testset.field_arrays["seq_len"].is_input = True testset.field_arrays["word_nums"].is_input = True # print(testset) from fastNLP.io import ModelLoader loader = ModelLoader() if torch.cuda.is_available(): model = loader.load_pytorch_model( "../models/bert_model_max_triple.pkl") else: model = torch.load("../models/bert_model_max_triple.pkl", map_location="cpu") model.eval() test_batch = DataSetIter(batch_size=1, dataset=testset, sampler=None) outputs = [] for batch_x, batch_y in test_batch: _move_dict_value_to_device(batch_x, batch_y, device=_get_model_device(model)) outputs.append( model.forward(batch_x["word_pieces"], batch_x["word_nums"], batch_x["seq_len"])['pred']) outputs = torch.cat(outputs) outputs = torch.nn.functional.softmax(outputs, dim=1) return outputs
def is_phrase_match_BERT(phrase1, phrase2): """ Determine if two phrases match :param phrase1: phrase1 :param phrase2: phrase2 """ from fastNLP import DataSetIter, DataSet from fastNLP.core.utils import _move_dict_value_to_device from my_bert_match import addWords, addWordPiece, processItem, processNum, addSeqlen # 0 for not match,1 for match testset = DataSet({"raw_words": [f"{phrase1}::{phrase2}"]}) testset.apply(addWords, new_field_name="p_words") testset.apply(addWordPiece, new_field_name="t_words") testset.apply(processItem, new_field_name="word_pieces") testset.apply(processNum, new_field_name="word_nums") testset.apply(addSeqlen, new_field_name="seq_len") testset.field_arrays["word_pieces"].is_input = True testset.field_arrays["seq_len"].is_input = True testset.field_arrays["word_nums"].is_input = True # print(testset) with torch.no_grad(): bert_model.eval() test_batch = DataSetIter(batch_size=1, dataset=testset, sampler=None) outputs = [] for batch_x, batch_y in test_batch: _move_dict_value_to_device(batch_x, batch_y, device=device) outputs.append(bert_model.forward(batch_x["word_pieces"], batch_x["word_nums"], batch_x["seq_len"])['pred']) outputs = torch.cat(outputs) outputs = torch.nn.functional.softmax(outputs, dim=1) return ["Not Match", "Related", "Match"][outputs.argmax().item()]
def create_dataset(data, sample_size): data_set = DataSet() data_set.add_field('raw_sentence', data.data[:sample_size]) data_set.add_field('target', data.target[:sample_size]) data_set.apply(lambda x: sentence_to_words(x['raw_sentence']), new_field_name='word_seq') return data_set
def test_apply_cannot_modify_instance(self): ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6]] * 40}) def modify_inplace(instance): instance['words'] = 1 with self.assertRaises(TypeError): ds.apply(modify_inplace)
def test_apply_tqdm(self): import time ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6]] * 40}) def do_nothing(ins): time.sleep(0.01) ds.apply(do_nothing, use_tqdm=True) ds.apply_field(do_nothing, field_name='x', use_tqdm=True)
def test_BucketSampler(self): sampler = BucketSampler(num_buckets=3, batch_size=16, seq_len_field_name="seq_len") data_set = DataSet({ "x": [[0] * random.randint(1, 10)] * 10, "y": [[5, 6]] * 10 }) data_set.apply(lambda ins: len(ins["x"]), new_field_name="seq_len") indices = sampler(data_set) self.assertEqual(len(indices), 10)
def produceCandidateTripleSlow(raw_phrase, Candidate_phrases, model, Candidate_hpos_sub, threshold): """ 使用BERT判断Candidate_phrases中哪个与raw_phrase语义最接近;基于最大值方式;适用于单个处理 """ from fastNLP.core.utils import _move_dict_value_to_device from fastNLP.core.utils import _get_model_device from fastNLP import DataSet from fastNLP import DataSetIter from my_bert_match import addWordPiece, addSeqlen, addWords, processItem, processNum p_Candidate_phrases = [ raw_phrase + "::" + item for item in Candidate_phrases ] Candidate_dataset = DataSet({"raw_words": p_Candidate_phrases}) Candidate_dataset.apply(addWords, new_field_name="p_words") Candidate_dataset.apply(addWordPiece, new_field_name="t_words") Candidate_dataset.apply(processItem, new_field_name="word_pieces") Candidate_dataset.apply(processNum, new_field_name="word_nums") Candidate_dataset.apply(addSeqlen, new_field_name="seq_len") Candidate_dataset.field_arrays["word_pieces"].is_input = True Candidate_dataset.field_arrays["seq_len"].is_input = True Candidate_dataset.field_arrays["word_nums"].is_input = True test_batch = DataSetIter(batch_size=10, dataset=Candidate_dataset, sampler=None) outputs = [] for batch_x, batch_y in test_batch: _move_dict_value_to_device(batch_x, batch_y, device=_get_model_device(model)) outputs.append( model.forward(batch_x["word_pieces"], batch_x["word_nums"], batch_x["seq_len"])['pred']) outputs = torch.cat(outputs) outputs = torch.nn.functional.softmax(outputs, dim=1).cpu().detach().numpy() results_2 = np.array([item[2] for item in outputs]) results_1 = np.array([item[1] for item in outputs]) # 如果这里已经能找到精确匹配的就直接输出 if max(results_2) >= threshold: return Candidate_hpos_sub[int( np.argmax(results_2))], max(results_2), "2" if max(results_1) >= threshold: return Candidate_hpos_sub[int( np.argmax(results_1))], max(results_1), "1" return "None", None, "0"
def get_data(): s = '' dataset = DataSet() for line in open('../handout/tangshi.txt'): if (line == '\n'): dataset.append(Instance(raw_sentence=s, label='0')) #print(s) s = '' else: s += line.replace('\n', '') dataset.apply(add_end, new_field_name='raw_sentence') dataset.apply(split_sent, new_field_name='words') return dataset
def preprocess(input): data = input.data target = input.target dataset = DataSet() for i in range(len(data)): data_tmp = data[i] for c in string.whitespace: data_tmp = data_tmp.replace(c, ' ') for c in string.punctuation: data_tmp = data_tmp.replace(c, '') data_tmp = data_tmp.lower().split() # print(data_tmp) dataset.append(Instance(sentence=data_tmp, target=int(target[i]))) dataset.apply(lambda x: len(x['sentence']), new_field_name='seq_len') return dataset
def import_data(path): dataset = DataSet() with open(path, encoding='utf-8') as f: content = f.readlines()[1::2] for c in content: c = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!?、~@#¥%……&*()《》]+", "", c) # !,。? c = c.replace(string.whitespace, '').strip() dataset.append(Instance(raw_sentence=c)) dataset.drop(lambda x: len(list(x['raw_sentence'])) == 0) def split_sent(ins): return list(ins['raw_sentence']) dataset.apply(split_sent, new_field_name='words', is_input=True) test_data, train_data = dataset.split(0.8) print(len(test_data), len(train_data)) return test_data, train_data
def get_data(): global max_length dataset = DataSet() for i in range(58): f = open("../../../chinese-poetry/json/poet.tang."+str(i*1000)+".json", encoding='utf-8') setting = json.load(f) #print(setting) for line in setting: if (len(line['paragraphs'])==4 and len(line['paragraphs'][0])==12 and len(line['paragraphs'][1])==12 and len(line['paragraphs'][2])==12 and len(line['paragraphs'][3])==12): s='' for sentence in line['paragraphs']: s+=Converter('zh-hans').convert(sentence) s+='$' dataset.append(Instance(raw_sentence=s)) f.close() print('Has processed '+str((i+1)*1000)+' poems') dataset.apply(split_sent, new_field_name='words') train_data,test_data = dataset.split(0.2) return train_data,test_data
def preprocess(data_in): data = data_in.data target = data_in.target dataset = DataSet() for i in range(len(data)): data_tmp = re.sub('\d+', ' ', data[i]) for c in string.whitespace: data_tmp = data_tmp.replace(c, ' ') for c in string.punctuation: data_tmp = data_tmp.replace(c, '') data_tmp = data_tmp.lower().split() dataset.append( Instance(raw_sentence=data[i], target=int(target[i]), sentence=data_tmp)) dataset.apply(lambda x: x['raw_sentence'].lower(), new_field_name='raw_sentence') dataset.apply(lambda x: len(x['sentence']), new_field_name='seq_len') return dataset
def __init__(self, path=".data/yelp", dataset="yelp", batch_size=32): if dataset == "yelp": dataset = DataSet() for db_set in ['train']: text_file = os.path.join(path, 'sentiment.' + db_set + '.text') label_file = os.path.join(path, 'sentiment.' + db_set + '.labels') with io.open(text_file, 'r', encoding="utf-8") as tf, io.open( label_file, 'r', encoding="utf-8") as lf: for text in tf: label = lf.readline() dataset.append(Instance(text=text, label=label)) dataset.apply(lambda x: x['text'].lower(), new_field_name='text') dataset.apply( lambda x: ['<start>'] + x['text'].split() + ['<eos>'], new_field_name='words') dataset.drop(lambda x: len(x['words']) > 1 + 15 + 1) dataset.apply(lambda x: x['words'] + ['<pad>'] * (17 - len(x['words'])), new_field_name='words') dataset.apply(lambda x: int(x['label']), new_field_name='label_seq', is_target=True) _train_data, _test_data = dataset.split(0.3) _vocab = Vocabulary(min_freq=2) _train_data.apply( lambda x: [_vocab.add(word) for word in x['words']]) _vocab.build_vocab() _train_data.apply( lambda x: [_vocab.to_index(word) for word in x['words']], new_field_name='word_seq', is_input=True) _test_data.apply( lambda x: [_vocab.to_index(word) for word in x['words']], new_field_name='word_seq', is_input=True) self.train_data = _train_data self.test_data = _test_data self.vocab = _vocab self.batch_size = batch_size self.train_iter = iter( Batch(dataset=self.train_data, batch_size=self.batch_size, sampler=SequentialSampler()))
def handle_data(n_class): train_data = get_text_classification_datasets(n_class) dataset = DataSet() vocab = Vocabulary(min_freq=0, unknown='<unk>', padding='<pad>') for i in range(len(train_data.data)): ans = remove_punc(train_data.data[i]) dataset.append((Instance(content=ans, target=int(train_data.target[i])))) dataset.apply(lambda x: x['content'].lower().split(), new_field_name='words', is_input=True) for txt in dataset: vocab.add_word_lst(txt['words']) vocab.build_vocab() # index句子, Vocabulary.to_index(word) dataset.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='index') dataset.set_input("index") dataset.set_target("target") tra, dev = dataset.split(0.2) return tra, dev, len(vocab)
def read_json(path, is_low=True): # 将json 读取为 dataset, 增加seq_len,post,mask field with open(path, "r", encoding="utf8") as f: raw_data = json.load(f) data = [] for d in raw_data: tok = d["token"] if not is_low else [w.lower() for w in d["token"]] pos = d["pos"] head = d["head"] deprel = d["deprel"] for aspect in d["aspects"]: asp = [aspect["term"], ["urgrn"]][len(aspect["term"]) == 0] # for Restaurants16 fidx = aspect["from"] tidx = aspect["to"] pol = aspect["polarity"] data.append([tok, pos, head, deprel, asp, fidx, tidx, pol]) fields = ["tok", "pos", "head", "deprel", "asp", "fidx", "tidx", "pol"] dataset = DataSet(dict(zip(fields, zip(*data)))) def get_post(instance): fidx = instance["fidx"] tidx = instance["tidx"] seq_len = instance["seq_len"] return ([i - fidx for i in range(fidx)] + [0 for _ in range(fidx, tidx)] + [i - tidx + 1 for i in range(tidx, seq_len)]) def get_mask(instance): if not instance["fidx"] == instance["tidx"]: return [[0, 1][i in range(instance["fidx"], instance["tidx"])] for i in range(instance["seq_len"])] else: return [1 for _ in range(instance["seq_len"])] dataset.apply(lambda line: len(line["tok"]), new_field_name="seq_len") dataset.apply(get_post, new_field_name="post") dataset.apply(get_mask, new_field_name="mask") return dataset
def get_fastnlp_dataset(): text_train, text_test = get_text_classification_datasets() train_data = DataSet() test_data = DataSet() for i in range(len(text_train.data)): train_data.append( Instance(text=split_sent(text_train.data[i]), target=int(text_train.target[i]))) for i in range(len(text_test.data)): test_data.append( Instance(text=split_sent(text_test.data[i]), target=int(text_test.target[i]))) # 构建词表 vocab = Vocabulary(min_freq=5, unknown='<unk>', padding='<pad>') train_data.apply(lambda x: [vocab.add(word) for word in x['text']]) vocab.build_vocab() # 根据词表映射句子 train_data.apply(lambda x: [vocab.to_index(word) for word in x['text']], new_field_name='word_seq') test_data.apply(lambda x: [vocab.to_index(word) for word in x['text']], new_field_name='word_seq') # 设定特征域和标签域 train_data.set_input("word_seq") test_data.set_input("word_seq") train_data.set_target("target") test_data.set_target("target") return train_data, test_data, vocab
def create_dataset(): # categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space', 'rec.motorcycles'] # categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space', 'rec.motorcycles', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale'] categories = ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc'] newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, data_home='../../..') newsgroups_test = fetch_20newsgroups(subset='test', categories=categories, data_home='../../..') dataset = DataSet() for i in range(len(newsgroups_train.data)): if len(newsgroups_train.data[i]) <= 2000: dataset.append(Instance(raw_sentence=newsgroups_train.data[i], target=int(newsgroups_train.target[i]))) for i in range(len(newsgroups_test.data)): if len(newsgroups_test.data[i]) <= 2000: dataset.append(Instance(raw_sentence=newsgroups_test.data[i], target=int(newsgroups_test.target[i]))) dataset.apply(lambda x: x['raw_sentence'].lower(), new_field_name='sentence') dataset.apply(lambda x: x['sentence'].split(), new_field_name='words') dataset.apply(lambda x: len(x['words']), new_field_name='seq_len') vocab = Vocabulary(min_freq=2).from_dataset(dataset, field_name='words') vocab.index_dataset(dataset, field_name='words', new_field_name='words') dataset.set_input('words', 'seq_len') dataset.set_target('target') train_dev_data, test_data = dataset.split(0.1) train_data, dev_data = train_dev_data.split(0.1) return vocab, train_data, dev_data, test_data
def get_text_classification_datasets(num=10): categories = target_name[:num] train = fetch_20newsgroups(subset='train', categories=categories, data_home='../../..') test = fetch_20newsgroups(subset='test', categories=categories, data_home='../../..') train_data, train_target = [delete_char(doc) for doc in train.data], train.target.tolist() test_data, test_target = [delete_char(doc) for doc in test.data], test.target.tolist() # transform to DataSet() dataset_train, dataset_test = DataSet(), DataSet() max_len = 0 for i in range(len(train_data)): dataset_train.append(Instance(doc_words=train_data[i], target=train_target[i])) if max_len < len(train_data[i]): max_len = len(train_data[i]) for i in range(len(test_data)): dataset_test.append(Instance(doc_words=test_data[i], target=test_target[i])) if max_len < len(test_data[i]): max_len = len(test_data[i]) # preprocess # drop some doc doc_len = lambda x: len(x['doc_words']) <= 10 dataset_train.drop(doc_len) # build vocabulary vocab = Vocabulary(max_size=10000, min_freq=15, unknown='<unk>') dataset_train.apply(lambda x: [vocab.add(word) for word in x['doc_words']]) vocab.build_vocab() # index indexF = lambda x: [vocab.to_index(word) for word in x['doc_words']] dataset_train.apply(indexF, new_field_name='words') dataset_test.apply(indexF, new_field_name='words') dataset_train_list = dataset_train.split(0.1) return dataset_train_list[0], dataset_train_list[1], dataset_test, len(vocab), max_len
def test_apply(self): ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6]] * 40}) ds.apply(lambda ins: ins["x"][::-1], new_field_name="rx") self.assertTrue("rx" in ds.field_arrays) self.assertEqual(ds.field_arrays["rx"].content[0], [4, 3, 2, 1]) ds.apply(lambda ins: len(ins["y"]), new_field_name="y") self.assertEqual(ds.field_arrays["y"].content[0], 2) res = ds.apply(lambda ins: len(ins["x"])) self.assertTrue(isinstance(res, list) and len(res) > 0) self.assertTrue(res[0], 4) ds.apply(lambda ins: (len(ins["x"]), "hahaha"), new_field_name="k", ignore_type=True)
def get_data(dataset): n = len(dataset.data) data_set = DataSet() for i in range(n): data_set.append( Instance(raw_sentence=dataset.data[i], target=int(dataset.target[i]))) data_set.apply(lambda x: x['raw_sentence'].lower(), new_field_name='sentence') data_set.apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), '', x['sentence']), new_field_name='sentence') data_set.apply(lambda x: re.sub('[%s]' % re.escape(string.whitespace), ' ', x['sentence']), new_field_name='sentence') data_set.apply(lambda x: x['sentence'].split(), new_field_name='words') return data_set
def construct_dataset(dataset): dataset_ = DataSet() for sentence, target in zip(dataset.data, dataset.target): instance = Instance() instance['raw_sentence'] = sentence instance['target'] = int(target) dataset_.append(instance) dataset_.apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), '', x['raw_sentence']), new_field_name='sentence') #忽略标点 dataset_.apply(lambda x: re.sub('[%s]' % re.escape(string.whitespace), ' ', x['sentence']), new_field_name='sentence') #将空格、换行符等空白替换为空格 dataset_.apply(lambda x: x['sentence'].lower(), new_field_name='sentence') #转换为小写 dataset_.apply_field(lambda x: x.split(), field_name='sentence', new_field_name='input') return dataset_
def get_dataset(raw_data): data_dict = {"input": raw_data.data, "target": raw_data.target} dataset = DataSet(data=data_dict) # ignore string.punctuation dataset.apply(lambda x: x['input'].translate( str.maketrans("", "", string.punctuation)), new_field_name='input') # string.whitespace -> space dataset.apply( lambda x: re.sub('[' + string.whitespace + ']', ' ', x['input']), new_field_name='input') # lower case & split by space dataset.apply(lambda x: x['input'].lower().split(' '), new_field_name='input') # target: int dataset.set_input('input') dataset.set_target('target') return dataset
kernel_sizes=kernel_sizes, padding=padding) self.dropout = nn.Dropout(dropout) self.fc = nn.Linear(sum(kernel_nums), num_classes) def forward(self, words, seq_len=None): x = self.embed(words) # [N,L] -> [N,L,C] x = self.conv_pool(x) # [N,L,C] -> [N,C] x = self.dropout(x) x = self.fc(x) # [N,C] -> [N, N_class] return {C.OUTPUT: x} def predict(self, words, seq_len=None): output = self(words, seq_len) _, predict = output[C.OUTPUT].max(dim=1) return {C.OUTPUT: predict} #demo version trainData.apply(lambda x: x['data'].lower(), new_field_name='sentence') trainData.apply(lambda x: x['sentence'].split(), new_field_name='words', is_input=True) vocab = Vocabulary(min_freq=2) vocab = vocab.from_dataset(trainData, field_name='words') #change to index vocab.index_dataset(trainData, field_name='words',new_field_name='words') trainData.set_target('target') model = CNNText((len(vocab),128), num_classes=20, padding=2, dropout=0.1) train_data, dev_data = trainData.split(0.2) trainer = Trainer(model=model, train_data=train_data, dev_data=dev_data, loss=CrossEntropyLoss(), metrics=AccuracyMetric(), batch_size=16) trainer.train()
def __init__(self, path='.data/sst/trees', data_type='sst', batch_size=32, split_ratio=0.1, seq_len=15, min_freq=2): data_set = DataSet() if data_type == 'yelp': path = '.data/yelp' for db_set in ['train']: text_file = os.path.join(path, 'sentiment.' + db_set + '.text') label_file = os.path.join(path, 'sentiment.' + db_set + '.labels') with io.open(text_file, 'r', encoding="utf-8") as tf, io.open( label_file, 'r', encoding="utf-8") as lf: for text in tf: label = lf.readline() data_set.append(Instance(text=text, label=label)) data_set.apply( lambda x: ['<start>'] + x['text'].lower().split() + ['<eos>'], new_field_name='words') data_set.drop(lambda x: len(x['words']) > seq_len + 2) elif data_type == 'sst': path = '.data/sst/trees' text = data.Field(init_token='<start>', eos_token='<eos>', lower=True, tokenize='spacy', fix_length=16) label = data.Field(sequential=False, unk_token='<unk>') filter = lambda ex: len(ex.text ) <= seq_len and ex.label != 'neutral' sst_train = datasets.SST(os.path.join(path, 'train.txt'), text, label, filter_pred=filter) sst_dev = datasets.SST(os.path.join(path, 'dev.txt'), text, label, filter_pred=filter) sst_test = datasets.SST(os.path.join(path, 'test.txt'), text, label, filter_pred=filter) for ex in sst_train.examples + sst_dev.examples + sst_test.examples: data_set.append( Instance(words=ex.text, label={ 'negative': 0, 'positive': 1 }[ex.label])) data_set.apply( lambda x: ['<start>'] + [w.lower() for w in x['words']] + ['<eos>'], new_field_name='words') elif data_type == 'test': with io.open('fasttrial1.pos', 'r', encoding="utf-8") as f: for text in f: data_set.append(Instance(text=text, label=1)) with io.open('fasttrial1.neg', 'r', encoding="utf-8") as f: for text in f: data_set.append(Instance(text=text, label=0)) data_set.apply( lambda x: ['<start>'] + x['text'].lower().split() + ['<eos>'], new_field_name='words') data_set.drop(lambda x: len(x['words']) > seq_len + 2) data_set.apply(lambda x: x['words'] + ['<pad>'] * (seq_len + 2 - len(x['words'])), new_field_name='words') _train_data, _ = data_set.split(split_ratio) _vocab = Vocabulary(min_freq=min_freq) _train_data.apply(lambda x: [_vocab.add(word) for word in x['words']]) _vocab.build_vocab() data_set.apply(lambda x: [_vocab.to_index(w) for w in x['words']], new_field_name='word_seq', is_input=True) data_set.apply(lambda x: x['word_seq'][1:] + [0], new_field_name='dec_target', is_target=True) data_set.apply(lambda x: int(x['label']), new_field_name='label_seq', is_target=True) _train_data, _test_data = data_set.split(split_ratio) self.train_data = _train_data self.test_data = _test_data self.vocab = _vocab self.batch_size = batch_size self.train_iter = iter( Batch(dataset=self.train_data, batch_size=self.batch_size, sampler=SequentialSampler()))
return {"pred": pred} # Prepare the dataset and testset fitlog.commit(__file__) fitlog.add_hyper_in_file(__file__) table = str.maketrans('', '', string.punctuation) newsgroups_train = fetch_20newsgroups(subset='train') dataset = DataSet() for i in range(newsgroups_train.target.shape[0]): dataset.append( Instance(raw_sentence=newsgroups_train.data[i].replace('\n', ' '), target=int(newsgroups_train.target[i]))) dataset.apply(lambda x: x['raw_sentence'].lower().translate(table), new_field_name='sentence') dataset.apply_field(lambda x: x.split(), field_name='sentence', new_field_name='words') dataset.apply_field(lambda x: len(x), field_name='words', new_field_name='seq_len') newsgroups_test = fetch_20newsgroups(subset='test') testset = DataSet() for i in range(newsgroups_test.target.shape[0]): testset.append( Instance(raw_sentence=newsgroups_test.data[i].replace('\n', ' '), target=int(newsgroups_test.target[i]))) testset.apply(lambda x: x['raw_sentence'].lower().translate(table), new_field_name='sentence')
def produceCandidateTriple(Candidate_hpos_sub_total, model, hpo_tree, threshold): """ 使用BERT判断Candidate_phrases中哪个与raw_phrase语义最接近;基于最大值方式 :param Candidate_hpos_sub_total: 输出的短语及候选HPO嵌套列表 :param model: :param hpo_tree: :param threshold: 用作该模型输出阈值 :return: """ from fastNLP.core.utils import _move_dict_value_to_device from fastNLP.core.utils import _get_model_device from fastNLP import DataSet from fastNLP import DataSetIter from my_bert_match import addWordPiece, addSeqlen, addWords, processItem, processNum p_Candidate_phrases = [] phrase_nums_per_hpo = [] Candidate_hpos = [] for raw_phrase, Candidate_phrase, Candidate_hpos_sub in Candidate_hpos_sub_total: p_Candidate_phrases.extend( [raw_phrase + "::" + item for item in Candidate_phrase]) phrase_nums_per_hpo.append(len(Candidate_phrase)) Candidate_hpos.append(Candidate_hpos_sub) Candidate_dataset = DataSet({"raw_words": p_Candidate_phrases}) Candidate_dataset.apply(addWords, new_field_name="p_words") Candidate_dataset.apply(addWordPiece, new_field_name="t_words") Candidate_dataset.apply(processItem, new_field_name="word_pieces") Candidate_dataset.apply(processNum, new_field_name="word_nums") Candidate_dataset.apply(addSeqlen, new_field_name="seq_len") Candidate_dataset.field_arrays["word_pieces"].is_input = True Candidate_dataset.field_arrays["seq_len"].is_input = True Candidate_dataset.field_arrays["word_nums"].is_input = True test_batch = DataSetIter(batch_size=128, dataset=Candidate_dataset, sampler=None) outputs = [] for batch_x, batch_y in test_batch: _move_dict_value_to_device(batch_x, batch_y, device=_get_model_device(model)) outputs.append( model.forward(batch_x["word_pieces"], batch_x["word_nums"], batch_x["seq_len"])['pred']) outputs = torch.cat(outputs) outputs = torch.nn.functional.softmax(outputs, dim=1).cpu().detach().numpy() # print(outputs.size) results_2 = np.array([item[2] for item in outputs]) results_1 = np.array([item[1] for item in outputs]) # 按短语分组 count = 0 index = 0 ans = [] for group_num in phrase_nums_per_hpo: g_results_2 = results_2[index:index + group_num] g_results_1 = results_1[index:index + group_num] Candidate_hpos_sub = Candidate_hpos[count] index += group_num count += 1 # 如果这里已经能找到精确匹配的就直接输出 if max(g_results_2) >= threshold: ans.append([ Candidate_hpos_sub[int(np.argmax(g_results_2))], max(g_results_2), "2" ]) continue if max(g_results_1) >= threshold: ans.append([ Candidate_hpos_sub[int(np.argmax(g_results_1))], max(g_results_1), "1" ]) continue ans.append(["None", None, "0"]) return ans
def preprocess(batch=16): raw_data1 = [] raw_data2 = [] for i in range(len(traindata.data)): raw_data1.append( Instance(sentence=traindata.data[i], label=int(traindata.target[i]))) trainset = DataSet(raw_data1) trainset.apply(lambda x: pre(x['sentence']), new_field_name='words') for i in range(len(testdata.data)): raw_data2.append( Instance(sentence=testdata.data[i], label=int(testdata.target[i]))) testset = DataSet(raw_data2) testset.apply(lambda x: pre(x['sentence']), new_field_name='words') global vocab vocab = Vocabulary(min_freq=1).from_dataset(trainset, testset, field_name='words') vocab.index_dataset(trainset, testset, field_name='words', new_field_name='words') trainset.set_input('words') testset.set_input('words') trainset.apply(lambda x: int(x['label']), new_field_name='target', is_target=True) testset.apply(lambda x: int(x['label']), new_field_name='target', is_target=True) trainset.apply(lambda x: len(x['words']), new_field_name='seq_len') testset.apply(lambda x: len(x['words']), new_field_name='seq_len') global vocabsize vocabsize = len(vocab) sampler = BucketSampler(batch_size=batch, seq_len_field_name='seq_len') train_batch = Batch(batch_size=batch, dataset=trainset, sampler=sampler) test_batch = Batch(batch_size=batch, dataset=testset, sampler=sampler) return train_batch, test_batch, vocabsize
def preprocess(): train_set = DataSet() for i in range(len(raw_train['data'])): di = transfer(raw_train['data'][i]) train_set.append( Instance(sentence=di, target=int(raw_train['target'][i]))) train_set.apply(lambda x: x['sentence'].lower(), new_field_name='sentence') train_set.apply(lambda x: x['sentence'].split(), new_field_name='words') train_set.apply(lambda x: len(x['words']), new_field_name='seq_len') test_set = DataSet() for i in range(len(raw_test['data'])): di = transfer(raw_test['data'][i]) test_set.append( Instance(sentence=di, target=int(raw_test['target'][i]))) test_set.apply(lambda x: x['sentence'].lower(), new_field_name='sentence') test_set.apply(lambda x: x['sentence'].split(), new_field_name='words') test_set.apply(lambda x: len(x['words']), new_field_name='seq_len') word_dict = Vocabulary(min_freq=2) train_set.apply(lambda x: [word_dict.add(word) for word in x['words']]) test_set.apply(lambda x: [word_dict.add(word) for word in x['words']]) word_dict.build_vocab() word_dict.index_dataset(train_set, field_name='words', new_field_name='words') word_dict.index_dataset(test_set, field_name='words', new_field_name='words') return train_set, test_set, word_dict
def get_dataset(data_path): print('Getting dataset...') poetry = [] with open(data_path, 'r', encoding='utf-8') as f: poem = '' for line in f: if len(line) <= 1: ins = Instance(text=poem) if len(poem) > 10: poetry.append(ins) poem = '' else: poem += line.strip('\n') # print(poetry[0]) data = DataSet(data=poetry) print("Original data:", data[0]) vocabulary = Vocabulary(min_freq=2, unknown='<oov>', padding='<pad>') vocabulary.add_word('<eos>') vocabulary.add_word('<START>') data.apply(lambda x: [vocabulary.add(char) for char in x['text']]) vocabulary.build_vocab() print('pad:', vocabulary.to_index('<pad>')) print('Vocab size:', len(vocabulary)) data.apply(lambda x: [vocabulary.to_index(char) for char in x['text']], new_field_name='text') data.apply(lambda x: [vocabulary.to_index('<START>')] + x['text'] + [vocabulary.to_index('<eos>')], new_field_name='text') data.apply( lambda x: x['text'][0:min(config.sequence_length, len(x['text']))], new_field_name='text') data.apply(lambda x: [vocabulary.to_index('<pad>')] * (config.sequence_length - len(x['text'])) + x['text'], new_field_name='text') data.apply(lambda x: x['text'][0:-1], new_field_name='input') data.apply(lambda x: x['text'][1:], new_field_name='target') data.set_input('input') data.set_target('target') # length = config.sequence_length # for i, d in enumerate(data): # if length != len(d['text']): # print("wrong!") # exit() train_data, dev_data = data.split(0.2) print('Train data size:', len(train_data)) print('Dev data size:', len(dev_data)) print("Train data:", train_data[20]) # print("Dev data:", dev_data[0]) return train_data, dev_data, vocabulary
def preprocess(): train_set = DataSet() for i in range(len(raw_train.data)): train_set.append( Instance(sentence=raw_train.data[i], target=int(raw_train.target[i]))) train_set.apply(lambda x: x['sentence'].translate( str.maketrans("", "", string.punctuation)).lower(), new_field_name='sentence') train_set.apply(lambda x: x['sentence'].split(), new_field_name='words') train_set.apply(lambda x: len(x['words']), new_field_name='seq_len') test_set = DataSet() for i in range(len(raw_test.data)): test_set.append( Instance(sentence=raw_test.data[i], target=int(raw_test.target[i]))) test_set.apply(lambda x: x['sentence'].translate( str.maketrans("", "", string.punctuation)).lower(), new_field_name='sentence') test_set.apply(lambda x: x['sentence'].split(), new_field_name='words') test_set.apply(lambda x: len(x['words']), new_field_name='seq_len') vocab = Vocabulary(min_freq=10) train_set.apply(lambda x: [vocab.add(word) for word in x['words']]) test_set.apply(lambda x: [vocab.add(word) for word in x['words']]) vocab.build_vocab() vocab.index_dataset(train_set, field_name='words', new_field_name='words') vocab.index_dataset(test_set, field_name='words', new_field_name='words') return train_set, test_set, vocab