def readdata(): global target_len min_count = 10 #categories = ['comp.os.ms-windows.misc', 'rec.motorcycles', 'sci.space', 'talk.politics.misc', ] dataset_train = fetch_20newsgroups(subset='train', data_home='../../..') dataset_test = fetch_20newsgroups(subset='test', data_home='../../..') data = dataset_train.data target = dataset_train.target target_len = len(dataset_train.target_names) train_data = DataSet() padding = 0 for i in range(len(data)): data_t = re.sub("\d+|\s+|/", " ", data[i] ) temp = [word.strip(string.punctuation).lower() for word in data_t.split() if word.strip(string.punctuation) != ''] train_data.append(Instance(raw = data[i], label = int(target[i]), words = temp)) if len(temp) > padding: padding = len(temp) train_data.apply(lambda x: x['raw'].lower(), new_field_name='raw') data = dataset_test.data target = dataset_test.target test_data = DataSet() padding = 0 for i in range(len(data)): data_t = re.sub("\d+|\s+|/", " ", data[i] ) temp = [word.strip(string.punctuation).lower() for word in data_t.split() if word.strip(string.punctuation) != ''] test_data.append(Instance(raw = data[i], label = int(target[i]), words = temp)) if len(temp) > padding: padding = len(temp) test_data.apply(lambda x: x['raw'].lower(), new_field_name='raw') train_data.apply(lambda x: len(x['words']), new_field_name='len') test_data.apply(lambda x: len(x['words']), new_field_name='len') vocab = Vocabulary(min_freq=10) train_data.apply(lambda x: [vocab.add(word) for word in x['words']]) vocab.build_vocab() train_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='seq') test_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='seq') train_data.rename_field('seq', Const.INPUT) train_data.rename_field('len', Const.INPUT_LEN) train_data.rename_field('label', Const.TARGET) test_data.rename_field('seq', Const.INPUT) test_data.rename_field('len', Const.INPUT_LEN) test_data.rename_field('label', Const.TARGET) test_data.set_input(Const.INPUT, Const.INPUT_LEN) test_data.set_target(Const.TARGET) train_data.set_input(Const.INPUT, Const.INPUT_LEN) train_data.set_target(Const.TARGET) test_data, dev_data = test_data.split(0.5) return train_data,dev_data,test_data,vocab
def test_add_field_v2(self): ds = DataSet({"x": [3, 4]}) ds.add_field('y', [['hello', 'world'], ['this', 'is', 'a', 'test']], is_input=True, is_target=True) # ds.apply(lambda x:[x['x']]*3, is_input=True, is_target=True, new_field_name='y') print(ds)
def test_add_null(self): # TODO test failed because 'fastNLP\core\field.py:143: RuntimeError' ds = DataSet() with self.assertRaises(RuntimeError) as RE: ds.add_field('test', [])
def test_drop(self): ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6], [7, 8, 9, 0]] * 20}) ds.drop(lambda ins: len(ins["y"]) < 3, inplace=True) self.assertEqual(len(ds), 20)
def test_get_target_name(self): ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10}) self.assertEqual(ds.get_target_name(), [_ for _ in ds.field_arrays if ds.field_arrays[_].is_target])
def test_tutorial_1_data_preprocess(self): from fastNLP import DataSet data = {'raw_words': ["This is the first instance .", "Second instance .", "Third instance ."], 'words': [['this', 'is', 'the', 'first', 'instance', '.'], ['Second', 'instance', '.'], ['Third', 'instance', '.']], 'seq_len': [6, 3, 3]} dataset = DataSet(data) # 传入的dict的每个key的value应该为具有相同长度的list from fastNLP import DataSet from fastNLP import Instance dataset = DataSet() instance = Instance(raw_words="This is the first instance", words=['this', 'is', 'the', 'first', 'instance', '.'], seq_len=6) dataset.append(instance) from fastNLP import DataSet from fastNLP import Instance dataset = DataSet([ Instance(raw_words="This is the first instance", words=['this', 'is', 'the', 'first', 'instance', '.'], seq_len=6), Instance(raw_words="Second instance .", words=['Second', 'instance', '.'], seq_len=3) ]) from fastNLP import DataSet dataset = DataSet({'a': range(-5, 5), 'c': [0] * 10}) # 不改变dataset,生成一个删除了满足条件的instance的新 DataSet dropped_dataset = dataset.drop(lambda ins: ins['a'] < 0, inplace=False) # 在dataset中删除满足条件的instance dataset.drop(lambda ins: ins['a'] < 0) # 删除第3个instance dataset.delete_instance(2) # 删除名为'a'的field dataset.delete_field('a') # 检查是否存在名为'a'的field print(dataset.has_field('a')) # 或 ('a' in dataset) # 将名为'a'的field改名为'b' dataset.rename_field('c', 'b') # DataSet的长度 len(dataset) from fastNLP import DataSet data = {'raw_words': ["This is the first instance .", "Second instance .", "Third instance ."]} dataset = DataSet(data) # 将句子分成单词形式, 详见DataSet.apply()方法 dataset.apply(lambda ins: ins['raw_words'].split(), new_field_name='words') # 或使用DataSet.apply_field() dataset.apply_field(lambda sent: sent.split(), field_name='raw_words', new_field_name='words') # 除了匿名函数,也可以定义函数传递进去 def get_words(instance): sentence = instance['raw_words'] words = sentence.split() return words dataset.apply(get_words, new_field_name='words')
from fastNLP import Trainer, CrossEntropyLoss, AccuracyMetric from sklearn.datasets import fetch_20newsgroups import numpy as np import torch import torch.nn as nn import torch.nn.functional as F from torch.nn import Parameter from fastNLP.core.const import Const as C from fastNLP.modules import encoder trainSet = fetch_20newsgroups(subset='train') testSet = fetch_20newsgroups(subset='test') testData = {"data": testSet['data'], "target": testSet['target']} trainData = {"data": trainSet['data'], "target": trainSet['target']} trainData = DataSet(trainData) testData = DataSet(testData) trainData.apply(lambda x: x['data'].lower(), new_field_name='sentence') trainData.apply(lambda x: x['sentence'].split(), new_field_name='words', is_input=True) vocab = Vocabulary(min_freq=2) vocab = vocab.from_dataset(trainData, field_name='words') #change to index vocab.index_dataset(trainData, field_name='words', new_field_name='words') trainData.set_target('target') train_data, dev_data = trainData.split(0.2) #demo LSTM version
def preprocess(task, type, length=60): Task = [] i = 0 index = 0 ind = [] raw = [] for File in filter(lambda s: True if s.find('.' + type + '.train') != -1 else False, Filelist): raw1 = [] j = 0 with open(File, 'r', encoding='utf-8') as f: for line in f.readlines(): j += 1 if line[-3] == '-': label = -1 onehot = [1, 0] else: label = 1 onehot = [0, 1] raw1.append(Instance(sentence = line[:-3], label = label, onehot=onehot)) raw.append(Instance(sentence = line[:-3], label = label, onehot=onehot)) raw1 = DataSet(raw1) raw1.apply(lambda x: pre(x['sentence'], length), new_field_name='words') if File in [task + type + '.train' for task in test]: index = i if j <= 30: ind.append(i) i += 1 Task.append(raw1) raw3 = [] for File in filter(lambda s: True if s == task + type + '.dev' else False, Filelist): with open(File, 'r', encoding='utf-8') as f: for line in f.readlines(): if line[-3] == '-': label = -1 onehot = [1, 0] else: label = 1 onehot = [0, 1] raw3.append(Instance(sentence=line[:-3], label=label, onehot=onehot)) raw3 = DataSet(raw3) raw3.apply(lambda x: pre(x['sentence'], length), new_field_name='words') raw2 = [] for File in filter(lambda s: True if s == task + type + '.test' else False, Filelist): raw2 = [] with open(File, 'r', encoding='utf-8') as f: for line in f.readlines(): if line[-3] == '-': label = -1 onehot = [1, 0] else: label = 1 onehot = [0, 1] raw2.append(Instance(sentence=line[:-3], label=label, onehot=onehot)) raw2 = DataSet(raw2) raw2.apply(lambda x: pre(x['sentence'], length), new_field_name='words') raw = DataSet(raw) raw.apply(lambda x: pre(x['sentence'], length), new_field_name='words') vocab = Vocabulary(min_freq=2).from_dataset(raw, raw3, raw2, field_name='words') vocab.index_dataset(raw2, raw3, field_name='words', new_field_name='words') return Task, vocab, ind, index, raw3, raw2
from fastNLP.embeddings import BertEmbedding from fastNLP.models import BertForSequenceClassification from fastNLP import Trainer, CrossEntropyLoss, AccuracyMetric, Adam '''import other package''' import os import csv import torch '''load the csv''' file2label = {} with open('dataset/annotations_metadata.csv') as f: for line in f: items = line.split(',') file2label[items[0] + ".txt"] = (1 if items[4].split('\n')[0] == 'hate' else 0) '''load the Hate-Speech data''' train_dataset = DataSet() dev_dataset = DataSet() test_dataset = DataSet() cnt = 0 length = len(os.listdir("dataset/all_files")) for file in os.listdir("dataset/all_files"): with open("dataset/all_files/" + file) as f: cnt += 1 raw_words = f.read() words = raw_words.split() seq_len = len(words) if cnt > length * 0.9: test_dataset.append( Instance(raw_words=raw_words, words=words,
def load_dataset(args): processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, } num_labels_task = { "cola": 2, "mnli": 3, "mrpc": 2, } task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() args.num_labels = num_labels_task[task_name] label_list = processor.get_labels() tokenizer = BertTokenizer.from_pretrained(args.bert_model) train_examples = None args.num_train_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) args.num_train_steps = int( len(train_examples) / args.train_batch_size / 1 * args.num_train_epochs) # training dataset train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) train_data = DataSet({ "x": [f.input_ids for f in train_features], "segment_info": [f.segment_ids for f in train_features], "mask": [f.input_mask for f in train_features], "target": [f.label_id for f in train_features] }) train_data.set_input('x', 'segment_info', 'mask') train_data.set_target('target') # dev dataset eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer) dev_data = DataSet({ "x": [f.input_ids for f in eval_features], "segment_info": [f.segment_ids for f in eval_features], "mask": [f.input_mask for f in eval_features], "target": [f.label_id for f in eval_features] }) dev_data.set_input('x', 'segment_info', 'mask') dev_data.set_target('target') return train_data, dev_data
class TextData(): vocab_size = 0 dataset_size = 0 train_size = 0 test_size = 0 class_num = 4 min_count = 10 max_seq_len = 500 seq_limit = 2000 data_src = "20news" data_set = DataSet() train_set = DataSet() test_set = DataSet() dev_set = DataSet() vocab = None def __init__(self,data_src="20news",min_count=10,seq_limit=None): self.data_src = data_src self.min_count = min_count if seq_limit is not None: self.seq_limit = seq_limit def find_max_len(self,words): self.max_seq_len = max(len(words),self.max_seq_len) def seq_regularize(self,words): wlen = len(words) if wlen<self.max_seq_len: return [0]*(self.max_seq_len-wlen) + words else: return words[:self.max_seq_len] def fetch_20news(self,size=4): print("Loading 20newsgroups data and tokenize.") if size==20: train,test = get_all_20news() else: train,test = get_text_classification_datasets() train_input,test_input = tokenize(train.data,test.data) train_target = train.target test_target = test.target self.class_num = len(train.target_names) assert (self.class_num == len(test.target_names)) # Building Fastnlp dataset. print("Building Fastnlp dataset.") self.train_set = DataSet({"text":train_input,"class":train_target}) self.test_set = DataSet({"text":test_input,"class":test_target}) # Building Fastnlp vocabulary... print("Building Fastnlp vocabulary.") self.vocab = Vocabulary(min_freq=self.min_count) self.train_set.apply(lambda x : [self.vocab.add_word(word) for word in x['text']]) self.vocab.build_vocab() self.vocab.build_reverse_vocab() self.vocab_size = len(self.vocab) # Building multi-hot-vector for train_set and test_set. print("Building id-presentation for train_set and test_set.") self.vocab.index_dataset(self.train_set,self.test_set,field_name='text',new_field_name='words') self.train_set.apply_field(lambda x : len(x),field_name='words',new_field_name='seq_len') self.test_set.apply_field(lambda x : len(x),field_name='words',new_field_name='seq_len') self.train_set.apply_field(self.find_max_len,field_name='words') print(self.max_seq_len) self.max_seq_len = min(self.max_seq_len,self.seq_limit) self.train_set.apply_field(self.seq_regularize,field_name='words',new_field_name='words') self.test_set.apply_field(self.seq_regularize,field_name='words',new_field_name='words') # self.train_set.apply(lambda x : text2multi_hot(x['words'],self.vocab_size),new_field_name="input") # self.test_set.apply(lambda x : text2multi_hot(x['words'],self.vocab_size),new_field_name='input') # Building target-vector for train_set and test_set. print("Building target-vector for train_set and test_set.") self.train_set.apply(lambda x : int(x['class']),new_field_name="target",is_target=True) self.test_set.apply(lambda x : int(x['class']),new_field_name="target",is_target=True) # self.train_set.apply(lambda x : class2target(x['class'],self.calss_num),new_field_name="target") # self.test_set.apply(lambda x : class2target(x['class'],self.calss_num),new_field_name="target") def fetch_csv(self,path=None): print("Not implemented now...") pass def fetch_data(self,path=None): if self.data_src == "20news": # Loading 20newsgroups data and tokenize. self.fetch_20news() elif self.data_src == "20news_all": self.fetch_20news(size=20) else: print("No data src...") self.train_size = self.train_set.get_length() self.test_size = self.test_set.get_length() return self.train_size,self.test_size
data_bundle = get_data() print(data_bundle) train_word2bpes = data_bundle.train_word2bpes target_word2bpes = data_bundle.target_word2bpes print( f"In total {len(target_word2bpes)} target words, {len(train_word2bpes)} words." ) pad_id = data_bundle.pad_id lg_dict = getattr(data_bundle, 'lg_dict') lg_shifts = getattr(data_bundle, 'lg_shift') train_lg_shifts = getattr(data_bundle, 'train_lg_shift') train_data = DataSet() for name, ds in data_bundle.iter_datasets(): if 'train' in name: for ins in ds: train_data.append(ins) train_data.add_seq_len('input') train_data.set_input('input', 'language_ids') train_data.set_target('target') train_data.set_pad_val('input', pad_id) clip_max_length(train_data, data_bundle, max_sent_len=50) model = JointBertReverseDict(pre_name, train_word2bpes, target_word2bpes,
import os import pickle from fastNLP import Vocabulary from fastNLP import DataSet root_path = '/remote-home/txsun/data/glue_data' # RTE print('processing RTE...') dataset = 'RTE' data_path = os.path.join(root_path, dataset) ## Train print('reading train file...') train_file = os.path.join(data_path, 'train.tsv') train_ds = DataSet.read_csv(train_file, sep='\t') train_ds.delete_field('index') print(train_ds[0]) print(len(train_ds)) ## Dev print('reading dev file...') dev_file = os.path.join(data_path, 'dev.tsv') dev_ds = DataSet.read_csv(dev_file, sep='\t') dev_ds.delete_field('index') print(dev_ds[0]) print(len(dev_ds)) ## Test
def main(): parser = argparse.ArgumentParser() # fmt: off parser.add_argument("--data_path", required=True, type=str, help="all of datasets pkl paths") # fmt: on options, _ = parser.parse_known_args() train_set, test_set = DataSet(), DataSet() input_dir = os.path.join(options.data_path, "joint-sighan2008/bmes") options.output = os.path.join(options.data_path, "total_dataset.pkl") print(input_dir, options.output) for fn in os.listdir(input_dir): if fn not in ["test.txt", "train-all.txt"]: continue print(fn) abs_fn = os.path.join(input_dir, fn) ds = read_file(abs_fn) if "test.txt" == fn: test_set = ds else: train_set = ds print( "num samples of total train, test: {}, {}".format(len(train_set), len(test_set)) ) uni_vocab = Vocabulary(min_freq=None).from_dataset( train_set, test_set, field_name="ori_words" ) # bi_vocab = Vocabulary(min_freq=3, max_size=50000).from_dataset(train_set,test_set, field_name="bi1") bi_vocab = Vocabulary(min_freq=3, max_size=None).from_dataset( train_set, field_name="bi1", no_create_entry_dataset=[test_set] ) tag_vocab = Vocabulary(min_freq=None, padding="s", unknown=None).from_dataset( train_set, field_name="ori_tags" ) task_vocab = Vocabulary(min_freq=None, padding=None, unknown=None).from_dataset( train_set, field_name="task" ) def to_index(dataset): uni_vocab.index_dataset(dataset, field_name="ori_words", new_field_name="uni") tag_vocab.index_dataset(dataset, field_name="ori_tags", new_field_name="tags") task_vocab.index_dataset(dataset, field_name="task", new_field_name="task") dataset.apply_field(lambda x: x[1:], field_name="bi1", new_field_name="bi2") dataset.apply_field(lambda x: x[:-1], field_name="bi1", new_field_name="bi1") bi_vocab.index_dataset(dataset, field_name="bi1", new_field_name="bi1") bi_vocab.index_dataset(dataset, field_name="bi2", new_field_name="bi2") dataset.set_input("task", "uni", "bi1", "bi2", "seq_len") dataset.set_target("tags") return dataset train_set = to_index(train_set) test_set = to_index(test_set) output = {} output["train_set"] = train_set output["test_set"] = test_set output["uni_vocab"] = uni_vocab output["bi_vocab"] = bi_vocab output["tag_vocab"] = tag_vocab output["task_vocab"] = task_vocab print(tag_vocab.word2idx) print(task_vocab.word2idx) make_sure_path_exists(os.path.dirname(options.output)) print("Saving dataset to {}".format(os.path.abspath(options.output))) with open(options.output, "wb") as outfile: dump(output, outfile) print(len(task_vocab), len(tag_vocab), len(uni_vocab), len(bi_vocab)) dic = {} tokens = {} def process(words): name = words[0][1:-1] if name not in dic: dic[name] = set() tokens[name] = 0 tokens[name] += len(words[1:]) dic[name].update(words[1:]) train_set.apply_field(process, "ori_words", None) for name in dic.keys(): print(name, len(dic[name]), tokens[name]) with open(os.path.join(os.path.dirname(options.output), "oovdict.pkl"), "wb") as f: dump(dic, f) def get_max_len(ds): global max_len max_len = 0 def find_max_len(words): global max_len if max_len < len(words): max_len = len(words) ds.apply_field(find_max_len, "ori_words", None) return max_len print( "train max len: {}, test max len: {}".format( get_max_len(train_set), get_max_len(test_set) ) )
def equip_chinese_ner_with_lexicon(datasets, vocabs, embeddings, w_list, word_embedding_path=None, only_lexicon_in_train=False, word_char_mix_embedding_path=None, number_normalized=False, lattice_min_freq=1, only_train_min_freq=0): from fastNLP.core import Vocabulary def normalize_char(inp): result = [] for c in inp: if c.isdigit(): result.append('0') else: result.append(c) return result def normalize_bigram(inp): result = [] for bi in inp: tmp = bi if tmp[0].isdigit(): tmp = '0' + tmp[:1] if tmp[1].isdigit(): tmp = tmp[0] + '0' result.append(tmp) return result if number_normalized == 3: for k, v in datasets.items(): v.apply_field(normalize_char, 'chars', 'chars') vocabs['char'] = Vocabulary() vocabs['char'].from_dataset(datasets['train'], field_name='chars', no_create_entry_dataset=[datasets['dev'], datasets['test']]) for k, v in datasets.items(): v.apply_field(normalize_bigram, 'bigrams', 'bigrams') vocabs['bigram'] = Vocabulary() vocabs['bigram'].from_dataset(datasets['train'], field_name='bigrams', no_create_entry_dataset=[datasets['dev'], datasets['test']]) if only_lexicon_in_train: print('已支持只加载在trian中出现过的词汇') def get_skip_path(chars, w_trie): sentence = ''.join(chars) result = w_trie.get_lexicon(sentence) # print(result) return result from V0.utils_ import Trie from functools import partial from fastNLP.core import Vocabulary # from fastNLP.embeddings import StaticEmbedding from fastNLP_module import StaticEmbedding from fastNLP import DataSet a = DataSet() w_trie = Trie() for w in w_list: w_trie.insert(w) if only_lexicon_in_train: lexicon_in_train = set() for s in datasets['train']['chars']: lexicon_in_s = w_trie.get_lexicon(s) for s, e, lexicon in lexicon_in_s: lexicon_in_train.add(''.join(lexicon)) print('lexicon in train:{}'.format(len(lexicon_in_train))) print('i.e.: {}'.format(list(lexicon_in_train)[:10])) w_trie = Trie() for w in lexicon_in_train: w_trie.insert(w) import copy for k, v in datasets.items(): v.apply_field(partial(get_skip_path, w_trie=w_trie), 'chars', 'lexicons') v.apply_field(copy.copy, 'chars', 'raw_chars') v.add_seq_len('lexicons', 'lex_num') v.apply_field(lambda x: list(map(lambda y: y[0], x)), 'lexicons', 'lex_s') v.apply_field(lambda x: list(map(lambda y: y[1], x)), 'lexicons', 'lex_e') if number_normalized == 1: for k, v in datasets.items(): v.apply_field(normalize_char, 'chars', 'chars') vocabs['char'] = Vocabulary() vocabs['char'].from_dataset(datasets['train'], field_name='chars', no_create_entry_dataset=[datasets['dev'], datasets['test']]) if number_normalized == 2: for k, v in datasets.items(): v.apply_field(normalize_char, 'chars', 'chars') vocabs['char'] = Vocabulary() vocabs['char'].from_dataset(datasets['train'], field_name='chars', no_create_entry_dataset=[datasets['dev'], datasets['test']]) for k, v in datasets.items(): v.apply_field(normalize_bigram, 'bigrams', 'bigrams') vocabs['bigram'] = Vocabulary() vocabs['bigram'].from_dataset(datasets['train'], field_name='bigrams', no_create_entry_dataset=[datasets['dev'], datasets['test']]) def concat(ins): chars = ins['chars'] lexicons = ins['lexicons'] result = chars + list(map(lambda x: x[2], lexicons)) # print('lexicons:{}'.format(lexicons)) # print('lex_only:{}'.format(list(filter(lambda x:x[2],lexicons)))) # print('result:{}'.format(result)) return result def get_pos_s(ins): lex_s = ins['lex_s'] seq_len = ins['seq_len'] pos_s = list(range(seq_len)) + lex_s return pos_s def get_pos_e(ins): lex_e = ins['lex_e'] seq_len = ins['seq_len'] pos_e = list(range(seq_len)) + lex_e return pos_e for k, v in datasets.items(): v.apply(concat, new_field_name='lattice') v.set_input('lattice') v.apply(get_pos_s, new_field_name='pos_s') v.apply(get_pos_e, new_field_name='pos_e') v.set_input('pos_s', 'pos_e') # print(list(datasets['train'][:10]['lexicons'])) # print(list(datasets['train'][:10]['lattice'])) # print(list(datasets['train'][:10]['lex_s'])) # print(list(datasets['train'][:10]['lex_e'])) # print(list(datasets['train'][:10]['pos_s'])) # print(list(datasets['train'][:10]['pos_e'])) # exit(1208) word_vocab = Vocabulary() word_vocab.add_word_lst(w_list) vocabs['word'] = word_vocab lattice_vocab = Vocabulary() lattice_vocab.from_dataset(datasets['train'], field_name='lattice', no_create_entry_dataset=[v for k, v in datasets.items() if k != 'train']) vocabs['lattice'] = lattice_vocab # for k,v in datasets.items(): # v.apply_field(lambda x:[ list(map(lambda x:x[0],p)) for p in x],'skips_l2r','skips_l2r_source') # v.apply_field(lambda x:[ list(map(lambda x:x[1],p)) for p in x], 'skips_l2r', 'skips_l2r_word') # # for k,v in datasets.items(): # v.apply_field(lambda x:[ list(map(lambda x:x[0],p)) for p in x],'skips_r2l','skips_r2l_source') # v.apply_field(lambda x:[ list(map(lambda x:x[1],p)) for p in x], 'skips_r2l', 'skips_r2l_word') # for k,v in datasets.items(): # v.apply_field(lambda x:list(map(len,x)), 'skips_l2r_word', 'lexicon_count') # v.apply_field(lambda x: # list(map(lambda y: # list(map(lambda z:word_vocab.to_index(z),y)),x)), # 'skips_l2r_word',new_field_name='skips_l2r_word') # # v.apply_field(lambda x:list(map(len,x)), 'skips_r2l_word', 'lexicon_count_back') # # v.apply_field(lambda x: # list(map(lambda y: # list(map(lambda z:word_vocab.to_index(z),y)),x)), # 'skips_r2l_word',new_field_name='skips_r2l_word') if word_embedding_path is not None: word_embedding = StaticEmbedding(word_vocab, word_embedding_path, word_dropout=0) embeddings['word'] = word_embedding if word_char_mix_embedding_path is not None: lattice_embedding = StaticEmbedding(lattice_vocab, word_char_mix_embedding_path, word_dropout=0.01, min_freq=lattice_min_freq, only_train_min_freq=only_train_min_freq) embeddings['lattice'] = lattice_embedding vocabs['char'].index_dataset(*(datasets.values()), field_name='chars', new_field_name='chars') vocabs['bigram'].index_dataset(*(datasets.values()), field_name='bigrams', new_field_name='bigrams') vocabs['label'].index_dataset(*(datasets.values()), field_name='target', new_field_name='target') vocabs['lattice'].index_dataset(*(datasets.values()), field_name='lattice', new_field_name='lattice') return datasets, vocabs, embeddings
def construct_dataset(dataset): dataset_ = DataSet() for sentence, target in zip(dataset.data, dataset.target): instance = Instance() instance['raw_sentence'] = sentence instance['target'] = int(target) dataset_.append(instance) dataset_.apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), '', x['raw_sentence']), new_field_name='sentence') #忽略标点 dataset_.apply(lambda x: re.sub('[%s]' % re.escape(string.whitespace), ' ', x['sentence']), new_field_name='sentence') #将空格、换行符等空白替换为空格 dataset_.apply(lambda x: x['sentence'].lower(), new_field_name='sentence') #转换为小写 dataset_.apply_field(lambda x: x.split(), field_name='sentence', new_field_name='input') return dataset_
def test_from_dataset(self): start_char = 65 num_samples = 10 # 0 dim dataset = DataSet() for i in range(num_samples): ins = Instance(char=chr(start_char + i)) dataset.append(ins) vocab = Vocabulary() vocab.from_dataset(dataset, field_name='char') for i in range(num_samples): self.assertEqual(vocab.to_index(chr(start_char + i)), i + 2) vocab.index_dataset(dataset, field_name='char') # 1 dim dataset = DataSet() for i in range(num_samples): ins = Instance(char=[chr(start_char + i)] * 6) dataset.append(ins) vocab = Vocabulary() vocab.from_dataset(dataset, field_name='char') for i in range(num_samples): self.assertEqual(vocab.to_index(chr(start_char + i)), i + 2) vocab.index_dataset(dataset, field_name='char') # 2 dim dataset = DataSet() for i in range(num_samples): ins = Instance(char=[[chr(start_char + i) for _ in range(6)] for _ in range(6)]) dataset.append(ins) vocab = Vocabulary() vocab.from_dataset(dataset, field_name='char') for i in range(num_samples): self.assertEqual(vocab.to_index(chr(start_char + i)), i + 2) vocab.index_dataset(dataset, field_name='char')
from fastNLP import Vocabulary from fastNLP import DataSet # 问题:fastNLP虽然已经提供了split函数,可以将数据集划分成训练集和测试机,但一般网上用作训练的标准集都已经提前划分好了训练集和测试机, # 而使用split将数据集进行随机划分还引来了一个问题: # 因为每次都是随机划分,导致每次的字典都不一样,保存好模型下次再载入进行测试时,因为字典不同导致结果差异非常大。 # # 解决方法:在Vocabulary增加一个字典保存函数和一个字典读取函数,而不是每次都生成一个新字典,同时减少下次运行的成本,第一次使用save_vocab() # 生成字典后,下次可以直接使用load_vocab()载入的字典。 if __name__ == '__main__': data_path = "data_for_tests/tutorial_sample_dataset.csv" train_data = DataSet.read_csv(data_path, headers=('raw_sentence', 'label'), sep='\t') print('len(train_data)', train_data) # 将所有字母转为小写 train_data.apply(lambda x: x['raw_sentence'].lower(), new_field_name='raw_sentence') # label转int train_data.apply(lambda x: int(x['label']) - 1, new_field_name='label_seq', is_target=True) # 使用空格分割句子 def split_sent(ins): return ins['raw_sentence'].split()
def get_batch_data(): # Load train data X_train, Y_train, Sources_train, Targets_train = load_train_data() # calc total batch count num_batch = len(X_train) // Hyperparams.batch_size i = 0 ds_train = DataSet() for x in X_train: instance = Instance(word=Sources_train[i], traslated=Targets_train[i], word_seq=x.tolist(), translated_seq=Y_train[i].tolist()) ds_train.append(instance) i = i + 1 ds_train.set_input('word_seq', 'translated_seq') ds_train.set_target('translated_seq') # Load test data X_test, Y_test, Sources_test, Targets_test = load_test_data() # calc total batch count num_batch = len(X_test) // Hyperparams.batch_size i = 0 ds_test = DataSet() for x in X_test: instance = Instance(word=Sources_test[i], traslated=Targets_test[i], word_seq=x.tolist(), translated_seq=Y_test[i].tolist()) ds_test.append(instance) i = i + 1 ds_test.set_input('word_seq', 'translated_seq') ds_test.set_target('translated_seq') return ds_train, ds_test
def test_tutorial(self): # 从csv读取数据到DataSet sample_path = "./data_for_tests/tutorial_sample_dataset.csv" dataset = DataSet.read_csv(sample_path, headers=('raw_sentence', 'label'), sep='\t') print(len(dataset)) print(dataset[0]) dataset.append(Instance(raw_sentence='fake data', label='0')) dataset.apply(lambda x: x['raw_sentence'].lower(), new_field_name='raw_sentence') # label转int dataset.apply(lambda x: int(x['label']), new_field_name='label') # 使用空格分割句子 def split_sent(ins): return ins['raw_sentence'].split() dataset.apply(split_sent, new_field_name='words') # 增加长度信息 dataset.apply(lambda x: len(x['words']), new_field_name='seq_len') # print(len(dataset)) # print(dataset[0]) # DataSet.drop(func)筛除数据 dataset.drop(lambda x: x['seq_len'] <= 3) print(len(dataset)) # 设置DataSet中,哪些field要转为tensor # set target,loss或evaluate中的golden,计算loss,模型评估时使用 dataset.set_target("label") # set input,模型forward时使用 dataset.set_input("words") # 分出测试集、训练集 test_data, train_data = dataset.split(0.5) # print(len(test_data)) # print(len(train_data)) # 构建词表, Vocabulary.add(word) vocab = Vocabulary(min_freq=2) train_data.apply(lambda x: [vocab.add(word) for word in x['words']]) vocab.build_vocab() # index句子, Vocabulary.to_index(word) train_data.apply( lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words') test_data.apply( lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words') print(test_data[0]) model = CNNText(embed_num=len(vocab), embed_dim=50, num_classes=5, padding=2, dropout=0.1) from fastNLP import Trainer from copy import deepcopy # 更改DataSet中对应field的名称,要以模型的forward等参数名一致 train_data.rename_field('words', 'word_seq') # input field 与 forward 参数一致 train_data.rename_field('label', 'label_seq') test_data.rename_field('words', 'word_seq') test_data.rename_field('label', 'label_seq') # 实例化Trainer,传入模型和数据,进行训练 copy_model = deepcopy(model) overfit_trainer = Trainer(train_data=test_data, model=copy_model, loss=CrossEntropyLoss(pred="output", target="label_seq"), metrics=AccuracyMetric(pred="predict", target="label_seq"), n_epochs=10, batch_size=4, dev_data=test_data, save_path="./save") overfit_trainer.train() trainer = Trainer(train_data=train_data, model=model, loss=CrossEntropyLoss(pred="output", target="label_seq"), metrics=AccuracyMetric(pred="predict", target="label_seq"), n_epochs=10, batch_size=4, dev_data=test_data, save_path="./save") trainer.train() print('Train finished!') # 使用fastNLP的Tester测试脚本 tester = Tester(data=test_data, model=model, metrics=AccuracyMetric(pred="predict", target="label_seq"), batch_size=4) acc = tester.test() print(acc)
def test_len_(self): ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6]] * 40}) self.assertEqual(len(ds), 40) ds = DataSet() self.assertEqual(len(ds), 0)
def make_dataset(data): dataset = DataSet() mx = 0 le = None for x, y in zip(data.data, data.target): xx = deal(x) ins = Instance(sentence=xx, label=int(y)) if mx < len(xx.split()): mx = max(mx, len(xx.split())) le = xx dataset.append(ins) print(mx) dataset.apply_field(lambda x: x.split(), field_name='sentence', new_field_name='words') dataset.apply_field(lambda x: len(x), field_name='words', new_field_name='seq_len') dataset.rename_field('words', Const.INPUT) dataset.rename_field('seq_len', Const.INPUT_LEN) dataset.rename_field('label', Const.TARGET) dataset.set_input(Const.INPUT, Const.INPUT_LEN) dataset.set_target(Const.TARGET) return dataset
def test_contains(self): ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6]] * 40}) self.assertTrue("x" in ds) self.assertTrue("y" in ds) self.assertFalse("z" in ds)
def testENAS(self): # 从csv读取数据到DataSet sample_path = "tutorials/sample_data/tutorial_sample_dataset.csv" dataset = DataSet.read_csv(sample_path, headers=('raw_sentence', 'label'), sep='\t') print(len(dataset)) print(dataset[0]) print(dataset[-3]) dataset.append(Instance(raw_sentence='fake data', label='0')) # 将所有数字转为小写 dataset.apply(lambda x: x['raw_sentence'].lower(), new_field_name='raw_sentence') # label转int dataset.apply(lambda x: int(x['label']), new_field_name='label') # 使用空格分割句子 def split_sent(ins): return ins['raw_sentence'].split() dataset.apply(split_sent, new_field_name='words') # 增加长度信息 dataset.apply(lambda x: len(x['words']), new_field_name='seq_len') print(len(dataset)) print(dataset[0]) # DataSet.drop(func)筛除数据 dataset.drop(lambda x: x['seq_len'] <= 3) print(len(dataset)) # 设置DataSet中,哪些field要转为tensor # set target,loss或evaluate中的golden,计算loss,模型评估时使用 dataset.set_target("label") # set input,模型forward时使用 dataset.set_input("words", "seq_len") # 分出测试集、训练集 test_data, train_data = dataset.split(0.5) print(len(test_data)) print(len(train_data)) # 构建词表, Vocabulary.add(word) vocab = Vocabulary(min_freq=2) train_data.apply(lambda x: [vocab.add(word) for word in x['words']]) vocab.build_vocab() # index句子, Vocabulary.to_index(word) train_data.apply( lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words') test_data.apply( lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words') print(test_data[0]) # 如果你们需要做强化学习或者GAN之类的项目,你们也可以使用这些数据预处理的工具 from fastNLP.core.batch import Batch from fastNLP.core.sampler import RandomSampler batch_iterator = Batch(dataset=train_data, batch_size=2, sampler=RandomSampler()) for batch_x, batch_y in batch_iterator: print("batch_x has: ", batch_x) print("batch_y has: ", batch_y) break from fastNLP.models.enas_model import ENASModel from fastNLP.models.enas_controller import Controller model = ENASModel(embed_num=len(vocab), num_classes=5) controller = Controller() from fastNLP.models.enas_trainer import ENASTrainer from copy import deepcopy # 更改DataSet中对应field的名称,要以模型的forward等参数名一致 train_data.rename_field('words', 'word_seq') # input field 与 forward 参数一致 train_data.rename_field('label', 'label_seq') test_data.rename_field('words', 'word_seq') test_data.rename_field('label', 'label_seq') loss = CrossEntropyLoss(pred="output", target="label_seq") metric = AccuracyMetric(pred="predict", target="label_seq") trainer = ENASTrainer(model=model, controller=controller, train_data=train_data, dev_data=test_data, loss=CrossEntropyLoss(pred="output", target="label_seq"), metrics=AccuracyMetric(pred="predict", target="label_seq"), check_code_level=-1, save_path=None, batch_size=32, print_every=1, n_epochs=3, final_epochs=1) trainer.train() print('Train finished!') # 调用Tester在test_data上评价效果 from fastNLP import Tester tester = Tester(data=test_data, model=model, metrics=AccuracyMetric(pred="predict", target="label_seq"), batch_size=4) acc = tester.test() print(acc)
def test_split(self): ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10}) d1, d2 = ds.split(0.1)
def prepare_fake_dataset2(*args, size=100): ys = np.random.randint(4, size=100, dtype=np.int64) data = {'y': ys} for arg in args: data[arg] = np.random.randn(size, 5) return DataSet(data=data)
def test_get_all_fields(self): ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10}) ans = ds.get_all_fields() self.assertEqual(ans["x"].content, [[1, 2, 3, 4]] * 10) self.assertEqual(ans["y"].content, [[5, 6]] * 10)
def from_raw_text_new(chars, vocabs, w_list, number_normalized=False): from fastNLP.core import DataSet from utils import get_bigrams bigrams = get_bigrams(chars) seq_len = len(chars) target = ['O'] * seq_len dataset = DataSet({ 'chars': [chars], 'bigrams': [bigrams], 'seq_len': [seq_len], 'target': [target] }) datasets = {'train': dataset} def normalize_char(inp): result = [] for c in inp: if c.isdigit(): result.append('0') else: result.append(c) return result def normalize_bigram(inp): result = [] for bi in inp: tmp = bi if tmp[0].isdigit(): tmp = '0' + tmp[:1] if tmp[1].isdigit(): tmp = tmp[0] + '0' result.append(tmp) return result if number_normalized == 3: print('not support exit!') exit() for k, v in datasets.items(): v.apply_field(normalize_char, 'chars', 'chars') vocabs['char'] = Vocabulary() vocabs['char'].from_dataset( datasets['train'], field_name='chars', no_create_entry_dataset=[datasets['dev'], datasets['test']]) for k, v in datasets.items(): v.apply_field(normalize_bigram, 'bigrams', 'bigrams') vocabs['bigram'] = Vocabulary() vocabs['bigram'].from_dataset( datasets['train'], field_name='bigrams', no_create_entry_dataset=[datasets['dev'], datasets['test']]) def get_skip_path(chars, w_trie): sentence = ''.join(chars) result = w_trie.get_lexicon(sentence) # print(result) return result from V0.utils_ import Trie from functools import partial from fastNLP.core import Vocabulary from fastNLP.embeddings import StaticEmbedding from fastNLP import DataSet a = DataSet() a.apply w_trie = Trie() for w in w_list: w_trie.insert(w) import copy for k, v in datasets.items(): v.apply_field(partial(get_skip_path, w_trie=w_trie), 'chars', 'lexicons') v.apply_field(copy.copy, 'chars', 'raw_chars') v.add_seq_len('lexicons', 'lex_num') v.apply_field(lambda x: list(map(lambda y: y[0], x)), 'lexicons', 'lex_s') v.apply_field(lambda x: list(map(lambda y: y[1], x)), 'lexicons', 'lex_e') if number_normalized == 1: print('not support exit!') exit() for k, v in datasets.items(): v.apply_field(normalize_char, 'chars', 'chars') vocabs['char'] = Vocabulary() vocabs['char'].from_dataset( datasets['train'], field_name='chars', no_create_entry_dataset=[datasets['dev'], datasets['test']]) if number_normalized == 2: print('not support exit!') exit() for k, v in datasets.items(): v.apply_field(normalize_char, 'chars', 'chars') vocabs['char'] = Vocabulary() vocabs['char'].from_dataset( datasets['train'], field_name='chars', no_create_entry_dataset=[datasets['dev'], datasets['test']]) for k, v in datasets.items(): v.apply_field(normalize_bigram, 'bigrams', 'bigrams') vocabs['bigram'] = Vocabulary() vocabs['bigram'].from_dataset( datasets['train'], field_name='bigrams', no_create_entry_dataset=[datasets['dev'], datasets['test']]) def concat(ins): chars = ins['chars'] lexicons = ins['lexicons'] result = chars + list(map(lambda x: x[2], lexicons)) # print('lexicons:{}'.format(lexicons)) # print('lex_only:{}'.format(list(filter(lambda x:x[2],lexicons)))) # print('result:{}'.format(result)) return result def get_pos_s(ins): lex_s = ins['lex_s'] seq_len = ins['seq_len'] pos_s = list(range(seq_len)) + lex_s return pos_s def get_pos_e(ins): lex_e = ins['lex_e'] seq_len = ins['seq_len'] pos_e = list(range(seq_len)) + lex_e return pos_e for k, v in datasets.items(): v.apply(concat, new_field_name='lattice') v.set_input('lattice') v.apply(get_pos_s, new_field_name='pos_s') v.apply(get_pos_e, new_field_name='pos_e') v.set_input('pos_s', 'pos_e') # print(list(datasets['train'][:10]['lexicons'])) # print(list(datasets['train'][:10]['lattice'])) # print(list(datasets['train'][:10]['lex_s'])) # print(list(datasets['train'][:10]['lex_e'])) # print(list(datasets['train'][:10]['pos_s'])) # print(list(datasets['train'][:10]['pos_e'])) # exit(1208) # for k,v in datasets.items(): # v.apply_field(lambda x:[ list(map(lambda x:x[0],p)) for p in x],'skips_l2r','skips_l2r_source') # v.apply_field(lambda x:[ list(map(lambda x:x[1],p)) for p in x], 'skips_l2r', 'skips_l2r_word') # # for k,v in datasets.items(): # v.apply_field(lambda x:[ list(map(lambda x:x[0],p)) for p in x],'skips_r2l','skips_r2l_source') # v.apply_field(lambda x:[ list(map(lambda x:x[1],p)) for p in x], 'skips_r2l', 'skips_r2l_word') # for k,v in datasets.items(): # v.apply_field(lambda x:list(map(len,x)), 'skips_l2r_word', 'lexicon_count') # v.apply_field(lambda x: # list(map(lambda y: # list(map(lambda z:word_vocab.to_index(z),y)),x)), # 'skips_l2r_word',new_field_name='skips_l2r_word') # # v.apply_field(lambda x:list(map(len,x)), 'skips_r2l_word', 'lexicon_count_back') # # v.apply_field(lambda x: # list(map(lambda y: # list(map(lambda z:word_vocab.to_index(z),y)),x)), # 'skips_r2l_word',new_field_name='skips_r2l_word') vocabs['char'].index_dataset(*(datasets.values()), field_name='chars', new_field_name='chars') vocabs['bigram'].index_dataset(*(datasets.values()), field_name='bigrams', new_field_name='bigrams') vocabs['label'].index_dataset(*(datasets.values()), field_name='target', new_field_name='target') vocabs['lattice'].index_dataset(*(datasets.values()), field_name='lattice', new_field_name='lattice') return datasets, vocabs
def test__repr__(self): ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10}) for iter in ds: self.assertEqual(iter.__repr__(), "{'x': [1, 2, 3, 4] type=list,\n'y': [5, 6] type=list}")
def get_fastnlp_dataset(): # Hyper parameters output_dim = 10 SEQUENCE_LENGTH = 28 mnist_train_length = 60000 validation_samples = 5000 BATCH_SIZE = 1 transform_train_test = transforms.Compose( [transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, ))]) indices = torch.randperm(mnist_train_length) train_indices = indices[:len(indices) - validation_samples] val_indices = indices[len(indices) - validation_samples:] train_loader = torch.utils.data.DataLoader( datasets.MNIST('./data', train=True, download=True, transform=transform_train_test), sampler=SubsetRandomSampler(train_indices), batch_size=BATCH_SIZE, shuffle=False) val_loader = torch.utils.data.DataLoader( datasets.MNIST('./data', train=True, download=False, transform=transform_train_test), sampler=SubsetRandomSampler(val_indices), batch_size=BATCH_SIZE, shuffle=False) test_loader = torch.utils.data.DataLoader(datasets.MNIST( './data', train=False, transform=transform_train_test), batch_size=BATCH_SIZE, shuffle=True) TRAIN_ITERS = int( math.ceil((mnist_train_length - validation_samples) / BATCH_SIZE)) VAL_ITERS = int(math.ceil(validation_samples / BATCH_SIZE)) TEST_ITERS = int(math.ceil(len(test_loader.dataset) / BATCH_SIZE)) train_data = DataSet() val_data = DataSet() test_data = DataSet() for batch_idx, (x, y) in enumerate(train_loader): x = x[0][0].numpy() train_data.append(Instance(word_seq=x, target=int(y))) for batch_idx, (x, y) in enumerate(val_loader): x = x[0][0].numpy() val_data.append(Instance(word_seq=x, target=int(y))) for batch_idx, (x, y) in enumerate(test_loader): x = x[0][0].numpy() test_data.append(Instance(word_seq=x, target=int(y))) # 设定特征域和标签域 train_data.set_input("word_seq") test_data.set_input("word_seq") val_data.set_input("word_seq") train_data.set_target("target") test_data.set_target("target") val_data.set_target("target") return train_data, val_data, test_data