def load(self, path: str, bigram: bool = False) -> DataSet: """ :param path: str :param bigram: 是否使用bigram feature :return: """ dataset = DataSet() with open(path, 'r', encoding='utf-8') as f: for line in f: line = line.strip() if not line: # 去掉空行 continue parts = line.split() word_lens = map(len, parts) chars = list(''.join(parts)) tags = self._word_len_to_target(word_lens) assert len(chars) == len(tags['target']) dataset.append( Instance(raw_chars=chars, **tags, seq_len=len(chars))) if len(dataset) == 0: raise RuntimeError(f"{path} has no valid data.") if bigram: dataset.apply_field(self._gen_bigram, field_name='raw_chars', new_field_name='bigrams') return dataset
def make_dataset(data): dataset = DataSet() mx = 0 le = None for x, y in zip(data.data, data.target): xx = deal(x) ins = Instance(sentence=xx, label=int(y)) if mx < len(xx.split()): mx = max(mx, len(xx.split())) le = xx dataset.append(ins) print(mx) dataset.apply_field(lambda x: x.split(), field_name='sentence', new_field_name='words') dataset.apply_field(lambda x: len(x), field_name='words', new_field_name='seq_len') dataset.rename_field('words', Const.INPUT) dataset.rename_field('seq_len', Const.INPUT_LEN) dataset.rename_field('label', Const.TARGET) dataset.set_input(Const.INPUT, Const.INPUT_LEN) dataset.set_target(Const.TARGET) return dataset
def test_apply_tqdm(self): import time ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6]] * 40}) def do_nothing(ins): time.sleep(0.01) ds.apply(do_nothing, use_tqdm=True) ds.apply_field(do_nothing, field_name='x', use_tqdm=True)
def test_copy_padder(self): from fastNLP.core.field import AutoPadder ds = DataSet() ds.add_field('idx', [1, 2, 3]) ds['idx'].set_padder(None) # workaround of problem 1 ds.apply_field(lambda x: x, 'idx', 'idx') self.assertEqual(ds['idx'].padder, None) # should be None, but AutoPadder ds = DataSet() ds.add_field('idx', [1, 2, 3]) ds.apply_field(lambda x: x, 'idx', 'idx') self.assertTrue( isinstance(ds.get_field('idx').padder, AutoPadder)) # should be None, but AutoPadder
def prepare_nli_data(self): index = 'index' ds = DataSet({index: list(range(N_SAMPLES))}) ds.apply_field(lambda x: self.gen_var_seq(MAX_LEN, VOCAB_SIZE), field_name=index, new_field_name=C.INPUTS(0), is_input=True) ds.apply_field(lambda x: self.gen_var_seq(MAX_LEN, VOCAB_SIZE), field_name=index, new_field_name=C.INPUTS(1), is_input=True) ds.apply_field(lambda x: randrange(NUM_CLS), field_name=index, new_field_name=C.TARGET, is_target=True) ds.apply_field(len, C.INPUTS(0), C.INPUT_LENS(0), is_input=True, is_target=True) ds.apply_field(len, C.INPUTS(1), C.INPUT_LENS(1), is_input = True, is_target = True) ds.set_input(C.INPUTS(0), C.INPUTS(1)) ds.set_target(C.TARGET) return ds
def construct_dataset(dataset): dataset_ = DataSet() for sentence, target in zip(dataset.data, dataset.target): instance = Instance() instance['raw_sentence'] = sentence instance['target'] = int(target) dataset_.append(instance) dataset_.apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), '', x['raw_sentence']), new_field_name='sentence') #忽略标点 dataset_.apply(lambda x: re.sub('[%s]' % re.escape(string.whitespace), ' ', x['sentence']), new_field_name='sentence') #将空格、换行符等空白替换为空格 dataset_.apply(lambda x: x['sentence'].lower(), new_field_name='sentence') #转换为小写 dataset_.apply_field(lambda x: x.split(), field_name='sentence', new_field_name='input') return dataset_
def read_file(filename, processing_word=get_processing_word(lowercase=False)): dataset = DataSet() niter = 0 with codecs.open(filename, "r", "utf-16") as f: words, tags = [], [] for line in f: line = line.strip() if len(line) == 0 or line.startswith("-DOCSTART-"): if len(words) != 0: assert len(words) > 2 if niter == 1: print(words, tags) niter += 1 dataset.append( Instance(ori_words=words[:-1], ori_tags=tags[:-1])) words, tags = [], [] else: word, tag = line.split() word = processing_word(word) words.append(word) tags.append(tag.lower()) dataset.apply_field(lambda x: [x[0]], field_name='ori_words', new_field_name='task') dataset.apply_field(lambda x: len(x), field_name='ori_tags', new_field_name='seq_len') dataset.apply_field(lambda x: expand(x), field_name='ori_words', new_field_name="bi1") return dataset
def prepare_pos_tagging_data(self): index = 'index' ds = DataSet({index: list(range(N_SAMPLES))}) ds.apply_field(lambda x: self.gen_var_seq(MAX_LEN, VOCAB_SIZE), field_name=index, new_field_name=C.INPUT, is_input=True) ds.apply_field(lambda x: self.gen_seq(len(x), NUM_CLS), field_name=C.INPUT, new_field_name=C.TARGET, is_target=True) ds.apply_field(len, C.INPUT, C.INPUT_LEN, is_input=True, is_target=True) return ds
def get_data(): dataset_train, dataset_test = get_text_classification_datasets() # print(dataset_train.data) dic_train = { "input" : dataset_train.data, "target" : dataset_train.target } dic_test = { "input" : dataset_test.data, "target" : dataset_test.target } dataset = DataSet(dic_train) test_data = DataSet(dic_test) dataset.apply_field(lambda x: re.sub(r'[{}]+'.format(string.punctuation), "", x.lower()), field_name='input', new_field_name='input') dataset.apply_field(lambda x: re.sub(r'[{}]+'.format(string.whitespace), " ", x), field_name='input', new_field_name='input') dataset.apply_field(lambda x: x.split(), field_name='input', new_field_name='words') test_data.apply_field(lambda x: re.sub(r'[{}]+'.format(string.punctuation), "", x.lower()), field_name='input', new_field_name='input') test_data.apply_field(lambda x: re.sub(r'[{}]+'.format(string.whitespace), " ", x), field_name='input', new_field_name='input') test_data.apply_field(lambda x: x.split(), field_name='input', new_field_name='words') # ************************** dataset.apply_field(lambda x: len(x), field_name='words', new_field_name='seq_len') test_data.apply_field(lambda x: len(x), field_name='words', new_field_name='seq_len') dataset.rename_field('words', Const.INPUT) dataset.rename_field('seq_len', Const.INPUT_LEN) dataset.rename_field('target', Const.TARGET) test_data.rename_field('words', Const.INPUT) test_data.rename_field('seq_len', Const.INPUT_LEN) test_data.rename_field('target', Const.TARGET) # dataset.set_input(Const.INPUT, Const.INPUT_LEN) dataset.set_input(Const.INPUT) dataset.set_target(Const.TARGET) # test_data.set_input(Const.INPUT, Const.INPUT_LEN) test_data.set_input(Const.INPUT) test_data.set_target(Const.TARGET) # ************************** # only use train for vocab or train+dev train_data, dev_data = dataset.split(0.1) # print(len(train_data), len(dev_data), len(test_data)) # print(train_data[0]) vocab = Vocabulary(min_freq=10).from_dataset(train_data, field_name=Const.INPUT) vocab.index_dataset(train_data, field_name=Const.INPUT,new_field_name=Const.INPUT) vocab.index_dataset(dev_data, field_name=Const.INPUT,new_field_name=Const.INPUT) vocab.index_dataset(test_data, field_name=Const.INPUT,new_field_name=Const.INPUT) # print(test_data[0]) print(len(vocab)) return vocab, train_data, dev_data, test_data
# Prepare the dataset and testset fitlog.commit(__file__) fitlog.add_hyper_in_file(__file__) table = str.maketrans('', '', string.punctuation) newsgroups_train = fetch_20newsgroups(subset='train') dataset = DataSet() for i in range(newsgroups_train.target.shape[0]): dataset.append( Instance(raw_sentence=newsgroups_train.data[i].replace('\n', ' '), target=int(newsgroups_train.target[i]))) dataset.apply(lambda x: x['raw_sentence'].lower().translate(table), new_field_name='sentence') dataset.apply_field(lambda x: x.split(), field_name='sentence', new_field_name='words') dataset.apply_field(lambda x: len(x), field_name='words', new_field_name='seq_len') newsgroups_test = fetch_20newsgroups(subset='test') testset = DataSet() for i in range(newsgroups_test.target.shape[0]): testset.append( Instance(raw_sentence=newsgroups_test.data[i].replace('\n', ' '), target=int(newsgroups_test.target[i]))) testset.apply(lambda x: x['raw_sentence'].lower().translate(table), new_field_name='sentence') testset.apply_field(lambda x: x.split(), field_name='sentence',
def test_tutorial_1_data_preprocess(self): from fastNLP import DataSet data = {'raw_words': ["This is the first instance .", "Second instance .", "Third instance ."], 'words': [['this', 'is', 'the', 'first', 'instance', '.'], ['Second', 'instance', '.'], ['Third', 'instance', '.']], 'seq_len': [6, 3, 3]} dataset = DataSet(data) # 传入的dict的每个key的value应该为具有相同长度的list from fastNLP import DataSet from fastNLP import Instance dataset = DataSet() instance = Instance(raw_words="This is the first instance", words=['this', 'is', 'the', 'first', 'instance', '.'], seq_len=6) dataset.append(instance) from fastNLP import DataSet from fastNLP import Instance dataset = DataSet([ Instance(raw_words="This is the first instance", words=['this', 'is', 'the', 'first', 'instance', '.'], seq_len=6), Instance(raw_words="Second instance .", words=['Second', 'instance', '.'], seq_len=3) ]) from fastNLP import DataSet dataset = DataSet({'a': range(-5, 5), 'c': [0] * 10}) # 不改变dataset,生成一个删除了满足条件的instance的新 DataSet dropped_dataset = dataset.drop(lambda ins: ins['a'] < 0, inplace=False) # 在dataset中删除满足条件的instance dataset.drop(lambda ins: ins['a'] < 0) # 删除第3个instance dataset.delete_instance(2) # 删除名为'a'的field dataset.delete_field('a') # 检查是否存在名为'a'的field print(dataset.has_field('a')) # 或 ('a' in dataset) # 将名为'a'的field改名为'b' dataset.rename_field('c', 'b') # DataSet的长度 len(dataset) from fastNLP import DataSet data = {'raw_words': ["This is the first instance .", "Second instance .", "Third instance ."]} dataset = DataSet(data) # 将句子分成单词形式, 详见DataSet.apply()方法 dataset.apply(lambda ins: ins['raw_words'].split(), new_field_name='words') # 或使用DataSet.apply_field() dataset.apply_field(lambda sent: sent.split(), field_name='raw_words', new_field_name='words') # 除了匿名函数,也可以定义函数传递进去 def get_words(instance): sentence = instance['raw_words'] words = sentence.split() return words dataset.apply(get_words, new_field_name='words')
class TextData(): vocab_size = 0 dataset_size = 0 train_size = 0 test_size = 0 class_num = 4 min_count = 10 max_seq_len = 500 seq_limit = 2000 data_src = "20news" data_set = DataSet() train_set = DataSet() test_set = DataSet() dev_set = DataSet() vocab = None def __init__(self,data_src="20news",min_count=10,seq_limit=None): self.data_src = data_src self.min_count = min_count if seq_limit is not None: self.seq_limit = seq_limit def find_max_len(self,words): self.max_seq_len = max(len(words),self.max_seq_len) def seq_regularize(self,words): wlen = len(words) if wlen<self.max_seq_len: return [0]*(self.max_seq_len-wlen) + words else: return words[:self.max_seq_len] def fetch_20news(self,size=4): print("Loading 20newsgroups data and tokenize.") if size==20: train,test = get_all_20news() else: train,test = get_text_classification_datasets() train_input,test_input = tokenize(train.data,test.data) train_target = train.target test_target = test.target self.class_num = len(train.target_names) assert (self.class_num == len(test.target_names)) # Building Fastnlp dataset. print("Building Fastnlp dataset.") self.train_set = DataSet({"text":train_input,"class":train_target}) self.test_set = DataSet({"text":test_input,"class":test_target}) # Building Fastnlp vocabulary... print("Building Fastnlp vocabulary.") self.vocab = Vocabulary(min_freq=self.min_count) self.train_set.apply(lambda x : [self.vocab.add_word(word) for word in x['text']]) self.vocab.build_vocab() self.vocab.build_reverse_vocab() self.vocab_size = len(self.vocab) # Building multi-hot-vector for train_set and test_set. print("Building id-presentation for train_set and test_set.") self.vocab.index_dataset(self.train_set,self.test_set,field_name='text',new_field_name='words') self.train_set.apply_field(lambda x : len(x),field_name='words',new_field_name='seq_len') self.test_set.apply_field(lambda x : len(x),field_name='words',new_field_name='seq_len') self.train_set.apply_field(self.find_max_len,field_name='words') print(self.max_seq_len) self.max_seq_len = min(self.max_seq_len,self.seq_limit) self.train_set.apply_field(self.seq_regularize,field_name='words',new_field_name='words') self.test_set.apply_field(self.seq_regularize,field_name='words',new_field_name='words') # self.train_set.apply(lambda x : text2multi_hot(x['words'],self.vocab_size),new_field_name="input") # self.test_set.apply(lambda x : text2multi_hot(x['words'],self.vocab_size),new_field_name='input') # Building target-vector for train_set and test_set. print("Building target-vector for train_set and test_set.") self.train_set.apply(lambda x : int(x['class']),new_field_name="target",is_target=True) self.test_set.apply(lambda x : int(x['class']),new_field_name="target",is_target=True) # self.train_set.apply(lambda x : class2target(x['class'],self.calss_num),new_field_name="target") # self.test_set.apply(lambda x : class2target(x['class'],self.calss_num),new_field_name="target") def fetch_csv(self,path=None): print("Not implemented now...") pass def fetch_data(self,path=None): if self.data_src == "20news": # Loading 20newsgroups data and tokenize. self.fetch_20news() elif self.data_src == "20news_all": self.fetch_20news(size=20) else: print("No data src...") self.train_size = self.train_set.get_length() self.test_size = self.test_set.get_length() return self.train_size,self.test_size
def main(): parser = argparse.ArgumentParser() # fmt: off parser.add_argument("--data_path", required=True, type=str, help="all of datasets pkl paths") # fmt: on options, _ = parser.parse_known_args() train_set, test_set = DataSet(), DataSet() input_dir = os.path.join(options.data_path, "joint-sighan2008/bmes") options.output = os.path.join(options.data_path, "total_dataset.pkl") print(input_dir, options.output) for fn in os.listdir(input_dir): if fn not in ["test.txt", "train-all.txt"]: continue print(fn) abs_fn = os.path.join(input_dir, fn) ds = read_file(abs_fn) if "test.txt" == fn: test_set = ds else: train_set = ds print( "num samples of total train, test: {}, {}".format(len(train_set), len(test_set)) ) uni_vocab = Vocabulary(min_freq=None).from_dataset( train_set, test_set, field_name="ori_words" ) # bi_vocab = Vocabulary(min_freq=3, max_size=50000).from_dataset(train_set,test_set, field_name="bi1") bi_vocab = Vocabulary(min_freq=3, max_size=None).from_dataset( train_set, field_name="bi1", no_create_entry_dataset=[test_set] ) tag_vocab = Vocabulary(min_freq=None, padding="s", unknown=None).from_dataset( train_set, field_name="ori_tags" ) task_vocab = Vocabulary(min_freq=None, padding=None, unknown=None).from_dataset( train_set, field_name="task" ) def to_index(dataset): uni_vocab.index_dataset(dataset, field_name="ori_words", new_field_name="uni") tag_vocab.index_dataset(dataset, field_name="ori_tags", new_field_name="tags") task_vocab.index_dataset(dataset, field_name="task", new_field_name="task") dataset.apply_field(lambda x: x[1:], field_name="bi1", new_field_name="bi2") dataset.apply_field(lambda x: x[:-1], field_name="bi1", new_field_name="bi1") bi_vocab.index_dataset(dataset, field_name="bi1", new_field_name="bi1") bi_vocab.index_dataset(dataset, field_name="bi2", new_field_name="bi2") dataset.set_input("task", "uni", "bi1", "bi2", "seq_len") dataset.set_target("tags") return dataset train_set = to_index(train_set) test_set = to_index(test_set) output = {} output["train_set"] = train_set output["test_set"] = test_set output["uni_vocab"] = uni_vocab output["bi_vocab"] = bi_vocab output["tag_vocab"] = tag_vocab output["task_vocab"] = task_vocab print(tag_vocab.word2idx) print(task_vocab.word2idx) make_sure_path_exists(os.path.dirname(options.output)) print("Saving dataset to {}".format(os.path.abspath(options.output))) with open(options.output, "wb") as outfile: dump(output, outfile) print(len(task_vocab), len(tag_vocab), len(uni_vocab), len(bi_vocab)) dic = {} tokens = {} def process(words): name = words[0][1:-1] if name not in dic: dic[name] = set() tokens[name] = 0 tokens[name] += len(words[1:]) dic[name].update(words[1:]) train_set.apply_field(process, "ori_words", None) for name in dic.keys(): print(name, len(dic[name]), tokens[name]) with open(os.path.join(os.path.dirname(options.output), "oovdict.pkl"), "wb") as f: dump(dic, f) def get_max_len(ds): global max_len max_len = 0 def find_max_len(words): global max_len if max_len < len(words): max_len = len(words) ds.apply_field(find_max_len, "ori_words", None) return max_len print( "train max len: {}, test max len: {}".format( get_max_len(train_set), get_max_len(test_set) ) )
class TextData(): data_src = "all_data" class_num = 2 min_count = 10 max_seq_len = 500 seq_limit = 2000 train_set = DataSet() val_set = DataSet() test_set = DataSet() train_size = 0 val_size = 0 test_size = 0 test_projectid = None vocab = None vocab_size = 0 def __init__(self, data_src="all_data", min_count=10, seq_limit=None): self.data_src = data_src self.min_count = min_count if seq_limit is not None: self.seq_limit = seq_limit def find_max_len(self, words): self.max_seq_len = max(len(words), self.max_seq_len) def seq_regularize(self, words): wlen = len(words) if wlen < self.max_seq_len: return [0] * (self.max_seq_len - wlen) + words else: return words[:self.max_seq_len] def fetch_csv(self, path, text_var="essay", target="is_exciting", subset_num=None, us_rate=None, os_rate=None): """ us_rate: under sampling rate os_rate: over sampling rate """ print("Loading data from {} ...".format(path)) df = pd.read_csv(path) # text_vars=["title", "short_description", "need_statement", "essay"] text_vars = text_var # only select the essay column target_var = "y" df[target_var] = 0.0 df[target_var][df[target] == "t"] = 1.0 df[target_var][df[target] != "t"] = 0.0 train_df = df[df['split'] == 'train'] val_df = df[df['split'] == 'val'] test_df = df[df['split'] == 'test'] train_num = len(train_df) val_num = len(val_df) test_num = len(test_df) print("nums:({},{},{})".format(train_num, val_num, test_num)) if os_rate is not None: print("Over Sample mode") ros = RandomOverSampler(random_state=0) elif us_rate is not None: print("Under Sample mode") train_df_t = train_df[df[target] == "t"] train_df_f = train_df[df[target] == "f"] t_num = len(train_df_t) f_num = len(train_df_f) print("Raw train t:f = {}:{}".format(t_num, f_num)) nf_num = int(t_num / us_rate) f_num = min(nf_num, f_num) balanced_train_t = train_df_t.sample(n=t_num) balanced_train_f = train_df_f.sample(n=f_num) train_df = pd.concat([balanced_train_t, balanced_train_f]).sample(frac=1) print("Balanced train: t:f = {}:{}".format(len(balanced_train_t), len(balanced_train_f))) # print("Train 1.0:",len(train_df[train_df[target_var] == 1.0])) val_df_t = val_df[df[target] == "t"] val_df_f = val_df[df[target] == "f"] t_num = len(val_df_t) f_num = len(val_df_f) print("Raw val t:f = {}:{}".format(t_num, f_num)) nf_num = int(t_num / us_rate) f_num = min(nf_num, f_num) balanced_val_t = val_df_t.sample(n=t_num) balanced_val_f = val_df_f.sample(n=f_num) val_df = pd.concat([balanced_val_t, balanced_val_f]).sample(frac=1) print("Balanced val: t:f = {}:{}".format(len(balanced_val_t), len(balanced_val_f))) else: print("No sample mode") if subset_num is not None and subset_num > 0: print("Get sub set of size {}.".format(subset_num)) train_df = train_df.sample(n=subset_num) val_df = val_df.sample(n=subset_num) train_num = len(train_df) val_num = len(val_df) test_num = len(test_df) print("subset nums:({},{},{})".format(train_num, val_num, test_num)) train_target = train_df[target_var].values count = 0 print(count) val_target = val_df[target_var].values test_target = test_df[target_var].values print("tokenize train set") train_input = tokenize(train_df[text_vars].values) print("tokenize val set") val_input = tokenize(val_df[text_vars].values) print("tokenize test set") test_input = tokenize(test_df[text_vars].values) assert (self.class_num == 2) self.test_projectid = test_df['projectid'] # Building Fastnlp dataset. print("Building Fastnlp dataset.") if os_rate is not None: print("Over Sampling...") train_input, train_target = ros.fit_sample( np.array(train_input)[:, np.newaxis], np.array(train_target)[:, np.newaxis]) train_input = train_input.squeeze().tolist() train_target = train_target.tolist() val_input, val_target = ros.fit_sample( np.array(val_input)[:, np.newaxis], np.array(val_target)[:, np.newaxis]) val_input = val_input.squeeze().tolist() val_target = val_target.tolist() self.train_set = DataSet({"text": train_input, "class": train_target}) self.val_set = DataSet({"text": val_input, "class": val_target}) self.test_set = DataSet({"text": test_input, "class": test_target}) # Building Fastnlp vocabulary... print("Building Fastnlp vocabulary.") self.vocab = Vocabulary(min_freq=self.min_count) self.train_set.apply( lambda x: [self.vocab.add_word(word) for word in x['text']]) self.vocab.build_vocab() self.vocab.build_reverse_vocab() self.vocab_size = len(self.vocab) # Building multi-hot-vector for train_set and test_set. print("Building id-presentation for train_set and test_set.") self.vocab.index_dataset(self.train_set, self.val_set, self.test_set, field_name='text', new_field_name='words') self.train_set.apply_field(lambda x: len(x), field_name='words', new_field_name='seq_len') self.val_set.apply_field(lambda x: len(x), field_name='words', new_field_name='seq_len') self.test_set.apply_field(lambda x: len(x), field_name='words', new_field_name='seq_len') self.train_set.apply_field(self.find_max_len, field_name='words') print(self.max_seq_len) self.max_seq_len = min(self.max_seq_len, self.seq_limit) self.train_set.apply_field(self.seq_regularize, field_name='words', new_field_name='words') self.val_set.apply_field(self.seq_regularize, field_name='words', new_field_name='words') self.test_set.apply_field(self.seq_regularize, field_name='words', new_field_name='words') # self.train_set.apply(lambda x : text2multi_hot(x['words'],self.vocab_size),new_field_name="input") # self.val_set.apply(lambda x : text2multi_hot(x['words'],self.vocab_size),new_field_name='input') # self.test_set.apply(lambda x : text2multi_hot(x['words'],self.vocab_size),new_field_name='input') # Building target-vector for train_set and test_set. print("Building target-vector for train_set and test_set.") self.train_set.apply(lambda x: int(x['class']), new_field_name="target", is_target=True) self.val_set.apply(lambda x: int(x['class']), new_field_name="target", is_target=True) self.test_set.apply(lambda x: int(x['class']), new_field_name="target", is_target=True) # self.train_set.apply(lambda x : class2target(x['class'],self.calss_num),new_field_name="target") # self.test_set.apply(lambda x : class2target(x['class'],self.calss_num),new_field_name="target") def fetch_data(self, path, text_var="essay", target_var="is_exciting", subset_num=None, us_rate=None, os_rate=None): if self.data_src == "all_data": # Loading 20newsgroups data and tokenize. self.fetch_csv(path, text_var, target_var, subset_num, us_rate, os_rate) else: print("No legal data src type:{} ...".format(self.data_src)) assert (0 == 1) self.train_size = self.train_set.get_length() self.val_size = self.val_set.get_length() self.test_size = self.test_set.get_length() return self.train_size, self.val_size, self.test_size
def preprocessing(data_train, data_test): data_train_dict = {'raw_text': data_train.data, 'label': data_train.target} data_test_dict = {'raw_text': data_test.data, 'label': data_test.target} dataset = DataSet(data_train_dict) test_set = DataSet(data_test_dict) dataset.apply_field(lambda piece: re.sub('[' + string.whitespace + '\u200b]+', ' ', re.sub('[' + string.punctuation +']', '', piece)).strip().lower(), field_name='raw_text', new_field_name='raw_text') test_set.apply_field(lambda piece: re.sub('[' + string.whitespace + '\u200b]+', ' ', re.sub('[' + string.punctuation + ']', '', piece)).strip().lower(), field_name='raw_text', new_field_name='raw_text') dataset.apply_field(lambda piece: piece.split(' '), field_name='raw_text', new_field_name='text') test_set.apply_field(lambda piece: piece.split(' '), field_name='raw_text', new_field_name='text') # 观察数据集中文本长度分布,以选取合适的text_length # data_lens = [] # for instance in dataset: # data_lens.append(len(instance['text'])) # for instance in test_set: # data_lens.append(len(instance['text'])) # print("max text_len %d, min text_len %d" % (max(data_lens), min(data_lens))) # print(len([i for i in data_lens if i < 400])) # plt.hist(data_lens, bins=200, facecolor="blue", edgecolor="black", alpha=0.7) # plt.xlabel("text_length") # plt.ylabel("number of texts") # plt.title("Distribution of text_length") # plt.show() dataset.apply_field(lambda piece: piece[:text_len], field_name='text', new_field_name='text') test_set.apply_field(lambda piece: piece[:text_len], field_name='text', new_field_name='text') dataset.delete_field('raw_text') test_set.delete_field('raw_text') # 将数字都转换成相同的形式 # for instance in dataset: # for i, word in enumerate(instance['text']): # if word.isdigit(): # instance['text'][i] = '1' # for instance in test_set: # for i, word in enumerate(instance['text']): # if word.isdigit(): # instance['text'][i] = '1' vocab = Vocabulary(min_freq=min_freqency, unknown='<unk>', padding='<pad>').from_dataset(dataset, field_name='text') print("vocabulary_length:", len(vocab)) vocab.index_dataset(dataset, field_name='text',new_field_name='text') vocab.index_dataset(test_set, field_name='text',new_field_name='text') # 是否使用padding, 将每条文本变为等长 train_set, dev_set = dataset.split(0.2) train_set.rename_field('text', Const.INPUT) train_set.rename_field('label', Const.TARGET) train_set.set_input(Const.INPUT) train_set.set_target(Const.TARGET) dev_set.rename_field('text', Const.INPUT) dev_set.rename_field('label', Const.TARGET) dev_set.set_input(Const.INPUT) dev_set.set_target(Const.TARGET) test_set.rename_field('text', Const.INPUT) test_set.rename_field('label', Const.TARGET) test_set.set_input(Const.INPUT) test_set.set_target(Const.TARGET) print("train_set length:", len(train_set)) print("dev_set length:", len(dev_set)) print("test_set length:", len(test_set)) return train_set, dev_set, test_set, vocab