def get_fastnlp_dataset(): text_train, text_test = get_text_classification_datasets() train_data = DataSet() test_data = DataSet() for i in range(len(text_train.data)): train_data.append( Instance(text=split_sent(text_train.data[i]), target=int(text_train.target[i]))) for i in range(len(text_test.data)): test_data.append( Instance(text=split_sent(text_test.data[i]), target=int(text_test.target[i]))) # 构建词表 vocab = Vocabulary(min_freq=5, unknown='<unk>', padding='<pad>') train_data.apply(lambda x: [vocab.add(word) for word in x['text']]) vocab.build_vocab() # 根据词表映射句子 train_data.apply(lambda x: [vocab.to_index(word) for word in x['text']], new_field_name='word_seq') test_data.apply(lambda x: [vocab.to_index(word) for word in x['text']], new_field_name='word_seq') # 设定特征域和标签域 train_data.set_input("word_seq") test_data.set_input("word_seq") train_data.set_target("target") test_data.set_target("target") return train_data, test_data, vocab
def preprocess(): train_set = DataSet() for i in range(len(raw_train.data)): train_set.append( Instance(sentence=raw_train.data[i], target=int(raw_train.target[i]))) train_set.apply(lambda x: x['sentence'].translate( str.maketrans("", "", string.punctuation)).lower(), new_field_name='sentence') train_set.apply(lambda x: x['sentence'].split(), new_field_name='words') train_set.apply(lambda x: len(x['words']), new_field_name='seq_len') test_set = DataSet() for i in range(len(raw_test.data)): test_set.append( Instance(sentence=raw_test.data[i], target=int(raw_test.target[i]))) test_set.apply(lambda x: x['sentence'].translate( str.maketrans("", "", string.punctuation)).lower(), new_field_name='sentence') test_set.apply(lambda x: x['sentence'].split(), new_field_name='words') test_set.apply(lambda x: len(x['words']), new_field_name='seq_len') vocab = Vocabulary(min_freq=10) train_set.apply(lambda x: [vocab.add(word) for word in x['words']]) test_set.apply(lambda x: [vocab.add(word) for word in x['words']]) vocab.build_vocab() vocab.index_dataset(train_set, field_name='words', new_field_name='words') vocab.index_dataset(test_set, field_name='words', new_field_name='words') return train_set, test_set, vocab
def create_dataset(): # categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space', 'rec.motorcycles'] # categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space', 'rec.motorcycles', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale'] categories = ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc'] newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, data_home='../../..') newsgroups_test = fetch_20newsgroups(subset='test', categories=categories, data_home='../../..') dataset = DataSet() for i in range(len(newsgroups_train.data)): if len(newsgroups_train.data[i]) <= 2000: dataset.append(Instance(raw_sentence=newsgroups_train.data[i], target=int(newsgroups_train.target[i]))) for i in range(len(newsgroups_test.data)): if len(newsgroups_test.data[i]) <= 2000: dataset.append(Instance(raw_sentence=newsgroups_test.data[i], target=int(newsgroups_test.target[i]))) dataset.apply(lambda x: x['raw_sentence'].lower(), new_field_name='sentence') dataset.apply(lambda x: x['sentence'].split(), new_field_name='words') dataset.apply(lambda x: len(x['words']), new_field_name='seq_len') vocab = Vocabulary(min_freq=2).from_dataset(dataset, field_name='words') vocab.index_dataset(dataset, field_name='words', new_field_name='words') dataset.set_input('words', 'seq_len') dataset.set_target('target') train_dev_data, test_data = dataset.split(0.1) train_data, dev_data = train_dev_data.split(0.1) return vocab, train_data, dev_data, test_data
def load(self, folder): fns ={ 'dev':'{}_dev.csv'.format(self.lg1_lg2), 'test':'{}_test500.csv'.format(self.lg1_lg2), 'train': '{}_train500_10.csv'.format(self.lg1_lg2) } target_lg = self.lg1_lg2.split('_')[0] data_bundle = DataBundle() for name, fn in fns.items(): path = os.path.join(folder, fn) ds = DataSet() with open(path, 'r', encoding='utf-8') as f: for line in f: line = line.strip() if line: parts = line.split('\t') if self.lower: ins = Instance(word=parts[1].lower(), definition=parts[-1].lower()) else: ins = Instance(word=parts[1], definition=parts[-1]) ds.append(ins) data_bundle.set_dataset(ds, name=name) target_words = {} with open(os.path.join(folder, '{}.txt'.format(target_lg)), encoding='utf-8') as f: for line in f: line = line.strip() if line: if self.lower: line = line.lower() target_words[line] = 1 target_words = list(target_words.keys()) setattr(data_bundle, 'target_words', target_words) return data_bundle
def test_list_of_numpy_to_tensor(self): ds = DataSet([Instance(x=np.array([1, 2]), y=np.array([3, 4])) for _ in range(2)] + [Instance(x=np.array([1, 2, 3, 4]), y=np.array([3, 4, 5, 6])) for _ in range(2)]) ds.set_input("x") ds.set_target("y") iter = DataSetIter(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False) for x, y in iter: print(x, y)
def test_init(self): fields = {"x": [1, 2, 3], "y": [4, 5, 6]} ins = Instance(x=[1, 2, 3], y=[4, 5, 6]) self.assertTrue(isinstance(ins.fields, dict)) self.assertEqual(ins.fields, fields) ins = Instance(**fields) self.assertEqual(ins.fields, fields)
def test_case_1(self): ds = DataSet([Instance(words=['hello', 'world']), Instance(words=['Jack'])]) vocab = Vocabulary().from_dataset(ds, field_name='words') self.assertEqual(len(vocab), 5) embed = LSTMCharEmbedding(vocab, embed_size=60) x = torch.LongTensor([[2, 1, 0], [4, 3, 4]]) y = embed(x) self.assertEqual(tuple(y.size()), (2, 3, 60))
def readdata(): global target_len min_count = 10 #categories = ['comp.os.ms-windows.misc', 'rec.motorcycles', 'sci.space', 'talk.politics.misc', ] dataset_train = fetch_20newsgroups(subset='train', data_home='../../..') dataset_test = fetch_20newsgroups(subset='test', data_home='../../..') data = dataset_train.data target = dataset_train.target target_len = len(dataset_train.target_names) train_data = DataSet() padding = 0 for i in range(len(data)): data_t = re.sub("\d+|\s+|/", " ", data[i] ) temp = [word.strip(string.punctuation).lower() for word in data_t.split() if word.strip(string.punctuation) != ''] train_data.append(Instance(raw = data[i], label = int(target[i]), words = temp)) if len(temp) > padding: padding = len(temp) train_data.apply(lambda x: x['raw'].lower(), new_field_name='raw') data = dataset_test.data target = dataset_test.target test_data = DataSet() padding = 0 for i in range(len(data)): data_t = re.sub("\d+|\s+|/", " ", data[i] ) temp = [word.strip(string.punctuation).lower() for word in data_t.split() if word.strip(string.punctuation) != ''] test_data.append(Instance(raw = data[i], label = int(target[i]), words = temp)) if len(temp) > padding: padding = len(temp) test_data.apply(lambda x: x['raw'].lower(), new_field_name='raw') train_data.apply(lambda x: len(x['words']), new_field_name='len') test_data.apply(lambda x: len(x['words']), new_field_name='len') vocab = Vocabulary(min_freq=10) train_data.apply(lambda x: [vocab.add(word) for word in x['words']]) vocab.build_vocab() train_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='seq') test_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='seq') train_data.rename_field('seq', Const.INPUT) train_data.rename_field('len', Const.INPUT_LEN) train_data.rename_field('label', Const.TARGET) test_data.rename_field('seq', Const.INPUT) test_data.rename_field('len', Const.INPUT_LEN) test_data.rename_field('label', Const.TARGET) test_data.set_input(Const.INPUT, Const.INPUT_LEN) test_data.set_target(Const.TARGET) train_data.set_input(Const.INPUT, Const.INPUT_LEN) train_data.set_target(Const.TARGET) test_data, dev_data = test_data.split(0.5) return train_data,dev_data,test_data,vocab
def test_list_of_list_to_tensor(self): ds = DataSet([Instance(x=[1, 2], y=[3, 4]) for _ in range(2)] + [Instance(x=[1, 2, 3, 4], y=[3, 4, 5, 6]) for _ in range(2)]) ds.set_input("x") ds.set_target("y") iter = DataSetIter(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False) for x, y in iter: self.assertTrue(isinstance(x["x"], torch.Tensor)) self.assertEqual(tuple(x["x"].shape), (4, 4)) self.assertTrue(isinstance(y["y"], torch.Tensor)) self.assertEqual(tuple(y["y"].shape), (4, 4))
def prepare_fake_dataset(): mean = np.array([-3, -3]) cov = np.array([[1, 0], [0, 1]]) class_A = np.random.multivariate_normal(mean, cov, size=(1000,)) mean = np.array([3, 3]) cov = np.array([[1, 0], [0, 1]]) class_B = np.random.multivariate_normal(mean, cov, size=(1000,)) data_set = DataSet([Instance(x=[float(item[0]), float(item[1])], y=[0.0]) for item in class_A] + [Instance(x=[float(item[0]), float(item[1])], y=[1.0]) for item in class_B]) return data_set
def test_case_2(self): # 测试只需要拥有一样的index就可以concat ds = DataSet([ Instance(words=['hello', 'world']), Instance(words=['hello', 'Jack']) ]) vocab1 = Vocabulary().from_dataset(ds, field_name='words') vocab2 = Vocabulary().from_dataset(ds, field_name='words') self.assertEqual(len(vocab1), 5) cnn_embed = CNNCharEmbedding(vocab1, embed_size=60) lstm_embed = LSTMCharEmbedding(vocab2, embed_size=70) embed = StackEmbedding([cnn_embed, lstm_embed]) x = torch.LongTensor([[2, 1, 0], [4, 3, 4]]) y = embed(x) self.assertEqual(tuple(y.size()), (2, 3, 130))
def read_dataset(path, lower, word_idx=1, def_idx=-1): ds = DataSet() with open(path, 'r', encoding='utf-8') as f: for line in f: line = line.strip() if line: parts = line.split('\t') if lower: ins = Instance(word=parts[word_idx].lower(), definition=parts[def_idx].lower()) else: ins = Instance(word=parts[word_idx], definition=parts[def_idx]) ds.append(ins) return ds
def process_poems_large(file_name, sentence_len, vocab_size): sentences = [] with open(file_name, "r", encoding='utf-8', ) as f: for line in f.readlines(): try: line = line.strip() if line: title, content = line.split(':') # print(title) # print(content) # content = line.replace(' ', '').replace(',','').replace('。','') content = content.replace(' ', '') #包含标点符号 # 可以只取五言诗 # if len(content) < 6 or content[5] != ',': # continue if len(content) < 20: continue if ':' in content or '_' in content or '(' in content or '(' in content or '《' in content or '[' in content: continue #截断长度 if len(content) > sentence_len: content = content[:sentence_len] content = content + end_token sentences.append(content) except ValueError as e: pass dataset = DataSet() # sentences = random.sample(sentences, 5000) for sentence in sentences: instance = Instance() instance['raw_sentence'] = sentence instance['target'] = sentence[1:] + sentence[-1] dataset.append(instance) dataset.set_input("raw_sentence") dataset.set_target("target") # for iter in dataset: # print(iter['raw_sentence']) print("dataset_size:", len(dataset)) train_data, dev_data = dataset.split(0.2) train_data.rename_field("raw_sentence", "sentence") dev_data.rename_field("raw_sentence", "sentence") vocab = Vocabulary(max_size=vocab_size, min_freq=2, unknown='<unk>', padding='<pad>') # 构建词表 train_data.apply(lambda x: [vocab.add(word) for word in x['sentence']]) vocab.build_vocab() # 根据词表index句子 train_data.apply(lambda x: [vocab.to_index(word) for word in x['sentence']], new_field_name='sentence') train_data.apply(lambda x: [vocab.to_index(word) for word in x['target']], new_field_name='target') dev_data.apply(lambda x: [vocab.to_index(word) for word in x['sentence']], new_field_name='sentence') dev_data.apply(lambda x: [vocab.to_index(word) for word in x['target']], new_field_name='target') print("vocabulary_size:", len(vocab)) return train_data, dev_data, vocab
def get_joke_data(data_path): data_set = DataSet() sample_num = 0 sample_len = [] if os.path.exists(data_path): with open(data_path, 'r', encoding='utf-8') as fin: for lid, line in enumerate(fin): joke = json.loads(line) if joke['support'] > 0: if len(joke['content']) == 0: continue else: instance = Instance(raw_joke=joke['content']) data_set.append(instance) sample_num += 1 sample_len.append(len(joke['content'])) else: print("the data path doesn't exit.") print("Got {} samples from file.".format(sample_num)) for i in range(5): import random id = random.randint(0, sample_num) print("sample {}: {}".format(id, data_set[id]['raw_joke'])) import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt plt.hist(sample_len, bins=50, range=(0, 1000)) plt.savefig("./examples.jpg") count = 0 for i in sample_len: if i < 255: count += 1 print(count, '/', len(sample_len)) return data_set
def make_dataset(data): dataset = DataSet() mx = 0 le = None for x, y in zip(data.data, data.target): xx = deal(x) ins = Instance(sentence=xx, label=int(y)) if mx < len(xx.split()): mx = max(mx, len(xx.split())) le = xx dataset.append(ins) print(mx) dataset.apply_field(lambda x: x.split(), field_name='sentence', new_field_name='words') dataset.apply_field(lambda x: len(x), field_name='words', new_field_name='seq_len') dataset.rename_field('words', Const.INPUT) dataset.rename_field('seq_len', Const.INPUT_LEN) dataset.rename_field('label', Const.TARGET) dataset.set_input(Const.INPUT, Const.INPUT_LEN) dataset.set_target(Const.TARGET) return dataset
def test_append(self): dd = DataSet() for _ in range(3): dd.append(Instance(x=[1, 2, 3, 4], y=[5, 6])) self.assertEqual(len(dd), 3) self.assertEqual(dd.field_arrays["x"].content, [[1, 2, 3, 4]] * 3) self.assertEqual(dd.field_arrays["y"].content, [[5, 6]] * 3)
def make_dataset(data): dataset = DataSet() tot = 0 for x in data: seq = "[CLS] " + x["raw_text"] seq = tokenizer.encode(seq) """ seq=["[CLS]"]+word_tokenize(x["raw_text"]) seq=tokenizer.convert_tokens_to_ids(seq) """ if len(seq) > 512: seq = seq[:512] tot += 1 # print(x["raw_text"]) # print() label = int(x["label"]) ins = Instance(origin=x["raw_text"], seq=seq, label=label, seq_len=len(seq)) dataset.append(ins) dataset.set_input("seq", "seq_len") dataset.set_target("label") print(dataset[5]) print("number:", len(dataset), tot) print() return dataset
def load(path): data = DataSet() _data = [] with open(path, "r", encoding="utf-8") as fil: fil.readline() for line in fil: try: tradi, verna = line.strip().split("\t") except ValueError: continue tradi = chinese_tokenizer(tradi) verna = chinese_tokenizer(verna) vocab.add_word_lst(tradi) vocab.add_word_lst(verna) _data.append(Instance(traditional=tradi, vernacular=verna)) random.shuffle(_data) for x in _data: data.append(x) data.set_input("vernacular") data.set_target("traditional") return data
def prepare_env(): mean = np.array([-3, -3]) cov = np.array([[1, 0], [0, 1]]) class_A = np.random.multivariate_normal(mean, cov, size=(1000,)) mean = np.array([3, 3]) cov = np.array([[1, 0], [0, 1]]) class_B = np.random.multivariate_normal(mean, cov, size=(1000,)) data_set = DataSet([Instance(x=[float(item[0]), float(item[1])], y=[0.0]) for item in class_A] + [Instance(x=[float(item[0]), float(item[1])], y=[1.0]) for item in class_B]) data_set.set_input("x") data_set.set_target("y") model = NaiveClassifier(2, 1) return data_set, model
def load(self, path: str, bigram: bool = False) -> DataSet: """ :param path: str :param bigram: 是否使用bigram feature :return: """ dataset = DataSet() with open(path, 'r', encoding='utf-8') as f: for line in f: line = line.strip() if not line: # 去掉空行 continue parts = line.split() word_lens = map(len, parts) chars = list(''.join(parts)) tags = self._word_len_to_target(word_lens) assert len(chars) == len(tags['target']) dataset.append( Instance(raw_chars=chars, **tags, seq_len=len(chars))) if len(dataset) == 0: raise RuntimeError(f"{path} has no valid data.") if bigram: dataset.apply_field(self._gen_bigram, field_name='raw_chars', new_field_name='bigrams') return dataset
def construct_dataset(sentences): dataset = DataSet() for sentence in sentences: instance = Instance() instance['raw_sentence'] = sentence dataset.append(instance) return dataset
def read_file(filename, processing_word=get_processing_word(lowercase=False)): dataset = DataSet() niter = 0 with codecs.open(filename, "r", "utf-16") as f: words, tags = [], [] for line in f: line = line.strip() if len(line) == 0 or line.startswith("-DOCSTART-"): if len(words) != 0: assert len(words) > 2 if niter == 1: print(words, tags) niter += 1 dataset.append( Instance(ori_words=words[:-1], ori_tags=tags[:-1])) words, tags = [], [] else: word, tag = line.split() word = processing_word(word) words.append(word) tags.append(tag.lower()) dataset.apply_field(lambda x: [x[0]], field_name='ori_words', new_field_name='task') dataset.apply_field(lambda x: len(x), field_name='ori_tags', new_field_name='seq_len') dataset.apply_field(lambda x: expand(x), field_name='ori_words', new_field_name="bi1") return dataset
def read_instances_from_file(file, max_len=400, keep_case=False): ''' Collect instances and construct vocab ''' dataset = DataSet() trimmed_sent = 0 with open(file) as f: lines = f.readlines() for l in lines: l = l.strip().split('\t') if len(l) < 2: continue label = int(l[0]) sent = l[1] if not keep_case: sent = sent.lower() word_lst = sent.split() if len(word_lst) > max_len: word_lst = word_lst[:max_len] trimmed_sent += 1 if word_lst: dataset.append(Instance(words=word_lst, label=label)) logger.info('Get {} instances from file {}'.format(len(dataset), file)) if trimmed_sent: logger.info('{} sentences are trimmed. Max sentence length: {}.' .format(trimmed_sent, max_len)) return dataset
def test_Index2WordProcessor(self): vocab = Vocabulary() vocab.add_word_lst(["a", "b", "c", "d", "e"]) proc = Index2WordProcessor(vocab, "tag_id", "tag") data_set = DataSet( [Instance(tag_id=[np.random.randint(0, 7) for _ in range(32)])]) data_set = proc(data_set) self.assertTrue("tag" in data_set)
def dataset(self): d = DataSet() for key in self.data: for ins in self.data[key]['dataset']['chars']: ins = Instance(chars=ins) d.append(ins) return d
def preprocess(batch=16): raw_data1 = [] raw_data2 = [] for i in range(len(traindata.data)): raw_data1.append( Instance(sentence=traindata.data[i], label=int(traindata.target[i]))) trainset = DataSet(raw_data1) trainset.apply(lambda x: pre(x['sentence']), new_field_name='words') for i in range(len(testdata.data)): raw_data2.append( Instance(sentence=testdata.data[i], label=int(testdata.target[i]))) testset = DataSet(raw_data2) testset.apply(lambda x: pre(x['sentence']), new_field_name='words') global vocab vocab = Vocabulary(min_freq=1).from_dataset(trainset, testset, field_name='words') vocab.index_dataset(trainset, testset, field_name='words', new_field_name='words') trainset.set_input('words') testset.set_input('words') trainset.apply(lambda x: int(x['label']), new_field_name='target', is_target=True) testset.apply(lambda x: int(x['label']), new_field_name='target', is_target=True) trainset.apply(lambda x: len(x['words']), new_field_name='seq_len') testset.apply(lambda x: len(x['words']), new_field_name='seq_len') global vocabsize vocabsize = len(vocab) sampler = BucketSampler(batch_size=batch, seq_len_field_name='seq_len') train_batch = Batch(batch_size=batch, dataset=trainset, sampler=sampler) test_batch = Batch(batch_size=batch, dataset=testset, sampler=sampler) return train_batch, test_batch, vocabsize
def get_all_tang(data_path=None): data_set = DataSet() if data_path is None: all_tang = get_all_data() for sample in all_tang: instance = Instance(raw_sentence=sample) data_set.append(instance) else: if os.path.exists(data_path): with open(data_path, 'r', encoding='utf-8') as fin: for lidx, line in enumerate(fin): line = line.strip() if (line != "" and len(line) > 1): instance = Instance(raw_sentence=line) data_set.append(instance) else: print("the data path doesn't exit.") return data_set
def get_dataset(data_path): print('Getting dataset...') poetry = [] with open(data_path, 'r', encoding='utf-8') as f: poem = '' for line in f: if len(line) <= 1: ins = Instance(text=poem) if len(poem) > 10: poetry.append(ins) poem = '' else: poem += line.strip('\n') # print(poetry[0]) data = DataSet(data=poetry) print("Original data:", data[0]) vocabulary = Vocabulary(min_freq=2, unknown='<oov>', padding='<pad>') vocabulary.add_word('<eos>') vocabulary.add_word('<START>') data.apply(lambda x: [vocabulary.add(char) for char in x['text']]) vocabulary.build_vocab() print('pad:', vocabulary.to_index('<pad>')) print('Vocab size:', len(vocabulary)) data.apply(lambda x: [vocabulary.to_index(char) for char in x['text']], new_field_name='text') data.apply(lambda x: [vocabulary.to_index('<START>')] + x['text'] + [vocabulary.to_index('<eos>')], new_field_name='text') data.apply( lambda x: x['text'][0:min(config.sequence_length, len(x['text']))], new_field_name='text') data.apply(lambda x: [vocabulary.to_index('<pad>')] * (config.sequence_length - len(x['text'])) + x['text'], new_field_name='text') data.apply(lambda x: x['text'][0:-1], new_field_name='input') data.apply(lambda x: x['text'][1:], new_field_name='target') data.set_input('input') data.set_target('target') # length = config.sequence_length # for i, d in enumerate(data): # if length != len(d['text']): # print("wrong!") # exit() train_data, dev_data = data.split(0.2) print('Train data size:', len(train_data)) print('Dev data size:', len(dev_data)) print("Train data:", train_data[20]) # print("Dev data:", dev_data[0]) return train_data, dev_data, vocabulary
def process_data_1(embed_file, cws_train): embed, vocab = EmbedLoader.load_without_vocab(embed_file) time.sleep(1) # 测试是否通过读取cache获得结果 with open(cws_train, 'r', encoding='utf-8') as f: d = DataSet() for line in f: line = line.strip() if len(line) > 0: d.append(Instance(raw=line)) return embed, vocab, d
def test_init_v1(self): # 一维list ds = DataSet([Instance(x=[1, 2, 3, 4], y=[5, 6])] * 40) self.assertTrue("x" in ds.field_arrays and "y" in ds.field_arrays) self.assertEqual(ds.field_arrays["x"].content, [ [1, 2, 3, 4], ] * 40) self.assertEqual(ds.field_arrays["y"].content, [ [5, 6], ] * 40)