def preprocess(): train_set = DataSet() for i in range(len(raw_train.data)): train_set.append( Instance(sentence=raw_train.data[i], target=int(raw_train.target[i]))) train_set.apply(lambda x: x['sentence'].translate( str.maketrans("", "", string.punctuation)).lower(), new_field_name='sentence') train_set.apply(lambda x: x['sentence'].split(), new_field_name='words') train_set.apply(lambda x: len(x['words']), new_field_name='seq_len') test_set = DataSet() for i in range(len(raw_test.data)): test_set.append( Instance(sentence=raw_test.data[i], target=int(raw_test.target[i]))) test_set.apply(lambda x: x['sentence'].translate( str.maketrans("", "", string.punctuation)).lower(), new_field_name='sentence') test_set.apply(lambda x: x['sentence'].split(), new_field_name='words') test_set.apply(lambda x: len(x['words']), new_field_name='seq_len') vocab = Vocabulary(min_freq=10) train_set.apply(lambda x: [vocab.add(word) for word in x['words']]) test_set.apply(lambda x: [vocab.add(word) for word in x['words']]) vocab.build_vocab() vocab.index_dataset(train_set, field_name='words', new_field_name='words') vocab.index_dataset(test_set, field_name='words', new_field_name='words') return train_set, test_set, vocab
def get_vocab(dataset): word_vocab = Vocabulary(unknown='<unk>', padding='<pad>') dataset.apply(lambda x: [word_vocab.add(word) for word in x['word']]) word_vocab.build_vocab() print('word vocab', len(word_vocab)) char_vocab = Vocabulary(unknown='<unk>', padding='<pad>') dataset.apply(lambda x: [char_vocab.add(char) for char in x['char']]) char_vocab.build_vocab() print('char vocab', len(char_vocab)) pos_vocab = Vocabulary(unknown='<unk>', padding='<pad>') dataset.apply(lambda x: [pos_vocab.add(pos) for pos in x['pos']]) pos_vocab.build_vocab() print('pos vocab', len(pos_vocab)) #spo_vocab = Vocabulary(unknown='<unk>', padding='<pad>') #dataset.apply(lambda x: spo_vocab.add(x['spo'])) #spo_vocab.build_vocab() #print('spo vocab', len(spo_vocab)) tag_vocab = Vocabulary(unknown='<unk>', padding='<pad>') dataset.apply(lambda x: [tag_vocab.add(tag) for tag in x['tag']]) # tag_vocab.add_word('start') # tag_vocab.add_word('end') tag_vocab.build_vocab() print('tag_vocab', len(tag_vocab)) #return word_vocab, char_vocab, pos_vocab, spo_vocab, tag_vocab return word_vocab, char_vocab, pos_vocab, tag_vocab
def get_data(path): f = open(path, 'r') shi = f.read() shi = shi.replace('\n', '').replace('\r', '') shi = shi[:5000 * 64] sl = 64 l_doc = len(shi) vocab = Vocabulary(min_freq=1) for i in shi: vocab.add(i) vocab.build_vocab() vocab_size = len(vocab) num_s = int(l_doc / sl) train_s = int(num_s * 0.8) test_s = num_s - train_s array_shi = torch.zeros(l_doc) train_shi = torch.zeros(train_s, sl) test_shi = torch.zeros(test_s, sl) print(train_shi.size()) print(test_shi.size()) for i, j in enumerate(shi): array_shi[i] = vocab[j] array_shi = array_shi.view(-1, sl) train_shi[:, :] = array_shi[:train_s, :] test_shi[:, :] = array_shi[train_s:, :] return vocab, train_shi, test_shi
def data_process(): with open('./data.txt', encoding='utf-8') as fp: out = fp.readlines() data = list(out) poem = [] cnt = 0 for temp in data: cnt += 1 if cnt % 2 == 0: rec = re.sub(',', '', temp) poem.append(rec[:-1]) poem_normalized = [] for i in range(len(poem)): if len(poem[i]) < 80: poem[i] = ' ' * (80 - len(poem[i])) + poem[i] poem_normalized.append(poem[i]) else: poem_normalized.append(poem[i][:80]) vocab = Vocabulary(min_freq=2) for temp in poem_normalized: for x in temp: vocab.add(x) vocab.build_vocab() dataset = [] for temp in poem_normalized: dataset.append([vocab.to_index(x) for x in temp]) return vocab, np.array(dataset)
def prepareVocab(poemList): vocab = Vocabulary() for poem in poemList: for character in poem: vocab.add(character) vocab.build_vocab() return vocab
def pre_process(file_name): poem = [] with open(file_name, 'r', encoding='utf-8') as f: for index, line in enumerate(f.readlines()): if index % 2 == 1: raw_line = line.strip() raw_line = re.sub(',', '', raw_line) raw_line = re.sub('。', '', raw_line) length = len(raw_line) if length < 100: raw_line = raw_line + '~' * (100 - length) poem.append(raw_line[:100]) word_dict = Vocabulary() for line in poem: for character in line: word_dict.add(character) word_dict.build_vocab() data = [] for pi in poem: p = [] for ch in pi: p.append(word_dict.to_index(ch)) data.append(p) data = np.array(data) return word_dict, data
def load_conll_with_glove( data_dir, data_path='train.pos', glove_path="", # glove_path='/remote-home/ygxu/dataset/glove.empty.txt', load_glove=True, vocabs=None): path = os.path.join(data_dir, data_path) print(f"start load dataset from {path}.") from dataset import MyConllLoader ds = MyConllLoader().load(path) print(ds) ds.rename_field('word_seq', 'sentence') ds.rename_field('label_seq', 'label') #ds = DataSet.read_pos(path, headers=('sentence', 'label'), sep='\t') #ds.apply(lambda x: x['sentence'].lower(), new_field_name='sentence') #ds.apply(lambda x: x['sentence'].strip().split(), new_field_name='sentence') ds.apply(lambda x: len(x['sentence']) * [1.], new_field_name='word_seq_origin_len', is_input=True) if vocabs is None: vocab = Vocabulary(max_size=30000, min_freq=2, unknown='<unk>', padding='<pad>') ds.apply(lambda x: [vocab.add(word) for word in x['sentence']]) vocab.build_vocab() vocab_label = Vocabulary(max_size=200, unknown=None, padding='<pad>') ds.apply(lambda x: [vocab_label.add(label) for label in x['label']]) vocab_label.build_vocab() else: vocab, vocab_label = vocabs ds.apply(lambda x: [vocab.to_index(w) for w in x['sentence']], new_field_name='word_seq', is_input=True) ds.apply(lambda x: [vocab_label.to_index(w) for w in x['label']], new_field_name='truth', is_input=True, is_target=True) if not load_glove: print(f"successful load dataset from {path}") return ds embedding, _ = EmbedLoader().load_embedding(300, glove_path, 'glove', vocab) print(f"successful load dataset and embedding from {path}") return ds, embedding, (vocab, vocab_label)
def test_additional_update(self): vocab = Vocabulary() vocab.update(text) _ = vocab["well"] self.assertEqual(vocab.rebuild, False) vocab.add("hahaha") self.assertEqual(vocab.rebuild, True) _ = vocab["hahaha"] self.assertEqual(vocab.rebuild, False) self.assertTrue("hahaha" in vocab)
def Get_Data_Vocab(path): s = "" with open (path, "r", encoding='UTF-8') as f: for line in f: s += line.rstrip('\r\n') + "#" sentences = s.split("#") dataset = construct_dataset(sentences) dataset.apply(cut_pad, new_field_name='words') #控制每首诗长度一致 # 分出测试集、训练集 dev_data, train_data = dataset.split(0.8) # 构建词表, Vocabulary.add(word) vocab = Vocabulary(padding="<pad>", min_freq=2) train_data.apply(lambda x: [vocab.add(word) for word in x['words']]) vocab.build_vocab() print(vocab.idx2word) train_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words') dev_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words') train_data.apply(lambda x: x['words'][:-1], new_field_name="input") train_data.apply(lambda x: x['words'][1:], new_field_name="target") dev_data.apply(lambda x: x['words'][:-1], new_field_name="input") dev_data.apply(lambda x: x['words'][1:], new_field_name="target") train_data.set_input("input") train_data.set_target("target") dev_data.set_input("input") dev_data.set_target("target") return vocab, train_data, dev_data
def construct_vec(test_data, train_data): try: f = open('idx2word.json', 'r', encoding='utf-8') idx2word = json.load(f) idx2word = {int(k): v for k, v in idx2word.items()} f = open('word2idx.json', 'r', encoding='utf-8') word2idx = json.load(f) word2idx = {k: int(v) for k, v in word2idx.items()} except OSError: vocab = Vocabulary(min_freq=2, unknown='<unk>', padding='<pad>') train_data.apply(lambda x: [vocab.add(word) for word in x['words']]) vocab.build_vocab() idx2word = vocab.idx2word word2idx = vocab.word2idx f = open('idx2word.json', 'w', encoding='utf-8') json.dump(idx2word, f) f = open('word2idx.json', 'w', encoding='utf-8') json.dump(word2idx, f) train_data.apply(lambda x: [w2i(word2idx, word) for word in x['words']], new_field_name='word_seq', is_input=True) test_data.apply(lambda x: [w2i(word2idx, word) for word in x['words']], new_field_name='word_seq', is_input=True) return test_data, train_data, idx2word, word2idx
def build_dataset(train_size, test_rate, categories): vocab = load('../data/vocab') train_set = load('../data/train_set') test_set = load('../data/test_set') if not vocab is None and not train_set is None and not test_set is None: return vocab, train_set, test_set train, test = get_20newsgroups_data(categories) train_set = create_dataset(train, train_size) test_set = create_dataset(test, int(train_size * test_rate)) # vocabulary vocab = Vocabulary(min_freq=10) test_set.apply(lambda x: [vocab.add(word) for word in x['word_seq']]) vocab.build_vocab() # word_seq to int train_set.apply(lambda x: [vocab.to_index(word) for word in x['word_seq']], new_field_name='word_seq') test_set.apply(lambda x: [vocab.to_index(word) for word in x['word_seq']], new_field_name='word_seq') # tag train_set.set_input('word_seq') train_set.set_target('target') test_set.set_input('word_seq') test_set.set_target('target') save('../data/vocab', vocab) save('../data/train_set', train_set) save('../data/test_set', test_set) return vocab, train_set, test_set
def build_worddict(dataset): """ Build a dictionary associating words from a set of premises and hypotheses to unique integer indices. Args: dataset: A dictionary containing the premises and hypotheses for which a worddict must be built. The dictionary is assumed to have the same form as the dicts built by the 'read_data' function of this module. num_words: Integer indicating the maximum number of words to keep in the worddict. If specified, only the 'num_words' most frequent words will be kept. If set to None, all words are kept. Defaults to None. Returns: A dictionary associating words to integer indices. """ """ vocab = Vocabulary(num_words) for ins in dataset: for word in ins['premise']: vocab.add(word) for word in ins['hypothesis']: vocab.add(word) vocab.build_vocab() """ vocab = Vocabulary(unknown='_OOV_', padding='_PAD_') dataset.apply(lambda x: [vocab.add(word) for word in x['premise']]) dataset.apply(lambda x: [vocab.add(word) for word in x['hypothesis']]) vocab.build_vocab() return vocab
def get_vocab(dataset): vocabulary = Vocabulary(unknown='<unk>', padding='<pad>') dataset.apply(lambda x: [vocabulary.add(char) for char in x['text']]) vocabulary.build_vocab() print('vocab', len(vocabulary)) return vocabulary
def get_fastnlp_dataset(): text_train, text_test = get_text_classification_datasets() train_data = DataSet() test_data = DataSet() for i in range(len(text_train.data)): train_data.append( Instance(text=split_sent(text_train.data[i]), target=int(text_train.target[i]))) for i in range(len(text_test.data)): test_data.append( Instance(text=split_sent(text_test.data[i]), target=int(text_test.target[i]))) # 构建词表 vocab = Vocabulary(min_freq=5, unknown='<unk>', padding='<pad>') train_data.apply(lambda x: [vocab.add(word) for word in x['text']]) vocab.build_vocab() # 根据词表映射句子 train_data.apply(lambda x: [vocab.to_index(word) for word in x['text']], new_field_name='word_seq') test_data.apply(lambda x: [vocab.to_index(word) for word in x['text']], new_field_name='word_seq') # 设定特征域和标签域 train_data.set_input("word_seq") test_data.set_input("word_seq") train_data.set_target("target") test_data.set_target("target") return train_data, test_data, vocab
def process_poems_large(file_name, sentence_len, vocab_size): sentences = [] with open(file_name, "r", encoding='utf-8', ) as f: for line in f.readlines(): try: line = line.strip() if line: title, content = line.split(':') # print(title) # print(content) # content = line.replace(' ', '').replace(',','').replace('。','') content = content.replace(' ', '') #包含标点符号 # 可以只取五言诗 # if len(content) < 6 or content[5] != ',': # continue if len(content) < 20: continue if ':' in content or '_' in content or '(' in content or '(' in content or '《' in content or '[' in content: continue #截断长度 if len(content) > sentence_len: content = content[:sentence_len] content = content + end_token sentences.append(content) except ValueError as e: pass dataset = DataSet() # sentences = random.sample(sentences, 5000) for sentence in sentences: instance = Instance() instance['raw_sentence'] = sentence instance['target'] = sentence[1:] + sentence[-1] dataset.append(instance) dataset.set_input("raw_sentence") dataset.set_target("target") # for iter in dataset: # print(iter['raw_sentence']) print("dataset_size:", len(dataset)) train_data, dev_data = dataset.split(0.2) train_data.rename_field("raw_sentence", "sentence") dev_data.rename_field("raw_sentence", "sentence") vocab = Vocabulary(max_size=vocab_size, min_freq=2, unknown='<unk>', padding='<pad>') # 构建词表 train_data.apply(lambda x: [vocab.add(word) for word in x['sentence']]) vocab.build_vocab() # 根据词表index句子 train_data.apply(lambda x: [vocab.to_index(word) for word in x['sentence']], new_field_name='sentence') train_data.apply(lambda x: [vocab.to_index(word) for word in x['target']], new_field_name='target') dev_data.apply(lambda x: [vocab.to_index(word) for word in x['sentence']], new_field_name='sentence') dev_data.apply(lambda x: [vocab.to_index(word) for word in x['target']], new_field_name='target') print("vocabulary_size:", len(vocab)) return train_data, dev_data, vocab
class JokeData(object): data_set = None train_data = None test_data = None vocab = None data_num = 0 vocab_size = 0 max_seq_len = 0 def __init__(self, conf): print(conf.data_path) self.data_set = get_joke_data(conf.data_path) self.data_num = len(self.data_set) self.data_set.apply(self.split_sent,new_field_name='words') self.max_seq_len = min(self.max_seq_len,conf.max_seq_len) self.data_set.apply(lambda x : len(x['words']),new_field_name='seq_len') self.train_data,self.test_data = self.data_set.split(0.2) def split_chinese_sent(self,ins,remove_punc=False): line = ins['raw_joke'].strip() words = ['<START>'] for c in line: if c in [',','。','?','!']: if remove_punc: continue else: words.append(c) else: words.append(c) words.append('<EOS>') self.max_seq_len = max(self.max_seq_len,len(words)) return words def split_sent(self,ins,remove_punc=False): words = ['<START>'] + ins['raw_joke'].split() + ['<EOS>'] self.max_seq_len = max(self.max_seq_len,len(words)) return words def pad_seq(self,ins): words = ins['words'] if(len(words) < self.max_seq_len): words = [0]*(self.max_seq_len-len(words)) + words else: words = words[:self.max_seq_len] return words def get_vocab(self): self.vocab = Vocabulary(min_freq=10) self.train_data.apply(lambda x : [self.vocab.add(word) for word in x['words']]) self.vocab.build_vocab() self.vocab.build_reverse_vocab() self.vocab_size = self.vocab.__len__() self.train_data.apply(lambda x : [self.vocab.to_index(word) for word in x['words']],new_field_name='words') self.train_data.apply(self.pad_seq,new_field_name='pad_words') self.test_data.apply(lambda x : [self.vocab.to_index(word) for word in x['words']],new_field_name='words') self.test_data.apply(self.pad_seq,new_field_name='pad_words')
def get_dataset(data_path): print('Getting dataset...') poetry = [] with open(data_path, 'r', encoding='utf-8') as f: poem = '' for line in f: if len(line) <= 1: ins = Instance(text=poem) if len(poem) > 10: poetry.append(ins) poem = '' else: poem += line.strip('\n') # print(poetry[0]) data = DataSet(data=poetry) print("Original data:", data[0]) vocabulary = Vocabulary(min_freq=2, unknown='<oov>', padding='<pad>') vocabulary.add_word('<eos>') vocabulary.add_word('<START>') data.apply(lambda x: [vocabulary.add(char) for char in x['text']]) vocabulary.build_vocab() print('pad:', vocabulary.to_index('<pad>')) print('Vocab size:', len(vocabulary)) data.apply(lambda x: [vocabulary.to_index(char) for char in x['text']], new_field_name='text') data.apply(lambda x: [vocabulary.to_index('<START>')] + x['text'] + [vocabulary.to_index('<eos>')], new_field_name='text') data.apply( lambda x: x['text'][0:min(config.sequence_length, len(x['text']))], new_field_name='text') data.apply(lambda x: [vocabulary.to_index('<pad>')] * (config.sequence_length - len(x['text'])) + x['text'], new_field_name='text') data.apply(lambda x: x['text'][0:-1], new_field_name='input') data.apply(lambda x: x['text'][1:], new_field_name='target') data.set_input('input') data.set_target('target') # length = config.sequence_length # for i, d in enumerate(data): # if length != len(d['text']): # print("wrong!") # exit() train_data, dev_data = data.split(0.2) print('Train data size:', len(train_data)) print('Dev data size:', len(dev_data)) print("Train data:", train_data[20]) # print("Dev data:", dev_data[0]) return train_data, dev_data, vocabulary
def readdata(): global target_len min_count = 10 #categories = ['comp.os.ms-windows.misc', 'rec.motorcycles', 'sci.space', 'talk.politics.misc', ] dataset_train = fetch_20newsgroups(subset='train', data_home='../../..') dataset_test = fetch_20newsgroups(subset='test', data_home='../../..') data = dataset_train.data target = dataset_train.target target_len = len(dataset_train.target_names) train_data = DataSet() padding = 0 for i in range(len(data)): data_t = re.sub("\d+|\s+|/", " ", data[i] ) temp = [word.strip(string.punctuation).lower() for word in data_t.split() if word.strip(string.punctuation) != ''] train_data.append(Instance(raw = data[i], label = int(target[i]), words = temp)) if len(temp) > padding: padding = len(temp) train_data.apply(lambda x: x['raw'].lower(), new_field_name='raw') data = dataset_test.data target = dataset_test.target test_data = DataSet() padding = 0 for i in range(len(data)): data_t = re.sub("\d+|\s+|/", " ", data[i] ) temp = [word.strip(string.punctuation).lower() for word in data_t.split() if word.strip(string.punctuation) != ''] test_data.append(Instance(raw = data[i], label = int(target[i]), words = temp)) if len(temp) > padding: padding = len(temp) test_data.apply(lambda x: x['raw'].lower(), new_field_name='raw') train_data.apply(lambda x: len(x['words']), new_field_name='len') test_data.apply(lambda x: len(x['words']), new_field_name='len') vocab = Vocabulary(min_freq=10) train_data.apply(lambda x: [vocab.add(word) for word in x['words']]) vocab.build_vocab() train_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='seq') test_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='seq') train_data.rename_field('seq', Const.INPUT) train_data.rename_field('len', Const.INPUT_LEN) train_data.rename_field('label', Const.TARGET) test_data.rename_field('seq', Const.INPUT) test_data.rename_field('len', Const.INPUT_LEN) test_data.rename_field('label', Const.TARGET) test_data.set_input(Const.INPUT, Const.INPUT_LEN) test_data.set_target(Const.TARGET) train_data.set_input(Const.INPUT, Const.INPUT_LEN) train_data.set_target(Const.TARGET) test_data, dev_data = test_data.split(0.5) return train_data,dev_data,test_data,vocab
def test_fastnlp_1min_tutorial(self): # tutorials/fastnlp_1min_tutorial.ipynb data_path = "test/data_for_tests/tutorial_sample_dataset.csv" ds = DataSet.read_csv(data_path, headers=('raw_sentence', 'label'), sep='\t') print(ds[1]) # 将所有数字转为小写 ds.apply(lambda x: x['raw_sentence'].lower(), new_field_name='raw_sentence') # label转int ds.apply(lambda x: int(x['label']), new_field_name='target', is_target=True) def split_sent(ins): return ins['raw_sentence'].split() ds.apply(split_sent, new_field_name='words', is_input=True) # 分割训练集/验证集 train_data, dev_data = ds.split(0.3) print("Train size: ", len(train_data)) print("Test size: ", len(dev_data)) from fastNLP import Vocabulary vocab = Vocabulary(min_freq=2) train_data.apply(lambda x: [vocab.add(word) for word in x['words']]) # index句子, Vocabulary.to_index(word) train_data.apply( lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words', is_input=True) dev_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words', is_input=True) from fastNLP.models import CNNText model = CNNText((len(vocab), 50), num_classes=5, padding=2, dropout=0.1) from fastNLP import Trainer, CrossEntropyLoss, AccuracyMetric, Adam trainer = Trainer(model=model, train_data=train_data, dev_data=dev_data, loss=CrossEntropyLoss(), optimizer=Adam(), metrics=AccuracyMetric(target='target')) trainer.train() print('Train finished!')
def get_vocabulary(dataset): vocabulary = Vocabulary(min_freq=2, unknown='<oov>', padding='<pad>') # vocabulary.add_word('<eos>') # vocabulary.add_word('<start>') dataset.apply(lambda x: [vocabulary.add(word) for word in x['input']]) vocabulary.build_vocab() print('pad:', vocabulary.to_index('<pad>')) print('Vocab size:', len(vocabulary)) return vocabulary
def get_vocabulary(train_data, test_data): # 构建词表, Vocabulary.add(word) vocab = Vocabulary(min_freq=0, unknown='<unk>', padding='<pad>') train_data.apply(lambda x: [vocab.add(word) for word in x['poem']]) vocab.build_vocab() # index句子, Vocabulary.to_index(word) train_data.apply(lambda x: [vocab.to_index(word) for word in x['poem']], new_field_name='words') test_data.apply(lambda x: [vocab.to_index(word) for word in x['poem']], new_field_name='words') return vocab, train_data, test_data
def build_vocab(dataset_list, key): """ Build vocab from the given datasets on certain key :param dataset_list: List of Dataset :param key: string for key :return vocab: Vocabulary, the vocab created """ vocab = Vocabulary(min_freq=1) for dataset in dataset_list: dataset.apply(lambda x: [vocab.add(word) for word in x[key]]) vocab.build_vocab() return vocab
def __init__(self, path=".data/yelp", dataset="yelp", batch_size=32): if dataset == "yelp": dataset = DataSet() for db_set in ['train']: text_file = os.path.join(path, 'sentiment.' + db_set + '.text') label_file = os.path.join(path, 'sentiment.' + db_set + '.labels') with io.open(text_file, 'r', encoding="utf-8") as tf, io.open( label_file, 'r', encoding="utf-8") as lf: for text in tf: label = lf.readline() dataset.append(Instance(text=text, label=label)) dataset.apply(lambda x: x['text'].lower(), new_field_name='text') dataset.apply( lambda x: ['<start>'] + x['text'].split() + ['<eos>'], new_field_name='words') dataset.drop(lambda x: len(x['words']) > 1 + 15 + 1) dataset.apply(lambda x: x['words'] + ['<pad>'] * (17 - len(x['words'])), new_field_name='words') dataset.apply(lambda x: int(x['label']), new_field_name='label_seq', is_target=True) _train_data, _test_data = dataset.split(0.3) _vocab = Vocabulary(min_freq=2) _train_data.apply( lambda x: [_vocab.add(word) for word in x['words']]) _vocab.build_vocab() _train_data.apply( lambda x: [_vocab.to_index(word) for word in x['words']], new_field_name='word_seq', is_input=True) _test_data.apply( lambda x: [_vocab.to_index(word) for word in x['words']], new_field_name='word_seq', is_input=True) self.train_data = _train_data self.test_data = _test_data self.vocab = _vocab self.batch_size = batch_size self.train_iter = iter( Batch(dataset=self.train_data, batch_size=self.batch_size, sampler=SequentialSampler()))
def pre_process(): path = os.getcwd() + '/poem.txt' with open(path, encoding='utf-8') as file: sentence = file.readlines() data = list(sentence) poem = [] line = 0 for unit in data: line += 1 if line % 2 == 0: tmp = re.sub(',', '', unit) sent = re.sub('。', '', tmp) poem.append(sent[:-1]) padding_poem = [] for sentence in poem: if len(sentence) < 80: sentence = ' ' * (80 - len(sentence)) + sentence padding_poem.append(sentence) else: padding_poem.append(sentence[:80]) vocab = Vocabulary() for line in padding_poem: for character in line: vocab.add(character) vocab.build_vocab() train_data = [] for poetry in padding_poem: p = [] for char in poetry: p.append(vocab.to_index(char)) train_data.append(p) train_data = np.array(train_data) return vocab, train_data
def preprocess(): train_set = DataSet() for i in range(len(raw_train['data'])): di = transfer(raw_train['data'][i]) train_set.append( Instance(sentence=di, target=int(raw_train['target'][i]))) train_set.apply(lambda x: x['sentence'].lower(), new_field_name='sentence') train_set.apply(lambda x: x['sentence'].split(), new_field_name='words') train_set.apply(lambda x: len(x['words']), new_field_name='seq_len') test_set = DataSet() for i in range(len(raw_test['data'])): di = transfer(raw_test['data'][i]) test_set.append( Instance(sentence=di, target=int(raw_test['target'][i]))) test_set.apply(lambda x: x['sentence'].lower(), new_field_name='sentence') test_set.apply(lambda x: x['sentence'].split(), new_field_name='words') test_set.apply(lambda x: len(x['words']), new_field_name='seq_len') word_dict = Vocabulary(min_freq=2) train_set.apply(lambda x: [word_dict.add(word) for word in x['words']]) test_set.apply(lambda x: [word_dict.add(word) for word in x['words']]) word_dict.build_vocab() word_dict.index_dataset(train_set, field_name='words', new_field_name='words') word_dict.index_dataset(test_set, field_name='words', new_field_name='words') return train_set, test_set, word_dict
def process_poems(file_name, sentence_len, vocab_size): sentences = [] with open(file_name, "r", encoding='utf-8', ) as f: for line in f.readlines(): try: line = line.strip() if line: # content = line.replace(' ', '').replace(',','').replace('。','') content = line.replace(' ', '') #包含标点符号 if len(content) < 10 or len(content) > sentence_len: continue # print(content) content = content + end_token sentences.append(content) except ValueError as e: pass dataset = DataSet() for sentence in sentences: instance = Instance() instance['raw_sentence'] = sentence instance['target'] = sentence[1:] + sentence[-1] dataset.append(instance) dataset.set_input("raw_sentence") dataset.set_target("target") # for iter in dataset: # print(iter) print("dataset_size:", len(dataset)) train_data, dev_data = dataset.split(0.2) train_data.rename_field("raw_sentence", "sentence") dev_data.rename_field("raw_sentence", "sentence") vocab = Vocabulary(max_size=vocab_size, min_freq=2, unknown='<unk>', padding='<pad>') # 构建词表 train_data.apply(lambda x: [vocab.add(word) for word in x['sentence']]) vocab.build_vocab() print("vocabulary_size:", len(vocab)) # 根据词表index句子 train_data.apply(lambda x: [vocab.to_index(word) for word in x['sentence']], new_field_name='sentence') train_data.apply(lambda x: [vocab.to_index(word) for word in x['target']], new_field_name='target') dev_data.apply(lambda x: [vocab.to_index(word) for word in x['sentence']], new_field_name='sentence') dev_data.apply(lambda x: [vocab.to_index(word) for word in x['target']], new_field_name='target') return train_data, dev_data, vocab
def load_dataset_with_glove(data_dir, data_path='mr.task.train', glove_path="", load_glove=True, vocabs=None): path = os.path.join(data_dir, data_path) print(f"start load dataset from {path}.") ds = DataSet.read_csv(path, headers=('label', 'sentence'), sep='\t') ds.apply(lambda x: x['sentence'].lower(), new_field_name='sentence') ds.apply(lambda x: x['sentence'].strip().split(), new_field_name='sentence') ds.apply(lambda x: len(x['sentence']) * [1.], new_field_name='mask', is_input=True) ds.apply(lambda x: int(x['label']), new_field_name='label', is_target=True) if vocabs is None: vocab = Vocabulary(max_size=30000, min_freq=2, unknown='<unk>', padding='<pad>') ds.apply(lambda x: [vocab.add(word) for word in x['sentence']]) vocab.build_vocab() else: vocab = vocabs ds.apply(lambda x: [vocab.to_index(w) for w in x['sentence']], new_field_name='data', is_input=True) if not load_glove: print(f"successful load dataset from {path}") return ds embedding, _ = EmbedLoader().load_embedding(300, glove_path, 'glove', vocab) print(f"successful load dataset and embedding from {path}") return ds, embedding, vocab
def get_text_classification_datasets(num=10): categories = target_name[:num] train = fetch_20newsgroups(subset='train', categories=categories, data_home='../../..') test = fetch_20newsgroups(subset='test', categories=categories, data_home='../../..') train_data, train_target = [delete_char(doc) for doc in train.data], train.target.tolist() test_data, test_target = [delete_char(doc) for doc in test.data], test.target.tolist() # transform to DataSet() dataset_train, dataset_test = DataSet(), DataSet() max_len = 0 for i in range(len(train_data)): dataset_train.append(Instance(doc_words=train_data[i], target=train_target[i])) if max_len < len(train_data[i]): max_len = len(train_data[i]) for i in range(len(test_data)): dataset_test.append(Instance(doc_words=test_data[i], target=test_target[i])) if max_len < len(test_data[i]): max_len = len(test_data[i]) # preprocess # drop some doc doc_len = lambda x: len(x['doc_words']) <= 10 dataset_train.drop(doc_len) # build vocabulary vocab = Vocabulary(max_size=10000, min_freq=15, unknown='<unk>') dataset_train.apply(lambda x: [vocab.add(word) for word in x['doc_words']]) vocab.build_vocab() # index indexF = lambda x: [vocab.to_index(word) for word in x['doc_words']] dataset_train.apply(indexF, new_field_name='words') dataset_test.apply(indexF, new_field_name='words') dataset_train_list = dataset_train.split(0.1) return dataset_train_list[0], dataset_train_list[1], dataset_test, len(vocab), max_len
def process(self, paths, config, load_vocab_file=True): """ :param paths: dict path for each dataset :param load_vocab_file: bool build vocab (False) or load vocab (True) :return: DataBundle datasets: dict keys correspond to the paths dict vocabs: dict key: vocab(if "train" in paths), domain(if domain=True), tag(if tag=True) embeddings: optional """ vocab_size = config.vocab_size def _merge_abstracts(abstracts): merged = [] for abstract in abstracts: merged.extend(abstract[:self.max_concat_len] + [SEP]) if len(abstracts) == 0: assert merged == [] return merged[:-1] def _pad_graph_inputs(graph_inputs): pad_text_wd = [] max_len = config.max_graph_enc_steps for graph_input in graph_inputs: if len(graph_input) < max_len: pad_num = max_len - len(graph_input) graph_input.extend([PAD_TOKEN] * pad_num) else: graph_input = graph_input[:max_len] pad_text_wd.append(graph_input) if len(pad_text_wd) == 0: pad_text_wd.append([PAD_TOKEN] * max_len) return pad_text_wd def _get_nbr_input_len(input_wd): enc_len = [ min(len(text), config.max_graph_enc_steps) for text in input_wd ] if len(enc_len) == 0: enc_len = [0] return enc_len def _pad_article(text_wd): token_num = len(text_wd) max_len = config.max_enc_steps if config.neighbor_process == "sep": max_len += self.max_concat_len * self.max_concat_num if token_num < max_len: padding = [PAD_TOKEN] * (max_len - token_num) article = text_wd + padding else: article = text_wd[:max_len] return article def _split_list(input_list): return [text.split() for text in input_list] def sent_tokenize(abstract): abs_list = abstract.split(".") return [(abst + ".") for abst in abs_list[:-1]] def _article_token_mask(text_wd): max_enc_len = config.max_enc_steps if config.neighbor_process == "sep": max_enc_len += self.max_concat_len * self.max_concat_num token_num = len(text_wd) if token_num < max_enc_len: mask = [1] * token_num + [0] * (max_enc_len - token_num) else: mask = [1] * max_enc_len return mask def generate_article_input(text, abstracts): if config.neighbor_process == "sep": text_wd = text.split()[:config.max_enc_steps] text_wd.append(SEP) abstracts_wd = _merge_abstracts(abstracts) return text_wd + abstracts_wd else: return text.split() def generate_graph_inputs(graph_struct): graph_inputs_ = [ graph_strut_dict[pid][config.graph_input_type] for pid in graph_struct ] return _split_list(graph_inputs_[1:]) def generate_graph_structs(paper_id): sub_graph_dict = {} sub_graph_set = [] n_hop = config.n_hop max_neighbor_num = config.max_neighbor_num k_nbrs = _k_hop_neighbor(paper_id, n_hop, max_neighbor_num) for sub_g in k_nbrs: sub_graph_set += sub_g for node in sub_graph_set: sub_graph_dict[node] = [] for sub_g in k_nbrs: for centre_node in sub_g: nbrs = graph_strut_dict[centre_node]['references'] c_nbrs = list(set(nbrs).intersection(sub_graph_set)) sub_graph_dict[centre_node].extend(c_nbrs) for c_nbr in c_nbrs: sub_graph_dict[c_nbr].append(centre_node) # in python 3.6, the first in subgraph dict is source paper return sub_graph_dict def _k_hop_neighbor(paper_id, n_hop, max_neighbor): sub_graph = [[] for _ in range(n_hop + 1)] level = 0 visited = set() q = deque() q.append([paper_id, level]) curr_node_num = 0 while len(q) != 0: paper_first = q.popleft() paper_id_first, level_first = paper_first if level_first > n_hop: return sub_graph sub_graph[level_first].append(paper_id_first) curr_node_num += 1 if curr_node_num > max_neighbor: return sub_graph visited.add(paper_id_first) for pid in graph_strut_dict[paper_id_first]["references"]: if pid not in visited and pid in graph_strut_dict: q.append([pid, level_first + 1]) visited.add(pid) return sub_graph def generate_dgl_graph(paper_id, graph_struct, nodes_num): g = dgl.DGLGraph() assert len(graph_struct) == nodes_num g.add_nodes(len(graph_struct)) pid2idx = {} for index, key_node in enumerate(graph_struct): pid2idx[key_node] = index assert pid2idx[paper_id] == 0 for index, key_node in enumerate(graph_struct): neighbor = [pid2idx[node] for node in graph_struct[key_node]] # add self loop neighbor.append(index) key_nodes = [index] * len(neighbor) g.add_edges(key_nodes, neighbor) return g train_ds = None dataInfo = self.load(paths) # pop nodes in train graph in inductive setting if config.mode == "test" and self.setting == "inductive": dataInfo.datasets.pop("train") graph_strut_dict = {} for key, ds in dataInfo.datasets.items(): for ins in ds: graph_strut_dict[ins["paper_id"]] = ins logger.info(f"the input graph G_v has {len(graph_strut_dict)} nodes") for key, ds in dataInfo.datasets.items(): # process summary ds.apply(lambda x: x['abstract'].split(), new_field_name='summary_wd') ds.apply(lambda x: sent_tokenize(x['abstract']), new_field_name='abstract_sentences') # generate graph ds.apply(lambda x: generate_graph_structs(x["paper_id"]), new_field_name="graph_struct") ds.apply(lambda x: generate_graph_inputs(x["graph_struct"]), new_field_name='graph_inputs_wd') ds.apply(lambda x: len(x["graph_inputs_wd"]) + 1, new_field_name="nodes_num") # pad input ds.apply(lambda x: generate_article_input(x['introduction'], x[ "graph_inputs_wd"]), new_field_name='input_wd') ds.apply(lambda x: _article_token_mask(x["input_wd"]), new_field_name="enc_len_mask") ds.apply(lambda x: sum(x["enc_len_mask"]), new_field_name="enc_len") ds.apply(lambda x: _pad_article(x["input_wd"]), new_field_name="pad_input_wd") ds.apply(lambda x: _get_nbr_input_len(x["graph_inputs_wd"]), new_field_name="nbr_inputs_len") ds.apply(lambda x: _pad_graph_inputs(x["graph_inputs_wd"]), new_field_name="pad_graph_inputs_wd") if key == "train": train_ds = ds vocab_dict = {} if not load_vocab_file: logger.info("[INFO] Build new vocab from training dataset!") if train_ds is None: raise ValueError("Lack train file to build vocabulary!") vocabs = Vocabulary(max_size=config.vocab_size - 2, padding=PAD_TOKEN, unknown=UNKNOWN_TOKEN) vocabs.from_dataset(train_ds, field_name=["input_wd", "summary_wd"]) vocabs.add_word(START_DECODING) vocabs.add_word(STOP_DECODING) vocab_dict["vocab"] = vocabs # save vocab with open(os.path.join(config.train_path, "vocab"), "w", encoding="utf8") as f: for w, idx in vocabs: f.write(str(w) + "\t" + str(idx) + "\n") logger.info( "build new vocab ends.. please reRun the code with load_vocab = True" ) exit(0) else: logger.info("[INFO] Load existing vocab from %s!" % config.vocab_path) word_list = [] cnt = 3 # pad and unk if config.neighbor_process == "sep": cnt += 1 with open(config.vocab_path, 'r', encoding='utf8') as vocab_f: for line in vocab_f: pieces = line.split("\t") word_list.append(pieces[0]) cnt += 1 if cnt > vocab_size: break vocabs = Vocabulary(max_size=vocab_size, padding=PAD_TOKEN, unknown=UNKNOWN_TOKEN) vocabs.add_word_lst(word_list) vocabs.add(START_DECODING) vocabs.add(STOP_DECODING) if config.neighbor_process == "sep": vocabs.add(SEP) vocabs.build_vocab() vocab_dict["vocab"] = vocabs logger.info(f"vocab size = {len(vocabs)}") assert len(vocabs) == config.vocab_size dataInfo.set_vocab(vocabs, "vocab") for key, dataset in dataInfo.datasets.items(): # do not process the training set in test mode if config.mode == "test" and key == "train": continue data_dict = { "enc_input": [], "nbr_inputs": [], "graph": [], "dec_input": [], "target": [], "dec_len": [], "article_oovs": [], "enc_input_extend_vocab": [], } logger.info( f"start construct the input of the model for {key} set, please wait..." ) for instance in dataset: graph_inputs = instance["pad_graph_inputs_wd"] abstract_sentences = instance["summary_wd"] enc_input = instance["pad_input_wd"] enc_input, nbr_inputs, dec_input, target, dec_len, article_oovs, enc_input_extend_vocab = \ getting_full_info(enc_input, graph_inputs, abstract_sentences, dataInfo.vocabs['vocab'], config) graph = generate_dgl_graph(instance["paper_id"], instance["graph_struct"], instance["nodes_num"]) data_dict["graph"].append(graph) data_dict["enc_input"].append(enc_input) data_dict["nbr_inputs"].append(nbr_inputs) data_dict["dec_input"].append(dec_input) data_dict["target"].append(target) data_dict["dec_len"].append(dec_len) data_dict["article_oovs"].append(article_oovs) data_dict["enc_input_extend_vocab"].append( enc_input_extend_vocab) dataset.add_field("enc_input", data_dict["enc_input"]) dataset.add_field("nbr_inputs", data_dict["nbr_inputs"]) dataset.add_field("dec_input", data_dict["dec_input"]) dataset.add_field("target", data_dict["target"]) dataset.add_field("dec_len", data_dict["dec_len"]) dataset.add_field("article_oovs", data_dict["article_oovs"]) dataset.add_field("enc_input_extend_vocab", data_dict["enc_input_extend_vocab"]) dataset.add_field("graph", data_dict["graph"]) dataset.set_ignore_type( 'graph') # without this line, there may be some errors dataset.set_input("graph") dataset.set_input("nbr_inputs_len", "nbr_inputs", "enc_len", "enc_input", "enc_len_mask", "dec_input", "dec_len", "article_oovs", "nodes_num", "enc_input_extend_vocab") dataset.set_target("target", "article_oovs", "abstract_sentences") dataset.delete_field('graph_inputs_wd') dataset.delete_field('pad_graph_inputs_wd') dataset.delete_field('input_wd') dataset.delete_field('pad_input_wd') logger.info("------load dataset over---------") return dataInfo, vocabs
def test_fastnlp_10min_tutorial(self): # 从csv读取数据到DataSet sample_path = "test/data_for_tests/tutorial_sample_dataset.csv" dataset = CSVLoader(headers=['raw_sentence', 'label'], sep=' ')._load(sample_path) print(len(dataset)) print(dataset[0]) print(dataset[-3]) dataset.append(Instance(raw_sentence='fake data', label='0')) # 将所有数字转为小写 dataset.apply(lambda x: x['raw_sentence'].lower(), new_field_name='raw_sentence') # label转int dataset.apply(lambda x: int(x['label']), new_field_name='label') # 使用空格分割句子 def split_sent(ins): return ins['raw_sentence'].split() dataset.apply(split_sent, new_field_name='words') # 增加长度信息 dataset.apply(lambda x: len(x['words']), new_field_name='seq_len') print(len(dataset)) print(dataset[0]) # DataSet.drop(func)筛除数据 dataset.drop(lambda x: x['seq_len'] <= 3, inplace=True) print(len(dataset)) # 设置DataSet中,哪些field要转为tensor # set target,loss或evaluate中的golden,计算loss,模型评估时使用 dataset.set_target("label") # set input,模型forward时使用 dataset.set_input("words", "seq_len") # 分出测试集、训练集 test_data, train_data = dataset.split(0.5) print(len(test_data)) print(len(train_data)) # 构建词表, Vocabulary.add(word) vocab = Vocabulary(min_freq=2) train_data.apply(lambda x: [vocab.add(word) for word in x['words']]) vocab.build_vocab() # index句子, Vocabulary.to_index(word) train_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words') test_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words') print(test_data[0]) # 如果你们需要做强化学习或者GAN之类的项目,你们也可以使用这些数据预处理的工具 from fastNLP.core.batch import DataSetIter from fastNLP.core.sampler import RandomSampler batch_iterator = DataSetIter(dataset=train_data, batch_size=2, sampler=RandomSampler()) for batch_x, batch_y in batch_iterator: print("batch_x has: ", batch_x) print("batch_y has: ", batch_y) break from fastNLP.models import CNNText model = CNNText((len(vocab), 50), num_classes=5, dropout=0.1) from fastNLP import Trainer from copy import deepcopy # 更改DataSet中对应field的名称,要以模型的forward等参数名一致 train_data.rename_field('label', 'label_seq') test_data.rename_field('label', 'label_seq') loss = CrossEntropyLoss(target="label_seq") metric = AccuracyMetric(target="label_seq") # 实例化Trainer,传入模型和数据,进行训练 # 先在test_data拟合(确保模型的实现是正确的) copy_model = deepcopy(model) overfit_trainer = Trainer(train_data=test_data, model=copy_model, loss=loss, batch_size=32, n_epochs=5, dev_data=test_data, metrics=metric, save_path=None) overfit_trainer.train() # 用train_data训练,在test_data验证 trainer = Trainer(model=model, train_data=train_data, dev_data=test_data, loss=CrossEntropyLoss(target="label_seq"), metrics=AccuracyMetric(target="label_seq"), save_path=None, batch_size=32, n_epochs=5) trainer.train() print('Train finished!') # 调用Tester在test_data上评价效果 from fastNLP import Tester tester = Tester(data=test_data, model=model, metrics=AccuracyMetric(target="label_seq"), batch_size=4) acc = tester.test() print(acc)