def data_process(): with open('./data.txt', encoding='utf-8') as fp: out = fp.readlines() data = list(out) poem = [] cnt = 0 for temp in data: cnt += 1 if cnt % 2 == 0: rec = re.sub(',', '', temp) poem.append(rec[:-1]) poem_normalized = [] for i in range(len(poem)): if len(poem[i]) < 80: poem[i] = ' ' * (80 - len(poem[i])) + poem[i] poem_normalized.append(poem[i]) else: poem_normalized.append(poem[i][:80]) vocab = Vocabulary(min_freq=2) for temp in poem_normalized: for x in temp: vocab.add(x) vocab.build_vocab() dataset = [] for temp in poem_normalized: dataset.append([vocab.to_index(x) for x in temp]) return vocab, np.array(dataset)
def build_dataset(train_size, test_rate, categories): vocab = load('../data/vocab') train_set = load('../data/train_set') test_set = load('../data/test_set') if not vocab is None and not train_set is None and not test_set is None: return vocab, train_set, test_set train, test = get_20newsgroups_data(categories) train_set = create_dataset(train, train_size) test_set = create_dataset(test, int(train_size * test_rate)) # vocabulary vocab = Vocabulary(min_freq=10) test_set.apply(lambda x: [vocab.add(word) for word in x['word_seq']]) vocab.build_vocab() # word_seq to int train_set.apply(lambda x: [vocab.to_index(word) for word in x['word_seq']], new_field_name='word_seq') test_set.apply(lambda x: [vocab.to_index(word) for word in x['word_seq']], new_field_name='word_seq') # tag train_set.set_input('word_seq') train_set.set_target('target') test_set.set_input('word_seq') test_set.set_target('target') save('../data/vocab', vocab) save('../data/train_set', train_set) save('../data/test_set', test_set) return vocab, train_set, test_set
def get_data(path): f = open(path, 'r') shi = f.read() shi = shi.replace('\n', '').replace('\r', '') shi = shi[:5000 * 64] sl = 64 l_doc = len(shi) vocab = Vocabulary(min_freq=1) for i in shi: vocab.add(i) vocab.build_vocab() vocab_size = len(vocab) num_s = int(l_doc / sl) train_s = int(num_s * 0.8) test_s = num_s - train_s array_shi = torch.zeros(l_doc) train_shi = torch.zeros(train_s, sl) test_shi = torch.zeros(test_s, sl) print(train_shi.size()) print(test_shi.size()) for i, j in enumerate(shi): array_shi[i] = vocab[j] array_shi = array_shi.view(-1, sl) train_shi[:, :] = array_shi[:train_s, :] test_shi[:, :] = array_shi[train_s:, :] return vocab, train_shi, test_shi
def pre_process(file_name): poem = [] with open(file_name, 'r', encoding='utf-8') as f: for index, line in enumerate(f.readlines()): if index % 2 == 1: raw_line = line.strip() raw_line = re.sub(',', '', raw_line) raw_line = re.sub('。', '', raw_line) length = len(raw_line) if length < 100: raw_line = raw_line + '~' * (100 - length) poem.append(raw_line[:100]) word_dict = Vocabulary() for line in poem: for character in line: word_dict.add(character) word_dict.build_vocab() data = [] for pi in poem: p = [] for ch in pi: p.append(word_dict.to_index(ch)) data.append(p) data = np.array(data) return word_dict, data
def Get_Data_Vocab(path): s = "" with open (path, "r", encoding='UTF-8') as f: for line in f: s += line.rstrip('\r\n') + "#" sentences = s.split("#") dataset = construct_dataset(sentences) dataset.apply(cut_pad, new_field_name='words') #控制每首诗长度一致 # 分出测试集、训练集 dev_data, train_data = dataset.split(0.8) # 构建词表, Vocabulary.add(word) vocab = Vocabulary(padding="<pad>", min_freq=2) train_data.apply(lambda x: [vocab.add(word) for word in x['words']]) vocab.build_vocab() print(vocab.idx2word) train_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words') dev_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words') train_data.apply(lambda x: x['words'][:-1], new_field_name="input") train_data.apply(lambda x: x['words'][1:], new_field_name="target") dev_data.apply(lambda x: x['words'][:-1], new_field_name="input") dev_data.apply(lambda x: x['words'][1:], new_field_name="target") train_data.set_input("input") train_data.set_target("target") dev_data.set_input("input") dev_data.set_target("target") return vocab, train_data, dev_data
def _get_vocab(data_list): vocab = Vocabulary(unknown=unk_str, padding=pad_str) for l in data_list: vocab.add_word_lst(l) vocab.build_vocab() print('vocab', len(vocab)) return vocab
def build_worddict(dataset): """ Build a dictionary associating words from a set of premises and hypotheses to unique integer indices. Args: dataset: A dictionary containing the premises and hypotheses for which a worddict must be built. The dictionary is assumed to have the same form as the dicts built by the 'read_data' function of this module. num_words: Integer indicating the maximum number of words to keep in the worddict. If specified, only the 'num_words' most frequent words will be kept. If set to None, all words are kept. Defaults to None. Returns: A dictionary associating words to integer indices. """ """ vocab = Vocabulary(num_words) for ins in dataset: for word in ins['premise']: vocab.add(word) for word in ins['hypothesis']: vocab.add(word) vocab.build_vocab() """ vocab = Vocabulary(unknown='_OOV_', padding='_PAD_') dataset.apply(lambda x: [vocab.add(word) for word in x['premise']]) dataset.apply(lambda x: [vocab.add(word) for word in x['hypothesis']]) vocab.build_vocab() return vocab
def construct_vec(test_data, train_data): try: f = open('idx2word.json', 'r', encoding='utf-8') idx2word = json.load(f) idx2word = {int(k): v for k, v in idx2word.items()} f = open('word2idx.json', 'r', encoding='utf-8') word2idx = json.load(f) word2idx = {k: int(v) for k, v in word2idx.items()} except OSError: vocab = Vocabulary(min_freq=2, unknown='<unk>', padding='<pad>') train_data.apply(lambda x: [vocab.add(word) for word in x['words']]) vocab.build_vocab() idx2word = vocab.idx2word word2idx = vocab.word2idx f = open('idx2word.json', 'w', encoding='utf-8') json.dump(idx2word, f) f = open('word2idx.json', 'w', encoding='utf-8') json.dump(word2idx, f) train_data.apply(lambda x: [w2i(word2idx, word) for word in x['words']], new_field_name='word_seq', is_input=True) test_data.apply(lambda x: [w2i(word2idx, word) for word in x['words']], new_field_name='word_seq', is_input=True) return test_data, train_data, idx2word, word2idx
def get_vocab(dataset): vocabulary = Vocabulary(unknown='<unk>', padding='<pad>') dataset.apply(lambda x: [vocabulary.add(char) for char in x['text']]) vocabulary.build_vocab() print('vocab', len(vocabulary)) return vocabulary
def prepareVocab(poemList): vocab = Vocabulary() for poem in poemList: for character in poem: vocab.add(character) vocab.build_vocab() return vocab
def word_to_id(glove_data, glove_matrix, vocab_dict_path, file_path): if os.path.exists(glove_data) == False or os.path.exists( glove_matrix) == False: data, feature_words, user_num, item_num, = feature_word(file_path) vocab = Vocabulary(max_size=len(feature_words) + 1, unknown='unk', padding='PAD') vocab.add_word_lst(feature_words) vocab.build_vocab() matrix = EmbedLoader.load_with_vocab(vocab_dict_path, vocab) matrix = torch.tensor(matrix) for d in range(len(data)): review = [] for word in data[d]['reviewText']: review.append(vocab.to_index(word)) data[d]['reviewText'] = review with open(glove_data, 'wb') as f: pickle.dump(data, f) with open(glove_matrix, 'wb') as f: pickle.dump(matrix, f) with open(glove_data, 'rb') as f: glove_data = pickle.load(f) with open(glove_matrix, 'rb') as f: matrix = pickle.load(f) return glove_data, matrix, len(glove_data[0]['reviewText'])
def process_poems_large(file_name, sentence_len, vocab_size): sentences = [] with open(file_name, "r", encoding='utf-8', ) as f: for line in f.readlines(): try: line = line.strip() if line: title, content = line.split(':') # print(title) # print(content) # content = line.replace(' ', '').replace(',','').replace('。','') content = content.replace(' ', '') #包含标点符号 # 可以只取五言诗 # if len(content) < 6 or content[5] != ',': # continue if len(content) < 20: continue if ':' in content or '_' in content or '(' in content or '(' in content or '《' in content or '[' in content: continue #截断长度 if len(content) > sentence_len: content = content[:sentence_len] content = content + end_token sentences.append(content) except ValueError as e: pass dataset = DataSet() # sentences = random.sample(sentences, 5000) for sentence in sentences: instance = Instance() instance['raw_sentence'] = sentence instance['target'] = sentence[1:] + sentence[-1] dataset.append(instance) dataset.set_input("raw_sentence") dataset.set_target("target") # for iter in dataset: # print(iter['raw_sentence']) print("dataset_size:", len(dataset)) train_data, dev_data = dataset.split(0.2) train_data.rename_field("raw_sentence", "sentence") dev_data.rename_field("raw_sentence", "sentence") vocab = Vocabulary(max_size=vocab_size, min_freq=2, unknown='<unk>', padding='<pad>') # 构建词表 train_data.apply(lambda x: [vocab.add(word) for word in x['sentence']]) vocab.build_vocab() # 根据词表index句子 train_data.apply(lambda x: [vocab.to_index(word) for word in x['sentence']], new_field_name='sentence') train_data.apply(lambda x: [vocab.to_index(word) for word in x['target']], new_field_name='target') dev_data.apply(lambda x: [vocab.to_index(word) for word in x['sentence']], new_field_name='sentence') dev_data.apply(lambda x: [vocab.to_index(word) for word in x['target']], new_field_name='target') print("vocabulary_size:", len(vocab)) return train_data, dev_data, vocab
def preprocess(): train_set = DataSet() for i in range(len(raw_train.data)): train_set.append( Instance(sentence=raw_train.data[i], target=int(raw_train.target[i]))) train_set.apply(lambda x: x['sentence'].translate( str.maketrans("", "", string.punctuation)).lower(), new_field_name='sentence') train_set.apply(lambda x: x['sentence'].split(), new_field_name='words') train_set.apply(lambda x: len(x['words']), new_field_name='seq_len') test_set = DataSet() for i in range(len(raw_test.data)): test_set.append( Instance(sentence=raw_test.data[i], target=int(raw_test.target[i]))) test_set.apply(lambda x: x['sentence'].translate( str.maketrans("", "", string.punctuation)).lower(), new_field_name='sentence') test_set.apply(lambda x: x['sentence'].split(), new_field_name='words') test_set.apply(lambda x: len(x['words']), new_field_name='seq_len') vocab = Vocabulary(min_freq=10) train_set.apply(lambda x: [vocab.add(word) for word in x['words']]) test_set.apply(lambda x: [vocab.add(word) for word in x['words']]) vocab.build_vocab() vocab.index_dataset(train_set, field_name='words', new_field_name='words') vocab.index_dataset(test_set, field_name='words', new_field_name='words') return train_set, test_set, vocab
def get_fastnlp_dataset(): text_train, text_test = get_text_classification_datasets() train_data = DataSet() test_data = DataSet() for i in range(len(text_train.data)): train_data.append( Instance(text=split_sent(text_train.data[i]), target=int(text_train.target[i]))) for i in range(len(text_test.data)): test_data.append( Instance(text=split_sent(text_test.data[i]), target=int(text_test.target[i]))) # 构建词表 vocab = Vocabulary(min_freq=5, unknown='<unk>', padding='<pad>') train_data.apply(lambda x: [vocab.add(word) for word in x['text']]) vocab.build_vocab() # 根据词表映射句子 train_data.apply(lambda x: [vocab.to_index(word) for word in x['text']], new_field_name='word_seq') test_data.apply(lambda x: [vocab.to_index(word) for word in x['text']], new_field_name='word_seq') # 设定特征域和标签域 train_data.set_input("word_seq") test_data.set_input("word_seq") train_data.set_target("target") test_data.set_target("target") return train_data, test_data, vocab
class JokeData(object): data_set = None train_data = None test_data = None vocab = None data_num = 0 vocab_size = 0 max_seq_len = 0 def __init__(self, conf): print(conf.data_path) self.data_set = get_joke_data(conf.data_path) self.data_num = len(self.data_set) self.data_set.apply(self.split_sent,new_field_name='words') self.max_seq_len = min(self.max_seq_len,conf.max_seq_len) self.data_set.apply(lambda x : len(x['words']),new_field_name='seq_len') self.train_data,self.test_data = self.data_set.split(0.2) def split_chinese_sent(self,ins,remove_punc=False): line = ins['raw_joke'].strip() words = ['<START>'] for c in line: if c in [',','。','?','!']: if remove_punc: continue else: words.append(c) else: words.append(c) words.append('<EOS>') self.max_seq_len = max(self.max_seq_len,len(words)) return words def split_sent(self,ins,remove_punc=False): words = ['<START>'] + ins['raw_joke'].split() + ['<EOS>'] self.max_seq_len = max(self.max_seq_len,len(words)) return words def pad_seq(self,ins): words = ins['words'] if(len(words) < self.max_seq_len): words = [0]*(self.max_seq_len-len(words)) + words else: words = words[:self.max_seq_len] return words def get_vocab(self): self.vocab = Vocabulary(min_freq=10) self.train_data.apply(lambda x : [self.vocab.add(word) for word in x['words']]) self.vocab.build_vocab() self.vocab.build_reverse_vocab() self.vocab_size = self.vocab.__len__() self.train_data.apply(lambda x : [self.vocab.to_index(word) for word in x['words']],new_field_name='words') self.train_data.apply(self.pad_seq,new_field_name='pad_words') self.test_data.apply(lambda x : [self.vocab.to_index(word) for word in x['words']],new_field_name='words') self.test_data.apply(self.pad_seq,new_field_name='pad_words')
def get_dataset(data_path): print('Getting dataset...') poetry = [] with open(data_path, 'r', encoding='utf-8') as f: poem = '' for line in f: if len(line) <= 1: ins = Instance(text=poem) if len(poem) > 10: poetry.append(ins) poem = '' else: poem += line.strip('\n') # print(poetry[0]) data = DataSet(data=poetry) print("Original data:", data[0]) vocabulary = Vocabulary(min_freq=2, unknown='<oov>', padding='<pad>') vocabulary.add_word('<eos>') vocabulary.add_word('<START>') data.apply(lambda x: [vocabulary.add(char) for char in x['text']]) vocabulary.build_vocab() print('pad:', vocabulary.to_index('<pad>')) print('Vocab size:', len(vocabulary)) data.apply(lambda x: [vocabulary.to_index(char) for char in x['text']], new_field_name='text') data.apply(lambda x: [vocabulary.to_index('<START>')] + x['text'] + [vocabulary.to_index('<eos>')], new_field_name='text') data.apply( lambda x: x['text'][0:min(config.sequence_length, len(x['text']))], new_field_name='text') data.apply(lambda x: [vocabulary.to_index('<pad>')] * (config.sequence_length - len(x['text'])) + x['text'], new_field_name='text') data.apply(lambda x: x['text'][0:-1], new_field_name='input') data.apply(lambda x: x['text'][1:], new_field_name='target') data.set_input('input') data.set_target('target') # length = config.sequence_length # for i, d in enumerate(data): # if length != len(d['text']): # print("wrong!") # exit() train_data, dev_data = data.split(0.2) print('Train data size:', len(train_data)) print('Dev data size:', len(dev_data)) print("Train data:", train_data[20]) # print("Dev data:", dev_data[0]) return train_data, dev_data, vocabulary
def get_vocabulary(data, min_freq): # train data -> vocabulary # alphabet = "abcdefghijklmnopqrstuvwxyz0123456789,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}" alphabet = "0123456789," char_list = [c for c in alphabet] vocabulary = Vocabulary(padding='<pad>', unknown='<unk>') vocabulary.add_word_lst(char_list) vocabulary.build_vocab() print('vocab size', len(vocabulary), 'pad', vocabulary.padding_idx, 'unk', vocabulary.unknown_idx) return vocabulary
def get_vocabulary(data, min_freq): # train data -> vocabulary vocabulary = Vocabulary(min_freq=min_freq, padding='<pad>', unknown='<unk>') for filename in data: for value in data[filename]: for word_list in data[filename][value]['data']: vocabulary.add_word_lst(word_list) vocabulary.build_vocab() print('vocab size', len(vocabulary), 'pad', vocabulary.padding_idx, 'unk', vocabulary.unknown_idx) return vocabulary
def readdata(): global target_len min_count = 10 #categories = ['comp.os.ms-windows.misc', 'rec.motorcycles', 'sci.space', 'talk.politics.misc', ] dataset_train = fetch_20newsgroups(subset='train', data_home='../../..') dataset_test = fetch_20newsgroups(subset='test', data_home='../../..') data = dataset_train.data target = dataset_train.target target_len = len(dataset_train.target_names) train_data = DataSet() padding = 0 for i in range(len(data)): data_t = re.sub("\d+|\s+|/", " ", data[i] ) temp = [word.strip(string.punctuation).lower() for word in data_t.split() if word.strip(string.punctuation) != ''] train_data.append(Instance(raw = data[i], label = int(target[i]), words = temp)) if len(temp) > padding: padding = len(temp) train_data.apply(lambda x: x['raw'].lower(), new_field_name='raw') data = dataset_test.data target = dataset_test.target test_data = DataSet() padding = 0 for i in range(len(data)): data_t = re.sub("\d+|\s+|/", " ", data[i] ) temp = [word.strip(string.punctuation).lower() for word in data_t.split() if word.strip(string.punctuation) != ''] test_data.append(Instance(raw = data[i], label = int(target[i]), words = temp)) if len(temp) > padding: padding = len(temp) test_data.apply(lambda x: x['raw'].lower(), new_field_name='raw') train_data.apply(lambda x: len(x['words']), new_field_name='len') test_data.apply(lambda x: len(x['words']), new_field_name='len') vocab = Vocabulary(min_freq=10) train_data.apply(lambda x: [vocab.add(word) for word in x['words']]) vocab.build_vocab() train_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='seq') test_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='seq') train_data.rename_field('seq', Const.INPUT) train_data.rename_field('len', Const.INPUT_LEN) train_data.rename_field('label', Const.TARGET) test_data.rename_field('seq', Const.INPUT) test_data.rename_field('len', Const.INPUT_LEN) test_data.rename_field('label', Const.TARGET) test_data.set_input(Const.INPUT, Const.INPUT_LEN) test_data.set_target(Const.TARGET) train_data.set_input(Const.INPUT, Const.INPUT_LEN) train_data.set_target(Const.TARGET) test_data, dev_data = test_data.split(0.5) return train_data,dev_data,test_data,vocab
def get_vocabulary(dataset): vocabulary = Vocabulary(min_freq=2, unknown='<oov>', padding='<pad>') # vocabulary.add_word('<eos>') # vocabulary.add_word('<start>') dataset.apply(lambda x: [vocabulary.add(word) for word in x['input']]) vocabulary.build_vocab() print('pad:', vocabulary.to_index('<pad>')) print('Vocab size:', len(vocabulary)) return vocabulary
def load_dataset( data_dir='/remote-home/ygxu/workspace/Product_all', data_path='mr.task.train', # bert_dir='/home/ygxu/BERT/BERT_English_uncased_L-12_H-768_A_12', bert_dir='/remote-home/ygxu/workspace/BERT/BERT_English_uncased_L-24_H-1024_A_16', ): path = os.path.join(data_dir, data_path) ds = DataSet.read_csv(path, headers=('label', 'raw_sentence'), sep='\t') ds.apply(lambda x: x['raw_sentence'].lower(), new_field_name='raw_sentence') ds.apply(lambda x: int(x['label']), new_field_name='label_seq', is_target=True) def transfer_bert_to_fastnlp(ins): result = "[CLS] " bert_text = ins['bert_tokenize_list'] for text in bert_text: result += text + " " return result.strip() with open(os.path.join(bert_dir, 'vocab.txt')) as f: lines = f.readlines() vocabs = [] for line in lines: vocabs.append(line[:-1]) vocab_bert = Vocabulary(unknown=None, padding=None) vocab_bert.add_word_lst(vocabs) vocab_bert.build_vocab() vocab_bert.unknown = '[UNK]' vocab_bert.padding = '[PAD]' from pytorch_pretrained_bert import BertTokenizer, BertModel tokenizer = BertTokenizer.from_pretrained( os.path.join(bert_dir, 'vocab.txt')) ds.apply(lambda x: tokenizer.tokenize(x['raw_sentence']), new_field_name='bert_tokenize_list') ds.apply(transfer_bert_to_fastnlp, new_field_name='bert_tokenize') ds.apply(lambda x: [vocab_bert.to_index(word) for word in x['bert_tokenize_list']], new_field_name='index_words', is_input=True) ds.rename_field('index_words', 'tokens') ds.apply(lambda x: [1.] * len(x['tokens']), new_field_name='masks', is_input=True) return ds
def load_conll_with_glove( data_dir, data_path='train.pos', glove_path="", # glove_path='/remote-home/ygxu/dataset/glove.empty.txt', load_glove=True, vocabs=None): path = os.path.join(data_dir, data_path) print(f"start load dataset from {path}.") from dataset import MyConllLoader ds = MyConllLoader().load(path) print(ds) ds.rename_field('word_seq', 'sentence') ds.rename_field('label_seq', 'label') #ds = DataSet.read_pos(path, headers=('sentence', 'label'), sep='\t') #ds.apply(lambda x: x['sentence'].lower(), new_field_name='sentence') #ds.apply(lambda x: x['sentence'].strip().split(), new_field_name='sentence') ds.apply(lambda x: len(x['sentence']) * [1.], new_field_name='word_seq_origin_len', is_input=True) if vocabs is None: vocab = Vocabulary(max_size=30000, min_freq=2, unknown='<unk>', padding='<pad>') ds.apply(lambda x: [vocab.add(word) for word in x['sentence']]) vocab.build_vocab() vocab_label = Vocabulary(max_size=200, unknown=None, padding='<pad>') ds.apply(lambda x: [vocab_label.add(label) for label in x['label']]) vocab_label.build_vocab() else: vocab, vocab_label = vocabs ds.apply(lambda x: [vocab.to_index(w) for w in x['sentence']], new_field_name='word_seq', is_input=True) ds.apply(lambda x: [vocab_label.to_index(w) for w in x['label']], new_field_name='truth', is_input=True, is_target=True) if not load_glove: print(f"successful load dataset from {path}") return ds embedding, _ = EmbedLoader().load_embedding(300, glove_path, 'glove', vocab) print(f"successful load dataset and embedding from {path}") return ds, embedding, (vocab, vocab_label)
def get_vocabulary(train_data, test_data): # 构建词表, Vocabulary.add(word) vocab = Vocabulary(min_freq=0, unknown='<unk>', padding='<pad>') train_data.apply(lambda x: [vocab.add(word) for word in x['poem']]) vocab.build_vocab() # index句子, Vocabulary.to_index(word) train_data.apply(lambda x: [vocab.to_index(word) for word in x['poem']], new_field_name='words') test_data.apply(lambda x: [vocab.to_index(word) for word in x['poem']], new_field_name='words') return vocab, train_data, test_data
def build_vocab(dataset_list, key): """ Build vocab from the given datasets on certain key :param dataset_list: List of Dataset :param key: string for key :return vocab: Vocabulary, the vocab created """ vocab = Vocabulary(min_freq=1) for dataset in dataset_list: dataset.apply(lambda x: [vocab.add(word) for word in x[key]]) vocab.build_vocab() return vocab
def __init__(self, path=".data/yelp", dataset="yelp", batch_size=32): if dataset == "yelp": dataset = DataSet() for db_set in ['train']: text_file = os.path.join(path, 'sentiment.' + db_set + '.text') label_file = os.path.join(path, 'sentiment.' + db_set + '.labels') with io.open(text_file, 'r', encoding="utf-8") as tf, io.open( label_file, 'r', encoding="utf-8") as lf: for text in tf: label = lf.readline() dataset.append(Instance(text=text, label=label)) dataset.apply(lambda x: x['text'].lower(), new_field_name='text') dataset.apply( lambda x: ['<start>'] + x['text'].split() + ['<eos>'], new_field_name='words') dataset.drop(lambda x: len(x['words']) > 1 + 15 + 1) dataset.apply(lambda x: x['words'] + ['<pad>'] * (17 - len(x['words'])), new_field_name='words') dataset.apply(lambda x: int(x['label']), new_field_name='label_seq', is_target=True) _train_data, _test_data = dataset.split(0.3) _vocab = Vocabulary(min_freq=2) _train_data.apply( lambda x: [_vocab.add(word) for word in x['words']]) _vocab.build_vocab() _train_data.apply( lambda x: [_vocab.to_index(word) for word in x['words']], new_field_name='word_seq', is_input=True) _test_data.apply( lambda x: [_vocab.to_index(word) for word in x['words']], new_field_name='word_seq', is_input=True) self.train_data = _train_data self.test_data = _test_data self.vocab = _vocab self.batch_size = batch_size self.train_iter = iter( Batch(dataset=self.train_data, batch_size=self.batch_size, sampler=SequentialSampler()))
def get_vocab(dataset): word_vocab = Vocabulary(unknown='<unk>', padding='<pad>') dataset.apply(lambda x: [word_vocab.add(word) for word in x['word']]) word_vocab.build_vocab() print('word vocab', len(word_vocab)) char_vocab = Vocabulary(unknown='<unk>', padding='<pad>') dataset.apply(lambda x: [char_vocab.add(char) for char in x['char']]) char_vocab.build_vocab() print('char vocab', len(char_vocab)) pos_vocab = Vocabulary(unknown='<unk>', padding='<pad>') dataset.apply(lambda x: [pos_vocab.add(pos) for pos in x['pos']]) pos_vocab.build_vocab() print('pos vocab', len(pos_vocab)) #spo_vocab = Vocabulary(unknown='<unk>', padding='<pad>') #dataset.apply(lambda x: spo_vocab.add(x['spo'])) #spo_vocab.build_vocab() #print('spo vocab', len(spo_vocab)) tag_vocab = Vocabulary(unknown='<unk>', padding='<pad>') dataset.apply(lambda x: [tag_vocab.add(tag) for tag in x['tag']]) # tag_vocab.add_word('start') # tag_vocab.add_word('end') tag_vocab.build_vocab() print('tag_vocab', len(tag_vocab)) #return word_vocab, char_vocab, pos_vocab, spo_vocab, tag_vocab return word_vocab, char_vocab, pos_vocab, tag_vocab
def read_vocab(file_name): # 读入vocab文件 with open(file_name) as f: lines = f.readlines() vocabs = [] for line in lines: vocabs.append(line.strip()) # 实例化Vocabulary vocab = Vocabulary(unknown='<unk>', padding='<pad>') # 将vocabs列表加入Vocabulary vocab.add_word_lst(vocabs) # 构建词表 vocab.build_vocab() return vocab
def process_poems(file_name, sentence_len, vocab_size): sentences = [] with open(file_name, "r", encoding='utf-8', ) as f: for line in f.readlines(): try: line = line.strip() if line: # content = line.replace(' ', '').replace(',','').replace('。','') content = line.replace(' ', '') #包含标点符号 if len(content) < 10 or len(content) > sentence_len: continue # print(content) content = content + end_token sentences.append(content) except ValueError as e: pass dataset = DataSet() for sentence in sentences: instance = Instance() instance['raw_sentence'] = sentence instance['target'] = sentence[1:] + sentence[-1] dataset.append(instance) dataset.set_input("raw_sentence") dataset.set_target("target") # for iter in dataset: # print(iter) print("dataset_size:", len(dataset)) train_data, dev_data = dataset.split(0.2) train_data.rename_field("raw_sentence", "sentence") dev_data.rename_field("raw_sentence", "sentence") vocab = Vocabulary(max_size=vocab_size, min_freq=2, unknown='<unk>', padding='<pad>') # 构建词表 train_data.apply(lambda x: [vocab.add(word) for word in x['sentence']]) vocab.build_vocab() print("vocabulary_size:", len(vocab)) # 根据词表index句子 train_data.apply(lambda x: [vocab.to_index(word) for word in x['sentence']], new_field_name='sentence') train_data.apply(lambda x: [vocab.to_index(word) for word in x['target']], new_field_name='target') dev_data.apply(lambda x: [vocab.to_index(word) for word in x['sentence']], new_field_name='sentence') dev_data.apply(lambda x: [vocab.to_index(word) for word in x['target']], new_field_name='target') return train_data, dev_data, vocab
def load_dataset_with_glove(data_dir, data_path='mr.task.train', glove_path="", load_glove=True, vocabs=None): path = os.path.join(data_dir, data_path) print(f"start load dataset from {path}.") ds = DataSet.read_csv(path, headers=('label', 'sentence'), sep='\t') ds.apply(lambda x: x['sentence'].lower(), new_field_name='sentence') ds.apply(lambda x: x['sentence'].strip().split(), new_field_name='sentence') ds.apply(lambda x: len(x['sentence']) * [1.], new_field_name='mask', is_input=True) ds.apply(lambda x: int(x['label']), new_field_name='label', is_target=True) if vocabs is None: vocab = Vocabulary(max_size=30000, min_freq=2, unknown='<unk>', padding='<pad>') ds.apply(lambda x: [vocab.add(word) for word in x['sentence']]) vocab.build_vocab() else: vocab = vocabs ds.apply(lambda x: [vocab.to_index(w) for w in x['sentence']], new_field_name='data', is_input=True) if not load_glove: print(f"successful load dataset from {path}") return ds embedding, _ = EmbedLoader().load_embedding(300, glove_path, 'glove', vocab) print(f"successful load dataset and embedding from {path}") return ds, embedding, vocab
def handle_data(n_class): train_data = get_text_classification_datasets(n_class) dataset = DataSet() vocab = Vocabulary(min_freq=0, unknown='<unk>', padding='<pad>') for i in range(len(train_data.data)): ans = remove_punc(train_data.data[i]) dataset.append((Instance(content=ans, target=int(train_data.target[i])))) dataset.apply(lambda x: x['content'].lower().split(), new_field_name='words', is_input=True) for txt in dataset: vocab.add_word_lst(txt['words']) vocab.build_vocab() # index句子, Vocabulary.to_index(word) dataset.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='index') dataset.set_input("index") dataset.set_target("target") tra, dev = dataset.split(0.2) return tra, dev, len(vocab)