def process_poems_large(file_name, sentence_len, vocab_size): sentences = [] with open(file_name, "r", encoding='utf-8', ) as f: for line in f.readlines(): try: line = line.strip() if line: title, content = line.split(':') # print(title) # print(content) # content = line.replace(' ', '').replace(',','').replace('。','') content = content.replace(' ', '') #包含标点符号 # 可以只取五言诗 # if len(content) < 6 or content[5] != ',': # continue if len(content) < 20: continue if ':' in content or '_' in content or '(' in content or '(' in content or '《' in content or '[' in content: continue #截断长度 if len(content) > sentence_len: content = content[:sentence_len] content = content + end_token sentences.append(content) except ValueError as e: pass dataset = DataSet() # sentences = random.sample(sentences, 5000) for sentence in sentences: instance = Instance() instance['raw_sentence'] = sentence instance['target'] = sentence[1:] + sentence[-1] dataset.append(instance) dataset.set_input("raw_sentence") dataset.set_target("target") # for iter in dataset: # print(iter['raw_sentence']) print("dataset_size:", len(dataset)) train_data, dev_data = dataset.split(0.2) train_data.rename_field("raw_sentence", "sentence") dev_data.rename_field("raw_sentence", "sentence") vocab = Vocabulary(max_size=vocab_size, min_freq=2, unknown='<unk>', padding='<pad>') # 构建词表 train_data.apply(lambda x: [vocab.add(word) for word in x['sentence']]) vocab.build_vocab() # 根据词表index句子 train_data.apply(lambda x: [vocab.to_index(word) for word in x['sentence']], new_field_name='sentence') train_data.apply(lambda x: [vocab.to_index(word) for word in x['target']], new_field_name='target') dev_data.apply(lambda x: [vocab.to_index(word) for word in x['sentence']], new_field_name='sentence') dev_data.apply(lambda x: [vocab.to_index(word) for word in x['target']], new_field_name='target') print("vocabulary_size:", len(vocab)) return train_data, dev_data, vocab
def test_same_vector4(self): # 验证在有min_freq下的lower word_lst = ["The", "the", "the", "The", "a", "A"] no_create_word_lst = ['of', 'Of', "Of", "of", 'With', 'with'] all_words = word_lst[:-2] + no_create_word_lst[:-2] vocab = Vocabulary(min_freq=2).add_word_lst(word_lst) vocab.add_word_lst(no_create_word_lst, no_create_entry=True) embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6B-100d', lower=True) words = torch.LongTensor([[vocab.to_index(word) for word in all_words]]) words = embed(words) lowered_word_lst = [word.lower() for word in word_lst] lowered_no_create_word_lst = [ word.lower() for word in no_create_word_lst ] lowered_vocab = Vocabulary().add_word_lst(lowered_word_lst) lowered_vocab.add_word_lst(lowered_no_create_word_lst, no_create_entry=True) lowered_embed = StaticEmbedding(lowered_vocab, model_dir_or_name='en-glove-6B-100d', lower=False) lowered_words = torch.LongTensor( [[lowered_vocab.to_index(word.lower()) for word in all_words]]) lowered_words = lowered_embed(lowered_words) for idx in range(len(all_words)): word_i, word_j = words[0, idx], lowered_words[0, idx] with self.subTest(idx=idx, word=all_words[idx]): assert torch.sum(word_i == word_j).eq(lowered_embed.embed_size)
def test_same_vector3(self): # 验证lower word_lst = ["The", "the"] no_create_word_lst = ['of', 'Of', 'With', 'with'] vocab = Vocabulary().add_word_lst(word_lst) vocab.add_word_lst(no_create_word_lst, no_create_entry=True) embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6B-100d', lower=True) words = torch.LongTensor( [[vocab.to_index(word) for word in word_lst + no_create_word_lst]]) words = embed(words) lowered_word_lst = [word.lower() for word in word_lst] lowered_no_create_word_lst = [ word.lower() for word in no_create_word_lst ] lowered_vocab = Vocabulary().add_word_lst(lowered_word_lst) lowered_vocab.add_word_lst(lowered_no_create_word_lst, no_create_entry=True) lowered_embed = StaticEmbedding(lowered_vocab, model_dir_or_name='en-glove-6B-100d', lower=False) lowered_words = torch.LongTensor([[ lowered_vocab.to_index(word) for word in lowered_word_lst + lowered_no_create_word_lst ]]) lowered_words = lowered_embed(lowered_words) all_words = word_lst + no_create_word_lst for idx, (word_i, word_j) in enumerate(zip(words[0], lowered_words[0])): with self.subTest(idx=idx, word=all_words[idx]): assert torch.sum(word_i == word_j).eq(lowered_embed.embed_size)
def test_same_vector5(self): # 检查通过使用min_freq后的word是否内容一致 word_lst = ["they", "the", "they", "the", 'he', 'he', "a", "A"] no_create_word_lst = ['of', "of", "she", "she", 'With', 'with'] all_words = word_lst[:-2] + no_create_word_lst[:-2] vocab = Vocabulary().add_word_lst(word_lst) vocab.add_word_lst(no_create_word_lst, no_create_entry=True) embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6B-100d', lower=False, min_freq=2) words = torch.LongTensor([[vocab.to_index(word) for word in all_words]]) words = embed(words) min_freq_vocab = Vocabulary(min_freq=2).add_word_lst(word_lst) min_freq_vocab.add_word_lst(no_create_word_lst, no_create_entry=True) min_freq_embed = StaticEmbedding(min_freq_vocab, model_dir_or_name='en-glove-6B-100d', lower=False) min_freq_words = torch.LongTensor( [[min_freq_vocab.to_index(word.lower()) for word in all_words]]) min_freq_words = min_freq_embed(min_freq_words) for idx in range(len(all_words)): word_i, word_j = words[0, idx], min_freq_words[0, idx] with self.subTest(idx=idx, word=all_words[idx]): assert torch.sum(word_i == word_j).eq( min_freq_embed.embed_size)
def get_fastnlp_dataset(): text_train, text_test = get_text_classification_datasets() train_data = DataSet() test_data = DataSet() for i in range(len(text_train.data)): train_data.append( Instance(text=split_sent(text_train.data[i]), target=int(text_train.target[i]))) for i in range(len(text_test.data)): test_data.append( Instance(text=split_sent(text_test.data[i]), target=int(text_test.target[i]))) # 构建词表 vocab = Vocabulary(min_freq=5, unknown='<unk>', padding='<pad>') train_data.apply(lambda x: [vocab.add(word) for word in x['text']]) vocab.build_vocab() # 根据词表映射句子 train_data.apply(lambda x: [vocab.to_index(word) for word in x['text']], new_field_name='word_seq') test_data.apply(lambda x: [vocab.to_index(word) for word in x['text']], new_field_name='word_seq') # 设定特征域和标签域 train_data.set_input("word_seq") test_data.set_input("word_seq") train_data.set_target("target") test_data.set_target("target") return train_data, test_data, vocab
def build_dataset(train_size, test_rate, categories): vocab = load('../data/vocab') train_set = load('../data/train_set') test_set = load('../data/test_set') if not vocab is None and not train_set is None and not test_set is None: return vocab, train_set, test_set train, test = get_20newsgroups_data(categories) train_set = create_dataset(train, train_size) test_set = create_dataset(test, int(train_size * test_rate)) # vocabulary vocab = Vocabulary(min_freq=10) test_set.apply(lambda x: [vocab.add(word) for word in x['word_seq']]) vocab.build_vocab() # word_seq to int train_set.apply(lambda x: [vocab.to_index(word) for word in x['word_seq']], new_field_name='word_seq') test_set.apply(lambda x: [vocab.to_index(word) for word in x['word_seq']], new_field_name='word_seq') # tag train_set.set_input('word_seq') train_set.set_target('target') test_set.set_input('word_seq') test_set.set_target('target') save('../data/vocab', vocab) save('../data/train_set', train_set) save('../data/test_set', test_set) return vocab, train_set, test_set
def Get_Data_Vocab(path): s = "" with open (path, "r", encoding='UTF-8') as f: for line in f: s += line.rstrip('\r\n') + "#" sentences = s.split("#") dataset = construct_dataset(sentences) dataset.apply(cut_pad, new_field_name='words') #控制每首诗长度一致 # 分出测试集、训练集 dev_data, train_data = dataset.split(0.8) # 构建词表, Vocabulary.add(word) vocab = Vocabulary(padding="<pad>", min_freq=2) train_data.apply(lambda x: [vocab.add(word) for word in x['words']]) vocab.build_vocab() print(vocab.idx2word) train_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words') dev_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words') train_data.apply(lambda x: x['words'][:-1], new_field_name="input") train_data.apply(lambda x: x['words'][1:], new_field_name="target") dev_data.apply(lambda x: x['words'][:-1], new_field_name="input") dev_data.apply(lambda x: x['words'][1:], new_field_name="target") train_data.set_input("input") train_data.set_target("target") dev_data.set_input("input") dev_data.set_target("target") return vocab, train_data, dev_data
def test_search(self): """语义搜索.TypeError: expected dimension <= 2 array or matrix """ print('{} test_search {}'.format('-' * 15, '-' * 15)) texts = ['温都尔站', '东乌广厦', '国电四郎', '阿尔善站', '朱日和基'] # 文本向量化 vocab = Vocabulary() for text in texts: vocab.add_word_lst(list(text)) print(len(vocab)) embed = StaticEmbedding( vocab, model_dir_or_name='./data/cn_char_fastnlp_100d.txt') texts_to_id = [[vocab.to_index(word) for word in list(text)] for text in texts] words = torch.LongTensor(texts_to_id) # 将文本转为index features_vec = embed(words) print(features_vec.shape) # build the search index! cp = ci.MultiClusterIndex(features_vec.detach().numpy(), texts) search_texts = ['朱日和站', '温都尔站', '国电站'] for text in search_texts: texts_to_id = [[vocab.to_index(word) for word in list(text)]] words = torch.LongTensor(texts_to_id) # 将文本转为index features_vec = embed(words) search_features_vec = features_vec.detach().numpy() search_result = cp.search(search_features_vec, k=2, k_clusters=2, return_distance=True) print('text:{}'.format(text)) print('search_result:{}'.format(search_result)) """
class JokeData(object): data_set = None train_data = None test_data = None vocab = None data_num = 0 vocab_size = 0 max_seq_len = 0 def __init__(self, conf): print(conf.data_path) self.data_set = get_joke_data(conf.data_path) self.data_num = len(self.data_set) self.data_set.apply(self.split_sent,new_field_name='words') self.max_seq_len = min(self.max_seq_len,conf.max_seq_len) self.data_set.apply(lambda x : len(x['words']),new_field_name='seq_len') self.train_data,self.test_data = self.data_set.split(0.2) def split_chinese_sent(self,ins,remove_punc=False): line = ins['raw_joke'].strip() words = ['<START>'] for c in line: if c in [',','。','?','!']: if remove_punc: continue else: words.append(c) else: words.append(c) words.append('<EOS>') self.max_seq_len = max(self.max_seq_len,len(words)) return words def split_sent(self,ins,remove_punc=False): words = ['<START>'] + ins['raw_joke'].split() + ['<EOS>'] self.max_seq_len = max(self.max_seq_len,len(words)) return words def pad_seq(self,ins): words = ins['words'] if(len(words) < self.max_seq_len): words = [0]*(self.max_seq_len-len(words)) + words else: words = words[:self.max_seq_len] return words def get_vocab(self): self.vocab = Vocabulary(min_freq=10) self.train_data.apply(lambda x : [self.vocab.add(word) for word in x['words']]) self.vocab.build_vocab() self.vocab.build_reverse_vocab() self.vocab_size = self.vocab.__len__() self.train_data.apply(lambda x : [self.vocab.to_index(word) for word in x['words']],new_field_name='words') self.train_data.apply(self.pad_seq,new_field_name='pad_words') self.test_data.apply(lambda x : [self.vocab.to_index(word) for word in x['words']],new_field_name='words') self.test_data.apply(self.pad_seq,new_field_name='pad_words')
def get_dataset(data_path): print('Getting dataset...') poetry = [] with open(data_path, 'r', encoding='utf-8') as f: poem = '' for line in f: if len(line) <= 1: ins = Instance(text=poem) if len(poem) > 10: poetry.append(ins) poem = '' else: poem += line.strip('\n') # print(poetry[0]) data = DataSet(data=poetry) print("Original data:", data[0]) vocabulary = Vocabulary(min_freq=2, unknown='<oov>', padding='<pad>') vocabulary.add_word('<eos>') vocabulary.add_word('<START>') data.apply(lambda x: [vocabulary.add(char) for char in x['text']]) vocabulary.build_vocab() print('pad:', vocabulary.to_index('<pad>')) print('Vocab size:', len(vocabulary)) data.apply(lambda x: [vocabulary.to_index(char) for char in x['text']], new_field_name='text') data.apply(lambda x: [vocabulary.to_index('<START>')] + x['text'] + [vocabulary.to_index('<eos>')], new_field_name='text') data.apply( lambda x: x['text'][0:min(config.sequence_length, len(x['text']))], new_field_name='text') data.apply(lambda x: [vocabulary.to_index('<pad>')] * (config.sequence_length - len(x['text'])) + x['text'], new_field_name='text') data.apply(lambda x: x['text'][0:-1], new_field_name='input') data.apply(lambda x: x['text'][1:], new_field_name='target') data.set_input('input') data.set_target('target') # length = config.sequence_length # for i, d in enumerate(data): # if length != len(d['text']): # print("wrong!") # exit() train_data, dev_data = data.split(0.2) print('Train data size:', len(train_data)) print('Dev data size:', len(dev_data)) print("Train data:", train_data[20]) # print("Dev data:", dev_data[0]) return train_data, dev_data, vocabulary
def readdata(): global target_len min_count = 10 #categories = ['comp.os.ms-windows.misc', 'rec.motorcycles', 'sci.space', 'talk.politics.misc', ] dataset_train = fetch_20newsgroups(subset='train', data_home='../../..') dataset_test = fetch_20newsgroups(subset='test', data_home='../../..') data = dataset_train.data target = dataset_train.target target_len = len(dataset_train.target_names) train_data = DataSet() padding = 0 for i in range(len(data)): data_t = re.sub("\d+|\s+|/", " ", data[i] ) temp = [word.strip(string.punctuation).lower() for word in data_t.split() if word.strip(string.punctuation) != ''] train_data.append(Instance(raw = data[i], label = int(target[i]), words = temp)) if len(temp) > padding: padding = len(temp) train_data.apply(lambda x: x['raw'].lower(), new_field_name='raw') data = dataset_test.data target = dataset_test.target test_data = DataSet() padding = 0 for i in range(len(data)): data_t = re.sub("\d+|\s+|/", " ", data[i] ) temp = [word.strip(string.punctuation).lower() for word in data_t.split() if word.strip(string.punctuation) != ''] test_data.append(Instance(raw = data[i], label = int(target[i]), words = temp)) if len(temp) > padding: padding = len(temp) test_data.apply(lambda x: x['raw'].lower(), new_field_name='raw') train_data.apply(lambda x: len(x['words']), new_field_name='len') test_data.apply(lambda x: len(x['words']), new_field_name='len') vocab = Vocabulary(min_freq=10) train_data.apply(lambda x: [vocab.add(word) for word in x['words']]) vocab.build_vocab() train_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='seq') test_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='seq') train_data.rename_field('seq', Const.INPUT) train_data.rename_field('len', Const.INPUT_LEN) train_data.rename_field('label', Const.TARGET) test_data.rename_field('seq', Const.INPUT) test_data.rename_field('len', Const.INPUT_LEN) test_data.rename_field('label', Const.TARGET) test_data.set_input(Const.INPUT, Const.INPUT_LEN) test_data.set_target(Const.TARGET) train_data.set_input(Const.INPUT, Const.INPUT_LEN) train_data.set_target(Const.TARGET) test_data, dev_data = test_data.split(0.5) return train_data,dev_data,test_data,vocab
def test_fastnlp_1min_tutorial(self): # tutorials/fastnlp_1min_tutorial.ipynb data_path = "test/data_for_tests/tutorial_sample_dataset.csv" ds = DataSet.read_csv(data_path, headers=('raw_sentence', 'label'), sep='\t') print(ds[1]) # 将所有数字转为小写 ds.apply(lambda x: x['raw_sentence'].lower(), new_field_name='raw_sentence') # label转int ds.apply(lambda x: int(x['label']), new_field_name='target', is_target=True) def split_sent(ins): return ins['raw_sentence'].split() ds.apply(split_sent, new_field_name='words', is_input=True) # 分割训练集/验证集 train_data, dev_data = ds.split(0.3) print("Train size: ", len(train_data)) print("Test size: ", len(dev_data)) from fastNLP import Vocabulary vocab = Vocabulary(min_freq=2) train_data.apply(lambda x: [vocab.add(word) for word in x['words']]) # index句子, Vocabulary.to_index(word) train_data.apply( lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words', is_input=True) dev_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words', is_input=True) from fastNLP.models import CNNText model = CNNText((len(vocab), 50), num_classes=5, padding=2, dropout=0.1) from fastNLP import Trainer, CrossEntropyLoss, AccuracyMetric, Adam trainer = Trainer(model=model, train_data=train_data, dev_data=dev_data, loss=CrossEntropyLoss(), optimizer=Adam(), metrics=AccuracyMetric(target='target')) trainer.train() print('Train finished!')
def load_conll_with_glove( data_dir, data_path='train.pos', glove_path="", # glove_path='/remote-home/ygxu/dataset/glove.empty.txt', load_glove=True, vocabs=None): path = os.path.join(data_dir, data_path) print(f"start load dataset from {path}.") from dataset import MyConllLoader ds = MyConllLoader().load(path) print(ds) ds.rename_field('word_seq', 'sentence') ds.rename_field('label_seq', 'label') #ds = DataSet.read_pos(path, headers=('sentence', 'label'), sep='\t') #ds.apply(lambda x: x['sentence'].lower(), new_field_name='sentence') #ds.apply(lambda x: x['sentence'].strip().split(), new_field_name='sentence') ds.apply(lambda x: len(x['sentence']) * [1.], new_field_name='word_seq_origin_len', is_input=True) if vocabs is None: vocab = Vocabulary(max_size=30000, min_freq=2, unknown='<unk>', padding='<pad>') ds.apply(lambda x: [vocab.add(word) for word in x['sentence']]) vocab.build_vocab() vocab_label = Vocabulary(max_size=200, unknown=None, padding='<pad>') ds.apply(lambda x: [vocab_label.add(label) for label in x['label']]) vocab_label.build_vocab() else: vocab, vocab_label = vocabs ds.apply(lambda x: [vocab.to_index(w) for w in x['sentence']], new_field_name='word_seq', is_input=True) ds.apply(lambda x: [vocab_label.to_index(w) for w in x['label']], new_field_name='truth', is_input=True, is_target=True) if not load_glove: print(f"successful load dataset from {path}") return ds embedding, _ = EmbedLoader().load_embedding(300, glove_path, 'glove', vocab) print(f"successful load dataset and embedding from {path}") return ds, embedding, (vocab, vocab_label)
def get_vocabulary(train_data, test_data): # 构建词表, Vocabulary.add(word) vocab = Vocabulary(min_freq=0, unknown='<unk>', padding='<pad>') train_data.apply(lambda x: [vocab.add(word) for word in x['poem']]) vocab.build_vocab() # index句子, Vocabulary.to_index(word) train_data.apply(lambda x: [vocab.to_index(word) for word in x['poem']], new_field_name='words') test_data.apply(lambda x: [vocab.to_index(word) for word in x['poem']], new_field_name='words') return vocab, train_data, test_data
def __init__(self, path=".data/yelp", dataset="yelp", batch_size=32): if dataset == "yelp": dataset = DataSet() for db_set in ['train']: text_file = os.path.join(path, 'sentiment.' + db_set + '.text') label_file = os.path.join(path, 'sentiment.' + db_set + '.labels') with io.open(text_file, 'r', encoding="utf-8") as tf, io.open( label_file, 'r', encoding="utf-8") as lf: for text in tf: label = lf.readline() dataset.append(Instance(text=text, label=label)) dataset.apply(lambda x: x['text'].lower(), new_field_name='text') dataset.apply( lambda x: ['<start>'] + x['text'].split() + ['<eos>'], new_field_name='words') dataset.drop(lambda x: len(x['words']) > 1 + 15 + 1) dataset.apply(lambda x: x['words'] + ['<pad>'] * (17 - len(x['words'])), new_field_name='words') dataset.apply(lambda x: int(x['label']), new_field_name='label_seq', is_target=True) _train_data, _test_data = dataset.split(0.3) _vocab = Vocabulary(min_freq=2) _train_data.apply( lambda x: [_vocab.add(word) for word in x['words']]) _vocab.build_vocab() _train_data.apply( lambda x: [_vocab.to_index(word) for word in x['words']], new_field_name='word_seq', is_input=True) _test_data.apply( lambda x: [_vocab.to_index(word) for word in x['words']], new_field_name='word_seq', is_input=True) self.train_data = _train_data self.test_data = _test_data self.vocab = _vocab self.batch_size = batch_size self.train_iter = iter( Batch(dataset=self.train_data, batch_size=self.batch_size, sampler=SequentialSampler()))
def test_rebuild(self): # 测试build之后新加入词,原来的词顺序不变 vocab = Vocabulary() text = [str(idx) for idx in range(10)] vocab.update(text) for i in text: self.assertEqual(int(i) + 2, vocab.to_index(i)) indexes = [] for word, index in vocab: indexes.append((word, index)) vocab.add_word_lst([str(idx) for idx in range(10, 13)]) for idx, pair in enumerate(indexes): self.assertEqual(pair[1], vocab.to_index(pair[0])) for i in range(13): self.assertEqual(int(i) + 2, vocab.to_index(str(i)))
def process_poems(file_name, sentence_len, vocab_size): sentences = [] with open(file_name, "r", encoding='utf-8', ) as f: for line in f.readlines(): try: line = line.strip() if line: # content = line.replace(' ', '').replace(',','').replace('。','') content = line.replace(' ', '') #包含标点符号 if len(content) < 10 or len(content) > sentence_len: continue # print(content) content = content + end_token sentences.append(content) except ValueError as e: pass dataset = DataSet() for sentence in sentences: instance = Instance() instance['raw_sentence'] = sentence instance['target'] = sentence[1:] + sentence[-1] dataset.append(instance) dataset.set_input("raw_sentence") dataset.set_target("target") # for iter in dataset: # print(iter) print("dataset_size:", len(dataset)) train_data, dev_data = dataset.split(0.2) train_data.rename_field("raw_sentence", "sentence") dev_data.rename_field("raw_sentence", "sentence") vocab = Vocabulary(max_size=vocab_size, min_freq=2, unknown='<unk>', padding='<pad>') # 构建词表 train_data.apply(lambda x: [vocab.add(word) for word in x['sentence']]) vocab.build_vocab() print("vocabulary_size:", len(vocab)) # 根据词表index句子 train_data.apply(lambda x: [vocab.to_index(word) for word in x['sentence']], new_field_name='sentence') train_data.apply(lambda x: [vocab.to_index(word) for word in x['target']], new_field_name='target') dev_data.apply(lambda x: [vocab.to_index(word) for word in x['sentence']], new_field_name='sentence') dev_data.apply(lambda x: [vocab.to_index(word) for word in x['target']], new_field_name='target') return train_data, dev_data, vocab
def word_to_id(glove_data, glove_matrix, vocab_dict_path, file_path): if os.path.exists(glove_data) == False or os.path.exists( glove_matrix) == False: data, feature_words, user_num, item_num, = feature_word(file_path) vocab = Vocabulary(max_size=len(feature_words) + 1, unknown='unk', padding='PAD') vocab.add_word_lst(feature_words) vocab.build_vocab() matrix = EmbedLoader.load_with_vocab(vocab_dict_path, vocab) matrix = torch.tensor(matrix) for d in range(len(data)): review = [] for word in data[d]['reviewText']: review.append(vocab.to_index(word)) data[d]['reviewText'] = review with open(glove_data, 'wb') as f: pickle.dump(data, f) with open(glove_matrix, 'wb') as f: pickle.dump(matrix, f) with open(glove_data, 'rb') as f: glove_data = pickle.load(f) with open(glove_matrix, 'rb') as f: matrix = pickle.load(f) return glove_data, matrix, len(glove_data[0]['reviewText'])
def data_process(): with open('./data.txt', encoding='utf-8') as fp: out = fp.readlines() data = list(out) poem = [] cnt = 0 for temp in data: cnt += 1 if cnt % 2 == 0: rec = re.sub(',', '', temp) poem.append(rec[:-1]) poem_normalized = [] for i in range(len(poem)): if len(poem[i]) < 80: poem[i] = ' ' * (80 - len(poem[i])) + poem[i] poem_normalized.append(poem[i]) else: poem_normalized.append(poem[i][:80]) vocab = Vocabulary(min_freq=2) for temp in poem_normalized: for x in temp: vocab.add(x) vocab.build_vocab() dataset = [] for temp in poem_normalized: dataset.append([vocab.to_index(x) for x in temp]) return vocab, np.array(dataset)
def pre_process(file_name): poem = [] with open(file_name, 'r', encoding='utf-8') as f: for index, line in enumerate(f.readlines()): if index % 2 == 1: raw_line = line.strip() raw_line = re.sub(',', '', raw_line) raw_line = re.sub('。', '', raw_line) length = len(raw_line) if length < 100: raw_line = raw_line + '~' * (100 - length) poem.append(raw_line[:100]) word_dict = Vocabulary() for line in poem: for character in line: word_dict.add(character) word_dict.build_vocab() data = [] for pi in poem: p = [] for ch in pi: p.append(word_dict.to_index(ch)) data.append(p) data = np.array(data) return word_dict, data
def test_same_vector(self): vocab = Vocabulary().add_word_lst(["The", "the", "THE"]) embed = StaticEmbedding(vocab, model_dir_or_name=None, embedding_dim=5, lower=True) words = torch.LongTensor([[vocab.to_index(word) for word in ["The", "the", "THE"]]]) words = embed(words) embed_0 = words[0, 0] for i in range(1, words.size(1)): assert torch.sum(embed_0==words[0, i]).eq(len(embed_0))
def test_index(self): vocab = Vocabulary() vocab.update(text) res = [vocab[w] for w in set(text)] self.assertEqual(len(res), len(set(res))) res = [vocab.to_index(w) for w in set(text)] self.assertEqual(len(res), len(set(res)))
def get_vocab(dataset): vocabulary = Vocabulary(unknown=unk_str, padding=pad_str) for data, _ in dataset: vocabulary.add_word_lst(data) print('vocab', len(vocabulary)) print('pad', vocabulary.to_index(pad_str)) return vocabulary
def test_roberta_ebembedding_2(self): # 测试only_use_pretrain_vocab与truncate_embed是否正常工作 Embedding = RobertaEmbedding weight_path = 'test/data_for_tests/embedding/small_roberta' vocab = Vocabulary().add_word_lst("this is a texta and".split()) embed1 = Embedding(vocab, model_dir_or_name=weight_path, layers=list(range(3)), only_use_pretrain_bpe=True, truncate_embed=True, min_freq=1) # embed_bpe_vocab_size = len(vocab)-1 + 2 # 排除NotInBERT, 额外加##a, [CLS] # self.assertEqual(embed_bpe_vocab_size, len(embed1.model.tokenzier.vocab)) embed2 = Embedding(vocab, model_dir_or_name=weight_path, layers=list(range(3)), only_use_pretrain_bpe=True, truncate_embed=False, min_freq=1) # embed_bpe_vocab_size = num_word # 排除NotInBERT # self.assertEqual(embed_bpe_vocab_size, len(embed2.model.tokenzier.vocab)) embed3 = Embedding(vocab, model_dir_or_name=weight_path, layers=list(range(3)), only_use_pretrain_bpe=False, truncate_embed=True, min_freq=1) # embed_bpe_vocab_size = len(vocab)+2 # 新增##a, [CLS] # self.assertEqual(embed_bpe_vocab_size, len(embed3.model.tokenzier.vocab)) embed4 = Embedding(vocab, model_dir_or_name=weight_path, layers=list(range(3)), only_use_pretrain_bpe=False, truncate_embed=False, min_freq=1) # embed_bpe_vocab_size = num_word+1 # 新增##a # self.assertEqual(embed_bpe_vocab_size, len(embed4.model.tokenzier.vocab)) # 测试各种情况下以下tensor的值是相等的 embed1.eval() embed2.eval() embed3.eval() embed4.eval() tensor = torch.LongTensor( [[vocab.to_index(w) for w in 'this is a texta and'.split()]]) t1 = embed1(tensor) t2 = embed2(tensor) t3 = embed3(tensor) t4 = embed4(tensor) self.assertEqual((t1 - t2).sum(), 0) self.assertEqual((t1 - t3).sum(), 0) self.assertEqual((t1 - t4).sum(), 0)
def get_vocabulary(dataset): vocabulary = Vocabulary(min_freq=2, unknown='<oov>', padding='<pad>') # vocabulary.add_word('<eos>') # vocabulary.add_word('<start>') dataset.apply(lambda x: [vocabulary.add(word) for word in x['input']]) vocabulary.build_vocab() print('pad:', vocabulary.to_index('<pad>')) print('Vocab size:', len(vocabulary)) return vocabulary
def load_dataset( data_dir='/remote-home/ygxu/workspace/Product_all', data_path='mr.task.train', # bert_dir='/home/ygxu/BERT/BERT_English_uncased_L-12_H-768_A_12', bert_dir='/remote-home/ygxu/workspace/BERT/BERT_English_uncased_L-24_H-1024_A_16', ): path = os.path.join(data_dir, data_path) ds = DataSet.read_csv(path, headers=('label', 'raw_sentence'), sep='\t') ds.apply(lambda x: x['raw_sentence'].lower(), new_field_name='raw_sentence') ds.apply(lambda x: int(x['label']), new_field_name='label_seq', is_target=True) def transfer_bert_to_fastnlp(ins): result = "[CLS] " bert_text = ins['bert_tokenize_list'] for text in bert_text: result += text + " " return result.strip() with open(os.path.join(bert_dir, 'vocab.txt')) as f: lines = f.readlines() vocabs = [] for line in lines: vocabs.append(line[:-1]) vocab_bert = Vocabulary(unknown=None, padding=None) vocab_bert.add_word_lst(vocabs) vocab_bert.build_vocab() vocab_bert.unknown = '[UNK]' vocab_bert.padding = '[PAD]' from pytorch_pretrained_bert import BertTokenizer, BertModel tokenizer = BertTokenizer.from_pretrained( os.path.join(bert_dir, 'vocab.txt')) ds.apply(lambda x: tokenizer.tokenize(x['raw_sentence']), new_field_name='bert_tokenize_list') ds.apply(transfer_bert_to_fastnlp, new_field_name='bert_tokenize') ds.apply(lambda x: [vocab_bert.to_index(word) for word in x['bert_tokenize_list']], new_field_name='index_words', is_input=True) ds.rename_field('index_words', 'tokens') ds.apply(lambda x: [1.] * len(x['tokens']), new_field_name='masks', is_input=True) return ds
def test_same_vector2(self): vocab = Vocabulary().add_word_lst(["The", 'a', 'b', "the", "THE", "B", 'a', "A"]) embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6B-100d', lower=True) words = torch.LongTensor([[vocab.to_index(word) for word in ["The", "the", "THE", 'b', "B", 'a', 'A']]]) words = embed(words) embed_0 = words[0, 0] for i in range(1, 3): assert torch.sum(embed_0==words[0, i]).eq(len(embed_0)) embed_0 = words[0, 3] for i in range(3, 5): assert torch.sum(embed_0 == words[0, i]).eq(len(embed_0))
def test_from_dataset(self): start_char = 65 num_samples = 10 # 0 dim dataset = DataSet() for i in range(num_samples): ins = Instance(char=chr(start_char + i)) dataset.append(ins) vocab = Vocabulary() vocab.from_dataset(dataset, field_name='char') for i in range(num_samples): self.assertEqual(vocab.to_index(chr(start_char + i)), i + 2) vocab.index_dataset(dataset, field_name='char') # 1 dim dataset = DataSet() for i in range(num_samples): ins = Instance(char=[chr(start_char + i)] * 6) dataset.append(ins) vocab = Vocabulary() vocab.from_dataset(dataset, field_name='char') for i in range(num_samples): self.assertEqual(vocab.to_index(chr(start_char + i)), i + 2) vocab.index_dataset(dataset, field_name='char') # 2 dim dataset = DataSet() for i in range(num_samples): ins = Instance(char=[[chr(start_char + i) for _ in range(6)] for _ in range(6)]) dataset.append(ins) vocab = Vocabulary() vocab.from_dataset(dataset, field_name='char') for i in range(num_samples): self.assertEqual(vocab.to_index(chr(start_char + i)), i + 2) vocab.index_dataset(dataset, field_name='char')
def test_fit(self): """文本编码. """ print('{} test_fit {}'.format('-' * 15, '-' * 15)) texts = ['温都尔站', '东乌广厦', '国电四郎', '阿尔善站', '朱日和基'] vocab = Vocabulary() for text in texts: vocab.add_word_lst(list(text)) print(len(vocab)) embed = StaticEmbedding( vocab, model_dir_or_name='./data/cn_char_fastnlp_100d.txt') texts_to_id = [[vocab.to_index(word) for word in list(text)] for text in ['朱日和', '东台变']] print(texts_to_id) # [[16, 17, 18], [6, 1, 1]] words = torch.LongTensor(texts_to_id) # 将文本转为index print(embed(words).size()) # torch.Size([2, 3, 100])
def load_dataset_with_glove(data_dir, data_path='mr.task.train', glove_path="", load_glove=True, vocabs=None): path = os.path.join(data_dir, data_path) print(f"start load dataset from {path}.") ds = DataSet.read_csv(path, headers=('label', 'sentence'), sep='\t') ds.apply(lambda x: x['sentence'].lower(), new_field_name='sentence') ds.apply(lambda x: x['sentence'].strip().split(), new_field_name='sentence') ds.apply(lambda x: len(x['sentence']) * [1.], new_field_name='mask', is_input=True) ds.apply(lambda x: int(x['label']), new_field_name='label', is_target=True) if vocabs is None: vocab = Vocabulary(max_size=30000, min_freq=2, unknown='<unk>', padding='<pad>') ds.apply(lambda x: [vocab.add(word) for word in x['sentence']]) vocab.build_vocab() else: vocab = vocabs ds.apply(lambda x: [vocab.to_index(w) for w in x['sentence']], new_field_name='data', is_input=True) if not load_glove: print(f"successful load dataset from {path}") return ds embedding, _ = EmbedLoader().load_embedding(300, glove_path, 'glove', vocab) print(f"successful load dataset and embedding from {path}") return ds, embedding, vocab