def makeVocabulary(filename, size, char=False): vocab = dict.Dict( [dict.PAD_WORD, dict.UNK_WORD, dict.BOS_WORD, dict.EOS_WORD]) if char: vocab.addSpecial(dict.SPA_WORD) # lengths = [] with open(filename, 'rb') as f: # lines = [line.decode('utf-8').strip().split() for line in txtfile.readlines()] for sent in f.readlines(): # print(sent[0].decode('utf-8')) for word in sent.decode('utf-8').strip().split(): # print(word) # lengths.append(len(word)) # if char: # for ch in word: # vocab.add(ch) # else: # print(word) if word != 'END': vocab.add(word + " ") # why add " " here ? # originalSize = vocab.size() originalSize = vocab.size() vocab = vocab.prune(size) print('Created dictionary of size %d (pruned from %d)' % (vocab.size(), originalSize)) return vocab
def initVocabulary(name, dataFile, vocabFile, vocabSize, char=False): ''' Inputs: name: str, 制作的字典的名字. dataFile: str, 文本源文件的路径. vocabFile: str, 已经存在的字典的路径, 每一行的第一列为label,第二列为idx. vocabSize: int, 制作的字典的size, 含有单词的个数. char: bool, 是否以字符为单位. Return: Dict对象, 里面有各种方法以及属性里idxToLabel, labelToIdx, frequencies三个字典. ''' vocab = None if vocabFile is not None: # If given, load existing word dictionary. print('Reading ' + name + ' vocabulary from \'' + vocabFile + '\'...') vocab = dict.Dict() vocab.loadFile(vocabFile) # 每一行的第一列为label,第二列为idx print('Loaded ' + str(vocab.size()) + ' ' + name + ' words') if vocab is None: # If a dictionary is still missing, generate it. print('Building ' + name + ' vocabulary...') genWordVocab = makeVocabulary(dataFile, vocabSize, char=char) # vocab_size为字典的大小 vocab = genWordVocab print() return vocab
def makeVocabulary(filename, size, is_target, char=False): if is_target: vocab = dict.Dict([], lower=opt.lower) else: vocab = dict.Dict( [dict.PAD_WORD, dict.UNK_WORD, dict.BOS_WORD, dict.EOS_WORD], lower=opt.lower) if char: vocab.addSpecial(dict.SPA_WORD) lengths = [] if type(filename) == list: for _filename in filename: with open(_filename) as f: for sent in f.readlines(): for word in sent.strip().split(): lengths.append(len(word)) if char: for ch in word: vocab.add(ch) else: vocab.add(word + " ") else: with open(filename) as f: for sent in f.readlines(): for word in sent.strip().split(): lengths.append(len(word)) if char: for ch in word: vocab.add(ch) else: vocab.add(word + " ") print('max: %d, min: %d, avg: %.2f' % (max(lengths), min(lengths), sum(lengths) / len(lengths))) originalSize = vocab.size() vocab = vocab.prune(size) print('Created dictionary of size %d (pruned from %d)' % (vocab.size(), originalSize)) return vocab
def makeVocabulary(filename, size, char=False): ''' Inputs: filename: str, 文本源文件的路径. size: int, 制作的字典的size, 含有单词的个数. char: bool, 是否以字符为单位. Return: Dict对象, 里面有各种方法以及属性里idxToLabel, labelToIdx, frequencies三个字典. ''' # PAD, UNK, BOS, EOS表示是特殊字符. 分别对应0, 1, 2, 3 vocab = dict.Dict( [dict.PAD_WORD, dict.UNK_WORD, dict.BOS_WORD, dict.EOS_WORD], lower=opt.lower) if char: vocab.addSpecial(dict.SPA_WORD) lengths = [] if type(filename) == list: for _filename in filename: with open(_filename) as f: for sent in f.readlines(): for word in sent.strip().split(): lengths.append(len(word)) if char: for ch in word: vocab.add(ch) else: vocab.add(word + " ") # 为什么要加空格?????????????????? else: with open(filename) as f: for sent in f.readlines(): for word in sent.strip().split(): lengths.append(len(word)) if char: for ch in word: vocab.add(ch) else: vocab.add(word + " ") # 单词的最大、最小、平均长度 print('max: %d, min: %d, avg: %.2f' % (max(lengths), min(lengths), sum(lengths) / len(lengths))) originalSize = vocab.size() vocab = vocab.prune(size) # 返回最频繁的size个单词的字典 print('Created dictionary of size %d (pruned from %d)' % (vocab.size(), originalSize)) return vocab
def initVocabulary(name, dataFile, vocabFile, vocabSize, char=False): vocab = None if vocabFile is not None: # If given, load existing word dictionary. print('Reading ' + name + ' vocabulary from \'' + vocabFile + '\'...') vocab = dict.Dict() vocab.loadFile(vocabFile) print('Loaded ' + str(vocab.size()) + ' ' + name + ' words') if vocab is None: # If a dictionary is still missing, generate it. print('Building ' + name + ' vocabulary...') vocab = makeVocabulary(dataFile, vocabSize, char=char) return vocab
def makeVocabulary(filename, size): vocab = dict.Dict( [dict.PAD_WORD, dict.UNK_WORD, dict.BOS_WORD, dict.EOS_WORD], lower=opt.lower) with open(filename) as f: for sent in f.readlines(): for word in sent.split(): vocab.add(word) originalSize = vocab.size() vocab = vocab.prune(size) print('Created dictionary of size %d (pruned from %d)' % (vocab.size(), originalSize)) return vocab
def makeVocabulary(filename, size): vocab = dict.Dict( [dict.PAD_WORD, dict.UNK_WORD, dict.BOS_WORD, dict.EOS_WORD], lower=opt.lower) max_tokens = opt.trun_src max_lengths = opt.src_length char = opt.src_char if type(filename) == str: filename = [filename] for _filename in filename: if opt.src_suf in _filename: max_tokens = opt.trun_src max_lengths = opt.src_length char = opt.src_char print(_filename, ' max tokens: ', max_tokens) print(_filename, ' max lengths: ', max_lengths) elif opt.tgt_suf in _filename: max_tokens = opt.trun_tgt max_lengths = opt.tgt_length char = opt.tgt_char print(_filename, ' max tokens: ', max_tokens) print(_filename, ' max lengths: ', max_lengths) with open(_filename, encoding='utf8') as f: for sent in f.readlines(): if char: tokens = list(sent.strip()) else: tokens = sent.strip().split() if max_lengths > 0 and len(tokens) > max_lengths: continue if max_tokens > 0: tokens = tokens[:max_tokens] for word in tokens: vocab.add(word + " ") originalSize = vocab.size() if size == 0: size = originalSize vocab = vocab.prune(size) print('Created dictionary of size %d (pruned from %d)' % (vocab.size(), originalSize)) return vocab
def makeVocabulary(filename, size, sep=' ', char=False): vocab = dict.Dict([dict.PAD_WORD, dict.UNK_WORD], lower=True) if char: vocab.addSpecial(dict.SPA_WORD) lengths = [] if type(filename) == list: for _filename in filename: with codecs.open(_filename, 'r', 'utf-8') as f: data = js.load(f) for sent in data: for word in sent.strip().split(sep): lengths.append(len(word)) if char: for ch in word.strip(): vocab.add(ch) else: vocab.add(word.strip() + " ") else: with codecs.open(filename, 'r', 'utf-8') as f: data = js.load(f) for sent in data: for word in sent.strip().split(sep): lengths.append(len(word)) if char: for ch in word.strip(): vocab.add(ch) else: vocab.add(word.strip() + " ") print('max: %d, min: %d, avg: %.2f' % (max(lengths), min(lengths), sum(lengths) / len(lengths))) originalSize = vocab.size() vocab = vocab.prune(size) print('Created dictionary of size %d (pruned from %d)' % (vocab.size(), originalSize)) return vocab
def main(): text_dict = dict.Dict('./data/data/text_dict') authors_dict = dict.Dict('./data/data/authors_dict') with codecs.open('./data/data/label_dict.json', 'r', 'utf-8') as f: label_dict = js.load(f) intro = { 'text_file': './data/data/intro_train', 'text_dict': text_dict, 'doc_len': 100, 'text_len': 25 } related = { 'text_file': './data/data/related_train', 'text_dict': text_dict, 'doc_len': 500, 'text_len': 25 } methods = { 'text_file': './data/data/methods_train', 'text_dict': text_dict, 'doc_len': 600, 'text_len': 25 } conclu = { 'text_file': './data/data/conclusion_train', 'text_dict': text_dict, 'doc_len': 150, 'text_len': 25 } abstract = { 'text_file': './data/data/abstract_train', 'text_dict': text_dict, 'doc_len': 10, 'text_len': 25 } title = { 'text_file': './data/data/title_train', 'text_dict': text_dict, 'text_len': 20 } authors = { 'text_file': './data/data/authors_train', 'text_dict': authors_dict, 'text_len': 7 } label = {'label_file': './data/data/label_train', 'label_dict': label_dict} text = [intro, related, methods, conclu, abstract] intro_val = { 'text_file': './data/data/intro_val', 'text_dict': text_dict, 'doc_len': 100, 'text_len': 25 } related_val = { 'text_file': './data/data/related_val', 'text_dict': text_dict, 'doc_len': 500, 'text_len': 25 } methods_val = { 'text_file': './data/data/methods_val', 'text_dict': text_dict, 'doc_len': 600, 'text_len': 25 } conclu_val = { 'text_file': './data/data/conclusion_val', 'text_dict': text_dict, 'doc_len': 150, 'text_len': 25 } abstract_val = { 'text_file': './data/data/abstract_val', 'text_dict': text_dict, 'doc_len': 10, 'text_len': 25 } title_val = { 'text_file': './data/data/title_val', 'text_dict': text_dict, 'text_len': 20 } authors_val = { 'text_file': './data/data/authors_val', 'text_dict': authors_dict, 'text_len': 7 } label_val = { 'label_file': './data/data/label_val', 'label_dict': label_dict } text_val = [intro_val, related_val, methods_val, conclu_val, abstract_val] intro_test = { 'text_file': './data/data/intro_test', 'text_dict': text_dict, 'doc_len': 100, 'text_len': 25 } related_test = { 'text_file': './data/data/related_test', 'text_dict': text_dict, 'doc_len': 500, 'text_len': 25 } methods_test = { 'text_file': './data/data/methods_test', 'text_dict': text_dict, 'doc_len': 600, 'text_len': 25 } conclu_test = { 'text_file': './data/data/conclusion_test', 'text_dict': text_dict, 'doc_len': 150, 'text_len': 25 } abstract_test = { 'text_file': './data/data/abstract_test', 'text_dict': text_dict, 'doc_len': 10, 'text_len': 25 } title_test = { 'text_file': './data/data/title_test', 'text_dict': text_dict, 'text_len': 20 } authors_test = { 'text_file': './data/data/authors_test', 'text_dict': authors_dict, 'text_len': 7 } label_test = { 'label_file': './data/data/label_test', 'label_dict': label_dict } text_test = [ intro_test, related_test, methods_test, conclu_test, abstract_test ] train = make_data(text, title, authors, label) val = make_data(text_val, title_val, authors_val, label_val) test = make_data(text_test, title_test, authors_test, label_test) save_data = {'train': train, 'val': val, 'test': test} torch.save(save_data, './data/data/save_data')