def __init__( self, question_path, paragraph_path, ratio, batch_size, vocab: Vocab = Ref("model.vocab"), batch_first=Ref("model.batch_first", True), ): self.vocab = vocab question = Field(include_lengths=True, batch_first=batch_first, pad_token=vocab.pad_token) question.vocab = vocab paragraph = Field(batch_first=batch_first, pad_token=vocab.pad_token) paragraph.vocab = vocab paragraphs = NestedField(paragraph, include_lengths=True) paragraphs.vocab = vocab target = Field(sequential=False, use_vocab=False, is_target=True) fields = [("question", question), ("paragraphs", paragraphs), ("target", target)] examples = [] with open(paragraph_path) as paragraph_file, open( question_path) as question_file: for q in question_file: q = q.strip() ps = [paragraph_file.readline().strip() for _ in range(ratio)] examples.append(Example.fromlist([q, ps, 0], fields)) BaseIRDataset.__init__(self, ratio, batch_size, batch_first) TorchTextDataset.__init__(self, examples, fields)
def build_field_vocab(cls, field: Field, counter: Counter, size_multiple: int = 1, **kwargs): # PN: original name was _build_field_vocab # this is basically copy-pasted from torchtext. all_specials = [ field.unk_token, field.pad_token, field.init_token, field.eos_token, "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", "38", "39", "40", "41", "42", "43", "44", "45", "46", "47", "48", "49", "50", "51", "52", "53", "54", "55", "56", "57", "58", "59", "60", "61", "62", "63", "64", "65", "66", "67", "68", "69", "70", "71", "72", "73", "74", "75", "76", "77", "78", "79", "80", "81", "82", "83", "84", "85", "86", "87", "88", "89", "90", "91", "92", "93", "94", "95", "96", "97", "98", "99", "100", "101", "102", "103", "104", "105", "106", "107", "108", "109", "110", "111", "112", "113", "114", "115", "116", "117", "118", "119", "120", "121", "122", "123", "124", "125", "126", "127" ] specials = [tok for tok in all_specials if tok is not None] field.vocab = field.vocab_cls(counter, specials=specials, **kwargs) if size_multiple > 1: cls.pad_vocab_to_multiple(field.vocab, size_multiple) return
def attach_tokenizer(field: Field, tokenizer: PreTrainedTokenizer) -> None: """Creates a tokenizer that is attached to a Corpus Field. Parameters ---------- field : Field Field to which the vocabulary will be attached tokenizer : PreTrainedTokenizer Tokenizer that will convert tokens to their index. """ def preprocess(value: Union[str, List[str]]) -> List[str]: """We only perform the splitting as a preprocessing step. This allows us to still have access to the original tokens, including those that will be mapped to <unk> later. """ if isinstance(value, list): value = " ".join(value) return [tokenizer.convert_ids_to_tokens(t) for t in tokenizer.encode(value)] field.preprocessing = preprocess field.pad_token = tokenizer.pad_token field.vocab = tokenizer field.vocab.stoi = tokenizer.vocab
def load_field(vocab_path: str) -> Field: field = Field(init_token='<bos>', eos_token='<eos>', batch_first=True, pad_first=True) field.vocab = pickle.load(open(vocab_path, 'rb')) return field
def create_datasets( data_path: str, mode: str, word_to_ix=None, word_vocab=None, tag_vocab=None ) -> Union[createDatasetsReturnType, BucketIterator]: """ Used when bert embeddings are switched off (i.e. we just used randomly initialized embeddings. Compiles the data first into a TabularDataset object and then into a BucketIterator (similar to a DataLoader) object via the function to_iter(). """ sent_field = Field(lower=True) tag_field = Field() data_fields = [('sentence', sent_field), ('tags', tag_field)] if mode == TRAIN: dataSetNames = [TRAIN, VAL] elif mode == TEST: dataSetNames = [TEST] for data_set in dataSetNames: create_csv(os.path.join(data_path, data_set)) if mode == TRAIN: train_dataset, val_dataset = TabularDataset.splits(path=data_path, train='train.csv', validation='val.csv', format='csv', fields=data_fields, skip_header=True) # build the vocab over the train set only sent_field.build_vocab(train_dataset) tag_field.build_vocab(train_dataset) char_to_ix = get_char_to_ix(train_dataset) train_iter = to_iter(train_dataset, sent_field.vocab.stoi['<pad>'], batch_size) val_iter = to_iter(val_dataset, sent_field.vocab.stoi['<pad>'], 1) return train_iter, val_iter, sent_field.vocab, tag_field.vocab, char_to_ix elif mode == TEST: sent_field.vocab = word_vocab tag_field.vocab = tag_vocab test_dataset = TabularDataset(path=os.path.join(data_path, 'test.csv'), format='csv', fields=data_fields, skip_header=True) test_iter = to_iter(test_dataset, word_to_ix['<pad>'], 1) return test_iter
def read_data_set(file_path, vocab): """ Reads the data set from one of the pre-processed CSVs composed of columns `label` and `sentence`. Parameters --- file_path : str Path to the CSV file. vocab : torchtext.Vocab Vocabulary to use. Returns --- X : torch.Tensor[num_labels x num_examples x sen_length] Sentences on the dataset grouped by labels. y : torch.Tensor[num_labels] Labels for each group of sentences. """ sentence = Field(batch_first=True, sequential=True, tokenize=simple_tokenizer) sentence.vocab = vocab label = Field(is_target=True) label.vocab = vocab data_set = TabularDataset(path=file_path, format='csv', skip_header=True, fields=[('label', label), ('sentence', sentence)]) sentences_tensor = sentence.process(data_set.sentence) labels_tensor = label.process(data_set.label).squeeze() # Infer num_labels and group sentences by label num_labels = labels_tensor.unique().shape[0] num_examples = labels_tensor.shape[0] // num_labels y = labels_tensor[::num_examples] sen_length = sentences_tensor.shape[-1] X = sentences_tensor.view(num_labels, num_examples, sen_length) return X, y
def load_data_dict(experiment_name, langs, corpora_type, args, device, src_field=None, trg_field=None): if src_field == None or trg_field == None: src_field = Field(tokenize=str.split, unk_token=UNK_WORD, pad_token=PAD_WORD, init_token=BOS_WORD, eos_token=EOS_WORD) trg_field = Field(tokenize=str.split, unk_token=UNK_WORD, pad_token=PAD_WORD, init_token=BOS_WORD, eos_token=EOS_WORD) fields = (src_field, trg_field) print('Loading src vocab') src_vocab = load_vocab(get_vocab_path(experiment_name, langs[0])) src_field.vocab = src_field.vocab_cls( src_vocab, specials=[UNK_WORD, PAD_WORD, BOS_WORD, EOS_WORD]) print('Loading trg vocab') trg_vocab = load_vocab(get_vocab_path(experiment_name, langs[1])) trg_field.vocab = trg_field.vocab_cls( trg_vocab, specials=[UNK_WORD, PAD_WORD, BOS_WORD, EOS_WORD]) args.src_pad_idx = src_field.vocab.stoi[PAD_WORD] args.trg_pad_idx = trg_field.vocab.stoi[PAD_WORD] args.trg_bos_idx = trg_field.vocab.stoi[BOS_WORD] args.trg_eos_idx = trg_field.vocab.stoi[EOS_WORD] args.src_vocab_size = len(src_field.vocab) args.trg_vocab_size = len(trg_field.vocab) print('Loading data') data, total_tokens = load_data(experiment_name=experiment_name, langs=langs, fields=fields, batch_size=args.batch_size, device=device, corpora_type=corpora_type, reduce_size=args.data_reduce_size) return data, total_tokens, src_field, trg_field
def make_fields(vocab_count, binary=True): text_field = Field(batch_first=True, include_lengths=True, tokenize=lambda x: x.split(' ')) text_field.vocab = Vocab(vocab_count['text']) char_nesting_field = Field(batch_first=True, tokenize=list) char_field = NestedField(char_nesting_field, tokenize=lambda x: x.split(' ')) char_nesting_field.vocab = Vocab(vocab_count['chars']) char_field.vocab = Vocab(vocab_count['chars']) pos1_field = Field(batch_first=True, sequential=False, use_vocab=False) pos2_field = Field(batch_first=True, sequential=False, use_vocab=False) pos1_rel_field = Field(sequential=True, batch_first=True) pos1_rel_field.vocab = Vocab(vocab_count['pos1_rel']) pos2_rel_field = Field(sequential=True, batch_first=True) pos2_rel_field.vocab = Vocab(vocab_count['pos2_rel']) if binary: label_field = Field(sequential=False, batch_first=True) else: label_field = Field(sequential=False, batch_first=True) label_field.vocab = Vocab(vocab_count['relation'], specials=[]) reltype_field = Field(batch_first=True, sequential=False) reltype_field.vocab = Vocab(vocab_count['rel_type']) fields_dict = { 'text': [('text', text_field), ('chars', char_field)], 'pos1': ('pos1', pos1_field), 'pos2': ('pos2', pos2_field), 'pos1_rel': ('pos1_rel', pos1_rel_field), 'pos2_rel': ('pos2_rel', pos2_rel_field), 'relation': ('relation', label_field), 'rel_type': ('rel_type', reltype_field) } return fields_dict
def __init__(self, module_name, train_bs, eval_bs, device, log): self.module_name = module_name # split_chars = lambda x: list("".join(x.split())) split_chars = lambda x: list(x) # keeps whitespaces source = Field(tokenize=split_chars, init_token='<sos>', eos_token='<eos>', batch_first=True) target = Field(tokenize=split_chars, init_token='<sos>', eos_token='<eos>', batch_first=True) log("Loading FULL datasets ...") folder = os.path.join(DATASET_TARGET_DIR, module_name) train_dataset, eval_dataset, _ = TranslationDataset.splits( path=folder, root=folder, exts=(INPUTS_FILE_ENDING, TARGETS_FILE_ENDING), fields=(source, target), train=TRAIN_FILE_NAME, validation=EVAL_FILE_NAME, test=EVAL_FILE_NAME) log("Building vocab ...") source.build_vocab(train_dataset) target.vocab = source.vocab log("Creating iterators ...") train_iterator = Iterator(dataset=train_dataset, batch_size=train_bs, train=True, repeat=True, shuffle=True, device=device) eval_iterator = Iterator(dataset=eval_dataset, batch_size=eval_bs, train=False, repeat=False, shuffle=False, device=device) self.train_dataset = train_dataset self.eval_dataset = eval_dataset self.train_iterator = train_iterator self.eval_iterator = eval_iterator self.source = source self.target = target
def load_naive_cl(args): """ Convenience function to load pickle or dataset """ if args.tokenizer == 'spacy': maslow_text = Field(tokenize=tokenize_en, init_token='<sos>', eos_token='<eos>', lower=True, include_lengths=True, use_vocab=True) reiss_text = maslow_text elif args.tokenizer == 'raw': maslow_text = Field(tokenize=tokenize_raw, init_token='<sos>', eos_token='<eos>', lower=True, include_lengths=True, use_vocab=True) reiss_text = maslow_text elif args.tokenizer == 'gpt2': maslow_text = args.gpt_maslowfield reiss_text = args.gpt_reissfield # Maslow dataset maslow_label = Field(sequential=False, unk_token=None) maslow_path = ".data/stories/story_commonsense/torchtext_class/maslow/" maslow_iterators= \ load_naive_iterators(args, maslow_path, fields=(maslow_text, maslow_label)) # Reiss dataset reiss_label = Field(sequential=False, unk_token=None) reiss_path = ".data/stories/story_commonsense/torchtext_class/reiss/" reiss_iterators= \ load_naive_iterators(args, reiss_path, fields=(reiss_text, reiss_label)) # Load vocab used for previous model from pickle print(f"Found data pickle, loading from {args.prepared_data}") with open(args.prepared_data, 'rb') as p: d = pickle.load(p) combined_vocab = d["combined_vocab"] args.emb_dim = d["emb_dim"] loaded_vectors = d["loaded_vectors"] maslow_text.vocab = combined_vocab reiss_text.vocab = combined_vocab return maslow_iterators, reiss_iterators, maslow_text, loaded_vectors
def build_field_vocab(cls, field: Field, counter: Counter, size_multiple: int = 1, **kwargs) -> NoReturn: # PN: original name was _build_field_vocab # this is basically copy-pasted from torchtext. all_specials = [ field.unk_token, field.pad_token, field.init_token, field.eos_token ] specials = [tok for tok in all_specials if tok is not None] field.vocab = field.vocab_cls(counter, specials=specials, **kwargs) if size_multiple > 1: cls.pad_vocab_to_multiple(field.vocab, size_multiple) return
def predict(sentence, model_path): if not os.path.exists(model_path): raise Exception("Need to provide model path") model = Model(model_path) checkpoint = torch.load(model_path, map_location=lambda storage, location: storage) vocab = checkpoint['vocab'] target_field = Field(sequential=True, init_token=START_DECODING, eos_token=STOP_DECODING, pad_token=PAD_TOKEN, batch_first=True, include_lengths=True, unk_token=UNKNOWN_TOKEN, lower=True) source_field = Field(sequential=True, init_token=SENTENCE_START, eos_token=SENTENCE_END, pad_token=PAD_TOKEN, batch_first=True, include_lengths=True, unk_token=UNKNOWN_TOKEN, lower=True) source_field.vocab = vocab target_field.vocab = vocab data = [{'src': sentence, 'tgt': ''}] predict_data = Mydataset(data=data, fields=(('source', source_field), ('target', target_field))) setattr(args, 'vectors', source_field.vocab.vectors) setattr(args, 'vocab_size', len(source_field.vocab.itos)) setattr(args, 'emb_dim', vectors.dim)
def create_dataset(config: Config, device: torch.device, vocab: Vocab, rics: List[str], seqtypes: List[SeqType]) -> Iterator: fields = dict() fields[SeqType.ArticleID.value] = (SeqType.ArticleID.value, RawField()) time_field = Field(use_vocab=False, batch_first=True, sequential=False) fields['jst_hour'] = (SeqType.Time.value, time_field) token_field = \ Field(use_vocab=True, init_token=SpecialToken.BOS.value, eos_token=SpecialToken.EOS.value, pad_token=SpecialToken.Padding.value, unk_token=SpecialToken.Unknown.value) fields['processed_tokens'] = (SeqType.Token.value, token_field) tensor_type = torch.FloatTensor if device.type == 'cpu' else torch.cuda.FloatTensor for (ric, seqtype) in itertools.product(rics, seqtypes): n = N_LONG_TERM if seqtype.value.endswith('long') else N_SHORT_TERM price_field = Field(use_vocab=False, fix_length=n, batch_first=True, pad_token=0.0, preprocessing=lambda xs: [float(x) for x in xs], tensor_type=tensor_type) key = stringify_ric_seqtype(ric, seqtype) fields[key] = (key, price_field) # load an alignment for predicttion predict = TabularDataset(path='output/alignment-predict.json', format='json', fields=fields) token_field.vocab = vocab # Make an iteroter for prediction return Iterator(predict, batch_size=1, device=-1 if device.type == 'cpu' else device, repeat=False, sort=False)
def __init__(self, data_file, vocab_file, batch_size=256): self.batch_size = batch_size smi_field = Field(sequential=True, init_token='<sos>', eos_token=' ', pad_token=' ', include_lengths=True, batch_first=True, tokenize=smi_tokenizer) property_field = Field(sequential=False, use_vocab=False) # load smile data with open(data_file, 'r') as f: mol_strs = f.read().strip().split('\n') mol_strs = [mol.replace(' ', '') for mol in mol_strs] mol_strs = [smi_field.preprocess(mol) for mol in mol_strs] smi_examples = [] fields = [('smile', smi_field), ('property', property_field)] for mol in mol_strs: ex = Example.fromlist([mol, [1, 2, 3]], fields) smi_examples.append(ex) # load or build vocab if os.path.isfile(vocab_file): print('load vocab from:', vocab_file) smi_field.vocab = pickle.load(open(vocab_file, 'rb')) else: print('build and save vocab file:', vocab_file) smi_field.build_vocab(mol_strs) pickle.dump(smi_field.vocab, open(vocab_file, 'wb'), protocol=2) self.vocab = smi_field.vocab self.vocab_size = len(smi_field.vocab.itos) self.padding_idx = smi_field.vocab.stoi[smi_field.pad_token] self.sos_idx = smi_field.vocab.stoi[smi_field.init_token] self.eos_idx = smi_field.vocab.stoi[smi_field.eos_token] self.unk_idx = smi_field.vocab.stoi[smi_field.unk_token] self.dataset_smi = Dataset(smi_examples, fields=fields) self.train_smi = Dataset(smi_examples[:-5000], fields=fields) self.test_smi = Dataset(smi_examples[-5000:], fields=fields)
def test_loader(path): with open("Data/text.pickle", "rb") as fp: vocab = pickle.load(fp) tokenize = lambda x: x.split() TEXT = Field(sequential=True, tokenize=tokenize, lower=True) TEXT.vocab = vocab tst_datafields = [("titles", TEXT)] tst = TabularDataset(path='Data/test.csv', format='csv', skip_header=True, fields=tst_datafields) test_iter = Iterator(tst, batch_size=32, sort=False, sort_within_batch=False, repeat=False) test_dl = BatchWrapper(test_iter, "titles", None) return test_dl, tst
def preprocess_couplet(): SRC = Field(include_lengths=True, init_token="<sos>", eos_token="<eos>", pad_token="<pad>", unk_token="<unk>", lower=True, batch_first=False, tokenize=lambda text: text.split()) TRG = Field(include_lengths=True, init_token="<sos>", eos_token="<eos>", pad_token="<pad>", unk_token="<unk>", lower=True, batch_first=False, tokenize=lambda text: text.split()) _train, _test = TabularDataset.splits(path="data/couplet", root="data", train="train.tsv", test="test.tsv", format='csv', skip_header=False, fields=[("src", SRC), ("trg", TRG)], csv_reader_params={"quoting": csv.QUOTE_NONE, "delimiter": "\t"}) SRC.build_vocab(_train.src, _train.trg, min_freq=1) TRG.vocab = SRC.vocab return _train, _test, SRC, TRG
def init_lm(config_path, state_path, model_cls_name: str): model_cls = MODEL_CLASSES[model_cls_name] hp = load_config(config_path).get('hp') get_path = create_get_path_fn(state_path) # Loading vocab field = Field(eos_token=EOS_TOKEN, batch_first=True, tokenize=char_tokenize, pad_first=True) field.vocab = pickle.load(open(get_path('vocab', 'pickle'), 'rb')) print('Loading models..') device = None if torch.cuda.is_available() else 'cpu' if model_cls is RNNLM: lm = cudable(RNNLM(hp.model_size, field.vocab, n_layers=hp.n_layers)).eval() lm.load_state_dict(torch.load(get_path('lm'), map_location=device)) elif model_cls is ConditionalLM: lm = cudable(ConditionalLM(hp.model_size, field.vocab)).eval() lm.load_state_dict(torch.load(get_path('lm'), map_location=device)) elif model_cls is CharLMFromEmbs: rnn_lm = cudable( RNNLM(hp.model_size, field.vocab, n_layers=hp.n_layers)) style_embed = cudable(nn.Embedding(2, hp.model_size)) rnn_lm.load_state_dict(torch.load(get_path('lm'), map_location=device)) style_embed.load_state_dict( torch.load(get_path('style_embed'), map_location=device)) lm = cudable(CharLMFromEmbs(rnn_lm, style_embed, n_layers=hp.n_layers)).eval() else: raise NotImplementedError return lm, field
def load_dataset(config, device): label_dict = {"observing": 0, "against": 1, "for": 2} LABEL = Field(use_vocab = False, sequential = False,\ dtype = torch.long, preprocessing = lambda x: label_dict[x.strip()]) SEQ = Field(dtype = torch.long, lower = True, batch_first = True,\ preprocessing = lambda x:x[:45], include_lengths = True) SENT = Field(dtype = torch.long, lower = True, batch_first = True,\ preprocessing = lambda x:x[:45], include_lengths = False) DOC = NestedField(SENT, tokenize = lambda s:s.strip().split(' </s> '), \ preprocessing = lambda s:[x for x in s[:45] if x], dtype = torch.long,\ include_lengths = True) fields = [('label', LABEL), ('claim', SEQ), ('hline', SEQ),\ ('abst', SEQ), ('body', DOC)] train, test = TabularDataset.splits(path="../stance_data/", format = "tsv",\ fields = fields, train = config.train_file, test = config.test_file) train, val = train.split(split_ratio=0.80) vectors = GloVe(name="6B", dim=config.embed_dim, cache='/users4/jwduan/vectors/') DOC.build_vocab(train, val, test, vectors=vectors) SEQ.build_vocab() SEQ.vocab = DOC.vocab config.vocab_size = len(DOC.vocab) train_loader, val_loader, test_loader = Iterator.splits((train, val, test),\ batch_sizes = (config.batch_size, 256, 256), sort_key = lambda x:len(x.body), sort = True, device = device, shuffle = True, repeat = False) return (train_loader, val_loader, test_loader), DOC.vocab.vectors
def reformat_data(data, data_torchaudio, trg_min_freq, trg_max_size, tok_fun, trg_vocab_file=None, trg_vocab=None, lowercase=True): train_iter = data src_field = Noprocessfield(sequential=False, use_vocab=False, dtype=torch.double, include_lengths=True) trg_field = Field(init_token=BOS_TOKEN, eos_token=EOS_TOKEN, pad_token=PAD_TOKEN, tokenize=tok_fun, unk_token=UNK_TOKEN, batch_first=True, lower=lowercase, include_lengths=True) if trg_vocab is None: trg_vocab = build_vocab(min_freq=trg_min_freq, max_size=trg_max_size, dataset=data_torchaudio, trg_field=trg_field, vocab_file=trg_vocab_file) trg_field.vocab = trg_vocab entry_list = [] for i, batch in enumerate(iter(train_iter)): # reactivate training entry_list.append(Entry(batch[0][0].squeeze(), batch[0][1])) train_data = Dataset(entry_list, [('src', src_field), ('trg', trg_field)]) return train_data, trg_vocab, src_field, trg_field
def getData(): german = Field(tokenize=tokenize_ger, lower=True, init_token="<sos>", eos_token="<eos>", pad_token="<pad>", unk_token="<unk>") english = Field(tokenize=tokenize_eng, lower=True, init_token="<sos>", eos_token="<eos>", pad_token="<pad>", unk_token="<unk>") print("===============================before ") train_data, valid_data, test_data = Multi30k.splits( exts=(".ennsw", ".en"), fields=(german, english), # root='.data', train='train', validation='val', test='test2016', path='.data/multi30k') # train_data, valid_data, test_data = Multi30k.splits( # exts=(".tgtnsw", ".tgt"), fields=(german, english), # # root='.data', # train='train', # validation='valid', # test='test', # path='/data/chaudhryz/uwstudent1/data_zaid_short' # ) #The study’s questions are carefully worded and chosen. # The study questions were carefully worded and chosen. # train_data, valid_data, test_data = Multi30k.splits( # exts=(".src", ".tgt"), fields=(german, english), # # root='.data', # train='train', # validation='valid', # test='test', # path = '/data/chaudhryz/uwstudent1/GDATA' # ) #german.build_vocab(train_data, max_size=10000, min_freq=2) #english.build_vocab(train_data, max_size=10000, min_freq=2) #german.vocab.init_token = "<sos>" #german.vocab.eos_token = "<eos>" #english.vocab.init_token = "<sos>" #english.vocab.eos_token = "<eos>" # print("Train") # for i in range(10): # #print(train_data[i].src, train_data[i].trg) # printSent(train_data[i].src) # printSent(train_data[i].trg) # print("Test") # for i in range(10): # #print(train_data[i].src, train_data[i].trg) # printSent(test_data[i].src) # printSent(test_data[i].trg) # exit() # a = {'GermanVocab': german.vocab, 'EnglishVocab': english.vocab} # with open('filename.pickle', 'wb') as handle: # pickle.dump(a, handle, protocol=pickle.HIGHEST_PROTOCOL) # with open('filename.pickle', 'rb') as handle: b = pickle.load(handle) german.vocab = b['GermanVocab'] english.vocab = b['EnglishVocab'] # # print # a == b return german.vocab, english.vocab, train_data, valid_data, test_data
target.append(labels[file[:-4]]) return list(zip(sentences, target)) # 定义Field TEXT = Field(sequential=True, tokenize=lambda x: jb.lcut(x), lower=True, use_vocab=True) LABEL = Field(sequential=False, use_vocab=False) FIELDS = [('text', TEXT), ('category', LABEL)] # 构建中文词汇表 with open("vocab.pkl", 'rb') as vocab: TEXT.vocab = pickle.load(vocab) # ----------------------------- 请加载您最满意的模型 ------------------------------- # 加载模型(请加载你认为的最佳模型) # 加载模型,加载请注意 model_path 是相对路径, 与当前文件同级。 # 如果你的模型是在 results 文件夹下的 temp.pth 模型,则 model_path = 'results/temp.pth' # 创建模型实例 vocab_size = len(TEXT.vocab) model = Net(vocab_size) model_path = "results/model.pth" model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu'))) # -------------------------请勿修改 predict 函数的输入和输出------------------------- def predict(text):
def train_data(): tokenize = lambda x: x.split() Text_src = Field(sequential=True, tokenize=tokenize, eos_token='<EOS>', include_lengths=True, lower=True) Answer = Field(sequential=True, tokenize=tokenize, eos_token='<EOS>', include_lengths=True, lower=True) Text_tgt = Field(sequential=True, tokenize=tokenize, eos_token='<EOS>', include_lengths=True, init_token='<SOS>', lower=True) trn_datafields = [("source",Text_src), ("target", Text_tgt), ("answer", Answer)] trn, val = TabularDataset.splits( path="../data/"+str(data_name), # the root directory where the data lies train='train.json', validation = 'validation.json', format='json', # skip_header=True, # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data! fields={'source': trn_datafields[0], 'target': trn_datafields[1], 'answer': trn_datafields[2]}) # Text_src.build_vocab(trn, max_size=vocab_size) Text_src.build_vocab(trn, max_size=src_vocab_size) Text_tgt.build_vocab(trn, max_size=tgt_vocab_size) Answer.build_vocab(trn) Text_src.vocab.load_vectors("glove.840B.300d") Text_tgt.vocab.load_vectors("glove.840B.300d") train_iter, val_iter = BucketIterator.splits( (trn, val), # we pass in the datasets we want the iterator to draw data from batch_sizes= (batch_size, batch_size), device=-1, # if you want to use the GPU, specify the GPU number here sort_key=lambda x: len(x.source), # the BucketIterator needs to be told what function it should use to group the data. sort_within_batch=True, shuffle = True, repeat= False) Text_tgt_r = ReversibleField(sequential=True, include_lengths=True, eos_token='<EOS>', init_token='<SOS>', lower=True) Text_tgt_r.vocab = Text_tgt.vocab Text_src_r = ReversibleField(sequential=True, include_lengths=True, eos_token='<EOS>', lower=True) Text_src_r.vocab = Text_src.vocab Text_ans_r = ReversibleField(sequential=True, tokenize=tokenize, eos_token='<EOS>', include_lengths=True, lower=True) Text_ans_r.vocab = Answer.vocab src_pad = Text_src.vocab.stoi['<pad>'] src_unk = Text_src.vocab.stoi['<unk>'] src_eos = Text_src.vocab.stoi['<EOS>'] src_special = [src_pad, src_unk, src_eos] ans_pad = Answer.vocab.stoi['<pad>'] ans_unk = Answer.vocab.stoi['<unk>'] ans_eos = Answer.vocab.stoi['<EOS>'] ans_special = [ans_pad, ans_unk, ans_eos] tgt_pad = Text_tgt.vocab.stoi['<pad>'] tgt_unk = Text_tgt.vocab.stoi['<unk>'] tgt_eos = Text_tgt.vocab.stoi['<EOS>'] tgt_sos = Text_tgt.vocab.stoi['<SOS>'] tgt_special = [tgt_pad, tgt_unk, tgt_eos, tgt_sos] # discriminator data iterator passage = Field(sequential=True, tokenize=tokenize, eos_token='<EOS>', include_lengths=True, lower=True) ans = Field(sequential=True, tokenize=tokenize, eos_token='<EOS>', include_lengths=True, lower=True) ques = Field(sequential=True, tokenize=tokenize, eos_token='<EOS>',include_lengths=True, lower=True) target = Field(sequential=False, use_vocab=False) disc_trn_datafields = [("question", ques), ("answer", ans), ("passage", passage), ("target", target)] disc_trn = TabularDataset( path="../data/" + str(data_name) + "/disc.json", # the root directory where the data lies # train='disc.json', format='json', # skip_header=True, # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data! fields={'question': disc_trn_datafields[0], 'answer': disc_trn_datafields[1], 'passage': disc_trn_datafields[2], 'target': disc_trn_datafields[3]}) passage.vocab = Text_src.vocab ans.vocab = Answer.vocab ques.vocab = Text_tgt.vocab disc_train_iter = BucketIterator( dataset=disc_trn, # we pass in the datasets we want the iterator to draw data from batch_size = batch_size, device=-1, # if you want to use the GPU, specify the GPU number here sort_key=lambda x: len(x.question), # the BucketIterator needs to be told what function it should use to group the data. sort_within_batch=True, shuffle=True, repeat=False) # raw data iterator Text_tgt_raw = ReversibleField(sequential=True, tokenize=tokenize, include_lengths=True, lower=True) trn_datafields = [("source", Text_tgt_raw), ("target", Text_tgt_raw)] trn_raw, val_raw = TabularDataset.splits( path="../data/"+str(data_name), # the root directory where the data lies train='train.json', validation='validation.json', format='json', # skip_header=True, # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data! fields={'source': trn_datafields[0], 'target': trn_datafields[1]}) Text_tgt_raw.build_vocab(val_raw) train_iter_raw, val_iter_raw = BucketIterator.splits( (trn_raw, val_raw), # we pass in the datasets we want the iterator to draw data from batch_sizes=(batch_size, batch_size), device=-1, # if you want to use the GPU, specify the GPU number here sort_key=lambda x: len(x.source), # the BucketIterator needs to be told what function it should use to group the data. sort_within_batch=True, shuffle=True, repeat=False) return train_iter, val_iter, src_special, tgt_special, Text_tgt_r, val_iter_raw, Text_tgt_raw, Text_src_r,\ Text_src, Text_tgt, ans_special, Text_ans_r, disc_train_iter
def __init__(self, path, batch_size, extensions=(".src", ".trg"), src_vocab: Vocab = Ref("model.src_vocab"), trg_vocab: Vocab = Ref("model.trg_vocab"), level=Ref("model.level"), sort=False, sort_within_batch=False, batch_by_words=True, batch_first=Ref("model.batch_first", True), multiple: int = Ref("exp_global.multiple", 1), max_len=1000, subword_model=None, subword_alpha=0.1, subword_nbest=64): self.max_len = max_len self.src_vocab = src_vocab self.trg_vocab = trg_vocab tokenize = None if level != "word": tokenize = list if subword_model is not None: import sentencepiece as spm self.subword_model = spm.SentencePieceProcessor() self.subword_model.load(subword_model) tokenize = self.split_subwords else: self.subword_model = None self.subword_alpha = subword_alpha self.subword_nbest = subword_nbest logger.info(f"Loading {path}") src = Field(batch_first=batch_first, tokenize=tokenize, include_lengths=True, preprocessing=None, postprocessing=self.postprocess_src) src.vocab = src_vocab if os.path.exists(os.path.expanduser(path + extensions[1])): has_target = True trg = Field(batch_first=batch_first, tokenize=tokenize, include_lengths=True, init_token=src_vocab.bos_token, eos_token=trg_vocab.eos_token, is_target=True, preprocessing=None, postprocessing=self.postprocess_trg) trg.vocab = trg_vocab fields = [('src', src), ('trg', trg)] TorchTextDataset.__init__( self, self.load_parallel_data( os.path.expanduser(path + extensions[0]), os.path.expanduser(path + extensions[1]), fields), fields) else: has_target = False fields = [('src', src)] TorchTextDataset.__init__( self, self.load_source_data(os.path.expanduser(path + extensions[0]), fields[0]), fields) BaseTranslationDataset.__init__(self, batch_size, level, sort, sort_within_batch, batch_by_words, batch_first, multiple, has_target)
# %% # region Dataset & DataLoader dataset = Dataset(examples, fields) train, valid, test = dataset.split(split_ratio=[0.6, 0.2, 0.2], stratified=False, strata_field='label') # vocab vectors = GloVe(name='6B', dim=300) source = [ getattr(dataset, item) for item in ['title', 'abstr', 'intro', 'relat', 'metho', 'concl'] ] TITLE.build_vocab(source, vectors=vectors, max_size=opt.vocab_size) TITLE.vocab.vectors.unk_init = init.xavier_uniform ABSTR.vocab = TITLE.vocab INTRO.vocab = TITLE.vocab RELAT.vocab = TITLE.vocab METHO.vocab = TITLE.vocab CONCL.vocab = TITLE.vocab AUTHO.build_vocab(train, max_size=1600) # Iterator if not opt.notrain: train_iter, valid_iter = BucketIterator.splits((train, valid), batch_size=opt.batch_size, sort=False) test_iter = BucketIterator(test, batch_size=opt.batch_size, sort=False, train=False,
def __init__(self, data_file, vocab_file, batch_size=256, property_column=None): self.batch_size = batch_size smi_field = Field(sequential=True, init_token='<sos>', eos_token=' ', pad_token=' ', include_lengths=True, batch_first=True, tokenize=smi_tokenizer) property_field = Field(sequential=False, use_vocab=False, dtype=torch.float) # load smile data # with open(data_file, 'r') as f: # mol_strs = f.read().strip().split('\n') # mol_strs = [mol.replace(' ', '') for mol in mol_strs] # mol_strs = [smi_field.preprocess(mol) for mol in mol_strs] mol_strs = [] smi_examples = [] fields = [('smile', smi_field), ('property', property_field)] for index, row in data_file.iterrows(): mol_str = smi_field.preprocess(row['smiles']) # prop_str = property_field.preprocess(row[property_column].tolist()) if property_column is not None: ex = Example.fromlist([mol_str, row[property_column].tolist()], fields) else: ex = Example.fromlist([mol_str, [0]], fields) mol_strs.append(mol_str) smi_examples.append(ex) # load or build vocab if os.path.isfile(vocab_file): print('load vocab from:', vocab_file) smi_field.vocab = pickle.load(open(vocab_file, 'rb')) else: print('build and save vocab file:', vocab_file) smi_field.build_vocab(mol_strs) pickle.dump(smi_field.vocab, open(vocab_file, 'wb'), protocol=2) self.dset_num = len(mol_strs) # self.dset_test_num = int(self.dset_num * 0.2) self.vocab = smi_field.vocab self.vocab_size = len(smi_field.vocab.itos) if property_column is not None: self.prop_size = len(property_column) else: self.prop_size = 0 self.padding_idx = smi_field.vocab.stoi[smi_field.pad_token] self.sos_idx = smi_field.vocab.stoi[smi_field.init_token] self.eos_idx = smi_field.vocab.stoi[smi_field.eos_token] self.unk_idx = smi_field.vocab.stoi[smi_field.unk_token] self.dataset_smi = Dataset(smi_examples, fields=fields) self.train_smi, self.test_smi = self.dataset_smi.split(0.8) self.dset_test_num = len(self.test_smi)
def load_dataset(hparams, is_eval=False, test_data_path=None): batch_size = hparams.batch_size max_copy_token_num = hparams.max_copy_token_num pointer_copy_tokens = hparams.max_copy_token_num if hparams.copy else 0 def tokenize(text): return text.strip('\r').split src_field = Field(tokenize=tokenize, include_lengths=True, init_token='<ssos>', eos_token='<seos>') tgt_field = Field(tokenize=tokenize, include_lengths=True, init_token='<sos>', eos_token='<eos>') fields = [('src', src_field), ('tgt', tgt_field)] if not hparams.share_vocab: logger.info('[VOCAB] Constructing two vocabs for the src and tgt') logger.info('[VOCAB] Loading src vocab from: %s' % hparams.src_vocab_path) load_vocab(hparams.src_vocab_path, src_field) logger.info('[VOCAB] src vocab size: %d' % len(src_field.vocab.itos)) logger.info('[VOCAB] Loading tgt vocab from: %s' % hparams.tgt_vocab_path) load_vocab(hparams.tgt_vocab_path, tgt_field, pointer_copy_tokens=pointer_copy_tokens) logger.info('[VOCAB] tgt vocab size: %d' % len(tgt_field.vocab.itos)) else: logger.info('[VOCAB] Constructing a sharing vocab for the src and tgt') logger.info('[VOCAB] Loading src&tgt vocab from: %s' % hparams.src_vocab_path) load_vocab(hparams.vocab_path, src_field, pointer_copy_tokens=pointer_copy_tokens, special_tokens=[ tgt_field.unk_token, tgt_field.pad_token, tgt_field.init_token, tgt_field.eos_token ]) tgt_field.vocab = src_field.vocab logger.info('[VOCAB] src vocab size: %d' % len(src_field.vocab.itos)) logger.info('[VOCAB] tgt vocab size: %d' % len(tgt_field.vocab.itos)) def sort_key(x): return len(x.tgt) + len(x.src) * 100 device = 'cuda' if hparams.cuda else 'cpu' val, max_val_len = get_dataset(hparams.val_data_path_prefix, fields=fields, max_src_len=hparams.max_src_len, max_tgt_len=hparams.max_tgt_len, pointer_copy=hparams.copy, word_freq_dict=src_field.vocab.stoi) test, max_test_len = get_dataset(hparams.test_data_path_prefix if not test_data_path else test_data_path, fields=fields, pointer_copy=hparams.copy, max_src_len=hparams.max_src_len, max_tgt_len=hparams.max_tgt_len, word_freq_dict=src_field.vocab.stoi) if hparams.copy: assert max_val_len + 1 < max_copy_token_num, max_val_len assert max_test_len + 1 < max_copy_token_num, max_test_len if not is_eval: logger.info('[DATASET] Training Mode') train, max_train_len = get_dataset(hparams.train_data_path_prefix, fields=fields, pointer_copy=hparams.copy, max_src_len=hparams.max_src_len, max_tgt_len=hparams.max_tgt_len, word_freq_dict=src_field.vocab.stoi) if hparams.copy: assert max_train_len + 1 < max_copy_token_num, max_train_len train_iter = BucketIterator(train, batch_size=batch_size, repeat=False, shuffle=True, sort_key=sort_key, sort=False, train=True, sort_within_batch=True, device=device) val_iter = BucketIterator(val, batch_size=batch_size, repeat=False, shuffle=True, sort_key=sort_key, sort=False, train=False, sort_within_batch=True, device=device) test_iter = Iterator(test, batch_size=batch_size, repeat=False, shuffle=False, sort_key=sort_key, sort=False, train=False, sort_within_batch=False, device=device) return train_iter, val_iter, test_iter, src_field, tgt_field else: logger.info('[DATASET] Eval/Inference Mode') val_iter = Iterator(val, batch_size=batch_size, repeat=False, shuffle=False, sort_key=sort_key, sort=False, train=False, sort_within_batch=False, device=device) test_iter = Iterator(test, batch_size=batch_size, repeat=False, shuffle=False, sort_key=sort_key, sort=False, train=False, sort_within_batch=False, device=device) return None, val_iter, test_iter, src_field, tgt_field
validation=val_path, test=test_path, format='tsv', fields=data_fields) #TR.build_vocab(train, min_freq=MIN_FREQ, max_size=params.vocab_size) #EN.build_vocab(train, min_freq=MIN_FREQ, max_size=params.vocab_size) print("Building vocab...") MIN_FREQ = 1 if USE_NEW_DOUBLE_TR and args.bpe: TR_CONTEXT.build_vocab(train.src, train.src_context, min_freq=MIN_FREQ, max_size=VOCAB_SIZE) TR_SRC.vocab = TR_CONTEXT.vocab TR = TR_SRC else: TR.build_vocab(train, min_freq=MIN_FREQ, max_size=VOCAB_SIZE) EN.build_vocab(train, min_freq=MIN_FREQ, max_size=VOCAB_SIZE) pad_idx = EN.vocab.stoi[PAD] print('making validation iterator') valid_iter = Iterator(val, batch_size=BATCH_SIZE, device=device, repeat=False, sort=False, train=False) print('done')
import json import string from pathlib import Path from torchtext.data import Field, RawField import numpy as np from utils.entities_list import Entities_list from utils.class_utils import keys_vocab_cls, iob_labels_vocab_cls, entities_vocab_cls MAX_BOXES_NUM = 130 # limit max number boxes of every documents MAX_TRANSCRIPT_LEN = 70 # limit max length text of every box # text string label converter TextSegmentsField = Field(sequential=True, use_vocab=True, include_lengths=True, batch_first=True) TextSegmentsField.vocab = keys_vocab_cls # iob string label converter IOBTagsField = Field(sequential=True, is_target=True, use_vocab=True, batch_first=True) IOBTagsField.vocab = iob_labels_vocab_cls class Document: def __init__(self, boxes_and_transcripts_file: Path, image_file: Path, resized_image_size: Tuple[int, int] = (560,784), iob_tagging_type: str = 'box_level', entities_file: Path = None, training: bool = True, image_index=None, max_boxes_num = MAX_BOXES_NUM, max_transcript_len = MAX_TRANSCRIPT_LEN): ''' :param boxes_and_transcripts_file: gt or ocr results file :param image_file: whole images file :param resized_image_size: resize whole image size, (w, h)
def translate(cfg_file, ckpt: str, output_path: str = None) -> None: """ Interactive translation function. Loads model from checkpoint and translates either the stdin input or asks for input to translate interactively. The input has to be pre-processed according to the data that the model was trained on, i.e. tokenized or split into subwords. Translations are printed to stdout. :param cfg_file: path to configuration file :param ckpt: path to checkpoint to load :param output_path: path to output file """ def _load_line_as_data(line): """ Create a dataset from one line via a temporary file. """ # write src input to temporary file tmp_name = "tmp" tmp_suffix = ".src" tmp_filename = tmp_name + tmp_suffix with open(tmp_filename, "w") as tmp_file: tmp_file.write("{}\n".format(line)) test_data = MonoDataset(path=tmp_name, ext=tmp_suffix, field=src_field) # remove temporary file if os.path.exists(tmp_filename): os.remove(tmp_filename) return test_data logger = make_logger() def _translate_data(test_data): """ Translates given dataset, using parameters from outer scope. """ # pylint: disable=unused-variable score, loss, ppl, sources, sources_raw, references, hypotheses, \ hypotheses_raw, attention_scores = validate_on_data( model, data=test_data, batch_size=batch_size, batch_type=batch_type, level=level, max_output_length=max_output_length, eval_metric="", use_cuda=use_cuda, loss_function=None, beam_size=beam_size, beam_alpha=beam_alpha, logger=logger) return hypotheses cfg = load_config(cfg_file) # when checkpoint is not specified, take oldest from model dir if ckpt is None: model_dir = cfg["training"]["model_dir"] ckpt = get_latest_checkpoint(model_dir) batch_size = cfg["training"].get("eval_batch_size", cfg["training"].get("batch_size", 1)) batch_type = cfg["training"].get( "eval_batch_type", cfg["training"].get("batch_type", "sentence")) use_cuda = cfg["training"].get("use_cuda", False) level = cfg["data"]["level"] max_output_length = cfg["training"].get("max_output_length", None) # read vocabs src_vocab_file = cfg["data"].get( "src_vocab", cfg["training"]["model_dir"] + "/src_vocab.txt") trg_vocab_file = cfg["data"].get( "trg_vocab", cfg["training"]["model_dir"] + "/trg_vocab.txt") src_vocab = Vocabulary(file=src_vocab_file) trg_vocab = Vocabulary(file=trg_vocab_file) data_cfg = cfg["data"] level = data_cfg["level"] lowercase = data_cfg["lowercase"] tok_fun = lambda s: list(s) if level == "char" else s.split() src_field = Field(init_token=None, eos_token=EOS_TOKEN, pad_token=PAD_TOKEN, tokenize=tok_fun, batch_first=True, lower=lowercase, unk_token=UNK_TOKEN, include_lengths=True) src_field.vocab = src_vocab # load model state from disk model_checkpoint = load_checkpoint(ckpt, use_cuda=use_cuda) # build model and load parameters into it model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab) model.load_state_dict(model_checkpoint["model_state"]) if use_cuda: model.cuda() # whether to use beam search for decoding, <2: greedy decoding if "testing" in cfg.keys(): beam_size = cfg["testing"].get("beam_size", 1) beam_alpha = cfg["testing"].get("alpha", -1) else: beam_size = 1 beam_alpha = -1 if not sys.stdin.isatty(): # input file given test_data = MonoDataset(path=sys.stdin, ext="", field=src_field) hypotheses = _translate_data(test_data) if output_path is not None: # write to outputfile if given output_path_set = "{}".format(output_path) with open(output_path_set, mode="w", encoding="utf-8") as out_file: for hyp in hypotheses: out_file.write(hyp + "\n") logger.info("Translations saved to: %s.", output_path_set) else: # print to stdout for hyp in hypotheses: print(hyp) else: # enter interactive mode batch_size = 1 batch_type = "sentence" while True: try: src_input = input("\nPlease enter a source sentence " "(pre-processed): \n") if not src_input.strip(): break # every line has to be made into dataset test_data = _load_line_as_data(line=src_input) hypotheses = _translate_data(test_data) print("JoeyNMT: {}".format(hypotheses[0])) except (KeyboardInterrupt, EOFError): print("\nBye.") break
def __init__(self, module_name, train_bs, eval_bs, device, vocab=None, base_folder=None, train_name=None, eval_name=None, x_ext=None, y_ext=None, tokens=None, specials=None, tokenizer=None, sort_within_batch=None, shuffle=None): self.module_name = module_name # split_chars = lambda x: list("".join(x.split())) split_chars = lambda x: list(x) # keeps whitespaces if not tokenizer: tokenizer = split_chars # NOTE: on Jul-20-2020, removed fix_length=200 since it forces # all batches to be of size (batch_size, 200) which # really wastes GPU memory source = Field(tokenize=tokenizer, init_token='<sos>', eos_token='<eos>', batch_first=True) target = Field(tokenize=tokenizer, init_token='<sos>', eos_token='<eos>', batch_first=True) base_folder = os.path.expanduser(base_folder) folder = os.path.join(base_folder, module_name) # fix slashes folder = os.path.abspath(folder) print("loading FULL datasets from folder={}".format(folder)) train_dataset, eval_dataset, _ = TranslationDataset.splits( path=folder, root=folder, exts=(x_ext, y_ext), fields=(source, target), train=train_name, validation=eval_name, test=eval_name) if vocab: print("Setting vocab to prebuilt file...") source.vocab = vocab target.vocab = vocab elif tokens: print("Building vocab from tokens...") #source.build_vocab(tokens, specials) counter = Counter(tokens) source.vocab = source.vocab_cls(counter, specials=specials) target.vocab = source.vocab else: print("Building vocab from TRAIN and EVAL datasets...") source.build_vocab(train_dataset, eval_dataset) target.vocab = source.vocab print("Creating iterators ...") do_shuffle = True if shuffle is None else shuffle train_iterator = Iterator(dataset=train_dataset, batch_size=train_bs, train=True, repeat=True, shuffle=do_shuffle, sort_within_batch=sort_within_batch, device=device) eval_iterator = Iterator(dataset=eval_dataset, batch_size=eval_bs, train=False, repeat=False, shuffle=False, sort_within_batch=sort_within_batch, device=device) self.train_dataset = train_dataset self.eval_dataset = eval_dataset self.train_iterator = train_iterator self.eval_iterator = eval_iterator self.source = source self.target = target