def _get_embtype(self, emb_type): # set up preinitialized embeddings try: import torchtext.vocab as vocab except ImportError as ex: print('Please install torch text with `pip install torchtext`') raise ex pretrained_dim = 300 if emb_type.startswith('glove'): if 'twitter' in emb_type: init = 'glove-twitter' name = 'twitter.27B' pretrained_dim = 200 else: init = 'glove' name = '840B' embs = vocab.GloVe(name=name, dim=pretrained_dim, cache=modelzoo_path(self.opt.get('datapath'), 'models:glove_vectors')) elif emb_type.startswith('fasttext'): init = 'fasttext' embs = vocab.FastText(language='en', cache=modelzoo_path( self.opt.get('datapath'), 'models:fasttext_vectors')) else: raise RuntimeError('embedding type {} not implemented. check arg, ' 'submit PR to this function, or override it.' ''.format(emb_type)) return embs, init
def _get_embtype(self, emb_type): # set up preinitialized embeddings try: import torchtext.vocab as vocab except ImportError as ex: print('Please install torch text with `pip install torchtext`') raise ex pretrained_dim = 300 if emb_type.startswith('glove'): if 'twitter' in emb_type: init = 'glove-twitter' name = 'twitter.27B' pretrained_dim = 200 else: init = 'glove' name = '840B' embs = vocab.GloVe(name=name, dim=pretrained_dim, cache=modelzoo_path(self.opt.get('datapath'), 'models:glove_vectors')) elif emb_type.startswith('fasttext_cc'): init = 'fasttext_cc' embs = vocab.FastText(language='en', cache=modelzoo_path( self.opt.get('datapath'), 'models:fasttext_cc_vectors')) elif emb_type.startswith('fasttext'): init = 'fasttext' embs = vocab.FastText(language='en', cache=modelzoo_path( self.opt.get('datapath'), 'models:fasttext_vectors')) else: # emb_type does not matching any type embeddings list above, # so we think it is a file_path to the embedding file, # if not, raise error assert os.path.isfile(emb_type), \ 'emb_type: {} does not matching any type embeddings list above, '.format(emb_type) + \ 'so we think it is a file_path to the embedding file!' init = os.path.basename(emb_type) cache = '.vector_cache' if not os.path.exists(cache): os.makedirs(cache) embs = vocab.Vectors(emb_type, cache=cache) return embs, init
def get_embedding(self, name, embedding_dim): if name == 'glove': pretrained_type = vocab.GloVe(name='42B', dim=embedding_dim) elif name == 'fasttext': if embedding_dim != 300: raise ValueError("Got embedding dim {}, expected size 300".format(embedding_dim)) pretrained_type = vocab.FastText('en') embedding_len = len(self) weights = np.zeros((embedding_len, embedding_dim)) words_found = 0 for word, index in self.word2idx.items(): try: # torchtext.vocab.__getitem__ defaults key error to a zero vector weights[index] = pretrained_type.vectors[pretrained_type.stoi[word]] words_found += 1 except KeyError: if index == 0: continue weights[index] = np.random.normal(scale=0.6, size=(embedding_dim)) print(embedding_len - words_found, "words missing from pretrained") return torch.from_numpy(weights).float()
def __init__(self, opt, shared=None): """Set up model if shared params not set, otherwise no work to do.""" super().__init__(opt, shared) opt = self.opt # there is a deepcopy in the init # all instances may need some params self.truncate = opt['truncate'] if opt['truncate'] > 0 else None self.history = {} self.states = {} # check for cuda self.use_cuda = not opt.get('no_cuda') and torch.cuda.is_available() if shared: # set up shared properties self.dict = shared['dict'] self.START_IDX = shared['START_IDX'] self.END_IDX = shared['END_IDX'] self.NULL_IDX = shared['NULL_IDX'] # answers contains a batch_size list of the last answer produced self.answers = shared['answers'] if 'model' in shared: # model is shared during hogwild self.model = shared['model'] self.states = shared['states'] else: # this is not a shared instance of this class, so do full init # answers contains a batch_size list of the last answer produced self.answers = [None] * opt['batchsize'] if self.use_cuda: print('[ Using CUDA ]') torch.cuda.set_device(opt['gpu']) if opt.get('model_file') and os.path.isfile(opt['model_file']): # load model parameters if available print('Loading existing model params from ' + opt['model_file']) new_opt, self.states = self.load(opt['model_file']) # override model-specific options with stored ones opt = self.override_opt(new_opt) if opt['dict_file'] is None and opt.get('model_file'): # set default dict-file if not set opt['dict_file'] = opt['model_file'] + '.dict' # load dictionary and basic tokens & vectors self.dict = DictionaryAgent(opt) self.id = 'Seq2Seq' # we use START markers to start our output self.START_IDX = self.dict[self.dict.start_token] # we use END markers to end our output self.END_IDX = self.dict[self.dict.end_token] # get index of null token from dictionary (probably 0) self.NULL_IDX = self.dict[self.dict.null_token] self.model = Seq2seq(opt, len(self.dict), padding_idx=self.NULL_IDX, start_idx=self.START_IDX, end_idx=self.END_IDX, longest_label=self.states.get( 'longest_label', 1)) if opt['embedding_type'] != 'random': # set up preinitialized embeddings try: import torchtext.vocab as vocab except ModuleNotFoundError as ex: print( 'Please install torch text with `pip install torchtext`' ) raise ex if opt['embedding_type'].startswith('glove'): init = 'glove' embs = vocab.GloVe(name='840B', dim=300) elif opt['embedding_type'].startswith('fasttext'): init = 'fasttext' embs = vocab.FastText(language='en') else: raise RuntimeError('embedding type not implemented') if opt['embeddingsize'] != 300: rp = torch.Tensor(300, opt['embeddingsize']).normal_() t = lambda x: torch.mm(x.unsqueeze(0), rp) else: t = lambda x: x cnt = 0 for w, i in self.dict.tok2ind.items(): if w in embs.stoi: vec = t(embs.vectors[embs.stoi[w]]) self.model.decoder.lt.weight.data[i] = vec cnt += 1 if opt['lookuptable'] in ['unique', 'dec_out']: # also set encoder lt, since it's not shared self.model.encoder.lt.weight.data[i] = vec print('Seq2seq: initialized embeddings for {} tokens from {}.' ''.format(cnt, init)) if self.states: # set loaded states if applicable self.model.load_state_dict(self.states['model']) if self.use_cuda: self.model.cuda() if hasattr(self, 'model'): # if model was built, do more setup self.clip = opt.get('gradient_clip', 0.2) self.rank = opt['rank_candidates'] # set up tensors once self.xs = torch.LongTensor(1, 1) self.ys = torch.LongTensor(1, 1) if self.rank: self.cands = torch.LongTensor(1, 1, 1) # set up criteria self.criterion = nn.CrossEntropyLoss(ignore_index=self.NULL_IDX) if self.use_cuda: # push to cuda self.xs = self.xs.cuda(async=True) self.ys = self.ys.cuda(async=True) if self.rank: self.cands = self.cands.cuda(async=True) self.criterion.cuda() # set up optimizer lr = opt['learningrate'] optim_class = Seq2seqAgent.OPTIM_OPTS[opt['optimizer']] kwargs = {'lr': lr} if opt['optimizer'] == 'sgd': kwargs['momentum'] = 0.95 kwargs['nesterov'] = True if opt['embedding_type'].endswith('fixed'): print('Seq2seq: fixing embedding weights.') self.model.decoder.lt.weight.requires_grad = False self.model.encoder.lt.weight.requires_grad = False if opt['lookuptable'] in ['dec_out', 'all']: self.model.decoder.e2s.weight.requires_grad = False self.optimizer = optim_class( [p for p in self.model.parameters() if p.requires_grad], **kwargs) if self.states: if self.states['optimizer_type'] != opt['optimizer']: print('WARNING: not loading optim state since optim class ' 'changed.') else: self.optimizer.load_state_dict(self.states['optimizer']) self.reset()
def __init__(self, opt, shared=None): """Set up model.""" super().__init__(opt, shared) opt = self.opt # there is a deepcopy in the init #self.opt = opt # all instances may need some params opt['label_smoothing'] = False opt['src_tgt_weight_share'] = False opt['tgt_prj_weight_share'] = False self.truncate = opt['truncate'] if opt['truncate'] > 0 else None self.metrics = { 'loss': 0.0, 'num_tokens': 0, 'correct_tokens': 0, 'total_skipped_batches': 0 } self.history = {} self.report_freq = opt.get('report_freq', 0.001) self.use_person_tokens = opt.get('person_tokens', False) self.batch_idx = shared and shared.get('batchindex') or 0 self.rank = opt['rank_candidates'] self.beam_size = opt.get('beam_size', 1) self.topk = opt.get('topk', 1) states = {} # check for cuda self.use_cuda = not opt.get('no_cuda') and torch.cuda.is_available() if opt.get('numthreads', 1) > 1: torch.set_num_threads(1) if shared: # set up shared properties self.opt = shared['opt'] opt = self.opt self.dict = shared['dict'] self.START_IDX = shared['START_IDX'] self.END_IDX = shared['END_IDX'] self.NULL_IDX = shared['NULL_IDX'] # answers contains a batch_size list of the last answer produced self.answers = shared['answers'] self.model = shared['model'] self.metrics = shared['metrics'] states = shared.get('states', {}) else: # this is not a shared instance of this class, so do full init # answers contains a batch_size list of the last answer produced self.answers = [None] * opt['batchsize'] if self.use_cuda: print('[ Using CUDA ]') torch.cuda.set_device(opt['gpu']) init_model = None # check first for 'init_model' for loading model from file if opt.get('init_model') and os.path.isfile(opt['init_model']): init_model = opt['init_model'] # next check for 'model_file', this would override init_model if opt.get('model_file') and os.path.isfile(opt['model_file']): init_model = opt['model_file'] if init_model is not None: # load model parameters if available print('[ Loading existing model params from {} ]'.format( init_model)) states = self.load(init_model) if os.path.isfile(init_model + '.dict') or opt['dict_file'] is None: opt['dict_file'] = init_model + '.dict' # load dictionary and basic tokens & vectors self.dict = DictionaryAgent(opt) self.id = 'Transformer' # we use START markers to start our output self.START_IDX = self.dict[self.dict.start_token] # we use END markers to end our output self.END_IDX = self.dict[self.dict.end_token] # get index of null token from dictionary (probably 0) self.NULL_IDX = self.dict[self.dict.null_token] if not hasattr(self, 'model_class'): # this allows child classes to override this but inherit init self.model_class = Transformer # self.model = self.model_class( # opt, len(self.dict), padding_idx=self.NULL_IDX, # start_idx=self.START_IDX, end_idx=self.END_IDX, # longest_label=states.get('longest_label', 1)) self.model = self.model_class(len(self.dict), opt) if opt.get('dict_tokenizer' ) == 'bpe' and opt['embedding_type'] != 'random': print('skipping preinitialization of embeddings for bpe') elif not states and opt['embedding_type'] != 'random': # set up preinitialized embeddings try: import torchtext.vocab as vocab except ImportError as ex: print( 'Please install torch text with `pip install torchtext`' ) raise ex pretrained_dim = 300 if opt['embedding_type'].startswith('glove'): if 'twitter' in opt['embedding_type']: init = 'glove-twitter' name = 'twitter.27B' pretrained_dim = 200 else: init = 'glove' name = '840B' embs = vocab.GloVe(name=name, dim=pretrained_dim, cache=modelzoo_path( self.opt.get('datapath'), 'models:glove_vectors')) elif opt['embedding_type'].startswith('fasttext'): init = 'fasttext' embs = vocab.FastText(language='en', cache=modelzoo_path( self.opt.get('datapath'), 'models:fasttext_vectors')) else: raise RuntimeError('embedding type not implemented') if opt['embeddingsize'] != pretrained_dim: rp = torch.Tensor(pretrained_dim, opt['embeddingsize']).normal_() t = lambda x: torch.mm(x.unsqueeze(0), rp) else: t = lambda x: x cnt = 0 for w, i in self.dict.tok2ind.items(): if w in embs.stoi: vec = t(embs.vectors[embs.stoi[w]]) self.model.decoder.tgt_word_emb.weight.data[i] = vec cnt += 1 if opt['lookuptable'] in ['unique', 'dec_out']: # also set encoder lt, since it's not shared self.model.encoder.src_word_emb.weight.data[ i] = vec print( 'Transformer: initialized embeddings for {} tokens from {}.' ''.format(cnt, init)) if states: # set loaded states if applicable self.model.load_state_dict(states['model']) if self.use_cuda: self.model.cuda() # set up criteria if opt.get('numsoftmax', 1) > 1: self.criterion = nn.NLLLoss(ignore_index=self.NULL_IDX, size_average=False) else: self.criterion = nn.CrossEntropyLoss(ignore_index=self.NULL_IDX, size_average=False) if self.use_cuda: self.criterion.cuda() if 'train' in opt.get('datatype', ''): # we only set up optimizers when training # we only set this up for the original instance or hogwild ones self.clip = opt.get('gradient_clip', -1) # set up optimizer lr = opt['learningrate'] optim_class = TransformerAgent.OPTIM_OPTS[opt['optimizer']] kwargs = {'lr': lr} if opt.get('momentum') > 0 and opt['optimizer'] in [ 'sgd', 'rmsprop' ]: kwargs['momentum'] = opt['momentum'] if opt['optimizer'] == 'sgd': kwargs['nesterov'] = True if opt['optimizer'] == 'adam': # https://openreview.net/forum?id=ryQu7f-RZ kwargs['amsgrad'] = True if opt['embedding_type'].endswith('fixed'): print('Transformer: fixing embedding weights.') self.model.decoder.tgt_word_emb.weight.requires_grad = False self.model.encoder.src_word_emb.weight.requires_grad = False if opt['lookuptable'] in ['dec_out', 'all']: # self.model.decoder.e2s.weight.requires_grad = False self.model.tgt_word_prj.weight.requires_grad = False self.optimizer = optim_class( [p for p in self.model.parameters() if p.requires_grad], **kwargs) if states.get('optimizer'): if states['optimizer_type'] != opt['optimizer']: print('WARNING: not loading optim state since optim class ' 'changed.') else: try: self.optimizer.load_state_dict(states['optimizer']) except ValueError: print('WARNING: not loading optim state since model ' 'params changed.') if self.use_cuda: for state in self.optimizer.state.values(): for k, v in state.items(): if isinstance(v, torch.Tensor): state[k] = v.cuda() self.scheduler = optim.lr_scheduler.ReduceLROnPlateau( self.optimizer, 'min', factor=0.5, patience=3, verbose=True) self.reset()
if not args.no_cuda: zero, diag_margin = zero.cuda(), diag_margin.cuda() zero, diag_margin = Variable(zero), Variable(diag_margin) x = x / torch.norm(x, 2, 1, keepdim=True) v = v / torch.norm(v, 2, 1, keepdim=True) prod = torch.matmul(x, v.transpose(0, 1)) diag = torch.diag(prod) for_x = torch.max(zero, margin - torch.unsqueeze(diag, 1) + prod) - diag_margin for_v = torch.max(zero, margin - torch.unsqueeze(diag, 0) + prod) - diag_margin return (torch.sum(for_x) + torch.sum(for_v)) / x.size(0) if __name__ == '__main__': print('Loading a pretrained fastText model...') word_embedding = vocab.FastText(language="en") #word_embedding =fasttext.load_model(args.fasttext_model) print('Loading a dataset...') train_data = ReedICML2016(args.img_root, args.caption_root, args.trainclasses_file, word_embedding, args.max_nwords, transforms.Compose([ transforms.Scale(256), transforms.RandomCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
def download(datapath): embs = vocab.FastText(language='en', cache=datapath + '/models/fasttext_vectors')
def __init__(self, opt, shared=None): """Set up model.""" super().__init__(opt, shared) opt = self.opt # there is a deepcopy in the init # all instances may need some params self.encode_max_seq_len = opt[ 'encode_max_seq_len'] if opt['encode_max_seq_len'] > 0 else None self.decode_max_seq_len = opt[ 'decode_max_seq_len'] if opt['decode_max_seq_len'] > 0 else None self.metrics = { 'loss': 0.0, 'num_tokens': 0, 'correct_tokens': 0, 'total_skipped_batches': 0, 'correct_pred': 0, 'pred_count': 0 } self.history = {} # batch share the same persona information self.use_person_tokens = opt.get('use_persona_token', False) self.use_talk_tokens = opt.get('use_talk_token', False) self.use_history_reply = opt.get('history_replies', 'label_else_model') self.add_default_persona = opt.get('add_default_persona', True) self.persona_append_strategy = opt.get('persona_append_strategy', 'concat') self.history_append_strategy = opt.get('history_append_strategy', -1) self.report_freq = opt.get('report_freq', 0.001) self.batch_idx = shared and shared.get('batchindex') or 0 self.rank = opt['rank_candidates'] self.beam_size = opt.get('beam_size', 1) self.topk = opt.get('topk', 1) states = {} # if gpt2 if 'gpt' in ARCH_CHOICE: num_optim_steps = opt['train_size'] * opt[ 'num_train_epochs'] // opt['batchsize'] # override optimizer_step opt['optimizer_step'] = num_optim_steps # check for cuda self.use_cuda = not opt.get('no_cuda') and torch.cuda.is_available() if shared: # set up shared properties self.opt = shared['opt'] opt = self.opt self.dict = shared['dict'] self.START_IDX = shared['START_IDX'] self.END_IDX = shared['END_IDX'] self.NULL_IDX = shared['NULL_IDX'] # answers contains a batchsize list of the last answer produced self.answers = shared['answers'] self.model = shared['model'] self.metrics = shared['metrics'] self.receiver = shared['receiver'] self.receiver_dict = shared['receiver_dict'] states = shared.get('states', {}) else: # this is not a shared instance of this class, so do full init # answers contains a batchsize list of the last answer produced self.answers = [None] * opt['batchsize'] if self.use_cuda: print('[ Using CUDA ]') torch.cuda.set_device(opt['gpu']) init_model = None # check first for 'init_model' for loading model from file if opt.get('init_model') and os.path.isfile(opt['init_model']): init_model = opt['init_model'] # next check for 'model_file', this would override init_model if opt.get('model_file') and os.path.isfile(opt['model_file']): init_model = opt['model_file'] if init_model is not None: # load model parameters if available print('[ Loading existing model params from {} ]'.format( init_model)) states = self.load(init_model) if os.path.isfile(init_model + '.dict') or opt['dict_file'] is None: opt['dict_file'] = init_model + '.dict' # load dictionary and basic tokens & vectors self.dict = self.dictionary_class()(opt) self.id = 'Transformer' # we use START markers to start our output self.START_IDX = self.dict[self.dict.start_token] # we use END markers to end our output self.END_IDX = self.dict[self.dict.end_token] # get index of null token from dictionary (probably 0) self.NULL_IDX = self.dict[self.dict.null_token] # get vocab size vocab_size = len(self.dict.tok2ind.items()) if ARCH_CHOICE == 'lstm': self.model = Seq2seqModel(opt=opt, num_features=len(self.dict), padding_idx=self.NULL_IDX, start_idx=self.START_IDX, end_idx=self.END_IDX, longest_label=states.get( 'longest_label', 1)) elif ARCH_CHOICE == 'gpt': assert isinstance(self.dict, GPTDictionaryAgent) self.model = Gpt2SeqModel( opt=opt, vocab_size=len(self.dict), pad_idx=self.NULL_IDX, start_idx=self.START_IDX, end_idx=self.END_IDX, dict=self.dict, special_token_len=len(self.dict.special_tokens), longest_label=states.get('longest_label', 1)) if opt.get('display_model', False): print_model(self.model) if opt.get('dict_tokenizer' ) == 'bpe' and opt['embedding_type'] != 'random': print('skipping preinitialization of embeddings for bpe') elif not states and opt[ 'embedding_type'] != 'random' and ARCH_CHOICE == 'lstm': # set up preinitialized embeddings try: import torchtext.vocab as vocab except ImportError as ex: print( 'Please install torch text with `pip install torchtext`' ) raise ex pretrained_dim = 300 if opt['embedding_type'].startswith('glove'): if 'twitter' in opt['embedding_type']: init = 'glove-twitter' name = 'twitter.27B' pretrained_dim = 200 else: init = 'glove' name = '840B' embs = vocab.GloVe(name=name, dim=pretrained_dim, cache=modelzoo_path( self.opt.get('datapath'), 'models:glove_vectors')) elif opt['embedding_type'].startswith('fasttext'): init = 'fasttext' embs = vocab.FastText(language='en', cache=modelzoo_path( self.opt.get('datapath'), 'models:fasttext_vectors')) else: raise RuntimeError('embedding type not implemented') if opt['encoder_embed_dim'] != pretrained_dim: rp = torch.Tensor(pretrained_dim, opt['encoder_embed_dim']).normal_() t = lambda x: torch.mm(x.unsqueeze(0), rp) else: t = lambda x: x cnt = 0 for w, i in self.dict.tok2ind.items(): if w in embs.stoi: vec = t(embs.vectors[embs.stoi[w]]) self.model.decoder.tgt_word_emb.weight.data[i] = vec cnt += 1 if opt['lookuptable'] in ['unique', 'dec_out']: # also set encoder lt, since it's not shared self.model.encoder.src_word_emb.weight.data[ i] = vec print('Seq2seq: initialized embeddings for {} tokens from {}.' ''.format(cnt, init)) if states: # set loaded states if applicable self.model.load_state_dict(states['model']) if self.use_cuda: self.model.cuda() # if select persona if opt['select_persona']: self.receiver, self.receiver_dict = self.load_receiver( opt['receiver_model']) self.receiver.eval() # move to cuda self.receiver.cuda() else: self.receiver = None self.receiver_dict = None vocab_size = len(self.dict.tok2ind.items()) if opt['smoothing'] > 0.0: self.criterion = LabelSmoothingLoss( vocabulary_size=40516, label_smoothing=opt['smoothing'], pad_index=self.NULL_IDX) else: self.criterion = TokenCrossEntropyLoss(pad_index=self.NULL_IDX) self.class_criter = nn.CrossEntropyLoss() self.eval_criterion = TokenCrossEntropyLoss(pad_index=self.NULL_IDX) # whether shuffle persona self.shuffle_persona = opt['shuffle_persona'] if self.use_cuda: self.criterion.cuda() if 'train' in opt.get('datatype', ''): # we only set up optimizers when training # we only set this up for the original instance or hogwild ones self.clip = opt.get('gradient_clip', -1) # set up optimizer lr = opt['lr'] optim_class = TransformerAgent.OPTIM_OPTS[opt['optimizer']] if ARCH_CHOICE == 'lstm': kwargs = {'lr': lr} if opt.get('momentum') > 0 and opt['optimizer'] in [ 'sgd', 'rmsprop' ]: kwargs['momentum'] = opt['momentum'] if opt['optimizer'] == 'sgd': kwargs['nesterov'] = True if opt['optimizer'] == 'adam': kwargs['amsgrad'] = True if opt['embedding_type'].endswith('fixed'): print('Transformer: fixing embedding weights.') self.model.decoder.tgt_word_emb.weight.requires_grad = False self.model.encoder.src_word_emb.weight.requires_grad = False if opt['lookuptable'] in ['dec_out', 'all']: self.model.decoder.e2s.weight.requires_grad = False self.optimizer = optim_class( [p for p in self.model.parameters() if p.requires_grad], **kwargs) elif ARCH_CHOICE == 'gpt': self.optimizer = GPTOptimizer(self.model, opt) if states.get('optimizer'): if states['optimizer_type'] != opt['optimizer']: print('WARNING: not loading optim state since optim class ' 'changed.') else: try: self.optimizer.load_state_dict(states['optimizer']) except ValueError: print('WARNING: not loading optim state since model ' 'params changed.') # if self.use_cuda: # for state in self.optimizer.state.values(): # for k, v in state.items(): # if isinstance(v, torch.Tensor): # state[k] = v.cuda() if ARCH_CHOICE == 'lstm': self.scheduler = optim.lr_scheduler.ReduceLROnPlateau( self.optimizer, 'min', factor=0.5, patience=3, verbose=True) self.step = torch.zeros(1) self.reset()
def __init__(self, opt, shared=None): """Set up model if shared params not set, otherwise no work to do.""" super().__init__(opt, shared) opt = self.opt # there is a deepcopy in the init # all instances may need some params self.truncate = opt['truncate'] if opt['truncate'] > 0 else None self.metrics = {'loss': 0.0, 'num_tokens': 0} self.history = {} self.report_freq = opt.get('report_freq', 0.001) states = {} # check for cuda self.use_cuda = not opt.get('no_cuda') and torch.cuda.is_available() if opt.get('numthreads', 1) > 1: torch.set_num_threads(1) if shared: # set up shared properties self.opt = shared['opt'] opt = self.opt self.dict = shared['dict'] self.START_IDX = shared['START_IDX'] self.END_IDX = shared['END_IDX'] self.NULL_IDX = shared['NULL_IDX'] # answers contains a batch_size list of the last answer produced self.answers = shared['answers'] if 'model' in shared: # model is shared during hogwild self.model = shared['model'] self.metrics = shared['metrics'] states = shared['states'] else: # this is not a shared instance of this class, so do full init # answers contains a batch_size list of the last answer produced self.answers = [None] * opt['batchsize'] if self.use_cuda: print('[ Using CUDA ]') torch.cuda.set_device(opt['gpu']) init_model = None # check first for 'init_model' for loading model from file if opt.get('init_model') and os.path.isfile(opt['init_model']): init_model = opt['init_model'] # next check for 'model_file', this would override init_model if opt.get('model_file') and os.path.isfile(opt['model_file']): init_model = opt['model_file'] if init_model is not None: # load model parameters if available print('[ Loading existing model params from {} ]'.format( init_model)) states = self.load(opt['model_file']) if ((init_model is not None and os.path.isfile(init_model + '.dict')) or opt['dict_file'] is None): opt['dict_file'] = init_model + '.dict' # load dictionary and basic tokens & vectors self.dict = DictionaryAgent(opt) self.id = 'Seq2Seq' # we use START markers to start our output self.START_IDX = self.dict[self.dict.start_token] # we use END markers to end our output self.END_IDX = self.dict[self.dict.end_token] # get index of null token from dictionary (probably 0) self.NULL_IDX = self.dict[self.dict.null_token] if not hasattr(self, 'model_class'): # this allows child classes to override this but inherit init self.model_class = Seq2seq self.model = self.model_class(opt, len(self.dict), padding_idx=self.NULL_IDX, start_idx=self.START_IDX, end_idx=self.END_IDX, longest_label=states.get( 'longest_label', 1)) if opt['embedding_type'] != 'random': # set up preinitialized embeddings try: import torchtext.vocab as vocab except ModuleNotFoundError as ex: print( 'Please install torch text with `pip install torchtext`' ) raise ex if opt['embedding_type'].startswith('glove'): init = 'glove' embs = vocab.GloVe(name='840B', dim=300, cache=os.path.join( opt['parlai_home'], 'data', 'models', 'glove_vectors')) elif opt['embedding_type'].startswith('fasttext'): init = 'fasttext' embs = vocab.FastText(language='en', cache=os.path.join( opt['parlai_home'], 'data', 'models', 'fasttext_vectors')) else: raise RuntimeError('embedding type not implemented') if opt['embeddingsize'] != 300: rp = torch.Tensor(300, opt['embeddingsize']).normal_() t = lambda x: torch.mm(x.unsqueeze(0), rp) else: t = lambda x: x cnt = 0 for w, i in self.dict.tok2ind.items(): if w in embs.stoi: vec = t(embs.vectors[embs.stoi[w]]) self.model.decoder.lt.weight.data[i] = vec cnt += 1 if opt['lookuptable'] in ['unique', 'dec_out']: # also set encoder lt, since it's not shared self.model.encoder.lt.weight.data[i] = vec print('Seq2seq: initialized embeddings for {} tokens from {}.' ''.format(cnt, init)) if states: # set loaded states if applicable self.model.load_state_dict(states['model']) if self.use_cuda: self.model.cuda() if hasattr(self, 'model'): # if model was built, do more setup self.clip = opt.get('gradient_clip', -1) self.rank = opt['rank_candidates'] # set up tensors once self.xs = torch.LongTensor(1, 1) self.ys = torch.LongTensor(1, 1) if self.rank: self.cands = torch.LongTensor(1, 1, 1) # set up criteria if opt.get('numsoftmax', 1) > 1: self.criterion = nn.NLLLoss(ignore_index=self.NULL_IDX, size_average=False) else: self.criterion = nn.CrossEntropyLoss( ignore_index=self.NULL_IDX, size_average=False) if self.use_cuda: # push to cuda self.xs = self.xs.cuda() self.ys = self.ys.cuda() if self.rank: self.cands = self.cands.cuda() self.criterion.cuda() # set up optimizer lr = opt['learningrate'] optim_class = Seq2seqAgent.OPTIM_OPTS[opt['optimizer']] kwargs = {'lr': lr} if opt.get('momentum') > 0 and opt['optimizer'] in [ 'sgd', 'rmsprop' ]: kwargs['momentum'] = opt['momentum'] if opt['optimizer'] == 'sgd': kwargs['nesterov'] = True if opt['optimizer'] == 'adam': # https://openreview.net/forum?id=ryQu7f-RZ kwargs['amsgrad'] = True if opt['embedding_type'].endswith('fixed'): print('Seq2seq: fixing embedding weights.') self.model.decoder.lt.weight.requires_grad = False self.model.encoder.lt.weight.requires_grad = False if opt['lookuptable'] in ['dec_out', 'all']: self.model.decoder.e2s.weight.requires_grad = False self.optimizer = optim_class( [p for p in self.model.parameters() if p.requires_grad], **kwargs) if states.get('optimizer'): if states['optimizer_type'] != opt['optimizer']: print('WARNING: not loading optim state since optim class ' 'changed.') else: self.optimizer.load_state_dict(states['optimizer']) if self.use_cuda: for state in self.optimizer.state.values(): for k, v in state.items(): if isinstance(v, torch.Tensor): state[k] = v.cuda() self.scheduler = optim.lr_scheduler.ReduceLROnPlateau( self.optimizer, 'min', factor=0.5, patience=3, verbose=True) self.reset()
def load_dataset(path, args, train=True, build_vocab=True): def tokenize(x): # Characters to exclude exclude = u'"%\'()*+,-./:;<=>[\]^_`{|}~' # Remove punctuation x = x.translate(str.maketrans('', '', exclude)) return x.split() # Create new fields on vocab if build_vocab: args.TEXT = Field(sequential=True, tokenize=tokenize, lower=True) args.TOPIC = Field(sequential=False, use_vocab=True, lower=True) args.LABEL = Field(sequential=False, use_vocab=False) if args.dataset == 'haha': datafields = [('id', None), ('text', args.TEXT), ('is_humor', args.LABEL), ('votes_no', None), ('votes_1', None), ('votes_2', None), ('votes_3', None), ('votes_4', None), ('votes_5', None), ('funniness_average', None)] else: datafields = [('id', None), ('topic', args.TOPIC), ('is_ironic', args.LABEL), ('message', args.TEXT)] dataset = TabularDataset(path=path, format='CSV', skip_header=True, fields=datafields) # build vocaboulary if build_vocab: if args.fasttext: args.TEXT.build_vocab(dataset, min_freq=2, vectors=vocab.FastText('es')) else: args.TEXT.build_vocab(dataset, min_freq=2) args.TOPIC.build_vocab(dataset) print('vocabulary length : ', len(args.TEXT.vocab)) print('number of topics : ', len(args.TOPIC.vocab)) if train: # Split dataset trn, vld = dataset.split(args.train_percentage) target = 'is_humor' if args.dataset == 'haha' else 'is_ironic' # study dataset vld_num_target = trn_num_target = 0 for element in trn: trn_num_target += int(element.__dict__[target]) trn_num_not_target = len(trn) - trn_num_target for element in vld: vld_num_target += int(element.__dict__[target]) vld_num_not_target = len(vld) - vld_num_target # Dataset information print('train dataset : {} elements'.format(len(trn))) print('train dataset ({}): {} elements. {:.2f}%'.format( target, trn_num_target, 100 * trn_num_target / len(trn))) print('train dataset (not {}): {} elements. {:.2f}%'.format( target, trn_num_not_target, 100 * trn_num_not_target / len(trn))) print('validate dataset : {} elements'.format(len(vld))) print('validate dataset ({}): {} elements. {:.2f}%'.format( target, vld_num_target, 100 * vld_num_target / len(vld))) print('validate dataset (not {}): {} elements. {:.2f}%'.format( target, vld_num_not_target, 100 * vld_num_not_target / len(vld))) return (trn, vld) else: # Return test dataset tst = dataset target = 'is_humor' if args.dataset == 'haha' else 'is_ironic' # study dataset tst_num_target = 0 for element in tst: tst_num_target += int(element.__dict__[target]) tst_num_not_target = len(tst) - tst_num_target # Dataset information print('test dataset : {} elements'.format(len(tst))) print('test dataset ({}): {} elements. {:.2f}%'.format( target, tst_num_target, 100 * tst_num_target / len(tst))) print('test dataset (not {}): {} elements. {:.2f}%'.format( target, tst_num_not_target, 100 * tst_num_not_target / len(tst))) return tst
def load_data(config, path_transcripts='/vol/work2/galmant/transcripts/'): type_sentence_embedding = config['type_sentence_embedding'] dev_set_list = config['dev_set_list'] test_set_list = config['test_set_list'] punctuations_end_sentence = ['.', '?', '!'] punctuations = string.punctuation #['!','(',')',',','-','.','/',':',';','<','=','>','?','[','\\',']','^','_','{','|','}','~'] #!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~ we = None if type_sentence_embedding == 'lstm': we = vocab.FastText(language='en') ''' pretrained_aliases = { "charngram.100d": partial(CharNGram), "fasttext.en.300d": partial(FastText, language="en"), "fasttext.simple.300d": partial(FastText, language="simple"), "glove.42B.300d": partial(GloVe, name="42B", dim="300"), "glove.840B.300d": partial(GloVe, name="840B", dim="300"), "glove.twitter.27B.25d": partial(GloVe, name="twitter.27B", dim="25"), "glove.twitter.27B.50d": partial(GloVe, name="twitter.27B", dim="50"), "glove.twitter.27B.100d": partial(GloVe, name="twitter.27B", dim="100"), "glove.twitter.27B.200d": partial(GloVe, name="twitter.27B", dim="200"), "glove.6B.50d": partial(GloVe, name="6B", dim="50"), "glove.6B.100d": partial(GloVe, name="6B", dim="100"), "glove.6B.200d": partial(GloVe, name="6B", dim="200"), "glove.6B.300d": partial(GloVe, name="6B", dim="300") } ''' #X_all = [] #Y_all = [] X_train = [] Y_train = [] X_dev = [] Y_dev = [] X_test = [] Y_test = [] words_set = set() for file in sorted(glob.glob(path_transcripts + '*')): #TEST #for file in [sorted(glob.glob(path_transcripts+'*'))[0]]: with open(file, newline='') as csvfile: reader = csv.reader(csvfile, delimiter=' ', quotechar='|') X_ = [] Y_ = [] for row in reader: sentence = row[2] old_word = row[2] for word in row[3:]: if any(punctuation in old_word for punctuation in punctuations_end_sentence ) and word and word[0].isupper(): sentence = sentence.strip() n = 0 for i, s in enumerate(sentence): if s in punctuations: sentence_ = list(sentence) sentence_.insert(i + n + 1, ' ') sentence_.insert(i + n, ' ') sentence = ''.join(sentence_) n += 2 #print(sentence) X_.append(sentence) Y_.append(row[1]) sentence = word else: sentence += ' ' + word old_word = word if sentence and row[1]: sentence = sentence.strip() n = 0 for i, s in enumerate(sentence): if s in punctuations: sentence_ = list(sentence) sentence_.insert(i + n + 1, ' ') sentence_.insert(i + n, ' ') sentence = ''.join(sentence_) n += 2 #print(sentence) X_.append(sentence) Y_.append(row[1]) Y = [s.lower() for s in Y_] if type_sentence_embedding == 'lstm': X = [s.lower().split() for s in X_] #Y = [s.lower() for s in Y_] to_del = [] for s in X: for w in s: if w not in we.stoi: to_del.append(w) X = [[w.strip() for w in s if w not in to_del] for s in X] for words_per_sentence in X: words_set = words_set.union(set(words_per_sentence)) else: X = X_ Y = Y #_ if len(X) > 0 and len(Y) > 0: names_episode = file.split('/')[-1] names_season = '.'.join(names_episode.split('.')[:-1]) names_serie = '.'.join(names_episode.split('.')[0]) if names_episode in dev_set_list or names_season in dev_set_list or names_serie in dev_set_list: X_dev.append(X) Y_dev.append(Y) elif names_episode in test_set_list or names_season in test_set_list or names_serie in test_set_list: X_test.append(X) Y_test.append(Y) else: X_train.append(X) Y_train.append(Y) assert len(X) == len(Y) '''threshold_train_dev = int(len(X_all)*0.8) threshold_dev_test = threshold_train_dev + int(len(X_all)*0.1) X_train = X_all[:threshold_train_dev] Y_train = Y_all[:threshold_train_dev] X_dev = X_all[threshold_train_dev:threshold_dev_test] Y_dev = Y_all[threshold_train_dev:threshold_dev_test] X_test = X_all[threshold_dev_test:] Y_test = Y_all[threshold_dev_test:]''' #TEST #X_train = X_test #Y_train = Y_test #X_dev = X_test #Y_dev = Y_test #print('X_train',X_train[-1]) #time.sleep(60) return X_train, Y_train, X_dev, Y_dev, X_test, Y_test, words_set, we
def load_data_new(config, path_transcripts='/vol/work3/maurice/Transcripts/'): type_sentence_embedding = config['type_sentence_embedding'] dev_set_list = config['dev_set_list'] test_set_list = config['test_set_list'] punctuations_end_sentence = ['.', '?', '!'] punctuations = string.punctuation #['!','(',')',',','-','.','/',':',';','<','=','>','?','[','\\',']','^','_','{','|','}','~'] #!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~ we = None if type_sentence_embedding == 'lstm': we = vocab.FastText(language='en') ''' pretrained_aliases = { "charngram.100d": partial(CharNGram), "fasttext.en.300d": partial(FastText, language="en"), "fasttext.simple.300d": partial(FastText, language="simple"), "glove.42B.300d": partial(GloVe, name="42B", dim="300"), "glove.840B.300d": partial(GloVe, name="840B", dim="300"), "glove.twitter.27B.25d": partial(GloVe, name="twitter.27B", dim="25"), "glove.twitter.27B.50d": partial(GloVe, name="twitter.27B", dim="50"), "glove.twitter.27B.100d": partial(GloVe, name="twitter.27B", dim="100"), "glove.twitter.27B.200d": partial(GloVe, name="twitter.27B", dim="200"), "glove.6B.50d": partial(GloVe, name="6B", dim="50"), "glove.6B.100d": partial(GloVe, name="6B", dim="100"), "glove.6B.200d": partial(GloVe, name="6B", dim="200"), "glove.6B.300d": partial(GloVe, name="6B", dim="300") } ''' #X_all = [] #Y_all = [] X_train = [] Y_train = [] X_dev = [] Y_dev = [] X_test = [] Y_test = [] words_set = set() threads = [] for file in sorted(glob.glob(path_transcripts + '*/*')): #TEST #for file in [sorted(glob.glob(path_transcripts+'*'))[0]]: process = Thread(target=read_file, args=[ file, punctuations_end_sentence, words_set, dev_set_list, test_set_list, X_train, Y_train, X_dev, Y_dev, X_test, Y_test, type_sentence_embedding, we ]) process.start() threads.append(process) for process in threads: process.join() print(words_set) '''threshold_train_dev = int(len(X_all)*0.8) threshold_dev_test = threshold_train_dev + int(len(X_all)*0.1) X_train = X_all[:threshold_train_dev] Y_train = Y_all[:threshold_train_dev] X_dev = X_all[threshold_train_dev:threshold_dev_test] Y_dev = Y_all[threshold_train_dev:threshold_dev_test] X_test = X_all[threshold_dev_test:] Y_test = Y_all[threshold_dev_test:]''' #TEST #X_train = X_test #Y_train = Y_test #X_dev = X_test #Y_dev = Y_test #print('X_train',X_train[-1]) #time.sleep(60) return X_train, Y_train, X_dev, Y_dev, X_test, Y_test, words_set, we
def load_data(self, loader, custom_preprocessing: data.Pipeline = DEFAULT_DATA_PIPELINE, verbose=True): self.verbose = verbose if self.verbose: # create an image folder self.img_stats_folder = os.path.join(self.data_path, 'stats') create_dir_if_necessary(self.img_stats_folder) self.logger.info( f'Getting {self.pretrained_word_embeddings} with dimension {self.pretrained_word_embeddings_dim}' ) word_vectors: vocab word_vectors = None if self.pretrained_word_embeddings == 'glove': word_vectors = vocab.GloVe( name=self.pretrained_word_embeddings_name, dim=self.pretrained_word_embeddings_dim) elif self.pretrained_word_embeddings == 'fasttext': word_vectors = vocab.FastText(language=self.language) self.logger.info('Word vectors successfully loaded.') self.logger.debug('Start loading dataset') self.dataset = loader(self.name, word_vectors, self.configuration, self.batch_size, self.data_path, self.train_file, self.valid_file, self.test_file, self.use_cuda, self.verbose) self.vocabs = self.dataset['vocabs'] self.task = self.dataset['task'] self.ds_stats = self.dataset['stats'] self.split_length = self.dataset['split_length'] self.train_iter, self.valid_iter, self.test_iter = self.dataset[ 'iters'] self.fields = self.dataset['fields'] self.target = self.dataset['target'] self.target_names = [n for n, _ in self.target] self.examples = self.dataset['examples'] self.embedding = self.dataset['embeddings'] self.dummy_input = self.dataset['dummy_input'] self.source_field_name = self.dataset['source_field_name'] self.target_field_name = self.dataset['target_field_name'] self.padding_field_name = self.dataset['padding_field_name'] self.baselines = self.dataset['baselines'] self.target_size = len(self.vocabs[self.target_vocab_index]) self.source_embedding = self.embedding[self.source_index] self.class_labels = list(self.vocabs[self.target_vocab_index].itos) self.source_reverser = self.dataset['source_field'] self.target_reverser = self.target[0] self.log_parameters() if verbose: # sns.set(style="whitegrid") sns.set_style("white") sns.despine() sns.set_color_codes() # sns.set_context("paper") sns.set(rc={"font.size": 18, "axes.labelsize": 22}) # sns.set(font_scale=1.7) self.show_stats() else: self._calculate_dataset_stats() self.logger.info('Dataset loaded. Ready for training')
from data import vocab import torchtext.vocab as Vocab import os from data import DATA_ROOT from data import train_iter, test_iter import time # 判定是否能用GPU os.environ["CUDA_VISIBLE_DEVICES"] = "2" device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # 设置词嵌入维度、隐藏层神经元数量、隐藏层数量 embed_size, num_hiddens, num_layers = 300, 100, 2 net = BiRNN(vocab, embed_size, num_hiddens, num_layers) # 加载维基百科预训练词向量(使用fasttext),cache为保存目录 fasttext_vocab = Vocab.FastText(cache=os.path.join(DATA_ROOT, "fasttext")) def load_pretrained_embedding(words, pretrained_vocab): """从预训练好的vocab中提取出words对应的词向量""" # 初始化为0 embed = torch.zeros(len(words), pretrained_vocab.vectors[0].shape[0]) oov_count = 0 # out of vocabulary for i, word in enumerate(words): try: idx = pretrained_vocab.stoi[word] embed[i, :] = pretrained_vocab.vectors[idx] except KeyError: oov_count += 1 if oov_count > 0: print("There are %d oov words." % oov_count)
parser.add_argument('--optim', help='optimizer, Adadelta, Adam or SGD', default='adadelta') parser.add_argument( '--debug', help='debugging mode, only use dev set, not enabled if set 0.', default=1, type=int) args = parser.parse_args() # load pre-trained word embeddings if args.embedding.lower() == 'glove': pretrained_embeddings = vocab.GloVe(name='42B') elif args.embedding.lower() == 'fasttext': pretrained_embeddings = vocab.FastText(max_vectors=500000) else: if not os.path.exists('model/GoogleNews-vectors-negative300.bin.gz'): os.system( 'wget https://drive.google.com/uc?export=download&confirm=irnl&id=0B7XkCwpI5KDYNlNUTTlSS21pQmM' ) pretrained_embeddings = load_w2v_vectors( 'model/GoogleNews-vectors-negative300.bin.gz') # prepare dataset logger.info('Preparing dataset...') train_data = data.Dataset(read_data('data/topicclass/topicclass_train.txt', FIELDS), fields=FIELDS) valid_data = data.Dataset(read_data('data/topicclass/topicclass_valid.txt', FIELDS),