def get_tokenizer(tokenizer_name): log.info(f"\tLoading Tokenizer {tokenizer_name}") if tokenizer_name.startswith("bert-"): do_lower_case = tokenizer_name.endswith("uncased") tokenizer = BertTokenizer.from_pretrained(tokenizer_name, do_lower_case=do_lower_case) elif tokenizer_name.startswith("roberta-"): tokenizer = RobertaTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("xlnet-"): do_lower_case = tokenizer_name.endswith("uncased") tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name, do_lower_case=do_lower_case) elif tokenizer_name.startswith("openai-gpt"): tokenizer = OpenAIGPTTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("gpt2"): tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("transfo-xl-"): # TransformerXL is trained on data pretokenized with MosesTokenizer tokenizer = MosesTokenizer() elif tokenizer_name.startswith("xlm-"): tokenizer = XLMTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name == "MosesTokenizer": tokenizer = MosesTokenizer() elif tokenizer_name == "SplitChars": tokenizer = SplitCharsTokenizer() elif tokenizer_name == "": tokenizer = SpaceTokenizer() else: tokenizer = None return tokenizer
def __init__(self, pretrain_path, max_length): nn.Module.__init__(self) self.bert = RobertaForSequenceClassification.from_pretrained(pretrain_path, num_labels=2) #self.bert = RobertaModel.from_pretrained(pretrain_path) self.max_length = max_length self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base') self.modelName = 'Roberta'
def __init__(self, args, dictionary, left_pad=False): super().__init__(dictionary) self.dropout = args.dropout from pytorch_transformers import RobertaModel, BertModel from pytorch_transformers.file_utils import PYTORCH_TRANSFORMERS_CACHE from pytorch_transformers import RobertaConfig, RobertaTokenizer, BertConfig, BertTokenizer if args.pretrained_bert_model.startswith('roberta'): self.embed = RobertaModel.from_pretrained( args.pretrained_bert_model, cache_dir=PYTORCH_TRANSFORMERS_CACHE / 'distributed_{}'.format(args.distributed_rank)) # self.context = RobertaModel.from_pretrained(args.pretrained_bert_model, # cache_dir=PYTORCH_TRANSFORMERS_CACHE / 'distributed_{}'.format(args.distributed_rank)) self.config = RobertaConfig.from_pretrained( args.pretrained_bert_model) self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base') else: self.embed = BertModel.from_pretrained( args.pretrained_bert_model, cache_dir=PYTORCH_TRANSFORMERS_CACHE / 'distributed_{}'.format(args.distributed_rank)) # self.context = BertModel.from_pretrained(args.pretrained_bert_model, # cache_dir=PYTORCH_TRANSFORMERS_CACHE / 'distributed_{}'.format(args.distributed_rank)) self.config = BertConfig.from_pretrained( args.pretrained_bert_model) self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') self.padding_idx = self.tokenizer.convert_tokens_to_ids( self.tokenizer.pad_token)
def check_iterator(pretrained_model, file, max_seq_length=None): batch_size = 10 name = pretrained_model.split('-')[0].lower() tokenizer = RobertaTokenizer.from_pretrained( pretrained_model ) if name == 'roberta' else BertTokenizer.from_pretrained( pretrained_model) reader = BertNLIDatasetReader(pretrained_model, lazy=True, percent_data=0.001, max_seq_length=max_seq_length) iterator = BasicIterator(batch_size=batch_size, max_instances_in_memory=10000) for batch_dict in iterator(reader.read(file), num_epochs=1): assert batch_dict['input_ids'].size( ) == batch_dict['token_type_ids'].size( ) == batch_dict['attention_mask'].size() for idx in range(batch_dict['input_ids'].size(0)): input_ids = batch_dict['input_ids'][idx].numpy().tolist() token_type_ids = batch_dict['token_type_ids'][idx] attention_mask = batch_dict['attention_mask'][idx] premise = batch_dict['metadata'][idx]['premise_tokens'] hypothesis = batch_dict['metadata'][idx][ 'hypothesis_tokens'] num_extra_tokens = 3 if name == 'bert' else 4 num_input_ids = len(premise) + len( hypothesis) + num_extra_tokens # Check input ids if name == 'bert': assert input_ids[: num_input_ids] == tokenizer.convert_tokens_to_ids( ['[CLS]'] + premise + ['[SEP]'] + hypothesis + ['[SEP]']) segment_divide = len(premise) + 2 assert input_ids[:segment_divide][-1] == 102 assert torch.sum(token_type_ids[:segment_divide]) == 0 assert torch.sum( token_type_ids[segment_divide:num_input_ids] ) == num_input_ids - segment_divide assert torch.sum(token_type_ids[num_input_ids:]) == 0 else: assert input_ids[: num_input_ids] == tokenizer.convert_tokens_to_ids( ['<s>'] + premise + ['</s>'] * 2 + hypothesis + ['</s>']) # Check attention mask assert torch.sum( attention_mask[:num_input_ids]).item() == num_input_ids assert torch.sum( attention_mask[num_input_ids:]).item() == 0
def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: bool = True): super(RoBERTa, self).__init__() self.config_keys = ['max_seq_length', 'do_lower_case'] self.max_seq_length = max_seq_length self.do_lower_case = do_lower_case self.roberta = RobertaModel.from_pretrained(model_name_or_path) self.tokenizer = RobertaTokenizer.from_pretrained(model_name_or_path, do_lower_case=do_lower_case) self.cls_token_id = self.tokenizer.convert_tokens_to_ids([self.tokenizer.cls_token])[0] self.sep_token_id = self.tokenizer.convert_tokens_to_ids([self.tokenizer.sep_token])[0]
def add_pytorch_transformers_vocab(vocab, tokenizer_name): """Add vocabulary from tokenizers in pytorch_transformers for use with pre-tokenized data. These tokenizers have a convert_tokens_to_ids method, but this doesn't do anything special, so we can just use the standard indexers. """ do_lower_case = "uncased" in tokenizer_name if tokenizer_name.startswith("bert-"): tokenizer = BertTokenizer.from_pretrained(tokenizer_name, do_lower_case=do_lower_case) elif tokenizer_name.startswith("roberta-"): tokenizer = RobertaTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("xlnet-"): tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name, do_lower_case=do_lower_case) elif tokenizer_name.startswith("openai-gpt"): tokenizer = OpenAIGPTTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("gpt2"): tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("transfo-xl-"): tokenizer = TransfoXLTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("xlm-"): tokenizer = XLMTokenizer.from_pretrained(tokenizer_name) if (tokenizer_name.startswith("openai-gpt") or tokenizer_name.startswith("gpt2") or tokenizer_name.startswith("transo-xl-")): tokenizer.add_special_tokens({ "bos_token": "<start>", "sep_token": "<delim>", "cls_token": "<extract>" }) # TODO: this is another place can be simplified by "model-before-preprocess" reorganization # we can pass tokenizer created in model here, see issue <TBD> vocab_size = len(tokenizer) # do not use tokenizer.vocab_size, it does not include newly added token if tokenizer_name.startswith("roberta-"): if tokenizer.convert_ids_to_tokens(vocab_size - 1) is None: vocab_size -= 1 else: log.info("Time to delete vocab_size-1 in preprocess.py !!!") # due to a quirk in huggingface's file, the last token of RobertaTokenizer is None, remove # this when they fix the problem ordered_vocab = tokenizer.convert_ids_to_tokens(range(vocab_size)) log.info("Added pytorch_transformers vocab (%s): %d tokens", tokenizer_name, len(ordered_vocab)) for word in ordered_vocab: vocab.add_token_to_namespace( word, input_module_tokenizer_name(tokenizer_name))
def __init__(self, model): super().__init__() if 'roberta' in model: print("Roberta model: {}".format(model)) self.tokenizer = RobertaTokenizer.from_pretrained(model) self.bert = RobertaModel.from_pretrained(model) else: print("Bert model: {}".format(model)) self.tokenizer = BertTokenizer.from_pretrained(model) self.bert = BertModel.from_pretrained(model) self.dim = self.bert.pooler.dense.in_features self.max_len = self.bert.embeddings.position_embeddings.num_embeddings if use_cuda: self.cuda()
def __init__(self, opt): self.opt = opt if 'roberta' in opt.pretrained_bert_name: tokenizer = RobertaTokenizer.from_pretrained( opt.pretrained_bert_name) transformer = RobertaModel.from_pretrained( opt.pretrained_bert_name, output_attentions=True) elif 'bert' in opt.pretrained_bert_name: tokenizer = BertTokenizer.from_pretrained(opt.pretrained_bert_name) transformer = BertModel.from_pretrained(opt.pretrained_bert_name, output_attentions=True) elif 'xlnet' in opt.pretrained_bert_name: tokenizer = XLNetTokenizer.from_pretrained( opt.pretrained_bert_name) transformer = XLNetModel.from_pretrained(opt.pretrained_bert_name, output_attentions=True) if 'bert' or 'xlnet' in opt.model_name: tokenizer = Tokenizer4Pretrain(tokenizer, opt.max_seq_len) self.model = opt.model_class(transformer, opt).to(opt.device) # elif 'xlnet' in opt.model_name: # tokenizer = Tokenizer4Pretrain(tokenizer, opt.max_seq_len) # self.model = opt.model_class(bert,opt).to(opt.device) else: tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, dat_fname='{0}_tokenizer.dat'.format(opt.dataset)) embedding_matrix = build_embedding_matrix( word2idx=tokenizer.word2idx, embed_dim=opt.embed_dim, dat_fname='{0}_{1}_embedding_matrix.dat'.format( str(opt.embed_dim), opt.dataset)) self.model = opt.model_class(embedding_matrix, opt).to(opt.device) self.trainset = ABSADataset(opt.dataset_file['train'], tokenizer) self.testset = ABSADataset(opt.dataset_file['test'], tokenizer) assert 0 <= opt.valset_ratio < 1 if opt.valset_ratio > 0: valset_len = int(len(self.trainset) * opt.valset_ratio) self.trainset, self.valset = random_split( self.trainset, (len(self.trainset) - valset_len, valset_len)) else: self.valset = self.testset if opt.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format( torch.cuda.memory_allocated(device=opt.device.index))) self._print_args()
def __init__(self, train_file, valid_file, test_file, dict_path, bert_path, bert_name, batch_size=4, seq_length=20, negative_sample_rate=1.0, negative_sample_size=25): self.batch_size = batch_size self.negative_sampling_rate = negative_sample_rate self.negative_sampling_size = negative_sample_size self.seq_length = seq_length self.ent_dict = {} if bert_name == 'roberta-base': self.tokenizer = RobertaTokenizer.from_pretrained(bert_path + bert_name) elif bert_name == 'bert-base-uncased': self.tokenizer = BertTokenizer.from_pretrained(bert_path + bert_name) logger.info('reading entity dict...') with open(dict_path) as f: for line in f: entity, entity_id = line.strip('\n').split('\t') self.ent_dict[entity] = int(entity_id) logger.info('there are {} entities'.format(len(self.ent_dict))) self.train_corpus = self.read_file(train_file)[:96106] self.total_train_instances = 96106 random.shuffle(self.train_corpus) logger.info('there are {} instances in {}'.format( self.total_train_instances, train_file)) self.valid_corpus = self.read_file(valid_file) self.total_valid_instances = len(self.valid_corpus) random.shuffle(self.valid_corpus) logger.info('there are {} instances in {}'.format( self.total_valid_instances, valid_file)) self.test_corpus = self.read_file(test_file) self.total_test_instances = len(self.test_corpus) random.shuffle(self.test_corpus) logger.info('there are {} instances in {}'.format( self.total_test_instances, test_file))
def check_tokenizer_dataset(pretrained_model, file): name = pretrained_model.split('-')[0].lower() assert name in ['bert', 'roberta'] tokenizer = RobertaTokenizer.from_pretrained( pretrained_model ) if name == 'roberta' else BertTokenizer.from_pretrained( pretrained_model) reader = BertNLIDatasetReader(pretrained_model, lazy=True, percent_data=0.001) for instance in reader.read(file): input_ids = instance['input_ids'].array.tolist() token_type_ids = instance['token_type_ids'].array attention_mask = instance['attention_mask'].array premise_tokens = instance['metadata'].metadata[ 'premise_tokens'] hypothesis_tokens = instance['metadata'].metadata[ 'hypothesis_tokens'] assert len(input_ids) == len(token_type_ids) == len( attention_mask) assert attention_mask.all() == 1 assert reader._label_dict[instance['metadata'].metadata[ 'label']] == instance['label'].array if name == 'bert': tokenizer.convert_tokens_to_ids(['[CLS]'] + premise_tokens + ['[SEP]'] + hypothesis_tokens + ['[SEP]']) == input_ids segment_divide = len(premise_tokens) + 2 assert input_ids[:segment_divide][-1] == 102 assert token_type_ids[:segment_divide].all() == 0 assert token_type_ids[segment_divide:].all() == 1 elif name == 'roberta': assert tokenizer.convert_tokens_to_ids( ['<s>'] + premise_tokens + ['</s>', '</s>'] + hypothesis_tokens + ['</s>']) == input_ids
def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: bool = True): super(RoBERTa, self).__init__() self.config_keys = ['max_seq_length', 'do_lower_case'] self.do_lower_case = do_lower_case if max_seq_length > 510: logging.warning( "RoBERTa only allows a max_seq_length of 510 (512 with special tokens). Value will be set to 510" ) max_seq_length = 510 self.max_seq_length = max_seq_length self.roberta = RobertaModel.from_pretrained(model_name_or_path) self.tokenizer = RobertaTokenizer.from_pretrained( model_name_or_path, do_lower_case=do_lower_case) self.cls_token_id = self.tokenizer.convert_tokens_to_ids( [self.tokenizer.cls_token])[0] self.sep_token_id = self.tokenizer.convert_tokens_to_ids( [self.tokenizer.sep_token])[0]
def __init__(self, pretrained_model, percent_data=1, max_seq_length=None, lazy=False) -> None: super().__init__(lazy) assert percent_data > 0 and percent_data <= 1 self.percent_data = percent_data self.max_seq_length = max_seq_length self.tokenizer_class = pretrained_model.split('-')[0].lower() if self.tokenizer_class == 'roberta': self._tokenizer = RobertaTokenizer.from_pretrained( pretrained_model) elif self.tokenizer_class == 'bert': self._tokenizer = BertTokenizer.from_pretrained(pretrained_model, do_lower_case=True) else: raise ValueError('tokenizer_model must either be roberta or bert') self.sep_id = self._tokenizer.encode(self._tokenizer.sep_token)[0] self._label_dict = {'entailment': 0, 'neutral': 1, 'contradiction': 2}
print("Build bert model.") bert_model = RobertaModel(RobertaConfig().from_pretrained(args.roberta_model)) print("Build Drop model.") network = NumericallyAugmentedBertNet(bert_model, hidden_size=bert_model.config.hidden_size, dropout_prob=0.0, use_gcn=args.use_gcn, gcn_steps=args.gcn_steps) if args.cuda: network.cuda() print("Load from pre path {}.".format(args.pre_path)) network.load_state_dict(torch.load(args.pre_path)) print("Load data from {}.".format(args.inf_path)) tokenizer = RobertaTokenizer.from_pretrained(args.roberta_model) inf_iter = DropBatchGen(args, tokenizer, DropReader(tokenizer, passage_length_limit=463, question_length_limit=46)._read(args.inf_path)) print("Start inference...") result = {} network.eval() with torch.no_grad(): for batch in tqdm(inf_iter): output_dict = network(**batch) for i in range(len(output_dict["question_id"])): result[output_dict["question_id"][i]] = output_dict["answer"][i]["predicted_answer"] with open(args.dump_path, "w", encoding="utf8") as f: json.dump(result, f)
def __init__(self, args, dictionary, embed_tokens, left_pad=False): super().__init__(dictionary) self.dropout = args.dropout # from pytorch_transformers import RobertaModel from fairseq.modules.roberta_causal_mask import RobertaCasulMaskModel, BertCasulMaskModel from pytorch_transformers.file_utils import PYTORCH_TRANSFORMERS_CACHE from pytorch_transformers import RobertaConfig, RobertaTokenizer, BertConfig, BertTokenizer if args.roberta_model.startswith('roberta'): self.roberta = RobertaCasulMaskModel.from_pretrained( args.roberta_model, cache_dir=PYTORCH_TRANSFORMERS_CACHE / 'distributed_{}'.format(args.distributed_rank)) self.config = RobertaConfig.from_pretrained(args.roberta_model) self.tokenizer = RobertaTokenizer.from_pretrained( args.roberta_model) else: self.roberta = BertCasulMaskModel.from_pretrained( args.roberta_model, cache_dir=PYTORCH_TRANSFORMERS_CACHE / 'distributed_{}'.format(args.distributed_rank)) self.config = BertConfig.from_pretrained(args.roberta_model) self.tokenizer = BertTokenizer.from_pretrained(args.roberta_model) self.config.output_attentions = True self.roberta.pooler.dense.weight.requires_grad = False self.roberta.pooler.dense.bias.requires_grad = False embed_dim = embed_tokens.embedding_dim self.padding_idx = embed_tokens.padding_idx # self.embed_tokens = embed_tokens # self.embed_scale = math.sqrt(embed_dim) self.args = args # if args.sentence_transformer_arch == 'fairseq': # self.padding_idx = embed_tokens.padding_idx # self.sent_embed_positions = PositionalEmbedding( # 1024, embed_dim, self.padding_idx, # left_pad=False, # learned=args.encoder_learned_pos, # ) # self.doc_layers = nn.ModuleList([]) # self.doc_layers.extend([ # TransformerEncoderLayer(args) # for i in range(args.encoder_layers) # ]) if args.sentence_transformer_arch == 'bert': # from pytorch_transformers import RobertaConfig, RobertaTokenizer # self.config = RobertaConfig.from_pretrained(args.roberta_model) # self.config.output_attentions = True # self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base') embed_dim = self.config.hidden_size print('*** padding idx before ***', embed_tokens.padding_idx) self.padding_idx = self.tokenizer.convert_tokens_to_ids( self.tokenizer.pad_token) print('*** padding idx after ***', self.padding_idx) # let's assume each document has at most 128-self.padding_idx-1 sentences # in case of roberta, it is 126 self.sent_position_embeddings = nn.Embedding(128, embed_dim) if args.encoder_layers: self.config.num_hidden_layers = args.encoder_layers if args.dropout: self.config.hidden_dropout_prob = args.dropout if args.attention_dropout: self.config.attention_probs_dropout_prob = args.attention_dropout if args.attn_type == 'attn_score': self.sent_encoder = AttnScoreBertEncoder(self.config) elif args.attn_type == 'attn_prob': self.sent_encoder = BertEncoder(self.config) else: raise Exception('--attn-type doesn\'t support {} yet !'.format( args.attn_type)) self.sent_encoder.apply(self._init_weights) print('*** sentence encoder config ***') print(self.config) else: raise Exception( '--sentence-transformer-arch doesn\'t support {} yet!'.format( args.sentence_transformer_arch))
def make_binary_dataset(input_prefix, output_prefix, lang, append_eos=False): if lang == args.target_lang: dict = flexible_dictionary.FlexibleDictionary.load(dict_path(lang)) else: # dict = bert_dictionary.BertDictionary.load(dict_path(lang)) dict = gpt2_dictionary.GPT2Dictionary.load(dict_path(lang)) print('| [{}] Dictionary: {} types | {} types (for real)'.format(lang, len(dict) - 1, len(dict))) ds = indexed_dataset.IndexedDatasetBuilder(dataset_dest_path(output_prefix, lang, 'bin')) def consumer(tensor): ds.add_item(tensor) input_file = '{}{}'.format(input_prefix, ('.' + lang) if lang is not None else '') if lang == args.target_lang: res = Tokenizer.binarize(input_file, dict, consumer, append_eos=append_eos) print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format( lang, input_file, res['nseq'], res['ntok'], 100 * res['nunk'] / res['ntok'], dict.unk_word if hasattr(dict, 'unk_word') else '<no_unk_word>')) else: # read article # from pytorch_pretrained_bert.tokenization import BertTokenizer # tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) from pytorch_transformers import RobertaTokenizer tokenizer = RobertaTokenizer.from_pretrained('roberta-base') def penn_token2orig_token(sent): # -LRB- -RRB- -LSB- -RSB- -LCB- -RCB- ''' penn2orig = {"``":'"', "''": '"', "-LRB-": '(', "-RRB-": ')', "-LSB-":'[', "-RSB-":']', "-LCB-":'{', "-RCB-":'}'} ''' penn2orig = {"-LRB-": '(', "-RRB-": ')', "-LSB-": '[', "-RSB-": ']', "-LCB-": '{', "-RCB-": '}', "-lrb-": '(', "-rrb-": ')', "-lsb-": '[', "-rsb-": ']', "-lcb-": '{', "-rcb-": '}',} words = sent.strip().split() words = [wd if not wd in penn2orig else penn2orig[wd] for wd in words] return ' '.join(words) num_token, num_unk_token = 0, 0 num_seq = 0 skip_line = 0 for line in open(input_file, encoding='utf8'): sents = line.strip().split('<S_SEP>') sents = sents[0:args.max_num_sentences] sents = [' '.join(sent.strip().split()[0:args.max_num_words]) for sent in sents] # print(sents) sents = [tokenizer.tokenize(penn_token2orig_token(sent)) for sent in sents] article_wids = [] for i, sent in enumerate(sents): # sometimes there are too many tokens MAXLEN = 500 if len(sent) > MAXLEN: # sent = sent[0:MAXLEN] print(' '.join(sent)) skip_line += 1 print(skip_line) continue if i != 0: article_wids.append( dict.sep_index ) wids = tokenizer.convert_tokens_to_ids(sent) # wids_vocab = [dict.index(word) for word in sent] # assert wids == wids_vocab, 'word indices should be the same!' article_wids.extend(wids) for wid in wids: if wid == dict.unk_index: num_unk_token += 1 num_token += 1 num_seq += 1 tensor = torch.IntTensor(article_wids) # print( dict.string_complete(tensor) ) ds.add_item(tensor) print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format( lang, input_file, num_seq, num_token, 100 * num_unk_token / num_token, dict.unk_word if hasattr(dict, 'unk_word') else '<no_unk_word>')) ds.finalize(dataset_dest_path(output_prefix, lang, 'idx'))
def test_roberta_embeddings(): roberta_model: str = "roberta-base" tokenizer = RobertaTokenizer.from_pretrained(roberta_model) model = RobertaModel.from_pretrained( pretrained_model_name_or_path=roberta_model, output_hidden_states=True ) model.to(flair.device) model.eval() s: str = "Berlin and Munich have a lot of puppeteer to see ." with torch.no_grad(): tokens = tokenizer.tokenize("<s> " + s + " </s>") indexed_tokens = tokenizer.convert_tokens_to_ids(tokens) tokens_tensor = torch.tensor([indexed_tokens]) tokens_tensor = tokens_tensor.to(flair.device) hidden_states = model(tokens_tensor)[-1] first_layer = hidden_states[1][0] assert len(first_layer) == len(tokens) # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 # # '<s>', 'Ber', 'lin', 'Ġand', 'ĠMunich', 'Ġhave', 'Ġa', 'Ġlot', 'Ġof', 'Ġpupp', 'ete', 'er', 'Ġto', 'Ġsee', 'Ġ.', '</s>' # \ / | | | | | | \ | / | | | # Berlin and Munich have a lot of puppeteer to see . # # 0 1 2 3 4 5 6 7 8 9 10 def embed_sentence( sentence: str, pooling_operation, layers: str = "1", use_scalar_mix: bool = False, ) -> Sentence: embeddings = RoBERTaEmbeddings( pretrained_model_name_or_path=roberta_model, layers=layers, pooling_operation=pooling_operation, use_scalar_mix=use_scalar_mix, ) flair_sentence = Sentence(sentence) embeddings.embed(flair_sentence) return flair_sentence # First subword embedding sentence_first_subword = embed_sentence(sentence=s, pooling_operation="first") first_token_embedding_ref = first_layer[1].tolist() first_token_embedding_actual = sentence_first_subword.tokens[0].embedding.tolist() puppeteer_first_subword_embedding_ref = first_layer[9].tolist() puppeteer_first_subword_embedding_actual = sentence_first_subword.tokens[ 7 ].embedding.tolist() assert first_token_embedding_ref == first_token_embedding_actual assert ( puppeteer_first_subword_embedding_ref == puppeteer_first_subword_embedding_actual ) # Last subword embedding sentence_last_subword = embed_sentence(sentence=s, pooling_operation="last") # First token is splitted into two subwords. # As we use "last" as pooling operation, we consider the last subword as "first token" here first_token_embedding_ref = first_layer[2].tolist() first_token_embedding_actual = sentence_last_subword.tokens[0].embedding.tolist() puppeteer_last_subword_embedding_ref = first_layer[11].tolist() puppeteer_last_subword_embedding_actual = sentence_last_subword.tokens[ 7 ].embedding.tolist() assert first_token_embedding_ref == first_token_embedding_actual assert ( puppeteer_last_subword_embedding_ref == puppeteer_last_subword_embedding_actual ) # First and last subword embedding sentence_first_last_subword = embed_sentence( sentence=s, pooling_operation="first_last" ) first_token_embedding_ref = torch.cat([first_layer[1], first_layer[2]]).tolist() first_token_embedding_actual = sentence_first_last_subword.tokens[ 0 ].embedding.tolist() puppeteer_first_last_subword_embedding_ref = torch.cat( [first_layer[9], first_layer[11]] ).tolist() puppeteer_first_last_subword_embedding_actual = sentence_first_last_subword.tokens[ 7 ].embedding.tolist() assert first_token_embedding_ref == first_token_embedding_actual assert ( puppeteer_first_last_subword_embedding_ref == puppeteer_first_last_subword_embedding_actual ) # Mean of all subword embeddings sentence_mean_subword = embed_sentence(sentence=s, pooling_operation="mean") first_token_embedding_ref = calculate_mean_embedding( [first_layer[1], first_layer[2]] ).tolist() first_token_embedding_actual = sentence_mean_subword.tokens[0].embedding.tolist() puppeteer_mean_subword_embedding_ref = calculate_mean_embedding( [first_layer[9], first_layer[10], first_layer[11]] ).tolist() puppeteer_mean_subword_embedding_actual = sentence_mean_subword.tokens[ 7 ].embedding.tolist() assert first_token_embedding_ref == first_token_embedding_actual assert ( puppeteer_mean_subword_embedding_ref == puppeteer_mean_subword_embedding_actual ) # Check embedding dimension when using multiple layers sentence_mult_layers = embed_sentence( sentence="Munich", pooling_operation="first", layers="1,2,3,4" ) ref_embedding_size = 4 * 768 actual_embedding_size = len(sentence_mult_layers.tokens[0].embedding) assert ref_embedding_size == actual_embedding_size # Check embedding dimension when using multiple layers and scalar mix sentence_mult_layers_scalar_mix = embed_sentence( sentence="Berlin", pooling_operation="first", layers="1,2,3,4", use_scalar_mix=True, ) ref_embedding_size = 1 * 768 actual_embedding_size = len(sentence_mult_layers_scalar_mix.tokens[0].embedding) assert ref_embedding_size == actual_embedding_size
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--output_dir", default='./proc/roberta_adapter', type=str, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument("--freeze_bert", default=False, type=bool, help="freeze the parameters of original model.") parser.add_argument("--freeze_adapter", default=True, type=bool, help="freeze the parameters of adapter.") parser.add_argument( '--fusion_mode', type=str, default='concat', help='the fusion mode for bert feature and adapter feature |add|concat' ) parser.add_argument("--adapter_transformer_layers", default=2, type=int, help="The transformer layers of adapter.") parser.add_argument("--adapter_size", default=768, type=int, help="The hidden size of adapter.") parser.add_argument("--adapter_list", default="0,11,23", type=str, help="The layer where add an adapter") parser.add_argument( "--adapter_skip_layers", default=0, type=int, help="The skip_layers of adapter according to bert layers") parser.add_argument("--no_cuda", default=False, action='store_true', help="Avoid using CUDA when available") parser.add_argument( '--meta_fac_adaptermodel', default="../pretrained_models/fac-adapter/pytorch_model.bin", type=str, help='the pretrained factual adapter model') parser.add_argument( '--meta_lin_adaptermodel', default="../pretrained_models/lin-adapter/pytorch_model.bin", type=str, help='the pretrained linguistic adapter model') args = parser.parse_args() args.adapter_list = args.adapter_list.split(',') args.adapter_list = [int(i) for i in args.adapter_list] if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # Setup CUDA, GPU device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() args.device = device ## test loading, inference, and save model, load model. # Load pretrained model/tokenizer tokenizer = RobertaTokenizer.from_pretrained('roberta-large') model = RobertaModelwithAdapter(args) model.to(args.device) # Encode text input_ids = torch.tensor([ tokenizer.encode("Here is some text to encode", add_special_tokens=True) ]).to( args.device ) # Add special tokens takes care of adding [CLS], [SEP], <s>... tokens in the right way for each model. with torch.no_grad(): last_hidden_states = model(input_ids)[0] # Models outputs are tuples # The last_hidden_state can be used as input for downstream tasks print('last_hidden_states:', last_hidden_states) # Savning model print('Saving model...') model_to_save = model.module if hasattr( model, 'module') else model # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) # Loading saved model print('Loading model...') if hasattr(model, 'module'): model.module.load_state_dict( torch.load(os.path.join(args.output_dir, 'pytorch_model.bin'))) else: # Take care of distributed/parallel training model.load_state_dict( torch.load(os.path.join(args.output_dir, 'pytorch_model.bin')))
def __init__(self, args): super().__init__() if args.hfroberta_model_dir is not None: # load bert model from file roberta_model_name = str(args.hfroberta_model_dir) + "/" dict_file = roberta_model_name print("loading huggingface RoBERTa model from {}".format(roberta_model_name)) else: # load RoBERTa model from huggingface cache roberta_model_name = args.hfroberta_model_name dict_file = roberta_model_name # When using a cased model, make sure to pass do_lower_case=False directly to BaseTokenizer do_lower_case = False if 'uncased' in roberta_model_name: do_lower_case=True # Load pre-trained model tokenizer (vocabulary) self.tokenizer = RobertaTokenizer.from_pretrained(dict_file) # original vocab # The following process is baded on gpt_connector. # RoBERTa also uses BPE. the bytes_to_unicode function takes all control # and whitespace characters in code points 0-255 and shifts them up # by 256 to make them printable. So space (code point 32) becomes Ġ (code point 288). # (copied from https://github.com/openai/gpt-2/issues/80#issuecomment-487202159). # # Other control characters will be removed during voca_intersection process. def convert_word(word): if word == ROBERTA_UNK: return word if word == ROBERTA_MASK: return word if word == ROBERTA_START_SENTENCE: return word if word == ROBERTA_END_SENTENCE: return word if word == ROBERTA_PAD: return word if word.startswith('Ġ'): # the token starts with a whitespace return word[1:] return f'_{word}_' # the token not start with a white space. # may be not a head of a word, # or may be a head of a sentence. # need duplitation check? _, gpt_vocab = zip(*sorted(self.tokenizer.decoder.items())) self.vocab = [convert_word(word) for word in gpt_vocab] self._init_inverse_vocab() # Get UNK symbol as it's written in the origin RoBERTa vocab. unk_index = self.inverse_vocab[ROBERTA_UNK] # OPENAI_UNK self.unk_symbol = self.tokenizer.decoder[unk_index] # Get MASK symbol as it's written in the origin RoBERTa vocab. mask_index = self.inverse_vocab[ROBERTA_MASK] self.mask_symbol = self.tokenizer.decoder[mask_index] # Load pre-trained model (weights) self.masked_roberta_model = RobertaForMaskedLM.from_pretrained(roberta_model_name) self.masked_roberta_model.eval() #print(self.masked_roberta_model.config) # ... to get hidden states self.roberta_model = self.masked_roberta_model.roberta # Sanity check. #assert len(self.vocab) == self.masked_roberta_model.config.vocab_size #assert 0 == self.masked_roberta_model.config.n_special self.eos_id = self.inverse_vocab[ROBERTA_END_SENTENCE] # OPENAI_EOS self.model_vocab = self.vocab self.pad_id = self.inverse_vocab[ROBERTA_PAD] self.unk_index = self.inverse_vocab[ROBERTA_UNK] self.mask_index = mask_index
def __init__(self, pretrained_model_name, max_seq_len): super().__init__() self.tokenizer = RobertaTokenizer.from_pretrained(pretrained_model_name) self.max_seq_len = max_seq_len self.pad_value = 1 # <pad> https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json
def main(): args = parse_arguments() # argument setting print("=== Argument Setting ===") print("src: " + args.src) print("tgt: " + args.tgt) print("seed: " + str(args.seed)) print("train_seed: " + str(args.train_seed)) print("model_type: " + str(args.model)) print("max_seq_length: " + str(args.max_seq_length)) print("batch_size: " + str(args.batch_size)) print("pre_epochs: " + str(args.pre_epochs)) print("num_epochs: " + str(args.num_epochs)) print("temperature: " + str(args.temperature)) set_seed(args.train_seed) if args.model == 'roberta': tokenizer = RobertaTokenizer.from_pretrained('roberta-base') else: tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # preprocess data print("=== Processing datasets ===") if args.src == 'blog': src_x, src_y = CSV2Array(os.path.join('data', args.src, 'blog.csv')) elif args.src == 'airline': src_x, src_y = CSV2Array(os.path.join('data', args.src, 'airline.csv')) else: src_x, src_y = XML2Array( os.path.join('data', args.src, 'negative.review'), os.path.join('data', args.src, 'positive.review')) src_x, src_test_x, src_y, src_test_y = train_test_split( src_x, src_y, test_size=0.2, stratify=src_y, random_state=args.seed) if args.tgt == 'blog': tgt_x, tgt_y = CSV2Array(os.path.join('data', args.tgt, 'blog.csv')) elif args.tgt == 'airline': tgt_x, tgt_y = CSV2Array(os.path.join('data', args.tgt, 'airline.csv')) else: tgt_x, tgt_y = XML2Array( os.path.join('data', args.tgt, 'negative.review'), os.path.join('data', args.tgt, 'positive.review')) tgt_train_x, tgt_test_y, tgt_train_y, tgt_test_y = train_test_split( tgt_x, tgt_y, test_size=0.2, stratify=tgt_y, random_state=args.seed) if args.model == 'roberta': src_features = roberta_convert_examples_to_features( src_x, src_y, args.max_seq_length, tokenizer) src_test_features = roberta_convert_examples_to_features( src_test_x, src_test_y, args.max_seq_length, tokenizer) tgt_features = roberta_convert_examples_to_features( tgt_x, tgt_y, args.max_seq_length, tokenizer) tgt_train_features = roberta_convert_examples_to_features( tgt_train_x, tgt_train_y, args.max_seq_length, tokenizer) else: src_features = convert_examples_to_features(src_x, src_y, args.max_seq_length, tokenizer) src_test_features = convert_examples_to_features( src_test_x, src_test_y, args.max_seq_length, tokenizer) tgt_features = convert_examples_to_features(tgt_x, tgt_y, args.max_seq_length, tokenizer) tgt_train_features = convert_examples_to_features( tgt_train_x, tgt_train_y, args.max_seq_length, tokenizer) # load dataset src_data_loader = get_data_loader(src_features, args.batch_size) src_data_eval_loader = get_data_loader(src_test_features, args.batch_size) tgt_data_train_loader = get_data_loader(tgt_train_features, args.batch_size) tgt_data_all_loader = get_data_loader(tgt_features, args.batch_size) # load models if args.model == 'bert': src_encoder = BertEncoder() tgt_encoder = BertEncoder() src_classifier = BertClassifier() elif args.model == 'distilbert': src_encoder = DistilBertEncoder() tgt_encoder = DistilBertEncoder() src_classifier = BertClassifier() else: src_encoder = RobertaEncoder() tgt_encoder = RobertaEncoder() src_classifier = RobertaClassifier() discriminator = Discriminator() if args.load: src_encoder = init_model(args, src_encoder, restore=param.src_encoder_path) src_classifier = init_model(args, src_classifier, restore=param.src_classifier_path) tgt_encoder = init_model(args, tgt_encoder, restore=param.tgt_encoder_path) discriminator = init_model(args, discriminator, restore=param.d_model_path) else: src_encoder = init_model(args, src_encoder) src_classifier = init_model(args, src_classifier) tgt_encoder = init_model(args, tgt_encoder) discriminator = init_model(args, discriminator) # train source model print("=== Training classifier for source domain ===") if args.pretrain: src_encoder, src_classifier = pretrain(args, src_encoder, src_classifier, src_data_loader) # eval source model print("=== Evaluating classifier for source domain ===") evaluate(src_encoder, src_classifier, src_data_loader) evaluate(src_encoder, src_classifier, src_data_eval_loader) evaluate(src_encoder, src_classifier, tgt_data_all_loader) for params in src_encoder.parameters(): params.requires_grad = False for params in src_classifier.parameters(): params.requires_grad = False # train target encoder by GAN print("=== Training encoder for target domain ===") if args.adapt: tgt_encoder.load_state_dict(src_encoder.state_dict()) tgt_encoder = adapt(args, src_encoder, tgt_encoder, discriminator, src_classifier, src_data_loader, tgt_data_train_loader, tgt_data_all_loader) # eval target encoder on lambda0.1 set of target dataset print("=== Evaluating classifier for encoded target domain ===") print(">>> source only <<<") evaluate(src_encoder, src_classifier, tgt_data_all_loader) print(">>> domain adaption <<<") evaluate(tgt_encoder, src_classifier, tgt_data_all_loader)
def main(): parser = ArgumentParser() parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--model_type", default='roberta', type=str, required=True, help="Model type selected in the list") parser.add_argument("--model_name_or_path", default='roberta-large', type=str, required=True, help="Path to pre-trained model or shortcut name selected in the list: ") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument("--comment", default='', type=str, help="The comment") parser.add_argument('--output_dir', type=Path, default="output") parser.add_argument("--restore", type=bool, default=True, help="Whether restore from the last checkpoint, is nochenckpoints, start from scartch") parser.add_argument("--max_seq_length", type=int, default=256, help="max lenght of token sequence") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--evaluate_during_training", type=bool, default=False, help="Rul evaluation during training at each logging step.") parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--adapter_transformer_layers", default=2, type=int, help="The transformer layers of adapter.") parser.add_argument("--adapter_size", default=768, type=int, help="The hidden size of adapter.") parser.add_argument("--adapter_list", default="0,11,23", type=str, help="The layer where add an adapter") parser.add_argument("--adapter_skip_layers", default=0, type=int, help="The skip_layers of adapter according to bert layers") parser.add_argument('--meta_adapter_model', type=str, help='the pretrained adapter model') parser.add_argument("--per_gpu_train_batch_size", default=32, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=64, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument('--gradient_accumulation_steps',type=int,default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument("--learning_rate", default=3e-5,type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=3, type=int, help="Total number of training epochs to perform.") parser.add_argument("--max_steps", default=-1, type=int, help="If > 0: set total number of training steps to perform. Override num_train_epochs.") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument('--logging_steps', type=int, default=10, help="How often do we snapshot losses, for inclusion in the progress dump? (0 = disable)") parser.add_argument('--save_steps', type=int, default=1000, help="Save checkpoint every X updates steps.") parser.add_argument('--eval_steps', type=int, default=None, help="eval every X updates steps.") parser.add_argument('--max_save_checkpoints', type=int, default=500, help="The max amounts of checkpoint saving. Bigger than it will delete the former checkpoints") parser.add_argument("--eval_all_checkpoints", action='store_true', help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number") parser.add_argument("--no_cuda", action='store_true', help="Avoid using CUDA when available") parser.add_argument('--overwrite_output_dir', action='store_true', help="Overwrite the content of the output directory") parser.add_argument('--overwrite_cache', action='store_true', help="Overwrite the cached training and evaluation sets") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit") parser.add_argument('--fp16_opt_level', type=str, default='O1', help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.") parser.add_argument('--server_port', type=str, default='', help="For distant debugging.") parser.add_argument('--negative_sample', type=int, default=0, help='how many negative samples to select') # args args = parser.parse_args() args.adapter_list = args.adapter_list.split(',') args.adapter_list = [int(i) for i in args.adapter_list] name_prefix = 'maxlen-'+str(args.max_seq_length)+'_'+'epoch-'+str(args.num_train_epochs)+'_'+'batch-'+str(args.per_gpu_train_batch_size)+'_'+'lr-'+str(args.learning_rate)+'_'+'warmup-'+str(args.warmup_steps)+'_'+str(args.comment) args.my_model_name = args.task_name+'_'+name_prefix args.output_dir = os.path.join(args.output_dir, args.my_model_name) if args.eval_steps is None: args.eval_steps = args.save_steps*2 # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl') args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt = '%m/%d/%Y %H:%M:%S', level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) # Set seed set_seed(args) args.output_mode = output_modes[args.task_name] processor = processors[args.task_name]() label_list = processor.get_labels() num_labels = len(label_list) # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab tokenizer = RobertaTokenizer.from_pretrained('roberta-large') config = RobertaConfig.from_pretrained('roberta-large', output_attentions=True) pretrained_model = PretrainedModel() adapter_model = AdapterModel(args, pretrained_model.config) if args.local_rank == 0: torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab pretrained_model.to(args.device) adapter_model.to(args.device) model = (pretrained_model, adapter_model) logger.info("Training/evaluation parameters %s", args) val_dataset = load_and_cache_examples(args, args.task_name, tokenizer, 'dev', evaluate=True) # Training if args.do_train: train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, 'train', evaluate=False) global_step, tr_loss = train(args, train_dataset, val_dataset, model, tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) logger.info("Saving model checkpoint to %s", args.output_dir) model_to_save = model.module if hasattr(adapter_model, 'module') else adapter_model # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
def main(): parser = argparse.ArgumentParser(description="Training") parser.add_argument( "--dump_path", type=str, required=True, help="The output directory (log, checkpoints, parameters, etc.)") parser.add_argument( "--data_file", type=str, required=True, help= "The binarized file (tokenized + tokens_to_ids) and grouped by sequence." ) parser.add_argument("--token_counts", type=str, required=True, help="The token counts in the data_file for MLM.") parser.add_argument("--force", action='store_true', help="Overwrite dump_path if it already exists.") parser.add_argument("--vocab_size", default=30522, type=int, help="The vocabulary size.") parser.add_argument( "--max_position_embeddings", default=512, type=int, help="Maximum sequence length we can model (including [CLS] and [SEP])." ) parser.add_argument( "--sinusoidal_pos_embds", action='store_false', help= "If true, the position embeddings are simply fixed with sinusoidal embeddings." ) parser.add_argument("--n_layers", default=6, type=int, help="Number of Transformer blocks.") parser.add_argument("--n_heads", default=12, type=int, help="Number of heads in the self-attention module.") parser.add_argument( "--dim", default=768, type=int, help="Dimension through the network. Must be divisible by n_heads") parser.add_argument("--hidden_dim", default=3072, type=int, help="Intermediate dimension in the FFN.") parser.add_argument("--dropout", default=0.1, type=float, help="Dropout.") parser.add_argument("--attention_dropout", default=0.1, type=float, help="Dropout in self-attention.") parser.add_argument("--activation", default='gelu', type=str, help="Activation to use in self-attention") parser.add_argument( "--tie_weights_", action='store_false', help= "If true, we tie the embeddings matrix with the projection over the vocabulary matrix. Default is true." ) parser.add_argument("--from_pretrained_weights", default=None, type=str, help="Load student initialization checkpoint.") parser.add_argument( "--from_pretrained_config", default=None, type=str, help="Load student initialization architecture config.") parser.add_argument("--teacher_type", default="bert", choices=["bert", "roberta"], help="Teacher type (BERT, RoBERTa).") parser.add_argument("--teacher_name", default="bert-base-uncased", type=str, help="The teacher model.") parser.add_argument("--temperature", default=2., type=float, help="Temperature for the softmax temperature.") parser.add_argument( "--alpha_ce", default=0.5, type=float, help="Linear weight for the distillation loss. Must be >=0.") parser.add_argument("--alpha_mlm", default=0.5, type=float, help="Linear weight for the MLM loss. Must be >=0.") parser.add_argument("--alpha_mse", default=0.0, type=float, help="Linear weight of the MSE loss. Must be >=0.") parser.add_argument( "--alpha_cos", default=0.0, type=float, help="Linear weight of the cosine embedding loss. Must be >=0.") parser.add_argument( "--mlm_mask_prop", default=0.15, type=float, help="Proportion of tokens for which we need to make a prediction.") parser.add_argument("--word_mask", default=0.8, type=float, help="Proportion of tokens to mask out.") parser.add_argument("--word_keep", default=0.1, type=float, help="Proportion of tokens to keep.") parser.add_argument("--word_rand", default=0.1, type=float, help="Proportion of tokens to randomly replace.") parser.add_argument( "--mlm_smoothing", default=0.7, type=float, help= "Smoothing parameter to emphasize more rare tokens (see XLM, similar to word2vec)." ) parser.add_argument( "--restrict_ce_to_mask", action='store_true', help= "If true, compute the distilation loss only the [MLM] prediction distribution." ) parser.add_argument("--n_epoch", type=int, default=3, help="Number of pass on the whole dataset.") parser.add_argument("--batch_size", type=int, default=5, help="Batch size (for each process).") parser.add_argument( "--tokens_per_batch", type=int, default=-1, help= "If specified, modify the batches so that they have approximately this number of tokens." ) parser.add_argument( "--shuffle", action='store_false', help="If true, shuffle the sequence order. Default is true.") parser.add_argument( "--group_by_size", action='store_false', help= "If true, group sequences that have similar length into the same batch. Default is true." ) parser.add_argument( "--gradient_accumulation_steps", type=int, default=50, help="Gradient accumulation for larger training batches.") parser.add_argument("--warmup_prop", default=0.05, type=float, help="Linear warmup proportion.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") parser.add_argument("--learning_rate", default=5e-4, type=float, help="The initial learning rate for Adam.") parser.add_argument("--adam_epsilon", default=1e-6, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=5.0, type=float, help="Max gradient norm.") parser.add_argument("--initializer_range", default=0.02, type=float, help="Random initialization range.") parser.add_argument( '--fp16', action='store_true', help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit" ) parser.add_argument( '--fp16_opt_level', type=str, default='O1', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument("--n_gpu", type=int, default=1, help="Number of GPUs in the node.") parser.add_argument("--local_rank", type=int, default=-1, help="Distributed training - Local rank") parser.add_argument("--seed", type=int, default=56, help="Random seed") parser.add_argument("--log_interval", type=int, default=500, help="Tensorboard logging interval.") parser.add_argument("--checkpoint_interval", type=int, default=4000, help="Checkpoint interval.") args = parser.parse_args() ## ARGS ## init_gpu_params(args) set_seed(args) if args.is_master: if os.path.exists(args.dump_path): if not args.force: raise ValueError( f'Serialization dir {args.dump_path} already exists, but you have not precised wheter to overwrite it' 'Use `--force` if you want to overwrite it') else: shutil.rmtree(args.dump_path) if not os.path.exists(args.dump_path): os.makedirs(args.dump_path) logger.info( f'Experiment will be dumped and logged in {args.dump_path}') ### SAVE PARAMS ### logger.info(f'Param: {args}') with open(os.path.join(args.dump_path, 'parameters.json'), 'w') as f: json.dump(vars(args), f, indent=4) git_log(args.dump_path) assert (args.from_pretrained_weights is None and args.from_pretrained_config is None) or \ (args.from_pretrained_weights is not None and args.from_pretrained_config is not None) ### TOKENIZER ### if args.teacher_type == 'bert': tokenizer = BertTokenizer.from_pretrained(args.teacher_name) elif args.teacher_type == 'roberta': tokenizer = RobertaTokenizer.from_pretrained(args.teacher_name) special_tok_ids = {} for tok_name, tok_symbol in tokenizer.special_tokens_map.items(): idx = tokenizer.all_special_tokens.index(tok_symbol) special_tok_ids[tok_name] = tokenizer.all_special_ids[idx] logger.info(f'Special tokens {special_tok_ids}') args.special_tok_ids = special_tok_ids ## DATA LOADER ## logger.info(f'Loading data from {args.data_file}') with open(args.data_file, 'rb') as fp: data = pickle.load(fp) assert os.path.isfile(args.token_counts) logger.info( f'Loading token counts from {args.token_counts} (already pre-computed)' ) with open(args.token_counts, 'rb') as fp: counts = pickle.load(fp) assert len(counts) == args.vocab_size token_probs = np.maximum(counts, 1)**-args.mlm_smoothing for idx in special_tok_ids.values(): token_probs[idx] = 0. # do not predict special tokens token_probs = torch.from_numpy(token_probs) train_dataloader = Dataset(params=args, data=data) logger.info(f'Data loader created.') ## STUDENT ## if args.from_pretrained_weights is not None: assert os.path.isfile(args.from_pretrained_weights) assert os.path.isfile(args.from_pretrained_config) logger.info( f'Loading pretrained weights from {args.from_pretrained_weights}') logger.info( f'Loading pretrained config from {args.from_pretrained_config}') stu_architecture_config = DistilBertConfig.from_json_file( args.from_pretrained_config) stu_architecture_config.output_hidden_states = True student = DistilBertForMaskedLM.from_pretrained( args.from_pretrained_weights, config=stu_architecture_config) else: args.vocab_size_or_config_json_file = args.vocab_size stu_architecture_config = DistilBertConfig(**vars(args), output_hidden_states=True) student = DistilBertForMaskedLM(stu_architecture_config) if args.n_gpu > 0: student.to(f'cuda:{args.local_rank}') logger.info(f'Student loaded.') ## TEACHER ## if args.teacher_type == 'bert': teacher = BertForMaskedLM.from_pretrained(args.teacher_name, output_hidden_states=True) elif args.teacher_type == 'roberta': teacher = RobertaForMaskedLM.from_pretrained(args.teacher_name, output_hidden_states=True) if args.n_gpu > 0: teacher.to(f'cuda:{args.local_rank}') logger.info(f'Teacher loaded from {args.teacher_name}.') ## DISTILLER ## torch.cuda.empty_cache() distiller = Distiller(params=args, dataloader=train_dataloader, token_probs=token_probs, student=student, teacher=teacher) distiller.train() logger.info("Let's go get some drinks.")
def __init__(self, filename, maxlen=30): self.df = pd.read_csv(filename) self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base') self.maxlen = maxlen
path_to_wsc = '../data/wsc_data/enhanced.tense.random.role.syn.voice.scramble.freqnoun.gender.number.adverb.tsv' wsc_datapoints = pd.read_csv(path_to_wsc, sep='\t') def replace_pronoun(tokenized_text, pronoun_index, tokenized_option): tokenized_text = tokenized_text[: pronoun_index] + tokenized_option + tokenized_text[ pronoun_index:] new_pronoun_index = pronoun_index + len(tokenized_option) tokenized_text.pop(new_pronoun_index) return tokenized_text # Load pre-trained model tokenizer (vocabulary) tokenizer = RobertaTokenizer.from_pretrained('roberta-large') correct_preds = 0 correct_preds_enhanced = 0 stability_match = 0 all_preds = 0 # Load pre-trained model (weights) model = RobertaForMaskedLM.from_pretrained('roberta-large') model.eval() for q_index, dp_split in wsc_datapoints.iterrows(): if dp_split['text_adverb'].replace( ' ', '') != '-' and dp_split['text_adverb'].replace(' ', ''):
def __init__(self, max_seq_len, pretrained_roberta_name): self.tokenizer = RobertaTokenizer.from_pretrained( pretrained_roberta_name) self.max_seq_len = max_seq_len
def main(args): assert args.path is not None, '--path required for generation!' assert not args.sampling or args.nbest == args.beam, \ '--sampling requires --nbest to be equal to --beam' assert args.replace_unk is None or args.raw_text, \ '--replace-unk requires a raw text dataset (--raw-text)' # if args.save_path is not None: # if check_file_exists(args): # return import_user_module(args) if args.max_tokens is None and args.max_sentences is None: args.max_tokens = 12000 # print(args) utils.xpprint(args) use_cuda = torch.cuda.is_available() and not args.cpu # Load dataset splits task = tasks.setup_task(args) task.load_dataset(args.gen_subset) print('| {} {} {} examples'.format(args.data, args.gen_subset, len(task.dataset(args.gen_subset)))) # Set dictionaries try: src_dict = getattr(task, 'source_dictionary', None) except NotImplementedError: src_dict = None tgt_dict = task.target_dictionary # Load ensemble print('| loading model(s) from {}'.format(args.path)) models, _model_args = utils.load_ensemble_for_inference( args.path.split(':'), task, model_arg_overrides=eval(args.model_overrides), ) # Optimize ensemble for generation for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, need_attn=args.print_alignment, ) if args.fp16: model.half() if use_cuda: model.cuda() # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(args.replace_unk) # Load dataset (possibly sharded) itr = task.get_batch_iterator( dataset=task.dataset(args.gen_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=utils.resolve_max_positions( task.max_positions(), # *[model.max_positions() for model in models] ), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=args.required_batch_size_multiple, num_shards=args.num_shards, shard_id=args.shard_id, num_workers=args.num_workers, ).next_epoch_itr(shuffle=False) # Initialize generator gen_timer = StopwatchMeter() generator = task.build_generator(args) # Generate and compute BLEU score if args.sacrebleu: scorer = bleu.SacrebleuScorer() else: scorer = bleu.Scorer(tgt_dict.pad(), tgt_dict.eos(), tgt_dict.unk()) num_sentences = 0 has_target = True if args.isRoberta: from pytorch_transformers import RobertaTokenizer tokenizer = RobertaTokenizer.from_pretrained('roberta-base') else: tokenizer = None with progress_bar.build_progress_bar(args, itr) as t: wps_meter = TimeMeter() for sample in t: sample = utils.move_to_cuda(sample) if use_cuda else sample if 'net_input' not in sample: continue prefix_tokens = None if args.prefix_size > 0: prefix_tokens = sample['target'][:, :args.prefix_size] gen_timer.start() hypos = task.inference_step(generator, models, sample, prefix_tokens) num_generated_tokens = sum(len(h[0]['tokens']) for h in hypos) gen_timer.stop(num_generated_tokens) for i, sample_id in enumerate(sample['id'].tolist()): has_target = sample['target'] is not None # Remove padding src_tokens = utils.strip_pad( sample['net_input']['src_tokens'][i, :], tgt_dict.pad()) target_tokens = None if has_target: target_tokens = utils.strip_pad( sample['target'][i, :], tgt_dict.pad()).int().cpu() # Either retrieve the original sentences or regenerate them from tokens. if align_dict is not None: src_str = task.dataset( args.gen_subset).src.get_original_text(sample_id) target_str = task.dataset( args.gen_subset).tgt.get_original_text(sample_id) else: if src_dict is not None: src_str = src_dict.string(src_tokens, args.remove_bpe) else: src_str = "" if has_target: target_str = tgt_dict.string(target_tokens, args.remove_bpe, escape_unk=True) if not args.quiet: if src_dict is not None: if not args.isRoberta: print('S-{}\t{}'.format(sample_id, src_str)) else: src_text = ''.join(src_str.strip().split()) src_out = tokenizer.convert_tokens_to_string( src_text) print('S-{}\t{}'.format(sample_id, src_out)) if has_target: if not args.isRoberta: print('T-{}\t{}'.format(sample_id, target_str)) else: tgt_text = ''.join(target_str.strip().split()) tgt_out = tokenizer.convert_tokens_to_string( tgt_text) print('T-{}\t{}'.format(sample_id, tgt_out)) # Process top predictions for i, hypo in enumerate( hypos[i][:min(len(hypos), args.nbest)]): hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypo['tokens'].int().cpu(), src_str=src_str, alignment=hypo['alignment'].int().cpu() if hypo['alignment'] is not None else None, align_dict=align_dict, tgt_dict=tgt_dict, remove_bpe=args.remove_bpe, ) if not args.quiet: if not args.isRoberta: print('H-{}\t{}\t{}'.format( sample_id, hypo['score'], hypo_str)) else: hypo_text = ''.join(hypo_str.strip().split()) hypo_out = tokenizer.convert_tokens_to_string( hypo_text) print('H-{}\t{}\t{}'.format( sample_id, hypo['score'], hypo_out)) print('P-{}\t{}'.format( sample_id, ' '.join( map( lambda x: '{:.4f}'.format(x), hypo['positional_scores'].tolist(), )))) if args.print_alignment: print('A-{}\t{}'.format( sample_id, ' '.join( map(lambda x: str(utils.item(x)), alignment)))) # Score only the top hypothesis if has_target and i == 0: if align_dict is not None or args.remove_bpe is not None: # Convert back to tokens for evaluation with unk replacement and/or without BPE target_tokens = tgt_dict.encode_line( target_str, add_if_not_exist=True) if hasattr(scorer, 'add_string'): scorer.add_string(target_str, hypo_str) else: scorer.add(target_tokens, hypo_tokens) wps_meter.update(num_generated_tokens) t.log({'wps': round(wps_meter.avg)}) num_sentences += sample['nsentences'] print( '| Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)' .format(num_sentences, gen_timer.n, gen_timer.sum, num_sentences / gen_timer.sum, 1. / gen_timer.avg)) if has_target: print('| Generate {} with beam={}: {}'.format(args.gen_subset, args.beam, scorer.result_string())) return scorer
########################## # Utility functions # ########################## import torch import time, sys from pytorch_transformers import RobertaTokenizer, BertTokenizer tokenizer = RobertaTokenizer.from_pretrained('roberta-base') tokenizer_bert = BertTokenizer.from_pretrained('bert-base-cased') def _truncate_seq_pair(tokens_a, max_length): """Truncates a sequence pair in place to the maximum length. Copyed from https://github.com/huggingface/pytorch-pretrained-BERT """ # This is a simple heuristic which will always truncate the longer sequence # one token at a time. This makes more sense than truncating an equal percent # of tokens from each, since if one sequence is very short then each token # that's truncated likely contains more information than a longer sequence. while True: total_length = len(tokens_a) if total_length <= max_length: break tokens_a.pop() def get_BERT_vector(sent1, sent2=None, max_sent1_len=400, max_sent2_len=100,
def make_binary_dataset(vocab, input_prefix, output_prefix, lang, num_workers): print("| [{}] Dictionary: {} types".format(lang, len(vocab) - 1)) print('input_prefix', input_prefix) print(dict_path(lang)) dict = roberta_dictionary.RobertaDictionary.load(dict_path(lang)) input_file = "{}{}".format( input_prefix, ("." + lang) if lang is not None else "" ) from pytorch_transformers import RobertaTokenizer import torch tokenizer = RobertaTokenizer.from_pretrained('roberta-large') def penn_token2orig_token(sent): # -LRB- -RRB- -LSB- -RSB- -LCB- -RCB- penn2orig = {"``":'"', "''": '"', "-LRB-": '(', "-RRB-": ')', "-LSB-":'[', "-RSB-":']', "-LCB-":'{', "-RCB-":'}', "-lrb-": '(', "-rrb-": ')', "-lsb-": '[', "-rsb-": ']', "-lcb-": '{', "-rcb-": '}', } words = sent.strip().split() words = [wd if not wd in penn2orig else penn2orig[wd] for wd in words] return ' '.join(words) num_token, num_unk_token = 0, 0 num_seq = 0 ds = indexed_dataset.IndexedDatasetBuilder( dataset_dest_file(args, output_prefix, lang, "bin") ) truncated_number = 512 if lang == 'article' else 256 CLS_TOKEN = '<s>' SEP_TOKEN = '</s>' if lang == 'article': for line in open(input_file, encoding='utf8'): article_wids = [] min_src_sentence = 3 max_src_sentence = 100 max_src_ntokens_per_sent = 200 min_src_ntokens_per_sent = 5 sents = line.strip().split('<S_SEP>') sents = [sent.strip().split() for sent in sents] idxs = [i for i, sent in enumerate(sents) if (len(sent) > min_src_ntokens_per_sent)] src = [sents[i][:max_src_ntokens_per_sent] for i in idxs] src = src[:max_src_sentence] src_txt = [' '.join(sent) for sent in src] src_tokens = [tokenizer.tokenize(sent) for sent in src_txt] for i, sent in enumerate(src_tokens): MAX_SENT_NTOKENS = 500 if len(sent) > MAX_SENT_NTOKENS: sent = sent[:MAX_SENT_NTOKENS] if i == 0: input_text = [CLS_TOKEN] + sent + [SEP_TOKEN] elif i != 0: input_text = [SEP_TOKEN] + sent + [SEP_TOKEN] wids = tokenizer.convert_tokens_to_ids(input_text) article_wids.extend(wids) for wid in wids: if wid == dict.unk_index: num_unk_token += 1 num_token += 1 num_seq += 1 article_wids = article_wids[:truncated_number] if len(article_wids) > truncated_number else article_wids if article_wids[-1] != dict.sep_index: article_wids[-1] = dict.sep_index tensor = torch.IntTensor(article_wids) # print( dict.string_complete(tensor) ) ds.add_item(tensor) ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx")) elif lang == 'summary': for line in open(input_file, encoding='utf8'): article_wids = [] max_tgt_ntokens = 500 min_tgt_ntokens = 5 sents = line.strip().split('<S_SEP>') sents = [tokenizer.tokenize(sent) for sent in sents] for i, sent in enumerate(sents): # sometimes, there are too many token in one single sentence # to be specific, there are 8 sentences in the training article longer than 512, so truncate them to 500 # MAX_SENT_LEN = 500 # if len(sent) > MAX_SENT_LEN: # sent = sent[:MAX_SENT_LEN] if i != 0: input_text = [SEP_TOKEN] + sent else: input_text = sent wids = tokenizer.convert_tokens_to_ids(input_text) # wtoks = tokenizer.convert_ids_to_tokens(wids) # wstring = tokenizer.convert_tokens_to_string(wtoks) # wids_vocab = [dict.index(word) for word in input_text] # assert wids == wids_vocab, 'word indices should be the same!' article_wids.extend(wids) for wid in wids: if wid == dict.unk_index: num_unk_token += 1 num_token += 1 num_seq += 1 article_wids = article_wids[:truncated_number] if len(article_wids) > truncated_number else article_wids if article_wids[-1] == dict.sep_index: article_wids = article_wids[:len(article_wids)-1] # print(article_wids) if len(article_wids) > truncated_number: print('lang: %s, token len: %d, truncated len: %d' % (lang, len(article_wids), truncated_number)) tensor = torch.IntTensor(article_wids) # print( dict.string_complete(tensor) ) ds.add_item(tensor) ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx")) print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format( lang, input_file, num_seq, num_token, 100 * num_unk_token / num_token, dict.unk_word if hasattr(dict, 'unk_word') else '<no_unk_word>'))
print(srl) crf = ConditionalRandomField(len(roles_to_idx), None, include_start_end_transitions=True) print(crf) model_parameters = filter(lambda p: p.requires_grad, chain(srl.parameters(), crf.parameters())) num_params = sum([np.prod(p.size()) for p in model_parameters]) print("Total parameters =", num_params) print(params) if params.use_bert: bert_tokenizer = RobertaTokenizer.from_pretrained("roberta-base") bert_model = RobertaModel.from_pretrained("roberta-base", output_hidden_states=True) if params.gpu_id > -1: bert_model.cuda() else: bert_tokenizer = None bert_model = None if params.gpu_id > -1: srl.cuda() crf.cuda() srl.load_state_dict(torch.load(os.path.join(params.dir, params.modelname))) crf.load_state_dict( torch.load(os.path.join(params.dir, params.modelname + "crf")))
def main(): args = parse_arguments() # argument setting print("=== Argument Setting ===") print("src: " + args.src) print("tgt: " + args.tgt) print("alpha: " + str(args.alpha)) print("seed: " + str(args.seed)) print("train_seed: " + str(args.train_seed)) print("model_type: " + str(args.model)) print("max_seq_length: " + str(args.max_seq_length)) print("batch_size: " + str(args.batch_size)) print("num_epochs: " + str(args.num_epochs)) set_seed(args.train_seed) if args.model == 'roberta': tokenizer = RobertaTokenizer.from_pretrained('roberta-base') else: tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # preprocess data print("=== Processing datasets ===") if args.src == 'blog': src_x, src_y = CSV2Array(os.path.join('data', args.src, 'blog.csv')) elif args.src == 'airline': src_x, src_y = CSV2Array(os.path.join('data', args.src, 'airline.csv')) else: src_x, src_y = XML2Array( os.path.join('data', args.src, 'negative.review'), os.path.join('data', args.src, 'positive.review')) src_x, src_test_x, src_y, src_test_y = train_test_split( src_x, src_y, test_size=0.2, stratify=src_y, random_state=args.seed) if args.tgt == 'blog': tgt_x, tgt_y = CSV2Array(os.path.join('data', args.tgt, 'blog.csv')) elif args.tgt == 'airline': tgt_x, tgt_y = CSV2Array(os.path.join('data', args.tgt, 'airline.csv')) else: tgt_x, tgt_y = XML2Array( os.path.join('data', args.tgt, 'negative.review'), os.path.join('data', args.tgt, 'positive.review')) tgt_train_x, _, tgt_train_y, _ = train_test_split(tgt_x, tgt_y, test_size=0.2, stratify=tgt_y, random_state=args.seed) if args.model == 'roberta': src_features = roberta_convert_examples_to_features( src_x, src_y, args.max_seq_length, tokenizer) src_test_features = roberta_convert_examples_to_features( src_test_x, src_test_y, args.max_seq_length, tokenizer) tgt_features = roberta_convert_examples_to_features( tgt_train_x, tgt_train_y, args.max_seq_length, tokenizer) tgt_all_features = roberta_convert_examples_to_features( tgt_x, tgt_y, args.max_seq_length, tokenizer) else: src_features = convert_examples_to_features(src_x, src_y, args.max_seq_length, tokenizer) src_test_features = convert_examples_to_features( src_test_x, src_test_y, args.max_seq_length, tokenizer) tgt_features = convert_examples_to_features(tgt_train_x, tgt_train_y, args.max_seq_length, tokenizer) tgt_all_features = convert_examples_to_features( tgt_x, tgt_y, args.max_seq_length, tokenizer) # load dataset src_data_loader = get_data_loader(src_features, args.batch_size) src_data_loader_eval = get_data_loader(src_test_features, args.batch_size) tgt_data_loader = get_data_loader(tgt_features, args.batch_size) tgt_data_loader_all = get_data_loader(tgt_all_features, args.batch_size) # load models if args.model == 'bert': encoder = BertEncoder() cls_classifier = BertClassifier() dom_classifier = DomainClassifier() elif args.model == 'distilbert': encoder = DistilBertEncoder() cls_classifier = BertClassifier() dom_classifier = DomainClassifier() else: encoder = RobertaEncoder() cls_classifier = RobertaClassifier() dom_classifier = RobertaDomainClassifier() if args.load: encoder = init_model(encoder, restore=param.encoder_path) cls_classifier = init_model(cls_classifier, restore=param.cls_classifier_path) dom_classifier = init_model(dom_classifier, restore=param.dom_classifier_path) else: encoder = init_model(encoder) cls_classifier = init_model(cls_classifier) dom_classifier = init_model(dom_classifier) print("=== Start Training ===") if args.train: encoder, cls_classifier, dom_classifier = train( args, encoder, cls_classifier, dom_classifier, src_data_loader, src_data_loader_eval, tgt_data_loader, tgt_data_loader_all) print("=== Evaluating classifier for encoded target domain ===") print(">>> after training <<<") evaluate(encoder, cls_classifier, tgt_data_loader_all)