def __init__(self, model_name: str, requires_grad: bool) -> None: super().__init__() self.transformer_model = AutoModel.from_pretrained(model_name) # I'm not sure if this works for all models; open an issue on github if you find a case # where it doesn't work. self.output_dim = self.transformer_model.config.hidden_size for param in self.transformer_model.parameters(): param.requires_grad = requires_grad
def __init__(self, model_name: str) -> None: super().__init__() config = PretrainedConfig.from_json_file(path + "/../bert_configs/debug.json") self.transformer_model = AutoModel.from_pretrained(model_name, config=config) # I'm not sure if this works for all models; open an issue on github if you find a case # where it doesn't work. self.output_dim = self.transformer_model.config.hidden_size
def __init__(self, model_name: str, requires_grad: bool, freeze_num_layers: int, aligning_files: Dict[str, str], xnli_tasks: List[str]) -> None: super().__init__() self.aligning_layer_num = freeze_num_layers # align by last frozen layer self.transformer_model = AutoModel.from_pretrained(model_name) # I'm not sure if this works for all models; open an issue on github if you find a case # where it doesn't work. self.output_dim = self.transformer_model.config.hidden_size #for param in self.transformer_model.parameters(): # param.requires_grad = requires_grad def get_layer_num(n): parts = n.split(".") for part in parts: try: return int(part) except ValueError: continue return 0 if freeze_num_layers > 0: for n, p in self.transformer_model.named_parameters(): layer_num = get_layer_num(n) if layer_num < freeze_num_layers: p.requires_grad = False xnli_langs = [t[4:] for t in xnli_tasks] print(xnli_langs) for lang in xnli_langs: name = 'aligning_%s' % lang aligning_matrix = torch.eye( self.output_dim) # default to identity matrix -> no alignment if lang in aligning_files and aligning_files[lang] != '': print(lang + " will be aligned") aligninig_path = cached_path(aligning_files[lang]) aligning_matrix = torch.FloatTensor(torch.load(aligninig_path)) aligning = torch.nn.Linear(self.output_dim, self.output_dim, bias=False) aligning.weight = torch.nn.Parameter(aligning_matrix, requires_grad=False) self.add_module(name, aligning)
def __init__(self, model_name: str, requires_grad: bool = True, freeze_num_layers: int = 0, aligning_files: Dict[str, str] = None) -> None: super().__init__() self.aligning_layer_num = freeze_num_layers # align by last frozen layer self.transformer_model = AutoModel.from_pretrained(model_name) # I'm not sure if this works for all models; open an issue on github if you find a case # where it doesn't work. self.output_dim = self.transformer_model.config.hidden_size for param in self.transformer_model.parameters(): param.requires_grad = requires_grad def get_layer_num(n): parts = n.split(".") for part in parts: try: return int(part) except ValueError: continue return 0 if freeze_num_layers > 0: for n, p in self.transformer_model.named_parameters(): layer_num = get_layer_num(n) if layer_num < freeze_num_layers: p.requires_grad = False if aligning_files is None: raise ValueError for lang in aligning_files.keys(): name = 'aligning_%s' % lang aligninig_path = cached_path(aligning_files[lang]) aligning_matrix = torch.FloatTensor(torch.load(aligninig_path)) aligning = torch.nn.Linear(self.output_dim, self.output_dim, bias=False) aligning.weight = torch.nn.Parameter(aligning_matrix, requires_grad=False) self.add_module(name, aligning)
def __init__(self, model_name: str, requires_grad: bool = True, freeze_num_layers: int = 0) -> None: super().__init__() self.transformer_model = AutoModel.from_pretrained(model_name) # I'm not sure if this works for all models; open an issue on github if you find a case # where it doesn't work. self.output_dim = self.transformer_model.config.hidden_size for param in self.transformer_model.parameters(): param.requires_grad = requires_grad def get_layer_num(n): parts = n.split(".") for part in parts: try: return int(part) except ValueError: continue return 0 if freeze_num_layers > 0: for n, p in self.transformer_model.named_parameters(): layer_num = get_layer_num(n) if layer_num < freeze_num_layers: p.requires_grad = False
def __init__(self, model_name: str) -> None: super().__init__() self.transformer_model = AutoModel.from_pretrained(model_name) # I'm not sure if this works for all models; open an issue on github if you find a case # where it doesn't work. self.output_dim = self.transformer_model.config.hidden_size
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--lang", default=None, type=str, required=True) parser.add_argument("--input_file", default=None, type=str, required=True) parser.add_argument("--out_dir", default=None, type=str, required=True) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) ## Other parameters parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--layers", default="0", type=str) parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences longer " "than this will be truncated, and sequences shorter than this will be padded." ) parser.add_argument("--batch_size", default=32, type=int, help="Batch size for predictions.") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('-d', '--emb_dim', type=int, default=1024, help="Embeddings size") parser.add_argument( '--vocab_file', type=str, default='vocabs/en_50k.vocab', help= "Path to vocab file with tokens (one per line) to include in output. Should also include <UNK> token. Can use $l as a placeholder for language" ) args = parser.parse_args() lang = args.lang tokenizer = AutoTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) vocab = vocabulary.Vocabulary() vocab.set_from_file(args.vocab_file, oov_token=tokenizer.unk_token) print("Loaded vocabulary of size {}".format(vocab.get_vocab_size())) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device: {} n_gpu: {} distributed training: {}".format( device, n_gpu, bool(args.local_rank != -1))) layer_indexes = [int(x) for x in args.layers.split(",")] examples = read_examples(args.input_file) features = convert_examples_to_features(examples=examples, seq_length=args.max_seq_length, tokenizer=tokenizer, lang=lang) unique_id_to_feature = {} for feature in features: unique_id_to_feature[feature.unique_id] = feature model = AutoModel.from_pretrained(args.bert_model) model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_example_index) if args.local_rank == -1: eval_sampler = SequentialSampler(eval_data) else: eval_sampler = DistributedSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size) num_occurrences = [0] * vocab.get_vocab_size() anchors = {} norms = {} total_words = 0 for l in layer_indexes: norms[l] = 0.0 anchors[l] = np.zeros(shape=(vocab.get_vocab_size(), args.emb_dim)) oov_ind = vocab.get_token_index(vocab._oov_token) model.eval() for input_ids, input_mask, example_indices in tqdm(eval_dataloader): input_ids = input_ids.to(device) input_mask = input_mask.to(device) all_encoder_layers = model(input_ids) all_encoder_layers = all_encoder_layers for b, example_index in enumerate(example_indices): feature = features[example_index.item()] for (i, token) in enumerate(feature.tokens): all_layers = [] w_id = vocab.get_token_index(token) if w_id == oov_ind: continue n = num_occurrences[w_id] for (j, layer_index) in enumerate(layer_indexes): layer_output = all_encoder_layers[int( layer_index)].detach().cpu().numpy() layer_output = layer_output[b] layers = collections.OrderedDict() l = layer_index values = layer_output[i] anchors[l][w_id, :] = anchors[l][w_id, :] * ( n / (n + 1)) + values[:] / (n + 1) norm = np.linalg.norm(values[:]) norms[l] = norms[l] * (total_words / (total_words + 1)) + norm / ( total_words + 1) total_words += 1 num_occurrences[w_id] += 1 os.makedirs(args.out_dir, exist_ok=True) norm_dict = {} print('Saving outputs to {}'.format(args.out_dir)) for l in tqdm(layer_indexes): norm_key = 'avg_norm_layer_{}'.format(l) norm_dict[norm_key] = norms[l] file_path = os.path.join(args.out_dir, 'avg_embeds_{}.txt'.format(l)) save_embeds(file_path, anchors[l], vocab, num_occurrences, args.emb_dim) norm_dict['occurrences'] = num_occurrences file_path = os.path.join(args.out_dir, 'norms.json') json.dump(norm_dict, open(file_path, 'w'))