def __init__(self, model_name: str, requires_grad: bool) -> None:
     super().__init__()
     self.transformer_model = AutoModel.from_pretrained(model_name)
     # I'm not sure if this works for all models; open an issue on github if you find a case
     # where it doesn't work.
     self.output_dim = self.transformer_model.config.hidden_size
     for param in self.transformer_model.parameters():
         param.requires_grad = requires_grad
Exemple #2
0
 def __init__(self, model_name: str) -> None:
     super().__init__()
     config = PretrainedConfig.from_json_file(path +
                                              "/../bert_configs/debug.json")
     self.transformer_model = AutoModel.from_pretrained(model_name,
                                                        config=config)
     # I'm not sure if this works for all models; open an issue on github if you find a case
     # where it doesn't work.
     self.output_dim = self.transformer_model.config.hidden_size
Exemple #3
0
    def __init__(self, model_name: str, requires_grad: bool,
                 freeze_num_layers: int, aligning_files: Dict[str, str],
                 xnli_tasks: List[str]) -> None:
        super().__init__()

        self.aligning_layer_num = freeze_num_layers  # align by last frozen layer

        self.transformer_model = AutoModel.from_pretrained(model_name)
        # I'm not sure if this works for all models; open an issue on github if you find a case
        # where it doesn't work.
        self.output_dim = self.transformer_model.config.hidden_size

        #for param in self.transformer_model.parameters():
        #    param.requires_grad = requires_grad

        def get_layer_num(n):
            parts = n.split(".")
            for part in parts:
                try:
                    return int(part)
                except ValueError:
                    continue
            return 0

        if freeze_num_layers > 0:
            for n, p in self.transformer_model.named_parameters():
                layer_num = get_layer_num(n)
                if layer_num < freeze_num_layers:
                    p.requires_grad = False

        xnli_langs = [t[4:] for t in xnli_tasks]
        print(xnli_langs)

        for lang in xnli_langs:
            name = 'aligning_%s' % lang

            aligning_matrix = torch.eye(
                self.output_dim)  # default to identity matrix -> no alignment
            if lang in aligning_files and aligning_files[lang] != '':
                print(lang + " will be aligned")
                aligninig_path = cached_path(aligning_files[lang])
                aligning_matrix = torch.FloatTensor(torch.load(aligninig_path))

            aligning = torch.nn.Linear(self.output_dim,
                                       self.output_dim,
                                       bias=False)
            aligning.weight = torch.nn.Parameter(aligning_matrix,
                                                 requires_grad=False)
            self.add_module(name, aligning)
    def __init__(self, model_name: str, requires_grad: bool = True, freeze_num_layers: int = 0, aligning_files: Dict[str, str] = None) -> None:
        super().__init__()

        self.aligning_layer_num = freeze_num_layers # align by last frozen layer
        

        self.transformer_model = AutoModel.from_pretrained(model_name)
        # I'm not sure if this works for all models; open an issue on github if you find a case
        # where it doesn't work.
        self.output_dim = self.transformer_model.config.hidden_size

        for param in self.transformer_model.parameters():
            param.requires_grad = requires_grad 
        
        def get_layer_num(n):
            parts = n.split(".")
            for part in parts:
                try:
                    return int(part)
                except ValueError:
                    continue
            return 0

        if freeze_num_layers > 0:
            for n, p in self.transformer_model.named_parameters():
                layer_num = get_layer_num(n)
                if layer_num < freeze_num_layers:
                    p.requires_grad = False
        
        if aligning_files is None:
            raise ValueError

        for lang in aligning_files.keys():
            name = 'aligning_%s' % lang

            aligninig_path = cached_path(aligning_files[lang])
            aligning_matrix = torch.FloatTensor(torch.load(aligninig_path))

            aligning = torch.nn.Linear(self.output_dim, self.output_dim, bias=False)
            aligning.weight = torch.nn.Parameter(aligning_matrix, requires_grad=False)
            self.add_module(name, aligning)
Exemple #5
0
    def __init__(self, model_name: str, requires_grad: bool = True, freeze_num_layers: int = 0) -> None:
        super().__init__()
        self.transformer_model = AutoModel.from_pretrained(model_name)
        # I'm not sure if this works for all models; open an issue on github if you find a case
        # where it doesn't work.
        self.output_dim = self.transformer_model.config.hidden_size

        for param in self.transformer_model.parameters():
            param.requires_grad = requires_grad 
        
        def get_layer_num(n):
            parts = n.split(".")
            for part in parts:
                try:
                    return int(part)
                except ValueError:
                    continue
            return 0

        if freeze_num_layers > 0:
            for n, p in self.transformer_model.named_parameters():
                layer_num = get_layer_num(n)
                if layer_num < freeze_num_layers:
                    p.requires_grad = False
 def __init__(self, model_name: str) -> None:
     super().__init__()
     self.transformer_model = AutoModel.from_pretrained(model_name)
     # I'm not sure if this works for all models; open an issue on github if you find a case
     # where it doesn't work.
     self.output_dim = self.transformer_model.config.hidden_size
Exemple #7
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--lang", default=None, type=str, required=True)
    parser.add_argument("--input_file", default=None, type=str, required=True)
    parser.add_argument("--out_dir", default=None, type=str, required=True)
    parser.add_argument(
        "--bert_model",
        default=None,
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese."
    )

    ## Other parameters
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--layers", default="0", type=str)
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. Sequences longer "
        "than this will be truncated, and sequences shorter than this will be padded."
    )
    parser.add_argument("--batch_size",
                        default=32,
                        type=int,
                        help="Batch size for predictions.")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument('-d',
                        '--emb_dim',
                        type=int,
                        default=1024,
                        help="Embeddings size")
    parser.add_argument(
        '--vocab_file',
        type=str,
        default='vocabs/en_50k.vocab',
        help=
        "Path to vocab file with tokens (one per line) to include in output. Should also include <UNK> token. Can use $l as a placeholder for language"
    )

    args = parser.parse_args()

    lang = args.lang

    tokenizer = AutoTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)

    vocab = vocabulary.Vocabulary()

    vocab.set_from_file(args.vocab_file, oov_token=tokenizer.unk_token)

    print("Loaded vocabulary of size {}".format(vocab.get_vocab_size()))

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device: {} n_gpu: {} distributed training: {}".format(
        device, n_gpu, bool(args.local_rank != -1)))

    layer_indexes = [int(x) for x in args.layers.split(",")]

    examples = read_examples(args.input_file)

    features = convert_examples_to_features(examples=examples,
                                            seq_length=args.max_seq_length,
                                            tokenizer=tokenizer,
                                            lang=lang)

    unique_id_to_feature = {}
    for feature in features:
        unique_id_to_feature[feature.unique_id] = feature

    model = AutoModel.from_pretrained(args.bert_model)
    model.to(device)

    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features],
                                  dtype=torch.long)
    all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)

    eval_data = TensorDataset(all_input_ids, all_input_mask, all_example_index)
    if args.local_rank == -1:
        eval_sampler = SequentialSampler(eval_data)
    else:
        eval_sampler = DistributedSampler(eval_data)
    eval_dataloader = DataLoader(eval_data,
                                 sampler=eval_sampler,
                                 batch_size=args.batch_size)

    num_occurrences = [0] * vocab.get_vocab_size()
    anchors = {}
    norms = {}
    total_words = 0
    for l in layer_indexes:
        norms[l] = 0.0
        anchors[l] = np.zeros(shape=(vocab.get_vocab_size(), args.emb_dim))

    oov_ind = vocab.get_token_index(vocab._oov_token)
    model.eval()
    for input_ids, input_mask, example_indices in tqdm(eval_dataloader):
        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)

        all_encoder_layers = model(input_ids)
        all_encoder_layers = all_encoder_layers

        for b, example_index in enumerate(example_indices):
            feature = features[example_index.item()]

            for (i, token) in enumerate(feature.tokens):
                all_layers = []
                w_id = vocab.get_token_index(token)
                if w_id == oov_ind:
                    continue

                n = num_occurrences[w_id]
                for (j, layer_index) in enumerate(layer_indexes):
                    layer_output = all_encoder_layers[int(
                        layer_index)].detach().cpu().numpy()
                    layer_output = layer_output[b]
                    layers = collections.OrderedDict()
                    l = layer_index
                    values = layer_output[i]

                    anchors[l][w_id, :] = anchors[l][w_id, :] * (
                        n / (n + 1)) + values[:] / (n + 1)
                    norm = np.linalg.norm(values[:])
                    norms[l] = norms[l] * (total_words /
                                           (total_words + 1)) + norm / (
                                               total_words + 1)

                total_words += 1
                num_occurrences[w_id] += 1

    os.makedirs(args.out_dir, exist_ok=True)
    norm_dict = {}
    print('Saving outputs to {}'.format(args.out_dir))
    for l in tqdm(layer_indexes):
        norm_key = 'avg_norm_layer_{}'.format(l)
        norm_dict[norm_key] = norms[l]
        file_path = os.path.join(args.out_dir, 'avg_embeds_{}.txt'.format(l))
        save_embeds(file_path, anchors[l], vocab, num_occurrences,
                    args.emb_dim)

    norm_dict['occurrences'] = num_occurrences
    file_path = os.path.join(args.out_dir, 'norms.json')
    json.dump(norm_dict, open(file_path, 'w'))