Example #1
0
def get_embedding(embed_type):
    if embed_type == 'BioBERT':
        model_loc = './auxiliary/pretrained_bert_tf/biobert_pretrain_output_all_notes_150000/'
        tokenizer = BertTokenizer.from_pretrained(model_loc,
                                                  do_lower_case=True)
        cache_dir = os.path.join(PYTORCH_PRETRAINED_BERT_CACHE,
                                 'distributed_{}'.format(-1))
        model = BertModel.from_pretrained(model_loc, cache_dir=cache_dir)
        indexer = None
    elif embed_type == 'BERT':
        model_loc = './auxiliary/pretrained_bert_tf/bert_pretrain_output_all_notes_150000/'
        tokenizer = BertTokenizer.from_pretrained(model_loc,
                                                  do_lower_case=True)
        cache_dir = os.path.join(PYTORCH_PRETRAINED_BERT_CACHE,
                                 'distributed_{}'.format(-1))
        model = BertModel.from_pretrained(model_loc, cache_dir=cache_dir)
        indexer = None
    elif embed_type == 'CharBERT':
        model_loc = './auxiliary/pretrained_character_bert/general_character_bert/'
        model = CharacterBertModel.from_pretrained(model_loc)
        tokenizer = BertTokenizer.from_pretrained(
            './auxiliary/pretrained_bert_tf/bert_pretrain_output_all_notes_150000/'
        )
        indexer = CharacterIndexer()
    elif embed_type == 'BioCharBERT':
        model_loc = './auxiliary/pretrained_character_bert/medical_character_bert/'
        model = CharacterBertModel.from_pretrained(model_loc)
        tokenizer = BertTokenizer.from_pretrained(
            './auxiliary/pretrained_bert_tf/biobert_pretrain_output_all_notes_150000/'
        )
        indexer = CharacterIndexer()
    return indexer, tokenizer, model
    def __init__(self,
                 encoder_size=64,
                 dim_num_feat=0,
                 dropout=0.2,
                 seq_dropout=0.1,
                 num_outputs=5):
        super(EntityLink_bert, self).__init__()
        # self.word_embedding = nn.Embedding(vocab_size,
        #                                    word_embed_size,
        #                                    padding_idx=0)
        # self.pos_embedding = nn.Embedding(pos_embed_size, pos_dim, padding_idx=0)
        self.seq_dropout = seq_dropout

        self.dropout1d = nn.Dropout2d(self.seq_dropout)
        self.span_extractor = EndpointSpanExtractor(encoder_size * 2,
                                                    combination="x,x+y,y")
        # selfspanextractor效果很差
        bert_model = 'bert-base-chinese'
        self.bert = BertModel.from_pretrained(bert_model)
        self.use_layer = -1
        self.LSTM = LSTMEncoder(embed_size=768,
                                encoder_size=encoder_size,
                                bidirectional=True)
        hidden_size = 100
        self.hidden = nn.Linear(2 * encoder_size, num_outputs)
        self.classify = nn.Sequential(
            nn.BatchNorm1d(4 * 768), nn.Dropout(p=dropout),
            nn.Linear(in_features=4 * 768, out_features=num_outputs))
        self.attn_pool = Attention(2 * encoder_size)
Example #3
0
	def __init__(self, param):
		super().__init__()
		self.args = args = param.args
		self.param = param

		self.bert_exclude = BertModel.from_pretrained(args.bert_model)
		self.drop = nn.Dropout(args.droprate)
Example #4
0
    def __init__(self, pretrained_model: str, requires_grad: bool = False, top_layer_only: bool = False) -> None:
        model = BertModel.from_pretrained(pretrained_model)

        for param in model.parameters():
            param.requires_grad = requires_grad

        super().__init__(bert_model=model, top_layer_only=top_layer_only)
 def __init__(self, bert_path):
     super().__init__()
     # 加载并冻结bert模型参数
     self.bert = BertModel.from_pretrained(bert_path)
     for param in self.bert.parameters():
         param.requires_grad = True
     self.output = nn.Sequential(nn.Dropout(0.2), nn.Linear(768, 3))
 def _init_embeddings(self):
     '''
         Initialise embeddings
     '''
     if self.elmo:
         self.embedding_dim += 1024 * 3
         options_file = "/data/models/pytorch/elmo/options/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json"
         weight_file = "/data/models/pytorch/elmo/weights/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5"
         self.elmo_embedder = ElmoEmbedder(options_file,
                                           weight_file,
                                           cuda_device=0)
     if self.glove:
         self.embedding_dim += 300
         self.vocab = glove_embeddings.index.tolist()
         self.num_embeddings = glove_embeddings.shape[0]
         self.glove_embedder = torch.nn.Embedding(self.num_embeddings,
                                                  self.embedding_dim,
                                                  max_norm=None,
                                                  norm_type=2,
                                                  scale_grad_by_freq=False,
                                                  sparse=False)
         self.glove_embedder.weight.data.copy_(
             torch.from_numpy(glove_embeddings.values))
         self.glove_embedder.weight.requires_grad = False
         self.vocab_hash = {w: i for i, w in enumerate(self.vocab)}
     if self.type:
         self.embedding_dim += type_dim
     if self.token:
         self.embedding_dim += token_dim
     if self.bert:
         self.bert_tokenizer = CustomBertTokenizer.from_pretrained(
             "bert-large-cased", do_lower_case=False)
         self.bert_embedder = BertModel.from_pretrained(
             "bert-large-cased").to(self.device)
 def __init__(self,
              bert_model: str,
              ) -> None:
     super().__init__()
     self.bert = BertModel.from_pretrained(bert_model)
     self.dropout = nn.Dropout(self.bert.config.hidden_dropout_prob)
     self.classifier = nn.Linear(self.bert.config.hidden_size, 2)
Example #8
0
 def __init__(
     self,
     bert_model=None,
     tokenizer=None,
     language=Language.ENGLISH,
     num_gpus=None,
     cache_dir=".",
     to_lower=True,
     max_len=512,
     layer_index=-1,
     pooling_strategy=PoolingStrategy.MEAN,
 ):
     """Initialize the encoder's underlying model and tokenizer
     
     Args:
         bert_model: BERT model to use for encoding. Defaults to pretrained BertModel.
         tokenizer: Tokenizer to use for preprocessing. Defaults to pretrained BERT tokenizer.
         language: The pretrained model's language. Defaults to Language.ENGLISH.
         num_gpus: The number of gpus to use. Defaults to None, which forces all available GPUs to be used. 
         cache_dir: Location of BERT's cache directory. Defaults to "."
         to_lower: True to lowercase before tokenization. Defaults to False.
         max_len: Maximum number of tokens.
         layer_index: The layer from which to extract features. 
                      Defaults to the last layer; can also be a list of integers for experimentation.
         pooling_strategy: Pooling strategy to aggregate token embeddings into sentence embedding.
     """
     self.model = (bert_model.model.bert if bert_model else
                   BertModel.from_pretrained(language, cache_dir=cache_dir))
     self.tokenizer = (tokenizer if tokenizer else Tokenizer(
         language, to_lower=to_lower, cache_dir=cache_dir))
     self.num_gpus = num_gpus
     self.max_len = max_len
     self.layer_index = layer_index
     self.pooling_strategy = pooling_strategy
     self.has_cuda = self.cuda
Example #9
0
def dump_row(data, dump_path="../data/data_h5py/"):
    model_type = "bert-base-uncased"
    model = BertModel.from_pretrained(model_type).cuda()
    tokenizer = BertTokenizer.from_pretrained(model_type)
    for index in tqdm(data.index):
        _id = data['QID'].iloc[index]
        d = data.iloc[index].to_dict()
        query = d['Query']
        passages = d['Passages']
        label = d['RelevantPassage']
        query_tensor = process_sentence(str(query), tokenizer,
                                        model).cpu().detach().numpy()
        passage_tensor = []
        for passage in passages:
            passage_tensor.append(
                process_sentence(str(passage), tokenizer, model))
        passage_tensor = torch.cat(passage_tensor, 0).cpu().detach().numpy()
        label = torch.LongTensor([label]).numpy()

        data_dict = dict(query=query_tensor,
                         passages=passage_tensor,
                         label=label)
        with h5py.File(f'{dump_path}/{_id}.hdf5', 'w') as h:
            for k, v in data_dict.items():
                h.create_dataset(k, data=v)
        del data_dict
        del passage_tensor
        del query_tensor
        gc.collect()
Example #10
0
    def __init__(self, model, requires_grad=True):
        super(BertEmbedding, self).__init__()

        self.bert = BertModel.from_pretrained(model)
        #self.bert = self.bert.requires_grad_(requires_grad)
        self.requires_grad = requires_grad
        self.hidden_size = self.bert.config.hidden_size
Example #11
0
    def __init__(self, pretrained_model: str, requires_grad: bool = False, top_layer_only: bool = False) -> None:
        model = BertModel.from_pretrained(pretrained_model)

        for param in model.parameters():
            param.requires_grad = requires_grad

        super().__init__(bert_model=model, top_layer_only=top_layer_only)
Example #12
0
 def __init__(self, config):
     super(PairModel, self).__init__()
     self.bert = BertModel.from_pretrained(config.bert_path)
     for param in self.bert.parameters():
         param.requires_grad = True
     self.out = nn.Linear(config.hidden_size, config.num_labels)
     self.config = config
Example #13
0
    def __init__(self,
                 bert_model: str,
                 device: torch.device,
                 use_layer: list,
                 linear_hidden_size=64,
                 dist_embed_dim=4,
                 token_dist_ratio=4,
                 bert_cache=None):
        super().__init__()
        self.device = device
        self.use_layer = use_layer

        self.bert_cache = bert_cache
        if bert_model in ("bert-base-uncased", "bert-base-cased"):
            self.bert_hidden_size = 768
        elif bert_model in ("bert-large-uncased", "bert-large-cased"):
            self.bert_hidden_size = 1024
        else:
            raise ValueError("Unsupported BERT model.")

        # self.bert = BertModel.from_pretrained(bert_model).to(device)
        self.bert = BertModel.from_pretrained(
            "/home/gy/.pytorch_pretrained_bert/214d4777e8e3eb234563136cd3a49f6bc34131de836848454373fa43f10adc5e.abfbb80ee795a608acbf35c7bf2d2d58574df3887cdd94b355fc67e03fddba05"
        ).to(device)
        # self.bert = BertModel.from_pretrained(bert_model).to(device)

        self.head = Head(self.bert_hidden_size,
                         linear_hidden_size=linear_hidden_size,
                         dist_embed_dim=dist_embed_dim,
                         token_dist_ratio=token_dist_ratio,
                         use_layers=use_layer).to(device)
Example #14
0
    def __init__(
        self, pretrained_model: str,
        requires_grad: Union[List[int], str] = [],
        top_layer_only: bool = False
    ) -> None:
        model = BertModel.from_pretrained(pretrained_model)
        if isinstance(requires_grad, str):
            if requires_grad == "None":
                for param in model.parameters():
                    param.requires_grad = False
            elif requires_grad == "all":
                for param in model.parameters():
                    param.requires_grad = True
            else:
                raise NotImplementedError("Work in progress")
        elif isinstance(requires_grad, list):
            if len(requires_grad) == 0:
                # no finetuning required
                for param in model.parameters():
                    param.requires_grad = False
            else:
                # Finetune the pooling layer and the
                # layers mentioned in the list
                grad_str = "|".join([str(x) for x in requires_grad])
                match_str = r"encoder\.layer\.({0}).*".format(grad_str)
                for name, param in model.named_parameters():
                    if re.match(match_str, name) is not None or "pooler" in name:
                        param.requires_grad = True
                        logger.info(f"Layer {name} is finetuned")
                    else:
                        param.requires_grad = False

        super().__init__(bert_model=model, top_layer_only=top_layer_only)
Example #15
0
 def __init__(self, n_classes, hidden_size=768):
     super(VanillaBert, self).__init__()
     self.n_classes = n_classes
     self.hidden_size = hidden_size
     self.bert = BertModel.from_pretrained("bert-base-uncased")
     self.linear = torch.nn.Linear(self.hidden_size, self.n_classes)
     self.softmax = torch.nn.LogSoftmax(dim=1)
Example #16
0
def compute_represenation(sents,
                          bert_model,
                          logger,
                          device="cuda",
                          reprer=None):
    if reprer is None:
        model = BertModel.from_pretrained(bert_model).to(device)
    else:
        model = reprer.model
    model.eval()
    batch_size = 100
    for i in range(0, len(sents), batch_size):
        items = sents[i:min(len(sents), i + batch_size)]
        with torch.no_grad():
            input_ids = torch.tensor([item.input_ids for item in items],
                                     dtype=torch.long).to(device)
            segment_ids = torch.tensor([item.segment_ids for item in items],
                                       dtype=torch.long).to(device)
            input_mask = torch.tensor([item.input_mask for item in items],
                                      dtype=torch.long).to(device)
            all_encoder_layers, _ = model(
                input_ids, segment_ids,
                input_mask)  # batch_size x seq_len x target_size
        layer_output = all_encoder_layers[-1].detach().cpu().numpy(
        )  # batch_size x seq_len x target_size
        for j, item in enumerate(items):
            item.representation = layer_output[j][0]
        # item.representation = layer_output
        if i % (10 * batch_size) == 0:
            logger.info(
                '  Compute sentence representation. To {}...'.format(i))
    logger.info('  Finish.')
Example #17
0
    def __init__(self, bert_model, args):
        super(RawBertCls, self).__init__()
        self.backbone = BertModel.from_pretrained(
            'data/.cache/bert-base-uncased.tar.gz')

        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(768, 2)
Example #18
0
def main(raw_args=None):
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_name",
                        type=str,
                        required=True,
                        help="model name e.g. bert-base-uncased")
    parser.add_argument("--cache_dir",
                        type=str,
                        default=None,
                        required=False,
                        help="Directory containing pytorch model")
    parser.add_argument("--pytorch_model_path",
                        type=str,
                        required=True,
                        help="/path/to/<pytorch-model-name>.bin")
    parser.add_argument("--tf_cache_dir",
                        type=str,
                        required=True,
                        help="Directory in which to save tensorflow model")
    args = parser.parse_args(raw_args)

    model = BertModel.from_pretrained(
        pretrained_model_name_or_path=args.model_name,
        state_dict=torch.load(args.pytorch_model_path),
        cache_dir=args.cache_dir)

    convert_pytorch_checkpoint_to_tf(model=model,
                                     ckpt_dir=args.tf_cache_dir,
                                     model_name=args.model_name)
Example #19
0
    def __init__(self, args):
        super(MultimodalBertEncoder, self).__init__()
        self.args = args
        bert = BertModel.from_pretrained(args.bert_model)
        self.txt_embeddings = bert.embeddings

        if args.task in ["vsnli", 'msnews']:
            ternary_embeds = nn.Embedding(3, args.hidden_sz)
            ternary_embeds.weight.data[:2].copy_(
                bert.embeddings.token_type_embeddings.weight)
            ternary_embeds.weight.data[2].copy_(
                bert.embeddings.token_type_embeddings.weight.data.mean(dim=0))
            self.txt_embeddings.token_type_embeddings = ternary_embeds

        self.img_embeddings = ImageBertEmbeddings(args, self.txt_embeddings)
        self.img_encoder = ImageEncoder(args)
        self.encoder = bert.encoder
        self.pooler = bert.pooler
        self.clf = nn.Linear(args.hidden_sz, args.n_classes)

        # GPU Options
        if args.multiGPU:
            self.img_embeddings = nn.DataParallel(self.img_embeddings)
            self.encoder = nn.DataParallel(self.encoder)
            self.img_encoder = nn.DataParallel(self.img_encoder)
Example #20
0
 def __init__(self, opt):
     super(TagWordModel, self).__init__()
     self.register_buffer('dummy', torch.Tensor(1, 1).fill_(-float("inf")))
     self.bert = BertModel.from_pretrained(opt.bert_model,
                                           cache_dir=CACHEDIR)
     for lay in self.bert.encoder.layer:
         lay.output.dropout.p = args.drop
    def __init__(self, job_config, use_pretrain, tokenizer, cache_dir, device,
                 write_log, summary_writer):
        self.job_config = job_config

        if not use_pretrain:
            model_config = self.job_config.get_model_config()
            bert_config = BertConfig(**model_config)
            bert_config.vocab_size = len(tokenizer.vocab)

            self.bert_encoder = BertModel(bert_config)
        # Use pretrained bert weights
        else:
            self.bert_encoder = BertModel.from_pretrained(
                self.job_config.get_model_file_type(), cache_dir=cache_dir)
            bert_config = self.bert_encoder.config

        self.network = MTLRouting(self.bert_encoder,
                                  write_log=write_log,
                                  summary_writer=summary_writer)

        #config_data=self.config['data']

        # Pretrain Dataset
        self.network.register_batch(BatchType.PRETRAIN_BATCH,
                                    "pretrain_dataset",
                                    loss_calculation=BertPretrainingLoss(
                                        self.bert_encoder, bert_config))

        self.device = device
Example #22
0
def bertModel(*args, **kwargs):
    """
    BertModel is the basic BERT Transformer model with a layer of summed token,
    position and sequence embeddings followed by a series of identical
    self-attention blocks (12 for BERT-base, 24 for BERT-large).

    Example:
        # Load the tokenizer
        >>> import torch
        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
        #  Prepare tokenized input
        >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
        >>> tokenized_text = tokenizer.tokenize(text)
        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
        >>> tokens_tensor = torch.tensor([indexed_tokens])
        >>> segments_tensors = torch.tensor([segments_ids])
        # Load bertModel
        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertModel', 'bert-base-cased')
        >>> model.eval()
        # Predict hidden states features for each layer
        >>> with torch.no_grad():
                encoded_layers, _ = model(tokens_tensor, segments_tensors)
    """
    model = BertModel.from_pretrained(*args, **kwargs)
    return model
Example #23
0
    def __init__(self,
                 vocab: Vocabulary,
                 bert_model: Union[str, BertModel],
                 embedding_dropout: float = 0.0,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None,
                 label_smoothing: float = None,
                 ignore_span_metric: bool = False,
                 srl_eval_path: str = DEFAULT_SRL_EVAL_PATH) -> None:
        super().__init__(vocab, regularizer)

        if isinstance(bert_model, str):
            self.bert_model = BertModel.from_pretrained(bert_model)
        else:
            self.bert_model = bert_model

        self.num_classes = self.vocab.get_vocab_size("labels")
        if srl_eval_path is not None:
            # For the span based evaluation, we don't want to consider labels
            # for verb, because the verb index is provided to the model.
            self.span_metric = SrlEvalScorer(srl_eval_path,
                                             ignore_classes=["V"])
        else:
            self.span_metric = None
        self.tag_projection_layer = Linear(self.bert_model.config.hidden_size,
                                           self.num_classes)

        self.embedding_dropout = Dropout(p=embedding_dropout)
        self._label_smoothing = label_smoothing
        self.ignore_span_metric = ignore_span_metric
        initializer(self)
Example #24
0
    def __init__(self, args):
        self.config = args.config

        if not args.use_pretrain:

            if args.progressive_layer_drop:
                print("BertConfigPreLnLayerDrop")
                from nvidia.modelingpreln_layerdrop import BertForPreTrainingPreLN, BertForMaskedLM, BertConfig
            else:
                from nvidia.modelingpreln import BertForPreTrainingPreLN, BertForMaskedLM, BertConfig

            bert_config = BertConfig(**self.config["bert_model_config"])
            bert_config.vocab_size = len(args.tokenizer.vocab)

            # Padding for divisibility by 8
            if bert_config.vocab_size % 8 != 0:
                bert_config.vocab_size += 8 - (bert_config.vocab_size % 8)
            print("VOCAB SIZE:", bert_config.vocab_size)

            self.network = BertForPreTrainingPreLN(bert_config, args)
            # self.network = BertForMaskedLM(bert_config)                   # something else should be changes for this to work
        # Use pretrained bert weights
        else:
            self.bert_encoder = BertModel.from_pretrained(
                self.config['bert_model_file'],
                cache_dir=PYTORCH_PRETRAINED_BERT_CACHE /
                'distributed_{}'.format(args.local_rank))
            bert_config = self.bert_encoder.config

        self.device = None
    def __init__(self,
                 pretrained_model: str, 
                 requires_grad: bool = False,
                 dropout: float = 0.0,
                 first_layer_only: bool = False,
                 second_to_last_layer_only: bool = False,
                 last_layer_only: bool = False,
                 sum_last_four_layers: bool = False,
                 concat_last_four_layers: bool = False,
                 sum_all_layers: bool = False,
                 scalar_mix: bool = False) -> None:
        model = BertModel.from_pretrained(pretrained_model)

        for param in model.parameters():
            param.requires_grad = requires_grad

        super().__init__(bert_model=model,
                         dropout=dropout,
                         first_layer_only=first_layer_only,
                         second_to_last_layer_only=second_to_last_layer_only,
                         last_layer_only=last_layer_only,
                         sum_last_four_layers=sum_last_four_layers,
                         concat_last_four_layers=concat_last_four_layers,
                         sum_all_layers=sum_all_layers,
                         scalar_mix=scalar_mix)
Example #26
0
 def __init__(self,
              n_input=768,
              n_output=128,
              bert_model='bert-base-uncased'):
     super(ProtNet, self).__init__()
     self.bert = BertModel.from_pretrained(
         '../Fewshot-Learning-with-BERT-master/bert-base-uncased')
Example #27
0
    def __init__(self, config):
        super(bc_RNN, self).__init__()

        self.config = config
        self.encoder = BertModel.from_pretrained("bert-base-uncased")

        context_input_size = (config.num_layers * config.encoder_hidden_size)

        self.context_encoder = layer.ContextRNN(context_input_size,
                                                config.context_size,
                                                config.rnn, config.num_layers,
                                                config.dropout)

        self.context2decoder = layer.FeedForward(config.context_size,
                                                 config.num_layers *
                                                 config.context_size,
                                                 num_layers=1,
                                                 activation=config.activation,
                                                 isActivation=True)

        self.decoder2output = layer.FeedForward(config.num_layers *
                                                config.context_size,
                                                config.num_classes,
                                                num_layers=1,
                                                isActivation=False)
        self.dropoutLayer = nn.Dropout(p=config.dropout)
Example #28
0
    def __init__(self, word_vec_mat, max_length=100, word_embedding_dim=768, dpos_embedding_dim=50, dmask_embedding_dim=50):
        nn.Module.__init__(self)
        self.max_length = max_length
        self.word_embedding_dim = word_embedding_dim
        self.dpos_embedding_dim = dpos_embedding_dim
        self.dmask_embedding_dim = dmask_embedding_dim
        #self.bert_token = BertTokenizer.from_pretrained('bert-base-uncased')
        #self.bert_embedding = BertModel.from_pretrained('bert-base-uncased')
        self.bert_token = BertTokenizer.from_pretrained('./models/bert-base-uncased-vocab.txt')
        self.bert_embedding = BertModel.from_pretrained('./models')
 
        
        # Word embedding
        #unk = torch.randn(1, word_embedding_dim) / math.sqrt(word_embedding_dim)
        #blk = torch.zeros(1, word_embedding_dim)
        #word_vec_mat = torch.from_numpy(word_vec_mat)
        self.word_embedding = nn.Embedding(400002, 50, padding_idx=word_vec_mat.shape[0] + 1)
        #self.word_embedding.weight.data.copy_(torch.cat((word_vec_mat, unk, blk), 0))
        #self.bword_embedding.weight.data.copy_(torch.cat((word_vec_mat, unk, blk), 0))
        #Position Embedding
        self.pos1_embedding = nn.Embedding(80, 5, padding_idx=0)
        self.pos2_embedding = nn.Embedding(80, 5, padding_idx=0)
        self.dpos1_embedding = nn.Embedding(2*self.max_length,dpos_embedding_dim,padding_idx=0)
        self.dpos2_embedding = nn.Embedding(2*self.max_length,dpos_embedding_dim,padding_idx=0)
        #dmask embedding
        self.dmask1_embedding = nn.Embedding(2*self.max_length,dmask_embedding_dim,padding_idx=0)
        self.dmask2_embedding = nn.Embedding(2*self.max_length,dmask_embedding_dim,padding_idx=0)
Example #29
0
    def __init__(self,
        bert_model,
        rnn_size,
        labels_vocab_size,
        num_layers=2,
        dropout=0.5
    ):
        torch.nn.Module.__init__(self)

        self.bert = BertModel.from_pretrained(bert_model)

        self.dropout = torch.nn.Dropout(dropout)

        bert_size = self.bert.config.hidden_size

        self.rnn = torch.nn.LSTM(
            bert_size,
            rnn_size,
            num_layers,
            bidirectional=True,
            batch_first=True,
            dropout=dropout,
        )

        self.output = torch.nn.Linear(rnn_size*2, labels_vocab_size)  # *2 because of bidi
Example #30
0
def extractBert():
    model = BertModel.from_pretrained(modelPath[args.model])
    #print(model);exit(10)
    embeddings = model.embeddings.word_embeddings
    print(embeddings.num_embeddings)
    print(embeddings.weight.size())
    weight = embeddings.weight.detach().numpy()
    tokenizer = BertTokenizer.from_pretrained(modelPath[args.model])
    #print(tokenizer.ids_to_tokens)
    #for i in range(10):
    #    print(weight[i])
    with open(programmingalpha.Bert768 + "embeddings.txt", "w") as f:
        vec_strs = []
        for i in range(len(weight)):
            vec = weight[i]
            vec_str = list(map(lambda x: str(x), vec))
            token = tokenizer.ids_to_tokens[i]
            vec_str.insert(0, token)
            vec_str = " ".join(vec_str)
            vec_strs.append(vec_str + "\n")

        def turnIndexs(index1, index2):
            tmp = vec_strs[index1]
            vec_strs[index1] = vec_strs[index2]
            vec_strs[index2] = tmp

        turnIndexs(0, 1)
        turnIndexs(0, 100)
        f.writelines(vec_strs)
    def __init__(self,
                 encoder,
                 decoder,
                 emb_type,
                 emb_dim,
                 vocab_size,
                 conv_hidden,
                 encoder_hidden,
                 encoder_layer,
                 isTrain=True,
                 n_hop=1,
                 dropout=0.0):
        super().__init__()
        self._encoder = encoder
        self._decoder = decoder
        self._emb_type = emb_type

        self._sent_enc = ConvSentEncoder(vocab_size, emb_dim, conv_hidden,
                                         dropout, emb_type)

        # BERT
        if emb_type == 'BERT':
            self._bert = BertModel.from_pretrained(
                '/path/to/uncased_L-24_H-1024_A-16')
            self._bert.eval()
            for p in self._bert.parameters():
                p.requires_grad = False
            self._bert_w = nn.Linear(1024 * 4, emb_dim)

        # Sentence Encoder
        if encoder == 'BiLSTM':
            enc_out_dim = encoder_hidden * 2  # bidirectional
            self._art_enc = LSTMEncoder(3 * conv_hidden,
                                        encoder_hidden,
                                        encoder_layer,
                                        dropout=dropout,
                                        bidirectional=True)
        elif encoder == 'Transformer':
            enc_out_dim = encoder_hidden
            self._art_enc = TransformerEncoder(3 * conv_hidden, encoder_hidden,
                                               encoder_layer, decoder)

            self._emb_w = nn.Linear(3 * conv_hidden, encoder_hidden)
            self.sent_pos_embed = nn.Embedding.from_pretrained(
                get_sinusoid_encoding_table(1000, enc_out_dim, padding_idx=0),
                freeze=True)
        elif encoder == 'DeepLSTM':
            enc_out_dim = encoder_hidden
            self._isTrain = isTrain
            self._art_enc = DeepLSTM(3 * conv_hidden, encoder_hidden,
                                     encoder_layer, 0.1)

        # Decoder
        decoder_hidden = encoder_hidden
        decoder_layer = encoder_layer
        if decoder == 'PN':
            self._extractor = LSTMPointerNet(enc_out_dim, decoder_hidden,
                                             decoder_layer, dropout, n_hop)
        else:
            self._ws = nn.Linear(enc_out_dim, 2)
Example #32
0
 def __init__(self, name, **kwargs):
     super(BERTBaseEmbeddings, self).__init__(name=name, **kwargs)
     global BERT_TOKENIZER
     self.dsz = kwargs.get('dsz')
     if BERT_TOKENIZER is None:
         BERT_TOKENIZER = BertTokenizer.from_pretrained(kwargs.get('embed_file'))
     self.model = BertModel.from_pretrained(kwargs.get('embed_file'))
     self.vocab = BERT_TOKENIZER.vocab
     self.vsz = len(BERT_TOKENIZER.vocab)  # 30522 self.model.embeddings.word_embeddings.num_embeddings
     self.layer_indices = kwargs.get('layers', [-1, -2, -3, -4])
     self.operator = kwargs.get('operator', 'concat')