コード例 #1
0
    def __init__(self, pretrained_model: str, requires_grad: bool = False, top_layer_only: bool = False) -> None:
        model = BertModel.from_pretrained(pretrained_model)

        for param in model.parameters():
            param.requires_grad = requires_grad

        super().__init__(bert_model=model, top_layer_only=top_layer_only)
コード例 #2
0
 def __init__(self, name, **kwargs):
     super(BERTBaseEmbeddings, self).__init__(name=name, **kwargs)
     global BERT_TOKENIZER
     self.dsz = kwargs.get('dsz')
     if BERT_TOKENIZER is None:
         BERT_TOKENIZER = BertTokenizer.from_pretrained(kwargs.get('embed_file'))
     self.model = BertModel.from_pretrained(kwargs.get('embed_file'))
     self.vocab = BERT_TOKENIZER.vocab
     self.vsz = len(BERT_TOKENIZER.vocab)  # 30522 self.model.embeddings.word_embeddings.num_embeddings
     self.layer_indices = kwargs.get('layers', [-1, -2, -3, -4])
     self.operator = kwargs.get('operator', 'concat')
コード例 #3
0
    def __init__(self,
                 update_embedding=False,
                 embedding_reduction='none',
                 pretrained_model_name='bert-base-uncased',
                 cache_dir='../data/bert_cache'):
        super().__init__()

        # Check if choice of pretrained model is valid
        assert pretrained_model_name in ('bert-base-uncased',
                                         'bert-large-uncased',
                                         'bert-base-cased')

        # Load pre-trained BERT model
        self.bert = BertModel.from_pretrained(
            pretrained_model_name_or_path=pretrained_model_name,
            cache_dir=cache_dir)
        self.embedding = self.bert.embeddings
        self.embedding_size = self.embedding.word_embeddings.embedding_dim

        self.reduction = embedding_reduction

        # (Remove or not) BERT model parameters from optimization
        for param in self.bert.parameters():
            param.requires_grad = update_embedding
コード例 #4
0
 def __init__(self, hiddenDim, tagsetSize, batchSize):
     super(NetEDU, self).__init__()
     self.hiddenDim = hiddenDim  # 768
     self.batchSize = batchSize
     self.tagsetSize = tagsetSize
     self.bert = BertModel.from_pretrained('bert-base-chinese').cuda()
     # classification layer
     self.hidden2tag = nn.Linear(
         self.hiddenDim, self.tagsetSize)  # convert to label set size
     # dropout layer
     self.dropout = nn.Dropout(0.1)
     # CRF layer
     self.transitions = nn.Parameter(
         torch.randn(self.tagsetSize, self.tagsetSize).cuda())  # initialize
     self.transitions.data[
         tagToIdx['[START]'], :] = -10000.  # no transition to SOS
     self.transitions.data[:, tagToIdx[
         '[END]']] = -10000.  # no transition from EOS except to PAD
     self.transitions.data[:, tagToIdx[
         '[PAD]']] = -10000.  # no transition from PAD except to PAD
     self.transitions.data[tagToIdx[
         '[PAD]'], :] = -10000.  # no transition to PAD except from EOS
     self.transitions.data[tagToIdx['[PAD]'], tagToIdx['[END]']] = 0.
     self.transitions.data[tagToIdx['[PAD]'], tagToIdx['[PAD]']] = 0.
コード例 #5
0
    def __init__(self, bert_model_path, n_tgt_vocab, len_max_seq, d_word_vec=768, d_model=768, d_inner=3072,
                 n_layers=12, n_head=12, d_k=64, d_v=64, dropout=0.1):

        super().__init__()

        self.encoder = BertModel.from_pretrained(bert_model_path)
        self.config = BertConfig(bert_model_path+'bert_config.json')
        self.decoder = Decoder(
            n_tgt_vocab=n_tgt_vocab, len_max_seq=len_max_seq,
            d_word_vec=d_word_vec, d_model=d_model, d_inner=d_inner,
            n_layers=n_layers, n_head=n_head, d_k=d_k, d_v=d_v,
            dropout=dropout)
        self.tgt_word_prj = nn.Linear(d_model, n_tgt_vocab, bias=False)
        nn.init.xavier_normal_(self.tgt_word_prj.weight)
        self.tgt_word_prj.weight = self.decoder.tgt_word_emb.weight
        self.x_logit_scale = (d_model ** -0.5)
        self.o_l = nn.Linear(d_model, 512, bias=False)
        self.h_l = nn.Linear(512, 1, bias=True)
        nn.init.xavier_normal_(self.o_l.weight)
        nn.init.xavier_normal_(self.h_l.weight)
        self.a_l_1 = nn.Linear(d_model, 512, bias=False)
        self.a_l_2 = nn.Linear(d_model, 512, bias=False)
        nn.init.xavier_normal_(self.a_l_1.weight)
        nn.init.xavier_normal_(self.a_l_2.weight)
コード例 #6
0
    def __init__(self,
                 config,
                 cls_sup: bool = False,
                 evidence_lambda=0.8,
                 extra_yesno_lambda=0.5):
        super(BertQAYesnoCLSHierarchical, self).__init__(config)
        print(f'The model {self.__class__.__name__} is loading...')
        print(f'The coefficient of evidence loss is {evidence_lambda}')
        print(f'Use cls extra supervision: {cls_sup}')
        print(f'The extra yesno loss lambda is {extra_yesno_lambda}')

        layers.set_seq_dropout(True)
        layers.set_my_dropout_prob(config.hidden_dropout_prob)

        self.bert = BertModel(config)
        # self.dropout = nn.Dropout(config.hidden_dropout_prob)
        # self.answer_choice = nn.Linear(config.hidden_size, 2)
        self.doc_word_sum = layers.AttentionScore(config.hidden_size,
                                                  250,
                                                  do_similarity=False)
        self.que_word_sum = layers.AttentionScore(config.hidden_size,
                                                  250,
                                                  do_similarity=False)
        self.doc_sen_sum = layers.AttentionScore(config.hidden_size,
                                                 250,
                                                 do_similarity=False)

        self.cls_sup = cls_sup
        self.extra_yesno_lam = extra_yesno_lambda
        if cls_sup:
            self.extra_predictor = nn.Linear(config.hidden_size, 3)

        self.yesno_predictor = nn.Linear(config.hidden_size * 2, 3)
        self.evidence_lam = evidence_lambda

        self.apply(self.init_bert_weights)
コード例 #7
0
 def __init__(
         self,
         config,
         output_attentions=False,
         keep_multihead_output=False,
         cls_alpha=1.0,
         mask_p=0.0,
 ):
     super(BertForCoQA, self).__init__(config)
     self.cls_alpha = cls_alpha
     self.mask_p = mask_p
     self.output_attentions = output_attentions
     self.bert = BertModel(
         config,
         output_attentions=output_attentions,
         keep_multihead_output=keep_multihead_output,
     )
     # self.qa_outputs_mid = nn.Linear(config.hidden_size, config.hidden_size)
     # self.dropout = nn.Dropout(config.hidden_dropout_prob)  # NOTE: It hurts.
     self.qa_outputs = nn.Linear(config.hidden_size, 2)
     # self.cls_outputs_mid = nn.Linear(config.hidden_size,
     #                                  config.hidden_size)
     self.cls_outputs = nn.Linear(config.hidden_size, 4)
     self.apply(self.init_bert_weights)
コード例 #8
0
    def __init__(self,
                 config,
                 evidence_lambda: float = 0.8,
                 my_dropout_p: float = 0.2,
                 tf_layers: int = 1,
                 tf_inter_size: int = 3072):
        super(BertHierarchicalTransformer, self).__init__(config)
        logger.info(f'Model {__class__.__name__} is loading...')
        logger.info(f'Model parameters:')
        logger.info(f'Evidence lambda: {evidence_lambda}')
        layers.set_seq_dropout(True)
        layers.set_my_dropout_prob(my_dropout_p)
        self.bert = BertModel(config)
        self.query_self_attn = layers.MultiHeadPooling(config.hidden_size, 6)
        self.value_self_attn = layers.MultiHeadPooling(config.hidden_size, 6)
        # self.sentence_input = layers.BertSentInput(config)
        config.num_hidden_layers = tf_layers
        config.intermediate_size = tf_inter_size
        self.sentence_encoder = BertEncoder(config)
        self.attention_score = layers.AttentionScore(config.hidden_size, 256)

        # Output layer
        self.evidence_lambda = evidence_lambda
        self.predictor = nn.Linear(config.hidden_size * 2, 3)
コード例 #9
0
    def __init__(self, num_choices, bert_config_file, init_embeddings):
        self.num_choices = num_choices
        self.bert_config = BertConfig.from_json_file(bert_config_file)
        BertPreTrainedModel.__init__(self, self.bert_config)

        self.bert = BertModel(self.bert_config)
        self.apply(self.init_bert_weights)
        self.dropout = nn.Dropout(self.bert_config.hidden_dropout_prob)

        self.vocab_size, self.embed_size = np.shape(init_embeddings)
        self.embed = nn.Embedding.from_pretrained(
            torch.FloatTensor(init_embeddings), freeze=False)

        #self.classifier = nn.Linear(self.bert_config.hidden_size + self.embed_size, 1)
        self.classifier = nn.Linear(self.bert_config.hidden_size, 1)
        self.reshape = nn.Linear(self.bert_config.hidden_size,
                                 self.embed_size,
                                 bias=False)
        self.reshape_know = nn.Linear(self.embed_size,
                                      self.bert_config.hidden_size,
                                      bias=True)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=-1)
        self.activation = nn.Sigmoid()
コード例 #10
0
    def __init__(self,
                 config,
                 num_classes,
                 encoding_type='bio',
                 target_vocab=None,
                 dropout=0.2):
        super(SubjectModel, self).__init__(config)

        self.bert = BertModel(config)

        self.apply(self.init_bert_weights)

        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(config.hidden_size, num_classes)

        trans = None
        if target_vocab is not None and encoding_type is not None:
            trans = allowed_transitions(target_vocab,
                                        encoding_type=encoding_type,
                                        include_start_end=True)

        self.crf = ConditionalRandomField(num_classes,
                                          include_start_end_trans=True,
                                          allowed_transitions=trans)
コード例 #11
0
    def __init__(self,
                 config,
                 evidence_lambda=0.8,
                 negative_lambda=1.0,
                 add_entropy: bool = False):
        super(BertQAYesnoHierarchicalNeg, self).__init__(config)
        print(f'The model {self.__class__.__name__} is loading...')
        print(f'The coefficient of evidence loss is {evidence_lambda}')
        print(f'The coefficient of negative samples loss is {negative_lambda}')
        print(f'Add entropy loss: {add_entropy}')

        layers.set_seq_dropout(True)
        layers.set_my_dropout_prob(config.hidden_dropout_prob)

        self.bert = BertModel(config)
        # self.dropout = nn.Dropout(config.hidden_dropout_prob)
        # self.answer_choice = nn.Linear(config.hidden_size, 2)

        self.doc_sen_self_attn = layers.LinearSelfAttnAllennlp(
            config.hidden_size)
        self.que_self_attn = layers.LinearSelfAttn(config.hidden_size)

        self.word_similarity = layers.AttentionScore(config.hidden_size,
                                                     250,
                                                     do_similarity=False)
        self.vector_similarity = layers.AttentionScore(config.hidden_size,
                                                       250,
                                                       do_similarity=False)

        # self.yesno_predictor = nn.Linear(config.hidden_size, 2)
        self.yesno_predictor = nn.Linear(config.hidden_size * 2, 3)
        self.evidence_lam = evidence_lambda
        self.negative_lam = negative_lambda
        self.add_entropy = add_entropy

        self.apply(self.init_bert_weights)
コード例 #12
0
    def __init__(self, config, evidence_lambda=0.8, num_choices=4):
        super(BertRACEHierarchicalTopK, self).__init__(config)
        logger.info(f'The model {self.__class__.__name__} is loading...')
        logger.info(f'Currently the number of choices is {num_choices}')
        logger.info(f'The coefficient of evidence loss is {evidence_lambda}')

        layers.set_seq_dropout(True)
        layers.set_my_dropout_prob(config.hidden_dropout_prob)
        rep_layers.set_seq_dropout(True)
        rep_layers.set_my_dropout_prob(config.hidden_dropout_prob)

        self.bert = BertModel(config)
        self.doc_sen_self_attn = rep_layers.LinearSelfAttention(config.hidden_size)
        self.que_self_attn = rep_layers.LinearSelfAttention(config.hidden_size)

        self.word_similarity = layers.AttentionScore(config.hidden_size, 250, do_similarity=False)
        self.vector_similarity = layers.AttentionScore(config.hidden_size, 250, do_similarity=False)

        # self.yesno_predictor = nn.Linear(config.hidden_size, 2)
        self.classifier = nn.Linear(config.hidden_size * 2, 1)
        self.evidence_lam = evidence_lambda
        self.num_choices = num_choices

        self.apply(self.init_bert_weights)
コード例 #13
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--input_file", default=None, type=str, required=True)
    parser.add_argument("--output_file", default=None, type=str, required=True)
    parser.add_argument(
        "--bert_model",
        default=None,
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese."
    )

    ## Other parameters
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--layers", default="-1,-2,-3,-4", type=str)
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. Sequences longer "
        "than this will be truncated, and sequences shorter than this will be padded."
    )
    parser.add_argument("--batch_size",
                        default=32,
                        type=int,
                        help="Batch size for predictions.")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")

    args = parser.parse_args()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device: {} n_gpu: {} distributed training: {}".format(
        device, n_gpu, bool(args.local_rank != -1)))

    layer_indexes = [int(x) for x in args.layers.split(",")]

    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)

    examples = read_examples(args.input_file)

    features = convert_examples_to_features(examples=examples,
                                            seq_length=args.max_seq_length,
                                            tokenizer=tokenizer)

    unique_id_to_feature = {}
    for feature in features:
        unique_id_to_feature[feature.unique_id] = feature

    model = BertModel.from_pretrained(args.bert_model)
    model.to(device)

    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features],
                                  dtype=torch.long)
    all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)

    eval_data = TensorDataset(all_input_ids, all_input_mask, all_example_index)
    if args.local_rank == -1:
        eval_sampler = SequentialSampler(eval_data)
    else:
        eval_sampler = DistributedSampler(eval_data)
    eval_dataloader = DataLoader(eval_data,
                                 sampler=eval_sampler,
                                 batch_size=args.batch_size)

    model.eval()
    with open(args.output_file, "w", encoding='utf-8') as writer:
        for input_ids, input_mask, example_indices in eval_dataloader:
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)

            all_encoder_layers, _ = model(input_ids,
                                          token_type_ids=None,
                                          attention_mask=input_mask)
            all_encoder_layers = all_encoder_layers

            for b, example_index in enumerate(example_indices):
                feature = features[example_index.item()]
                unique_id = int(feature.unique_id)
                # feature = unique_id_to_feature[unique_id]
                output_json = collections.OrderedDict()
                output_json["linex_index"] = unique_id
                all_out_features = []
                for (i, token) in enumerate(feature.tokens):
                    all_layers = []
                    for (j, layer_index) in enumerate(layer_indexes):
                        layer_output = all_encoder_layers[int(
                            layer_index)].detach().cpu().numpy()
                        layer_output = layer_output[b]
                        layers = collections.OrderedDict()
                        layers["index"] = layer_index
                        layers["values"] = [
                            round(x.item(), 6) for x in layer_output[i]
                        ]
                        all_layers.append(layers)
                    out_features = collections.OrderedDict()
                    out_features["token"] = token
                    out_features["layers"] = all_layers
                    all_out_features.append(out_features)
                output_json["features"] = all_out_features
                writer.write(json.dumps(output_json) + "\n")
コード例 #14
0
    def __init__(self, params, vocab_size, hidden_size, emb_dim, dropout, tok2id):
        global CUDA
        super(Seq2Seq, self).__init__()

        self.vocab_size = vocab_size
        self.hidden_dim = hidden_size
        self.emb_dim = emb_dim
        self.dropout = dropout
        self.pad_id = 0
        self.tok2id = tok2id
        self.params = params

        self.embeddings = nn.Embedding(self.vocab_size, self.emb_dim, self.pad_id)
        self.encoder = LSTMEncoder(
            self.emb_dim, self.hidden_dim, layers=1, bidirectional=True, dropout=self.dropout)

        self.h_t_projection = nn.Linear(hidden_size, hidden_size)
        self.c_t_projection = nn.Linear(hidden_size, hidden_size)

        self.bridge = nn.Linear(768 if self.params['bert_encoder'] else self.hidden_dim, self.hidden_dim)

        if self.params['transformer_decoder']:
            self.decoder = transformer.TransformerDecoder(
                num_layers=self.params['transformer_layers'],
                d_model=self.hidden_dim,
                heads=8,
                d_ff=self.hidden_dim,
                copy_attn=False,
                self_attn_type='scaled-dot',
                dropout=self.dropout,
                embeddings=self.embeddings,
                max_relative_positions=0)
        else:
            self.decoder = StackedAttentionLSTM(
                params, self.emb_dim, self.hidden_dim, layers=1, dropout=self.dropout)

        self.output_projection = nn.Linear(self.hidden_dim, self.vocab_size)

        self.softmax = nn.Softmax(dim=-1)
        # for training
        self.log_softmax = nn.LogSoftmax(dim=-1)

        self.init_weights()

        # pretrained embs from bert (after init to avoid overwrite)
        if self.params['bert_word_embeddings'] or \
        self.params['bert_full_embeddings'] or \
        self.params['bert_encoder']:
            model = BertModel.from_pretrained(
                self.params['bert_model'],
                self.params['working_dir'] + '/cache')

            if self.params['bert_word_embeddings']:
                self.embeddings = model.embeddings.word_embeddings

            if self.params['bert_encoder']:
                self.encoder = model
                # share bert word embeddings with decoder
                self.embeddings = model.embeddings.word_embeddings

            if self.params['bert_full_embeddings']:
                self.embeddings = model.embeddings

        if self.params['freeze_embeddings']:
            for param in self.embeddings.parameters():
                param.requires_grad = False

        self.enrich_input = torch.ones(hidden_size)
        if CUDA:
            self.enrich_input = self.enrich_input.cuda()
        self.enricher = nn.Linear(hidden_size, hidden_size)
コード例 #15
0
                             path=args.db_fi,
                             align_strat=args.align_strat,
                             subsample_all=args.subsample_all)
    else:
        sentdb = data.SentDB(args.sent_fi,
                             args.tag_fi,
                             tokenizer,
                             args.val_sent_fi,
                             args.val_tag_fi,
                             lower=args.lower,
                             align_strat=args.align_strat,
                             subsample_all=args.subsample_all)

        nebert = model.bert
        if args.zero_shot and "newne" not in args.just_eval:
            nebert = BertModel.from_pretrained(args.bert_model,
                                               cache_dir=CACHEDIR)
            nebert = nebert.to(device)

        def avg_bert_emb(x):
            mask = (x != 0)
            rep, _ = nebert(x,
                            attention_mask=mask.long(),
                            output_all_encoded_layers=False)
            mask = mask.float().unsqueeze(2)  # bsz x T x 1
            avgs = (rep * mask).sum(1) / mask.sum(1)  # bsz x hid
            return avgs

        nebsz, nne = 128, 500
        # we always compute neighbors w/ cosine; seems to be a bit better
        model.eval()
        sentdb.compute_top_neighbs(nebsz,
コード例 #16
0
ファイル: bert.py プロジェクト: wangluolin/Web-20
 def __init__(self, config):
     super(Model, self).__init__()
     self.bert = BertModel.from_pretrained(config.bert_path)
     for param in self.bert.parameters():
         param.requires_grad = True
     self.fc = nn.Linear(config.hidden_size, config.num_classes)
コード例 #17
0
ファイル: models.py プロジェクト: THU-KEG/NGS
 def __init__(self, config):
     super(DMCNN_Encoder_argument0, self).__init__(config)
     self.bert = BertModel(config)
     self.dropout = nn.Dropout(p=keepProb)
     self.maxpooling = nn.MaxPool1d(SenLen)
コード例 #18
0
    def __init__(self, config):
        super(BertForUtteranceEncoding, self).__init__(config)

        self.config = config
        self.bert = BertModel(config)
     token_embedding = ElmoTokenEmbedder(options_file,
                                         weights_file,
                                         dropout=DROPOUT,
                                         projection_dim=PROJECT_DIM)
 elif EMBEDDING_TYPE == "_elmo_retrained_2":
     options_file = os.path.join("data", "bilm-tf", "elmo_retrained",
                                 "options_2.json")
     weights_file = os.path.join("data", "bilm-tf", "elmo_retrained",
                                 "weights_2.hdf5")
     token_embedding = ElmoTokenEmbedder(options_file,
                                         weights_file,
                                         dropout=DROPOUT,
                                         projection_dim=PROJECT_DIM)
 elif EMBEDDING_TYPE == "_bert":
     print("Loading bert model")
     model = BertModel.from_pretrained('bert-base-uncased')
     token_embedding = BertEmbedder(model)
     PROJECT_DIM = 768
 else:
     print("Error: Some weird Embedding type", EMBEDDING_TYPE)
     exit()
 word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
 HIDDEN_DIM = 200
 params = Params({
     'input_dim': PROJECT_DIM,
     'hidden_dims': HIDDEN_DIM,
     'activations': 'relu',
     'num_layers': NUM_LAYERS,
     'dropout': DROPOUT
 })
 attend_feedforward = FeedForward.from_params(params)
コード例 #20
0
def main():
    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO)
    logger = logging.getLogger(__name__)
    args = get_args()
    log_info.print_args(args)

    device, n_gpu = initialization.init_cuda_from_args(args, logger=logger)
    initialization.init_seed(args, n_gpu=n_gpu, logger=logger)
    initialization.init_train_batch_size(args)
    initialization.init_output_dir(args)
    initialization.save_args(args)
    task = get_task(args.task_name, args.data_dir)

    # prepare examples, load model as encoder
    tokenizer = shared_model_setup.create_tokenizer(
        bert_model_name=args.bert_model,
        bert_load_mode=args.bert_load_mode,
        do_lower_case=args.do_lower_case,
        bert_vocab_path=args.bert_vocab_path,
    )
    all_state = shared_model_setup.load_overall_state(args.bert_load_path,
                                                      relaxed=True)

    # Load Model...
    if args.bert_load_mode == "state_model_only":
        state_dict = all_state['model']
        bert_as_encoder = BertModel.from_state_dict(
            config_file=args.bert_config_json_path, state_dict=state_dict)
    else:
        assert args.bert_load_mode == "from_pretrained"
        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(
            args.local_rank)
        bert_as_encoder = BertModel.from_pretrained(
            pretrained_model_name_or_path=args.bert_model, cache_dir=cache_dir)

    bert_as_encoder.to(device)

    runner_param = RunnerParameters(
        max_seq_length=args.max_seq_length,
        local_rank=args.local_rank,
        n_gpu=n_gpu,
        fp16=args.fp16,
        learning_rate=args.learning_rate,
        gradient_accumulation_steps=args.gradient_accumulation_steps,
        t_total=None,
        warmup_proportion=args.warmup_proportion,
        num_train_epochs=args.num_train_epochs,
        train_batch_size=args.train_batch_size,
        eval_batch_size=args.eval_batch_size,
    )

    runner = EmbeddingTaskRunner(bert_model=bert_as_encoder,
                                 optimizer=None,
                                 tokenizer=tokenizer,
                                 label_list=task.get_labels(),
                                 device=device,
                                 rparams=runner_param)

    # Run training set encoding...
    print("Run training set encoding ... ")
    train_examples = task.get_train_examples()
    train_dataset = runner.run_encoding(train_examples,
                                        verbose=True,
                                        mode='train')
    print("saving embeddings ... ")
    torch.save(train_dataset, os.path.join(args.output_dir, "train.dataset"))

    # Run development set encoding ...
    eval_examples = task.get_dev_examples()
    eval_dataset = runner.run_encoding(eval_examples,
                                       verbose=True,
                                       mode='eval')
    print("saving embeddings ... ")
    torch.save(eval_dataset, os.path.join(args.output_dir, 'dev.dataset'))

    # Run test set encoding ...
    test_examples = task.get_test_examples()
    test_dataset = runner.run_encoding(test_examples,
                                       verbose=True,
                                       mode='test')
    print("saving embeddings ... ")
    torch.save(test_dataset, os.path.join(args.output_dir, "test.dataset"))

    # HACK for MNLI mis-matched set ...
    if args.task_name == 'mnli':
        print("=== Start embedding task for MNLI mis-matched ===")
        mm_eval_examples = MnliMismatchedProcessor().get_dev_examples(
            task.data_dir)
        mm_eval_dataset = runner.run_encoding(mm_eval_examples,
                                              verbose=True,
                                              mode='eval')
        print("=== Saving eval dataset ===")
        torch.save(mm_eval_dataset,
                   os.path.join(args.output_dir, "mm_dev.dataset"))
        print("=== Saved ===")

        mm_test_examples = MnliMismatchedProcessor().get_test_examples(
            task.data_dir)
        mm_test_dataset = runner.run_encoding(mm_test_examples,
                                              verbose=True,
                                              mode='test')
        print("=== Saving tensor dataset ===")
        torch.save(mm_test_dataset,
                   os.path.join(args.output_dir, "mm_test.dataset"))
        print("=== Saved ===")
コード例 #21
0
class BertQAYesnoHierarchicalReinforceRACE(BertPreTrainedModel):
    """
    Hard attention using reinforce learning
    """
    def __init__(self,
                 config,
                 evidence_lambda=0.8,
                 num_choices=4,
                 sample_steps: int = 5,
                 reward_func: int = 0,
                 freeze_bert=False):
        super(BertQAYesnoHierarchicalReinforceRACE, self).__init__(config)
        logger.info(f'The model {self.__class__.__name__} is loading...')
        logger.info(f'The coefficient of evidence loss is {evidence_lambda}')
        logger.info(f'Currently the number of choices is {num_choices}')
        logger.info(f'Sample steps: {sample_steps}')
        logger.info(f'Reward function: {reward_func}')
        logger.info(f'If freeze BERT\'s parameters: {freeze_bert} ')

        layers.set_seq_dropout(True)
        layers.set_my_dropout_prob(config.hidden_dropout_prob)
        rep_layers.set_seq_dropout(True)
        rep_layers.set_my_dropout_prob(config.hidden_dropout_prob)

        self.bert = BertModel(config)

        if freeze_bert:
            for p in self.bert.parameters():
                p.requires_grad = False

        self.doc_sen_self_attn = rep_layers.LinearSelfAttention(
            config.hidden_size)
        self.que_self_attn = rep_layers.LinearSelfAttention(config.hidden_size)

        self.word_similarity = layers.AttentionScore(config.hidden_size,
                                                     250,
                                                     do_similarity=False)
        self.vector_similarity = layers.AttentionScore(config.hidden_size,
                                                       250,
                                                       do_similarity=False)

        # self.yesno_predictor = nn.Linear(config.hidden_size * 2, 3)
        self.classifier = nn.Linear(config.hidden_size * 2, 1)
        self.evidence_lam = evidence_lambda
        self.sample_steps = sample_steps
        self.reward_func = [self.reinforce_step,
                            self.reinforce_step_1][reward_func]
        self.num_choices = num_choices

        self.apply(self.init_bert_weights)

    def forward(self,
                input_ids,
                token_type_ids=None,
                attention_mask=None,
                labels=None,
                sentence_span_list=None,
                sentence_ids=None,
                max_sentences: int = 0):
        flat_input_ids = input_ids.view(-1, input_ids.size(-1))
        flat_token_type_ids = token_type_ids.view(
            -1,
            token_type_ids.size(-1)) if token_type_ids is not None else None
        flat_attention_mask = attention_mask.view(
            -1,
            attention_mask.size(-1)) if attention_mask is not None else None
        sequence_output, _ = self.bert(flat_input_ids,
                                       flat_token_type_ids,
                                       flat_attention_mask,
                                       output_all_encoded_layers=False)

        # mask: 1 for masked value and 0 for true value
        # doc, que, doc_mask, que_mask = layers.split_doc_que(sequence_output, token_type_ids, attention_mask)
        doc_sen, que, doc_sen_mask, que_mask, sentence_mask = \
            rep_layers.split_doc_sen_que(sequence_output, flat_token_type_ids, flat_attention_mask, sentence_span_list,
                                         max_sentences=max_sentences)

        batch, max_sen, doc_len = doc_sen_mask.size()

        que_vec = self.que_self_attn(que, que_mask).view(batch, 1, -1)

        doc = doc_sen.reshape(batch, max_sen * doc_len, -1)
        word_sim = self.word_similarity(que_vec,
                                        doc).view(batch * max_sen, doc_len)
        doc = doc_sen.reshape(batch * max_sen, doc_len, -1)
        doc_mask = doc_sen_mask.reshape(batch * max_sen, doc_len)
        word_hidden = rep_layers.masked_softmax(word_sim, doc_mask,
                                                dim=1).unsqueeze(1).bmm(doc)

        word_hidden = word_hidden.view(batch, max_sen, -1)

        doc_vecs = self.doc_sen_self_attn(doc,
                                          doc_mask).view(batch, max_sen, -1)

        sentence_sim = self.vector_similarity(que_vec, doc_vecs)
        if self.training:
            _sample_prob, _sample_log_prob = self.sample_one_hot(
                sentence_sim, sentence_mask)
            loss_and_reward, _ = self.reward_func(word_hidden, que_vec, labels,
                                                  _sample_prob,
                                                  _sample_log_prob)
            output_dict = {'loss': loss_and_reward}
        else:
            _prob, _ = self.sample_one_hot(sentence_sim, sentence_mask)
            loss, _choice_logits = self.simple_step(word_hidden, que_vec,
                                                    labels, _prob)
            sentence_scores = rep_layers.masked_softmax(sentence_sim,
                                                        sentence_mask,
                                                        dim=-1).squeeze_(1)
            output_dict = {
                'sentence_logits': sentence_scores.float(),
                'loss': loss,
                'choice_logits': _choice_logits.float()
            }

        return output_dict

    def sample_one_hot(self, _similarity, _mask):
        _probability = rep_layers.masked_softmax(_similarity, _mask)
        dtype = _probability.dtype
        _probability = _probability.float()
        # _log_probability = masked_log_softmax(_similarity, _mask)
        if self.training:
            _distribution = Categorical(_probability)
            _sample_index = _distribution.sample((self.sample_steps, ))
            logger.debug(str(_sample_index.size()))
            new_shape = (self.sample_steps, ) + _similarity.size()
            logger.debug(str(new_shape))
            _sample_one_hot = F.one_hot(_sample_index,
                                        num_classes=_similarity.size(-1))
            # _sample_one_hot = _similarity.new_zeros(new_shape).scatter(-1, _sample_index.unsqueeze(-1), 1.0)
            logger.debug(str(_sample_one_hot.size()))
            _log_prob = _distribution.log_prob(
                _sample_index)  # sample_steps, batch, 1
            assert _log_prob.size() == new_shape[:-1], (_log_prob.size(),
                                                        new_shape)
            _sample_one_hot = _sample_one_hot.transpose(
                0, 1)  # batch, sample_steps, 1, max_sen
            _log_prob = _log_prob.transpose(0, 1)  # batch, sample_steps, 1
            return _sample_one_hot.to(dtype=dtype), _log_prob.to(dtype=dtype)
        else:
            _max_index = _probability.float().max(dim=-1, keepdim=True)[1]
            _one_hot = torch.zeros_like(_similarity).scatter_(
                -1, _max_index, 1.0)
            # _log_prob = _log_probability.gather(-1, _max_index)
            return _one_hot, None

    def reinforce_step(self, hidden, q_vec, label, prob, log_prob):
        batch, max_sen, hidden_dim = hidden.size()
        assert q_vec.size() == (batch, 1, hidden_dim)
        assert prob.size() == (batch, self.sample_steps, 1, max_sen)
        assert log_prob.size() == (batch, self.sample_steps, 1)
        expanded_hidden = hidden.unsqueeze(1).expand(-1, self.sample_steps, -1,
                                                     -1)
        h = prob.matmul(expanded_hidden).squeeze(
            2)  # batch, sample_steps, hidden_dim
        q = q_vec.expand(-1, self.sample_steps, -1)
        # _logits = self.classifier(torch.cat([h, q], dim=2)).view(-1, self.num_choices)  # batch, sample_steps, 3
        # Note the rank of dimension here
        _logits = self.classifier(torch.cat([h, q], dim=2)).view(label.size(0), self.num_choices, self.sample_steps)\
            .transpose(1, 2).reshape(-1, self.num_choices)
        expanded_label = label.unsqueeze(1).expand(
            -1, self.sample_steps).reshape(-1)
        _loss = F.cross_entropy(_logits, expanded_label)
        corrects = (_logits.max(dim=-1)[1] == expanded_label).to(hidden.dtype)
        log_prob = log_prob.reshape(label.size(0), self.num_choices,
                                    self.sample_steps).transpose(
                                        1, 2).mean(dim=-1)
        reward1 = (log_prob.reshape(-1) *
                   corrects).sum() / (self.sample_steps * label.size(0))
        return _loss - reward1, _logits

    def reinforce_step_1(self, hidden, q_vec, label, prob, log_prob):
        batch, max_sen, hidden_dim = hidden.size()
        assert q_vec.size() == (batch, 1, hidden_dim)
        assert prob.size() == (batch, self.sample_steps, 1, max_sen)
        assert log_prob.size() == (batch, self.sample_steps, 1)
        expanded_hidden = hidden.unsqueeze(1).expand(-1, self.sample_steps, -1,
                                                     -1)
        h = prob.matmul(expanded_hidden).squeeze(
            2)  # batch, sample_steps, hidden_dim
        q = q_vec.expand(-1, self.sample_steps, -1)
        # _logits = self.classifier(torch.cat([h, q], dim=2)).view(-1, self.num_choices)  # batch * sample_steps, 3
        _logits = self.classifier(torch.cat([h, q], dim=2)).view(label.size(0), self.num_choices, self.sample_steps)\
            .transpose(1, 2).reshape(-1, self.num_choices)
        expanded_label = label.unsqueeze(1).expand(
            -1, self.sample_steps).reshape(-1)  # batch * sample_steps

        _loss = F.cross_entropy(_logits, expanded_label)

        _final_log_prob = F.log_softmax(_logits, dim=-1)
        # ignore_mask = (expanded_label == -1)
        # expanded_label = expanded_label.masked_fill(ignore_mask, 0)
        selected_log_prob = _final_log_prob.gather(
            1, expanded_label.unsqueeze(1)).squeeze(-1)  # batch * sample_steps
        assert selected_log_prob.size() == (
            label.size(0) * self.sample_steps, ), selected_log_prob.size()
        log_prob = log_prob.reshape(label.size(0), self.num_choices,
                                    self.sample_steps).transpose(
                                        1, 2).mean(dim=-1)
        # reward2 = - (log_prob.reshape(-1) * (selected_log_prob * (1 - ignore_mask).to(log_prob.dtype))).sum() / (
        #         self.sample_steps * batch)
        reward2 = -(log_prob.reshape(-1) * selected_log_prob).sum() / (
            self.sample_steps * label.size(0))

        return _loss - reward2, _logits

    def simple_step(self, hidden, q_vec, label, prob):
        batch, max_sen, hidden_dim = hidden.size()
        assert q_vec.size() == (batch, 1, hidden_dim)
        assert prob.size() == (batch, 1, max_sen)
        h = prob.bmm(hidden)
        _logits = self.classifier(torch.cat([h, q_vec],
                                            dim=2)).view(-1, self.num_choices)
        if label is not None:
            _loss = F.cross_entropy(_logits, label)
        else:
            _loss = _logits.new_zeros(1)
        return _loss, _logits
コード例 #22
0
def extract_embeddings(dataname,
                       layer_indexes=[-1, -2, -3, -4],
                       bert_model='bert-large-uncased',
                       max_seq_length=128,
                       batch_size=32,
                       data_dir='../data/'):

    input_corpus = data_dir + dataname + '/corpus.txt'
    input_vocab = data_dir + dataname + '/vocab.txt'
    output_embedding = data_dir + dataname + '/bert_embeddings.pickle'
    #    output_tokenized_corpus = data_dir + dataname + '/tokenized_corpus.pickle'

    reader = open(input_corpus, 'r', encoding='utf8')
    total_lines = get_num_lines(input_corpus)
    vocab = set([
        each.split('\t')[0] + '||'
        for each in open(input_vocab, 'r', encoding='utf8').read().split('\n')
    ])

    tokenizer = BertTokenizer.from_pretrained(bert_model)

    model = BertModel.from_pretrained(bert_model)
    model.cuda()
    model.eval()

    batch_sentences = {}
    fout = open(output_embedding, 'wb')
    with torch.no_grad():
        for sent_id, line in enumerate(tqdm(reader, total=total_lines)):

            if len(batch_sentences) < batch_size and sent_id < total_lines - 1:
                line = line.strip()
                terms = line.split(' ')
                intersection = set(terms).intersection(vocab)
                if intersection:
                    raw_line = re.sub("\|\|.+?\|\|", '',
                                      line).replace('_', ' ')
                    sent_info = get_sent_info(raw_line, intersection, sent_id,
                                              max_seq_length)
                    batch_sentences[sent_id] = {
                        'raw_sent': raw_line,
                        'sent_info': sent_info
                    }
            else:

                batch_tokenized_sents = [
                    tokenize_sent(sent_id, values['raw_sent'], max_seq_length,
                                  tokenizer)
                    for sent_id, values in batch_sentences.items()
                ]

                batch_input_ids = torch.tensor(
                    [sent['input_ids'] for sent in batch_tokenized_sents],
                    dtype=torch.long).cuda()
                batch_input_mask = torch.tensor(
                    [sent['input_mask'] for sent in batch_tokenized_sents],
                    dtype=torch.long).cuda()

                all_encoder_layers, _ = model(batch_input_ids,
                                              attention_mask=batch_input_mask)

                ### performance bottleneck
                #s = time.time()
                emb_encoder_layers = torch.stack(
                    all_encoder_layers)[layer_indexes]
                for idx, sent_id in enumerate(batch_sentences):
                    sent_info = batch_sentences[sent_id]['sent_info']
                    [term_info.__setitem__('embedding', emb_encoder_layers[:, idx, term_info['loc'][0]:term_info['loc'][1], :].detach().cpu().numpy().astype(np.float16))\
                     for term_info in sent_info]
                #print(time.time()-s)
                ### performance bottleneck

                pickle.dump(batch_sentences, fout)
                batch_sentences = {}

                line = line.strip()
                terms = line.split(' ')
                intersection = set(terms).intersection(vocab)
                if intersection:
                    raw_line = re.sub("\|\|.+?\|\|", '',
                                      line).replace('_', ' ')
                    sent_info = get_sent_info(raw_line, intersection, sent_id,
                                              max_seq_length)
                    batch_sentences[sent_id] = {
                        'raw_sent': raw_line,
                        'sent_info': sent_info
                    }

    reader.close()
    fout.close()
コード例 #23
0
    def __init__(self, corpus=None, emb_size=256, jemb_drop_out=0.1, bert_model='bert-base-uncased', \
     coordmap=True, leaky=False, dataset=None, light=False,seg=False,att=False,args=None):
        super(grounding_model, self).__init__()
        self.coordmap = coordmap
        self.light = light
        self.seg = seg
        self.att = att
        self.lstm = (corpus is not None)
        self.emb_size = emb_size
        if bert_model == 'bert-base-uncased':
            self.textdim = 768
        else:
            self.textdim = 1024
        ## Visual model
        self.visumodel = Darknet(config_path='./model/yolov3.cfg')
        self.visumodel.load_weights('./saved_models/yolov3.weights')
        # self.visumodel = torch.hub.load('pytorch/vision:v0.6.0', 'deeplabv3_resnet101', pretrained=True)
        # self.visumodel =deeplabv3_resnet101(pretrained=False, progress=True, num_classes=21, aux_loss=None)
        self.intmd_fea = []
        ## Text model
        if self.lstm:
            self.textdim, self.embdim = 1024, 512
            self.textmodel = RNNEncoder(vocab_size=len(corpus),
                                        word_embedding_size=self.embdim,
                                        word_vec_size=self.textdim // 2,
                                        hidden_size=self.textdim // 2,
                                        bidirectional=True,
                                        input_dropout_p=0.2,
                                        variable_lengths=True)
        else:
            self.textmodel = BertModel.from_pretrained(bert_model)

        ## Mapping module
        self.mapping_visu = nn.Sequential(
            OrderedDict([
                ('0', ConvBatchNormReLU(1024,
                                        emb_size,
                                        1,
                                        1,
                                        0,
                                        1,
                                        leaky=leaky)),
                ('1', ConvBatchNormReLU(512, emb_size, 1, 1, 0, 1,
                                        leaky=leaky)),
                ('2', ConvBatchNormReLU(256, emb_size, 1, 1, 0, 1,
                                        leaky=leaky))
            ]))
        self.mapping_lang = torch.nn.Sequential(
            nn.Linear(self.textdim, emb_size),
            nn.BatchNorm1d(emb_size),
            nn.ReLU(),
            nn.Dropout(jemb_drop_out),
            nn.Linear(emb_size, emb_size),
            nn.BatchNorm1d(emb_size),
            nn.ReLU(),
        )
        embin_size = emb_size * 2
        if self.coordmap:
            embin_size += 8
        if self.light:
            self.fcn_emb = nn.Sequential(
                OrderedDict([
                    ('0',
                     torch.nn.Sequential(
                         ConvBatchNormReLU(embin_size,
                                           emb_size,
                                           1,
                                           1,
                                           0,
                                           1,
                                           leaky=leaky), )),
                    ('1',
                     torch.nn.Sequential(
                         ConvBatchNormReLU(embin_size,
                                           emb_size,
                                           1,
                                           1,
                                           0,
                                           1,
                                           leaky=leaky), )),
                    ('2',
                     torch.nn.Sequential(
                         ConvBatchNormReLU(embin_size,
                                           emb_size,
                                           1,
                                           1,
                                           0,
                                           1,
                                           leaky=leaky), )),
                ]))
            self.fcn_out = nn.Sequential(
                OrderedDict([
                    ('0',
                     torch.nn.Sequential(
                         nn.Conv2d(emb_size, 3 * 5, kernel_size=1), )),
                    ('1',
                     torch.nn.Sequential(
                         nn.Conv2d(emb_size, 3 * 5, kernel_size=1), )),
                    ('2',
                     torch.nn.Sequential(
                         nn.Conv2d(emb_size, 3 * 5, kernel_size=1), )),
                ]))
        else:
            self.fcn_emb = nn.Sequential(
                OrderedDict([
                    (
                        '0',
                        torch.nn.Sequential(
                            ConvBatchNormReLU(embin_size,
                                              emb_size,
                                              1,
                                              1,
                                              0,
                                              1,
                                              leaky=leaky),
                            # Self_Attn(emb_size,'relu'),
                            ConvBatchNormReLU(emb_size,
                                              emb_size,
                                              3,
                                              1,
                                              1,
                                              1,
                                              leaky=leaky),
                            # Self_Attn(emb_size, 'relu'),
                            ConvBatchNormReLU(emb_size,
                                              emb_size,
                                              1,
                                              1,
                                              0,
                                              1,
                                              leaky=leaky),
                            # NLBlockND(in_channels=emb_size, dimension=2)
                        )),
                    (
                        '1',
                        torch.nn.Sequential(
                            ConvBatchNormReLU(embin_size,
                                              emb_size,
                                              1,
                                              1,
                                              0,
                                              1,
                                              leaky=leaky),
                            # Self_Attn(emb_size, 'relu'),
                            ConvBatchNormReLU(emb_size,
                                              emb_size,
                                              3,
                                              1,
                                              1,
                                              1,
                                              leaky=leaky),
                            # Self_Attn(emb_size, 'relu'),
                            ConvBatchNormReLU(emb_size,
                                              emb_size,
                                              1,
                                              1,
                                              0,
                                              1,
                                              leaky=leaky),
                            # NLBlockND(in_channels=emb_size, dimension=2)
                        )),
                    (
                        '2',
                        torch.nn.Sequential(
                            ConvBatchNormReLU(embin_size,
                                              emb_size,
                                              1,
                                              1,
                                              0,
                                              1,
                                              leaky=leaky),
                            # Self_Attn(emb_size, 'relu'),
                            ConvBatchNormReLU(emb_size,
                                              emb_size,
                                              3,
                                              1,
                                              1,
                                              1,
                                              leaky=leaky),
                            # Self_Attn(emb_size, 'relu'),
                            ConvBatchNormReLU(emb_size,
                                              emb_size,
                                              1,
                                              1,
                                              0,
                                              1,
                                              leaky=leaky),
                            # NLBlockND(in_channels=emb_size, dimension=2)
                        )),
                ]))
            self.fcn_out = nn.Sequential(
                OrderedDict([
                    ('0',
                     torch.nn.Sequential(
                         ConvBatchNormReLU(emb_size,
                                           emb_size // 2,
                                           1,
                                           1,
                                           0,
                                           1,
                                           leaky=leaky),
                         nn.Conv2d(emb_size // 2, 3, kernel_size=1),
                     )),
                    ('1',
                     torch.nn.Sequential(
                         ConvBatchNormReLU(emb_size,
                                           emb_size // 2,
                                           1,
                                           1,
                                           0,
                                           1,
                                           leaky=leaky),
                         nn.Conv2d(emb_size // 2, 3, kernel_size=1),
                     )),
                    ('2',
                     torch.nn.Sequential(
                         ConvBatchNormReLU(emb_size,
                                           emb_size // 2,
                                           1,
                                           1,
                                           0,
                                           1,
                                           leaky=leaky),
                         nn.Conv2d(emb_size // 2, 3, kernel_size=1),
                     )),
                ]))
            if self.att:
                self.attn_emb = Self_Attn(4, emb_size, 'relu')
            # self.fcn_emb=torch.nn.Sequential(
            #         ConvBatchNormReLU(embin_size, emb_size, 1, 1, 0, 1, leaky=leaky),
            #         # Self_Attn(emb_size,'relu'),
            #         ConvBatchNormReLU(emb_size, emb_size, 3, 1, 1, 1, leaky=leaky),
            #         # Self_Attn(emb_size, 'relu'),
            #         ConvBatchNormReLU(emb_size, emb_size, 1, 1, 0, 1, leaky=leaky),)

            self.fcn_out_offset = nn.Sequential(
                OrderedDict([
                    ('0',
                     torch.nn.Sequential(
                         ConvBatchNormReLU(emb_size,
                                           emb_size // 2,
                                           1,
                                           1,
                                           0,
                                           1,
                                           leaky=leaky),
                         nn.Conv2d(emb_size // 2, 2, kernel_size=1),
                     )),
                    ('1',
                     torch.nn.Sequential(
                         ConvBatchNormReLU(emb_size,
                                           emb_size // 2,
                                           1,
                                           1,
                                           0,
                                           1,
                                           leaky=leaky),
                         nn.Conv2d(emb_size // 2, 2, kernel_size=1),
                     )),
                    ('2',
                     torch.nn.Sequential(
                         ConvBatchNormReLU(emb_size,
                                           emb_size // 2,
                                           1,
                                           1,
                                           0,
                                           1,
                                           leaky=leaky),
                         nn.Conv2d(emb_size // 2, 2, kernel_size=1),
                     )),
                ]))
            self.fcn_out_center = nn.Sequential(
                OrderedDict([
                    (
                        '0',
                        torch.nn.Sequential(
                            ConvBatchNormReLU(emb_size,
                                              emb_size,
                                              1,
                                              1,
                                              0,
                                              1,
                                              leaky=leaky),
                            ConvBatchNormReLU(emb_size,
                                              emb_size,
                                              3,
                                              1,
                                              1,
                                              1,
                                              leaky=leaky),
                            # NLBlockND(in_channels=emb_size, dimension=2),
                            ConvBatchNormReLU(emb_size,
                                              emb_size // 2,
                                              1,
                                              1,
                                              0,
                                              1,
                                              leaky=leaky),
                            nn.Conv2d(emb_size // 2,
                                      int(args.size / 32) *
                                      int(args.size / 32),
                                      kernel_size=1),
                        )),
                    (
                        '1',
                        torch.nn.Sequential(
                            ConvBatchNormReLU(emb_size,
                                              emb_size,
                                              1,
                                              1,
                                              0,
                                              1,
                                              leaky=leaky),
                            ConvBatchNormReLU(emb_size,
                                              emb_size,
                                              3,
                                              1,
                                              1,
                                              1,
                                              leaky=leaky),
                            # NLBlockND(in_channels=emb_size, dimension=2),
                            ConvBatchNormReLU(emb_size,
                                              emb_size // 2,
                                              1,
                                              1,
                                              0,
                                              1,
                                              leaky=leaky),
                            nn.Conv2d(emb_size // 2,
                                      int(args.size / 16) *
                                      int(args.size / 16),
                                      kernel_size=1),
                        )),
                    (
                        '2',
                        torch.nn.Sequential(
                            ConvBatchNormReLU(emb_size,
                                              emb_size,
                                              1,
                                              1,
                                              0,
                                              1,
                                              leaky=leaky),
                            ConvBatchNormReLU(emb_size,
                                              emb_size,
                                              3,
                                              1,
                                              1,
                                              1,
                                              leaky=leaky),
                            # NLBlockND(in_channels=emb_size, dimension=2),
                            ConvBatchNormReLU(emb_size,
                                              emb_size,
                                              1,
                                              1,
                                              0,
                                              1,
                                              leaky=leaky),
                            nn.Conv2d(emb_size,
                                      int(args.size / 8) * int(args.size / 8),
                                      kernel_size=1),
                        )),
                ]))

        # if self.seg:
        self.segmentation = ReferCam()
コード例 #24
0
def main():
    parser = argparse.ArgumentParser()

    # Required parameters
    parser.add_argument("--input_file", default=None, type=str, required=True)
    parser.add_argument("--output_file", default=None, type=str, required=True)
    parser.add_argument(
        "--bert_model",
        default=None,
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese."
    )
    # Other parameters
    # parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using uncased model.")
    # parser.add_argument("--layers", default="-2", type=str)
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. Sequences longer "
        "than this will be truncated, and sequences shorter than this will be padded."
    )
    parser.add_argument("--batch_size",
                        default=32,
                        type=int,
                        help="Batch size for predictions.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")

    args = parser.parse_args()

    device = torch.device(
        "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    n_gpu = torch.cuda.device_count()
    logger.info("device: {} n_gpu: {}".format(device, n_gpu))

    # layer_indexes = [int(x) for x in args.layers.split(",")]
    layer_index = -2  # second-to-last, which showed reasonable performance in BERT paper

    dset = BertSingleSeqDataset(args.input_file, args.bert_model,
                                args.max_seq_length)
    model = BertModel.from_pretrained(args.bert_model)
    model.to(device)

    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    eval_sampler = SequentialSampler(dset)
    eval_dataloader = DataLoader(dset,
                                 sampler=eval_sampler,
                                 batch_size=args.batch_size,
                                 collate_fn=pad_collate,
                                 num_workers=8)

    model.eval()
    torch.set_grad_enabled(False)
    with h5py.File(args.output_file, "w") as h5_f:
        for batch in tqdm(eval_dataloader):
            input_ids = batch.token_ids.to(device)
            input_mask = batch.token_ids_mask.to(device)
            unique_ids = batch.unique_id

            all_encoder_layers, _ = model(
                input_ids, token_type_ids=None,
                attention_mask=input_mask)  # (#layers, bsz, #tokens, hsz)
            layer_output = all_encoder_layers[layer_index].detach().cpu(
            ).numpy()  # (bsz, #tokens, hsz)
            print("layer_output", layer_output.shape)

            for batch_idx, unique_id in enumerate(unique_ids):
                original_token_embeddings = get_original_token_embedding(
                    layer_output[batch_idx], batch.token_ids_mask[batch_idx],
                    batch.token_map[batch_idx])
                h5_f.create_dataset(str(unique_id),
                                    data=original_token_embeddings,
                                    dtype=np.float32)
コード例 #25
0
class BertQAYesnoHierarchicalHardRACE(BertPreTrainedModel):
    """
    Hard:
    Hard attention, using gumbel softmax of reinforcement learning.
    """
    def __init__(self,
                 config,
                 evidence_lambda=0.8,
                 num_choices=4,
                 use_gumbel=True,
                 freeze_bert=False):
        super(BertQAYesnoHierarchicalHardRACE, self).__init__(config)
        logger.info(f'The model {self.__class__.__name__} is loading...')
        logger.info(f'The coefficient of evidence loss is {evidence_lambda}')
        logger.info(f'Currently the number of choices is {num_choices}')
        logger.info(f'Use gumbel: {use_gumbel}')
        logger.info(f'If freeze BERT\'s parameters: {freeze_bert} ')

        layers.set_seq_dropout(True)
        layers.set_my_dropout_prob(config.hidden_dropout_prob)
        rep_layers.set_seq_dropout(True)
        rep_layers.set_my_dropout_prob(config.hidden_dropout_prob)

        self.bert = BertModel(config)

        if freeze_bert:
            for p in self.bert.parameters():
                p.requires_grad = False

        # self.doc_sen_self_attn = layers.LinearSelfAttnAllennlp(config.hidden_size)
        # self.que_self_attn = layers.LinearSelfAttn(config.hidden_size)
        self.doc_sen_self_attn = rep_layers.LinearSelfAttention(
            config.hidden_size)
        self.que_self_attn = rep_layers.LinearSelfAttention(config.hidden_size)

        self.word_similarity = layers.AttentionScore(config.hidden_size,
                                                     250,
                                                     do_similarity=False)
        self.vector_similarity = layers.AttentionScore(config.hidden_size,
                                                       250,
                                                       do_similarity=False)

        self.classifier = nn.Linear(config.hidden_size * 2, 1)
        self.evidence_lam = evidence_lambda
        self.use_gumbel = use_gumbel
        self.num_choices = num_choices

        self.apply(self.init_bert_weights)

    def forward(self,
                input_ids,
                token_type_ids=None,
                attention_mask=None,
                labels=None,
                sentence_span_list=None,
                sentence_ids=None,
                max_sentences: int = 0):
        flat_input_ids = input_ids.view(-1, input_ids.size(-1))
        flat_token_type_ids = token_type_ids.view(
            -1,
            token_type_ids.size(-1)) if token_type_ids is not None else None
        flat_attention_mask = attention_mask.view(
            -1,
            attention_mask.size(-1)) if attention_mask is not None else None
        sequence_output, _ = self.bert(flat_input_ids,
                                       flat_token_type_ids,
                                       flat_attention_mask,
                                       output_all_encoded_layers=False)

        # mask: 1 for masked value and 0 for true value
        # doc, que, doc_mask, que_mask = layers.split_doc_que(sequence_output, token_type_ids, attention_mask)
        doc_sen, que, doc_sen_mask, que_mask, sentence_mask = \
            rep_layers.split_doc_sen_que(sequence_output, flat_token_type_ids, flat_attention_mask, sentence_span_list,
                                         max_sentences=max_sentences)

        batch, max_sen, doc_len = doc_sen_mask.size()
        # que_len = que_mask.size(1)

        # que_vec = layers.weighted_avg(que, self.que_self_attn(que, que_mask)).view(batch, 1, -1)
        que_vec = self.que_self_attn(que, que_mask).view(batch, 1, -1)

        doc = doc_sen.reshape(batch, max_sen * doc_len, -1)
        word_sim = self.word_similarity(que_vec,
                                        doc).view(batch * max_sen, doc_len)
        doc = doc_sen.reshape(batch * max_sen, doc_len, -1)
        doc_mask = doc_sen_mask.reshape(batch * max_sen, doc_len)
        word_hidden = rep_layers.masked_softmax(word_sim, doc_mask,
                                                dim=1).unsqueeze(1).bmm(doc)

        word_hidden = word_hidden.view(batch, max_sen, -1)

        doc_vecs = self.doc_sen_self_attn(doc,
                                          doc_mask).view(batch, max_sen, -1)

        sentence_sim = self.vector_similarity(que_vec, doc_vecs)
        sentence_hidden = self.hard_sample(
            sentence_sim,
            use_gumbel=self.use_gumbel,
            dim=-1,
            hard=True,
            mask=sentence_mask).bmm(word_hidden).squeeze(1)

        choice_logits = self.classifier(
            torch.cat([sentence_hidden, que_vec.squeeze(1)],
                      dim=1)).reshape(-1, self.num_choices)

        sentence_scores = rep_layers.masked_softmax(sentence_sim,
                                                    sentence_mask,
                                                    dim=-1).squeeze_(1)
        output_dict = {
            'choice_logits':
            choice_logits.float(),
            'sentence_logits':
            sentence_scores.reshape(choice_logits.size(0), self.num_choices,
                                    max_sen).detach().cpu().float(),
        }
        loss = 0
        if labels is not None:
            choice_loss = F.cross_entropy(choice_logits, labels)
            loss += choice_loss
        if sentence_ids is not None:
            log_sentence_sim = rep_layers.masked_log_softmax(
                sentence_sim.squeeze(1), sentence_mask, dim=-1)
            sentence_loss = F.nll_loss(log_sentence_sim,
                                       sentence_ids.view(batch),
                                       reduction='sum',
                                       ignore_index=-1)
            loss += self.evidence_lam * sentence_loss / choice_logits.size(0)
        output_dict['loss'] = loss
        return output_dict

    def hard_sample(self, logits, use_gumbel, dim=-1, hard=True, mask=None):
        if use_gumbel:
            if self.training:
                probs = rep_layers.gumbel_softmax(logits,
                                                  mask=mask,
                                                  hard=hard,
                                                  dim=dim)
                return probs
            else:
                probs = rep_layers.masked_softmax(logits, mask, dim=dim)
                index = probs.max(dim, keepdim=True)[1]
                y_hard = torch.zeros_like(logits).scatter_(dim, index, 1.0)
                return y_hard
        else:
            pass
コード例 #26
0
ファイル: matcher.py プロジェクト: Mehrad0711/mt-dnn-1
class SANBertNetwork(nn.Module):
    def __init__(self, opt, bert_config=None):
        super(SANBertNetwork, self).__init__()
        self.dropout_list = nn.ModuleList()
        self.bert_config = BertConfig.from_dict(opt)
        self.bert = BertModel(self.bert_config)
        if opt.get('dump_feature', False):
            self.opt = opt
            return
        if opt['update_bert_opt'] > 0:
            for p in self.bert.parameters():
                p.requires_grad = False
        mem_size = self.bert_config.hidden_size
        self.decoder_opt = opt['answer_opt']
        self.scoring_list = nn.ModuleList()
        labels = [int(ls) for ls in opt['label_size'].split(',')]
        task_dropout_p = opt['tasks_dropout_p']
        self.bert_pooler = None

        for task, lab in enumerate(labels):
            decoder_opt = self.decoder_opt[task]
            dropout = DropoutWrapper(task_dropout_p[task], opt['vb_dropout'])
            self.dropout_list.append(dropout)
            if decoder_opt == 1:
                out_proj = SANClassifier(mem_size, mem_size, lab, opt, prefix='answer', dropout=dropout)
                self.scoring_list.append(out_proj)
            else:
                out_proj = nn.Linear(self.bert_config.hidden_size, lab)
                self.scoring_list.append(out_proj)

        self.opt = opt
        self._my_init()
        self.set_embed(opt)

    def _my_init(self):
        def init_weights(module):
            if isinstance(module, (nn.Linear, nn.Embedding)):
                # Slightly different from the TF version which uses truncated_normal for initialization
                # cf https://github.com/pytorch/pytorch/pull/5617
                module.weight.data.normal_(mean=0.0, std=self.bert_config.initializer_range * self.opt['init_ratio'])
            elif isinstance(module, BertLayerNorm):
                # Slightly different from the BERT pytorch version, which should be a bug.
                # Note that it only affects on training from scratch. For detailed discussions, please contact xiaodl@.
                # Layer normalization (https://arxiv.org/abs/1607.06450)
                # support both old/latest version
                if 'beta' in dir(module) and 'gamma' in dir(module):
                    module.beta.data.zero_()
                    module.gamma.data.fill_(1.0)
                else:
                    module.bias.data.zero_()
                    module.weight.data.fill_(1.0)
            if isinstance(module, nn.Linear):
                module.bias.data.zero_()

        self.apply(init_weights)

    def nbert_layer(self):
        return len(self.bert.encoder.layer)

    def freeze_layers(self, max_n):
        assert max_n < self.nbert_layer()
        for i in range(0, max_n):
            self.freeze_layer(i)

    def freeze_layer(self, n):
        assert n < self.nbert_layer()
        layer = self.bert.encoder.layer[n]
        for p in layer.parameters():
            p.requires_grad = False

    def set_embed(self, opt):
        bert_embeddings = self.bert.embeddings
        emb_opt = opt['embedding_opt']
        if emb_opt == 1:
            for p in bert_embeddings.word_embeddings.parameters():
                p.requires_grad = False
        elif emb_opt == 2:
            for p in bert_embeddings.position_embeddings.parameters():
                p.requires_grad = False
        elif emb_opt == 3:
            for p in bert_embeddings.token_type_embeddings.parameters():
                p.requires_grad = False
        elif emb_opt == 4:
            for p in bert_embeddings.token_type_embeddings.parameters():
                p.requires_grad = False
            for p in bert_embeddings.position_embeddings.parameters():
                p.requires_grad = False

    def forward(self, input_ids, token_type_ids, attention_mask, premise_mask=None, hyp_mask=None, task_id=0):
        all_encoder_layers, pooled_output = self.bert(input_ids, token_type_ids, attention_mask)
        sequence_output = all_encoder_layers[-1]
        if self.bert_pooler is not None:
            pooled_output = self.bert_pooler(sequence_output)
        decoder_opt = self.decoder_opt[task_id]
        if decoder_opt == 1:
            max_query = hyp_mask.size(1)
            assert max_query > 0
            assert premise_mask is not None
            assert hyp_mask is not None
            hyp_mem = sequence_output[:, :max_query, :]
            logits = self.scoring_list[task_id](sequence_output, hyp_mem, premise_mask, hyp_mask)
        else:
            pooled_output = self.dropout_list[task_id](pooled_output)
            logits = self.scoring_list[task_id](pooled_output)
        return logits
コード例 #27
0
def inference(params):
    voc, all_def_embs, id2def = load_req(params)

    mapping = load_model(params, torch.tensor(voc.embedding))
    mapping.eval()
    torch.set_grad_enabled(False)

    if params.model_type == 'baseline':
        import tensorflow_hub as hub
        import tensorflow as tf
        sent_embed = hub.Module(
            "https://tfhub.dev/google/universal-sentence-encoder-large/3")
        input_sent = tf.placeholder(tf.string, shape=(None))
        encoded = sent_embed(input_sent)
        with tf.Session() as sess:
            sess.run(
                [tf.global_variables_initializer(),
                 tf.tables_initializer()])
            sess.graph.finalize()

            while True:
                _, w_id, ctx = get_input(voc)
                if w_id == None: continue
                ctx_emb = np.round(
                    sess.run(encoded, {
                        input_sent: [ctx]
                    }).astype(np.float64), 6)  # (1, 512)
                answer(mapping, all_def_embs, id2def, ctx_emb, w_id)

    elif params.model_type == 'ELMo':
        from allennlp.commands.elmo import ElmoEmbedder
        elmo = ElmoEmbedder()

        while True:
            word, w_id, ctx = get_input(voc)
            if w_id == None: continue
            ctx = ctx.split()
            ctx_emb = elmo.embed_sentence(ctx)  # (3, seq_len, 1024)

            word_pos, _ = find_varaint_word(word, ctx)
            if word_pos == None: continue

            ctx_emb = ctx_emb[:, word_pos][np.newaxis, :]
            answer(mapping, all_def_embs, id2def, ctx_emb, w_id)

    else:
        from pytorch_pretrained_bert.tokenization import BertTokenizer
        from pytorch_pretrained_bert.modeling import BertModel

        tokenizer = BertTokenizer.from_pretrained(params.bert_model,
                                                  do_lower_case=True)
        model = BertModel.from_pretrained(params.bert_model)
        model.to(device)
        model.eval()

        while True:
            word, w_id, ctx = get_input(voc)
            if w_id == None: continue

            _, word = find_varaint_word(word, ctx.split())
            if word == None: continue

            input_ids, input_mask, key_ids = convert_examples_to_features(
                word, ctx, 128, tokenizer)

            all_encoder_layers, _ = model(input_ids,
                                          token_type_ids=None,
                                          attention_mask=input_mask)

            ctx_emb = np.zeros((params.n_feats, 3, params.emb1_dim))
            for j, ly_id in enumerate(reversed(range(-params.n_feats,
                                                     0))):  # -1, -2, -3 ...
                layer_output = all_encoder_layers[ly_id].detach().cpu().numpy(
                ).astype(np.float64).squeeze()
                ctx_emb[j, :len(key_ids)] = np.round(
                    layer_output[key_ids], 6).tolist()  # (3, 768/1024)

            ctx_emb = np.transpose(ctx_emb,
                                   (0, 2, 1))  # (n_feats, 768/1024, 3)
            answer(mapping, all_def_embs, id2def, ctx_emb, w_id)
コード例 #28
0
    def __init__(self, label_num=4):
        super(FlatBertModel, self).__init__()
        self.sentences_encoder = BertModel.from_pretrained('bert-base-cased')
        self.hidden_size = self.sentences_encoder.config.hidden_size

        self.classifer = nn.Linear(self.hidden_size, label_num)
コード例 #29
0
def main(args):

    # to pick up here.
    if args.data_type == "matres":
        label_map = matres_label_map
    elif args.data_type == "tbd":
        label_map = tbd_label_map

    all_labels = list(OrderedDict.fromkeys(label_map.values()))

    args._label_to_id = OrderedDict([(all_labels[l], l)
                                     for l in range(len(all_labels))])
    args._id_to_label = OrderedDict([(l, all_labels[l])
                                     for l in range(len(all_labels))])
    print(args._label_to_id)

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    if args.load_model_dir:
        output_model_file = os.path.join(args.load_model_dir,
                                         "pytorch_model.bin")
        model_state_dict = torch.load(output_model_file)
        bert_model = BertModel.from_pretrained('bert-base-uncased',
                                               state_dict=model_state_dict)
    else:
        bert_model = BertModel.from_pretrained('bert-base-uncased')

    train_data = pickle.load(open(args.data_dir + "/train.pickle", "rb"))
    print("process train...")
    data = [
        parallel(v, k, args, tokenizer, bert_model)
        for k, v in train_data.items()
    ]

    if args.data_type in ['tbd']:
        print("process dev...")
        dev_data = pickle.load(open(args.data_dir + "/dev.pickle", "rb"))
        dev_data = [
            parallel(v, k, args, tokenizer, bert_model)
            for k, v in dev_data.items()
        ]
        data += dev_data

    # doc splits
    if args.data_type in ['matres']:
        train_docs, dev_docs = train_test_split(args.train_docs,
                                                test_size=0.2,
                                                random_state=args.seed)
    # TBDense data has given splits on train/dev/test
    else:
        train_docs = args.train_docs
        dev_docs = args.dev_docs

    if not os.path.isdir(args.save_data_dir):
        os.mkdir(args.save_data_dir)

    if 'all' in args.split:
        print("process test...")
        test_data = pickle.load(open(args.data_dir + "/test.pickle", "rb"))
        test_data = [
            parallel(v, k, args, tokenizer, bert_model)
            for k, v in test_data.items()
        ]
        print(len(test_data))
        print(args.save_data_dir)

        with open(args.save_data_dir + '/test.pickle', 'wb') as handle:
            pickle.dump(test_data, handle, protocol=pickle.HIGHEST_PROTOCOL)
        handle.close()

        split_and_save(train_docs, dev_docs, data, args.seed,
                       args.save_data_dir)

        # quick trick to reduce number of tokens in GloVe
        # reduce_vocab(data + test_data, args.save_data_dir, args.w2i, args.glove)
    return
コード例 #30
0
    # this forward is just for predict, not for train
    # dont confuse this with _forward_alg above.
    def forward(self, input_ids, segment_ids, input_mask):
        # Get the emission scores from the BiLSTM
        bert_feats = self._get_bert_features(input_ids, segment_ids,
                                             input_mask)

        # Find the best path, given the features.
        value, score, label_seq_ids = self._viterbi_decode(bert_feats)
        return value, score, label_seq_ids


start_label_id = conllProcessor.get_start_label_id()
stop_label_id = conllProcessor.get_stop_label_id()

bert_model = BertModel.from_pretrained(bert_model_scale)
model = BERT_CRF_NER(bert_model, start_label_id, stop_label_id,
                     len(label_list), max_seq_length, batch_size, device)

#%%
if load_checkpoint and os.path.exists(output_dir +
                                      '/ner_bert_crf_checkpoint.pt'):
    checkpoint = torch.load(output_dir + '/ner_bert_crf_checkpoint.pt',
                            map_location='cpu')
    start_epoch = checkpoint['epoch'] + 1
    valid_acc_prev = checkpoint['valid_acc']
    valid_f1_prev = checkpoint['valid_f1']
    pretrained_dict = checkpoint['model_state']
    net_state_dict = model.state_dict()
    pretrained_dict_selected = {
        k: v
コード例 #31
0
def convert_pytorch_checkpoint_to_tf(model: BertModel, ckpt_dir: str,
                                     model_name: str):
    """
    :param model:BertModel Pytorch model instance to be converted
    :param ckpt_dir: Tensorflow model directory
    :param model_name: model name
    :return:

    Currently supported HF models:
        Y BertModel
        N BertForMaskedLM
        N BertForPreTraining
        N BertForMultipleChoice
        N BertForNextSentencePrediction
        N BertForSequenceClassification
        N BertForQuestionAnswering
    """

    tensors_to_transpose = ("dense.weight", "attention.self.query",
                            "attention.self.key", "attention.self.value")

    var_map = (('layer.', 'layer_'), ('word_embeddings.weight',
                                      'word_embeddings'),
               ('position_embeddings.weight', 'position_embeddings'),
               ('token_type_embeddings.weight', 'token_type_embeddings'),
               ('.', '/'), ('LayerNorm/weight', 'LayerNorm/gamma'),
               ('LayerNorm/bias', 'LayerNorm/beta'), ('weight', 'kernel'))

    if not os.path.isdir(ckpt_dir):
        os.makedirs(ckpt_dir)

    state_dict = model.state_dict()

    def to_tf_var_name(name: str):
        for patt, repl in iter(var_map):
            name = name.replace(patt, repl)
        return 'bert/{}'.format(name)

    def create_tf_var(tensor: np.ndarray, name: str, session: tf.Session):
        tf_dtype = tf.dtypes.as_dtype(tensor.dtype)
        tf_var = tf.get_variable(dtype=tf_dtype,
                                 shape=tensor.shape,
                                 name=name,
                                 initializer=tf.zeros_initializer())
        session.run(tf.variables_initializer([tf_var]))
        session.run(tf_var)
        return tf_var

    tf.reset_default_graph()
    with tf.Session() as session:
        for var_name in state_dict:
            tf_name = to_tf_var_name(var_name)
            torch_tensor = state_dict[var_name].numpy()
            if any([x in var_name for x in tensors_to_transpose]):
                torch_tensor = torch_tensor.T
            tf_var = create_tf_var(tensor=torch_tensor,
                                   name=tf_name,
                                   session=session)
            tf.keras.backend.set_value(tf_var, torch_tensor)
            tf_weight = session.run(tf_var)
            print("Successfully created {}: {}".format(
                tf_name, np.allclose(tf_weight, torch_tensor)))

        saver = tf.train.Saver(tf.trainable_variables())
        saver.save(
            session,
            os.path.join(ckpt_dir,
                         model_name.replace("-", "_") + ".ckpt"))
コード例 #32
0
    random.shuffle(train_examples)
    random.shuffle(val_examples)
    train_dataloader = DataLoader(dataset=DomainData(train_examples,
                                                     label_list,
                                                     max_seq_length,
                                                     tokenizer),
                                  batch_size=batch_size,
                                  shuffle=True,
                                  drop_last=False)
    val_dataset = DomainData(val_examples, label_list, max_seq_length,
                             tokenizer)
    val_dataloader = DataLoader(dataset=val_dataset, batch_size=batch_size)

    num_train_steps = int(len(train_examples) / batch_size * total_epoch_num)

    bert = BertModel.from_pretrained(bert_model, PYTORCH_PRETRAINED_BERT_CACHE)
    generator = Generator1(noise_size=noise_size,
                           output_size=768,
                           hidden_sizes=[768],
                           dropout_rate=0.1)
    discriminator = Discriminator(input_size=768,
                                  hidden_sizes=[768],
                                  num_labels=len(label_list),
                                  dropout_rate=0.1)

    bert.to(device)
    if multi_gpu:
        bert = torch.nn.DataParallel(bert, device_ids=device_ids)

    generator.to(device)
    discriminator.to(device)