Ejemplo n.º 1
0
 def __init__(self):
     super().__init__()
     config = BertConfig()
     config.output_hidden_states = True
     self.bert = BertModel.from_pretrained('bert-base-uncased',
                                           config=config)
     self.bertTokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
Ejemplo n.º 2
0
    def __init__(self,
                 *,
                 pretrained_model_name=None,
                 config_filename=None,
                 vocab_size=None,
                 hidden_size=768,
                 num_hidden_layers=12,
                 num_attention_heads=12,
                 intermediate_size=3072,
                 hidden_act="gelu",
                 max_position_embeddings=512,
                 random_init=False,
                 **kwargs):
        TrainableNM.__init__(self, **kwargs)

        # Check that only one of pretrained_model_name, config_filename, and
        # vocab_size was passed in
        total = 0

        if pretrained_model_name is not None:
            total += 1
        if config_filename is not None:
            total += 1
        if vocab_size is not None:
            total += 1

        if total != 1:
            raise ValueError(
                "Only one of pretrained_model_name, vocab_size, " +
                "or config_filename should be passed into the " +
                "BERT constructor.")

        if vocab_size is not None:
            config = BertConfig(
                vocab_size_or_config_json_file=vocab_size,
                hidden_size=hidden_size,
                num_hidden_layers=num_hidden_layers,
                num_attention_heads=num_attention_heads,
                intermediate_size=intermediate_size,
                hidden_act=hidden_act,
                max_position_embeddings=max_position_embeddings)
            model = BertModel(config)
        elif pretrained_model_name is not None:
            model = BertModel.from_pretrained(pretrained_model_name)
        elif config_filename is not None:
            config = BertConfig.from_json_file(config_filename)
            model = BertModel(config)
        else:
            raise ValueError(
                "Either pretrained_model_name or vocab_size must" +
                "be passed into the BERT constructor")

        model.to(self._device)

        self.add_module("bert", model)
        self.config = model.config

        if random_init:
            self.apply(
                lambda module: transformer_weights_init(module, xavier=False))
Ejemplo n.º 3
0
def load_model():
    model_dir = '../../model/model/'
    config = BertConfig(num_labels=3, output_attentions=True)
    config.from_pretrained('../../model/bert-cased/')
    model = BertAttn(config,
                     option='feed',
                     dropout=0.1,
                     gpu=False,
                     seed=0,
                     do_lower_case=False)
    class_weights = [0.6058, 0.1161, 0.2781]
    model.set_focal_loss(alpha=class_weights, gamma=-1)
    model.load_model(True, model_dir)
    return model
def main(args):

    if args.dataset == 'sim-R':
        from BERTDST_utils.simR_data_utils import prepare_dataset, MultiWozDataset, make_turn_label, postprocessing, state_equal, SLOT, OP
    if args.dataset == 'sim-M':
        from BERTDST_utils.simM_data_utils import prepare_dataset, MultiWozDataset, make_turn_label, postprocessing, state_equal, SLOT, OP
    if args.dataset == 'DSTC2':
        from BERTDST_utils.DSTC2_data_utils import prepare_dataset, MultiWozDataset, make_turn_label, postprocessing, state_equal, SLOT, OP
    if args.dataset == 'WOZ2.0':
        from BERTDST_utils.WOZ_data_utils import prepare_dataset, MultiWozDataset, make_turn_label, postprocessing, state_equal, SLOT, OP
    if args.dataset == 'MultiWOZ2.1':
        from BERTDST_utils.MultiWOZ_data_utils import prepare_dataset, MultiWozDataset, make_turn_label, postprocessing, state_equal, OP, make_slot_meta
        ontology = json.load(open(args.ontology_data_path))
        SLOT, ontology = make_slot_meta(ontology)

    slot_meta = SLOT
    tokenizer = BertTokenizer(args.vocab_path, do_lower_case=True)
    data = prepare_dataset(1.0, args.test_data_path, tokenizer, slot_meta,
                           args.test_size_window, args.max_seq_length,
                           args.test_MG)

    model_config = BertConfig.from_json_file(args.bert_config_path)
    model_config.dropout = 0.1
    op2id = OP
    model = MGDST(model_config, len(op2id), len(slot_meta))
    ckpt = torch.load(args.model_ckpt_path, map_location='cpu')
    model.load_state_dict(ckpt)

    model.eval()
    model.to(device)

    model_evaluation(make_turn_label, postprocessing, state_equal, OP, model,
                     data, tokenizer, slot_meta, 0, args.test_size_window,
                     args.test_MG)
Ejemplo n.º 5
0
    def __init__(self): 
        super(Bert, self).__init__()

        self.tokenizer = BertTokenizer.from_pretrained(os.path.join(config.get('model_config')['language_model_path'], 'bert-base-uncased-vocab.txt'))
        modelConfig = BertConfig.from_pretrained(os.path.join(config.get('model_config')['language_model_path'], 'bert_config.json'))
        self.textExtractor = BertModel.from_pretrained(
            os.path.join(config.get('model_config')['language_model_path'], 'pytorch_model.bin'), config=modelConfig)
        def prepare_config_and_inputs(self):
            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)

            input_mask = None
            if self.use_input_mask:
                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)

            token_type_ids = None
            if self.use_token_type_ids:
                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)

            sequence_labels = None
            token_labels = None
            choice_labels = None
            if self.use_labels:
                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
                choice_labels = ids_tensor([self.batch_size], self.num_choices)

            config = BertConfig(
                vocab_size_or_config_json_file=self.vocab_size,
                hidden_size=self.hidden_size,
                num_hidden_layers=self.num_hidden_layers,
                num_attention_heads=self.num_attention_heads,
                intermediate_size=self.intermediate_size,
                hidden_act=self.hidden_act,
                hidden_dropout_prob=self.hidden_dropout_prob,
                attention_probs_dropout_prob=self.attention_probs_dropout_prob,
                max_position_embeddings=self.max_position_embeddings,
                type_vocab_size=self.type_vocab_size,
                initializer_range=self.initializer_range)

            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
Ejemplo n.º 7
0
    def __init__(self, args, dictionary, left_pad=False):
        super().__init__(dictionary)
        self.dropout = args.dropout

        from pytorch_transformers import RobertaModel, BertModel
        from pytorch_transformers.file_utils import PYTORCH_TRANSFORMERS_CACHE
        from pytorch_transformers import RobertaConfig, RobertaTokenizer, BertConfig, BertTokenizer

        if args.pretrained_bert_model.startswith('roberta'):
            self.embed = RobertaModel.from_pretrained(
                args.pretrained_bert_model,
                cache_dir=PYTORCH_TRANSFORMERS_CACHE /
                'distributed_{}'.format(args.distributed_rank))
            # self.context = RobertaModel.from_pretrained(args.pretrained_bert_model,
            #         cache_dir=PYTORCH_TRANSFORMERS_CACHE / 'distributed_{}'.format(args.distributed_rank))
            self.config = RobertaConfig.from_pretrained(
                args.pretrained_bert_model)
            self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

        else:
            self.embed = BertModel.from_pretrained(
                args.pretrained_bert_model,
                cache_dir=PYTORCH_TRANSFORMERS_CACHE /
                'distributed_{}'.format(args.distributed_rank))
            # self.context = BertModel.from_pretrained(args.pretrained_bert_model,
            #         cache_dir=PYTORCH_TRANSFORMERS_CACHE / 'distributed_{}'.format(args.distributed_rank))
            self.config = BertConfig.from_pretrained(
                args.pretrained_bert_model)

            self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

        self.padding_idx = self.tokenizer.convert_tokens_to_ids(
            self.tokenizer.pad_token)
Ejemplo n.º 8
0
    def __init__(self, opt):
        self.opt = opt

        if 'bert' in opt.model_name:
            tokenizer = Tokenizer4Bert(opt.max_seq_len,
                                       opt.pretrained_bert_name)
            # bert = BertModel.from_pretrained(opt.pretrained_bert_name)
            config = BertConfig.from_pretrained(opt.pretrained_bert_name,
                                                output_attentions=True)
            bert = BertModel.from_pretrained(opt.pretrained_bert_name,
                                             config=config)
            self.pretrained_bert_state_dict = bert.state_dict()
            self.model = opt.model_class(bert, opt).to(opt.device)
        else:
            tokenizer = build_tokenizer(
                fnames=[opt.dataset_file['train'], opt.dataset_file['test']],
                max_seq_len=opt.max_seq_len,
                dat_fname='{0}_tokenizer.dat'.format(opt.dataset))
            embedding_matrix = build_embedding_matrix(
                word2idx=tokenizer.word2idx,
                embed_dim=opt.embed_dim,
                dat_fname='{0}_{1}_embedding_matrix.dat'.format(
                    str(opt.embed_dim), opt.dataset))
            self.model = opt.model_class(embedding_matrix, opt).to(opt.device)

        self.trainset = ABSADataset(opt.dataset_file['train'], tokenizer)
        self.testset = ABSADataset(opt.dataset_file['test'], tokenizer)

        if opt.device.type == 'cuda':
            logger.info('cuda memory allocated: {}'.format(
                torch.cuda.memory_allocated(device=opt.device.index)))
        self._print_args()
Ejemplo n.º 9
0
    def __init__(self, args, device, checkpoint=None, bert_from_extractive=None):
        super(AbsSummarizer, self).__init__()
        self.args = args
        self.device = device
        self.bert = Bert(args.large, args.temp_dir, args.finetune_bert)

        if bert_from_extractive is not None:
            self.bert.model.load_state_dict(
                dict([(n[11:], p) for n, p in bert_from_extractive.items() if n.startswith('bert.model')]), strict=True)

        if (args.encoder == 'baseline'):
            bert_config = BertConfig(self.bert.model.config.vocab_size, hidden_size=args.enc_hidden_size,
                                     num_hidden_layers=args.enc_layers, num_attention_heads=8,
                                     intermediate_size=args.enc_ff_size,
                                     hidden_dropout_prob=args.enc_dropout,
                                     attention_probs_dropout_prob=args.enc_dropout)
            self.bert.model = BertModel(bert_config)

        if(args.max_pos>512):
            my_pos_embeddings = nn.Embedding(args.max_pos, self.bert.model.config.hidden_size)
            my_pos_embeddings.weight.data[:512] = self.bert.model.embeddings.position_embeddings.weight.data
            my_pos_embeddings.weight.data[512:] = self.bert.model.embeddings.position_embeddings.weight.data[-1][None,:].repeat(args.max_pos-512,1)
            self.bert.model.embeddings.position_embeddings = my_pos_embeddings
        self.vocab_size = self.bert.model.config.vocab_size
        tgt_embeddings = nn.Embedding(self.vocab_size, self.bert.model.config.hidden_size, padding_idx=0)
        if (self.args.share_emb):
            tgt_embeddings = self.bert.model.embeddings.word_embeddings

        self.decoder = TransformerDecoder(
            self.args.dec_layers,
            self.args.dec_hidden_size, heads=self.args.dec_heads,
            d_ff=self.args.dec_ff_size, dropout=self.args.dec_dropout, embeddings=tgt_embeddings)

        self.generator = get_generator(self.vocab_size, self.args.dec_hidden_size, device)
        self.generator[0].weight = self.decoder.embeddings.weight


        if checkpoint is not None:
            self.load_state_dict(checkpoint['model'], strict=True)
        else:
            for module in self.decoder.modules():
                if isinstance(module, (nn.Linear, nn.Embedding)):
                    module.weight.data.normal_(mean=0.0, std=0.02)
                elif isinstance(module, nn.LayerNorm):
                    module.bias.data.zero_()
                    module.weight.data.fill_(1.0)
                if isinstance(module, nn.Linear) and module.bias is not None:
                    module.bias.data.zero_()
            for p in self.generator.parameters():
                if p.dim() > 1:
                    xavier_uniform_(p)
                else:
                    p.data.zero_()
            if(args.use_bert_emb):
                tgt_embeddings = nn.Embedding(self.vocab_size, self.bert.model.config.hidden_size, padding_idx=0)
                tgt_embeddings.weight = copy.deepcopy(self.bert.model.embeddings.word_embeddings.weight)
                self.decoder.embeddings = tgt_embeddings
                self.generator[0].weight = self.decoder.embeddings.weight

        self.to(device)
Ejemplo n.º 10
0
    def __init__(self, args, device, checkpoint):
        super(ExtSummarizer, self).__init__()
        self.args = args
        self.device = device
        self.bert = Bert(args, args.temp_dir, args.finetune_bert)

        self.ext_layer = ExtTransformerEncoder(self.bert.model.config.hidden_size, args.ext_ff_size, args.ext_heads,
                                               args.ext_dropout, args.ext_layers)
        if (args.encoder == 'baseline'):
            bert_config = BertConfig(self.bert.model.config.vocab_size, hidden_size=args.ext_hidden_size,
                                     num_hidden_layers=args.ext_layers, num_attention_heads=args.ext_heads, intermediate_size=args.ext_ff_size)
            self.bert.model = BertModel(bert_config)
            self.ext_layer = Classifier(self.bert.model.config.hidden_size)

        if(args.max_pos>512):
            my_pos_embeddings = nn.Embedding(args.max_pos, self.bert.model.config.hidden_size)
            my_pos_embeddings.weight.data[:512] = self.bert.model.embeddings.position_embeddings.weight.data
            my_pos_embeddings.weight.data[512:] = self.bert.model.embeddings.position_embeddings.weight.data[-1][None,:].repeat(args.max_pos-512,1)
            self.bert.model.embeddings.position_embeddings = my_pos_embeddings


        if checkpoint is not None:
            self.load_state_dict(checkpoint['model'], strict=True)
        else:
            if args.param_init != 0.0:
                for p in self.ext_layer.parameters():
                    p.data.uniform_(-args.param_init, args.param_init)
            if args.param_init_glorot:
                for p in self.ext_layer.parameters():
                    if p.dim() > 1:
                        xavier_uniform_(p)

        self.to(device)
Ejemplo n.º 11
0
    def __init__(self, args, device, checkpoint):
        super(ExtSummarizer, self).__init__()
        self.args = args
        self.device = device
        self.bert = Bert(args.large, args.temp_dir, args.finetune_bert)

        self.ext_layer = ExtTransformerEncoder(
            self.bert.model.config.hidden_size, args.ext_ff_size,
            args.ext_heads, args.ext_dropout, args.ext_layers)
        if (args.encoder == 'baseline'):
            bert_config = BertConfig(self.bert.model.config.vocab_size,
                                     hidden_size=args.hidden_size,
                                     num_hidden_layers=6,
                                     num_attention_heads=8,
                                     intermediate_size=args.ff_size)
            self.bert.model = BertModel(bert_config)
            self.ext_layer = Classifier(self.bert.model.config.hidden_size)

        if checkpoint is not None:
            self.load_state_dict(checkpoint['model'], strict=True)
        else:
            if args.param_init != 0.0:
                for p in self.ext_layer.parameters():
                    p.data.uniform_(-args.param_init, args.param_init)
            if args.param_init_glorot:
                for p in self.ext_layer.parameters():
                    if p.dim() > 1:
                        xavier_uniform_(p)

        self.to(device)
    def __init__(self, vocab_size, tag_to_ix, hidden_dim, n_layers):
        super(BERT_BiLSTM_CRF, self).__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)

        config = BertConfig.from_pretrained('bert-base-multilingual-cased')
        self.model = BertModel(config)

        self.lstm = nn.LSTM(768,
                            hidden_dim,
                            num_layers=n_layers,
                            bidirectional=True)

        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim * 2, self.tagset_size)

        # Matrix of transition parameters.  Entry i,j is the score of
        # transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size, device=device))

        # These two statements enforce the constraint that we never transfer
        # to the start tag and we never transfer from the stop tag
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000

        self.hidden = self.init_hidden()
Ejemplo n.º 13
0
def createCsvData():
    config = BertConfig.from_pretrained('bert-base-uncased')
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel(config)
    with Cd("lemmadata"):
        with open("id_to_sent.json") as sent_id_dict_file:
            sent_id_dict = json.load(sent_id_dict_file)
        for dir_item in os.listdir():
            if os.path.isfile(dir_item):
                if dir_item.endswith(".json") and dir_item != "id_to_sent.json":
                    print(dir_item)
                    with open(dir_item, "r") as f:
                        lemma_data = json.load(f)
                    with Cd("vectors"):
                        with open(dir_item[:-5]+".csv", "w") as vector_file:
                            writer = csv.writer(vector_file, delimiter=",")
                            for instance in lemma_data:
                                inst_sent_id = instance["sent_id"]
                                inst_sense = instance["sense"]
                                inst_sent = sent_id_dict[str(inst_sent_id)]
                                if(len(inst_sent) > 511):
                                    continue 
                                vector = vectorizeWordInContext(inst_sent, instance["pos"], tokenizer, model)
                                vec_list = vector.detach().tolist()
                                row_data = [inst_sent_id, instance["pos"], inst_sense] + vec_list
                                writer.writerow(row_data)
Ejemplo n.º 14
0
    def __init__(self,
                 num_labels=2,
                 model_type='bert-base-uncased',
                 token_layer='token-cls',
                 output_logits=True):
        super(BertForWSD, self).__init__()

        self.config = BertConfig()
        self.token_layer = token_layer
        self.num_labels = 2
        self.bert = BertModel.from_pretrained(model_type)
        self.dropout = nn.Dropout(self.config.hidden_dropout_prob)
        self.output_logits = output_logits

        # Define which token selection layer to use
        if token_layer == 'token-cls':
            self.tokenselectlayer = TokenClsLayer()
        elif token_layer in ['sent-cls', 'sent-cls-ws']:
            self.tokenselectlayer = SentClsLayer()
        else:
            raise ValueError(
                "Unidentified parameter for token selection layer")

        self.classifier = nn.Linear(768, num_labels)
        if not output_logits:
            self.softmax = nn.Softmax(dim=1)  # to be checked!!!

        nn.init.xavier_normal_(self.classifier.weight)
Ejemplo n.º 15
0
    def __init__(self):
        model_dir = '/var/model/bert'
        if not os.path.isdir(model_dir):
            model_dir = os.path.abspath(os.path.dirname(__file__) + '/../../var/model/bert')

        self.use_gpu: bool = torch.cuda.is_available()
        self.config: BertConfig = BertConfig.from_json_file(model_dir + '/config.json')
        self.tokenizer: BertTokenizer = BertTokenizer.from_pretrained(model_dir + '/vocab.txt', do_lower_case=False)

        self.model_masked: BertForMaskedLM = BertForMaskedLM.from_pretrained(model_dir + '/model.bin', config=self.config)
        self.model: BertModel = self.model_masked.bert

        # freeze bert encoder
        for param in self.model.parameters():
            param.requires_grad = False
        for param in self.model_masked.parameters():
            param.requires_grad = False

        self.model.encoder.output_hidden_states = True
        self.model.eval()
        self.model_masked.eval()

        if self.use_gpu:
            self.model.cuda()
            self.model_masked.cuda()
Ejemplo n.º 16
0
 def __init__(self, name='bert-base-uncased', dropout=0.1, num_class=2):
     super(BertC, self).__init__()
     config = BertConfig.from_pretrained(name)
     self.bert = BertModel_attack(config)
     self.proj = nn.Linear(config.hidden_size, num_class)
     self.loss_f = nn.CrossEntropyLoss()
     self.drop = nn.Dropout(p=dropout)
def main():

    bert_base_config = BertConfig.from_pretrained('bert-base-uncased', num_labels=2)
    bert_base_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', config=bert_base_config)
    count = 0
    for name, param in bert_base_model.named_parameters():
        if param.requires_grad:
            size = 1
            for s in param.data.size():
                size = s * size
            count += size
    print('The total number of parameters in bert_base_uncased: ', count)

    roberta_config = RobertaConfig.from_pretrained('roberta-base', num_labels=2)
    roberta_model = RobertaForSequenceClassification.from_pretrained('roberta-base',config=roberta_config)
    count = 0
    for name, param in roberta_model.named_parameters():
        if param.requires_grad:
            size = 1
            for s in param.data.size():
                size = s * size
            count += size
    print('The total number of parameters in roberta: ', count)

    albert_config = AlbertConfig.from_pretrained('albert-base-v2', num_labels=2)
    albert_model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', config=albert_config)
    count = 0
    for name, param in albert_model.named_parameters():
        if param.requires_grad:
            size = 1
            for s in param.data.size():
                size = s * size
            count += size
    print('The total number of parameters in albert: ', count)
Ejemplo n.º 18
0
    def __init__(self, hidden_dim, n_layers, tagset_size):
        super(BertLSTM, self).__init__()
        config = BertConfig.from_pretrained('bert-base-multilingual-cased')
        self.model = BertModel(config)

        self.decoder = nn.LSTM(768, hidden_dim, n_layers)

        self.hiddentotag = nn.Linear(hidden_dim, tagset_size)
Ejemplo n.º 19
0
 def load_model(model_name: str, do_lower_case=False):
     config = BertConfig.from_pretrained(model_name)
     tokenizer = BertTokenizer.from_pretrained(model_name,
                                               do_lower_case=do_lower_case)
     model = BertForQuestionAnswering.from_pretrained(model_name,
                                                      from_tf=False,
                                                      config=config)
     return model, tokenizer
Ejemplo n.º 20
0
 def load_model(self, model_path: str, do_lower_case=False):
     config = BertConfig.from_pretrained(model_path + "/config.json")
     tokenizer = BertTokenizer.from_pretrained(model_path,
                                               do_lower_case=do_lower_case)
     model = BertForQuestionAnswering.from_pretrained(model_path,
                                                      from_tf=False,
                                                      config=config)
     return model, tokenizer
Ejemplo n.º 21
0
def main():
    torch.cuda.empty_cache()
    parser = setup_parser()
    args = parser.parse_args()
    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir
    ) and args.do_train and not args.overwrite_output_dir:
        raise ValueError("Output directory already exists and is not empty.")
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    args.n_gpu = torch.cuda.device_count()
    args.device = device
    set_seed(args)
    args.task_name = args.task_name.lower()
    if args.task_name not in processors:
        raise ValueError("Task not found: {}".format(args.task_name))
    processor = processors[args.task_name]()
    args.output_mode = output_modes[args.task_name]
    label_list = processor.get_labels()
    num_labels = len(label_list)

    ##Load Models
    config = BertConfig.from_pretrained(args.config_name)
    tokenizer = BertTokenizer.from_pretrained(args.text_encoder_checkpoint,
                                              do_lower_case=args.do_lower_case)
    text_encoder = BertModel.from_pretrained(args.text_encoder_checkpoint,
                                             config=config)
    graph_encoder = GraphEncoder(args.n_hidden, args.min_score)
    if args.graph_encoder_checkpoint:
        graph_encoder.gcnnet.load_state_dict(
            torch.load(args.graph_encoder_checkpoint))

    medsts_classifier = PairClassifier(config.hidden_size + args.n_hidden, 1)
    medsts_c_classifier = PairClassifier(config.hidden_size + args.n_hidden, 5)
    medsts_type_classifier = PairClassifier(config.hidden_size + args.n_hidden,
                                            4)
    model = MedstsNet(text_encoder, graph_encoder, medsts_classifier,
                      medsts_c_classifier, medsts_type_classifier)
    model.to(args.device)

    args.n_gpu = 1

    if args.do_train:
        train_dataset = load_and_cache_examples(args,
                                                args.task_name,
                                                tokenizer,
                                                evaluate=False)
        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info('global step = {}, average loss = {}'.format(
            global_step, tr_loss))
        if not os.path.exists(args.output_dir):
            os.makedirs(args.output_dir)
        logger.info("saving model checkpoint to {}".format(args.output_dir))
        model_to_save = model.module if hasattr(model, 'module') else model
        # model_to_save.save_pretrained(args.output_dir)
        torch.save(model_to_save.state_dict(),
                   os.path.join(args.output_dir, 'saved_model.pth'))
        tokenizer.save_pretrained(args.output_dir)
        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
Ejemplo n.º 22
0
def start_inference(data, dialogue_type, dest, batchsize, bert_model, cuda):

    assert torch.cuda.is_available(
    ) == True, 'PyTorch not running on GPU! #sadpanda'

    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.manual_seed(100)

    dialogue_type_dict = {'DB': 'db_response_new', 'normal': 'response'}

    config = BertConfig.from_pretrained(bert_model)
    tokenizer = BertTokenizer.from_pretrained(bert_model)
    model = BertForNextSentencePrediction(config)
    model.cuda()
    model.eval()

    df = pd.read_csv(data, usecols=['id'])
    df.dropna(inplace=True)
    row_count = df.shape[0]
    del df

    chunk_count = math.ceil(row_count / batchsize)

    with open(dest, 'w+'):
        pass

    cols = ['context', dialogue_type_dict[dialogue_type]]
    for i, chunk in enumerate(
            tqdm(pd.read_csv(open(data, 'r'),
                             usecols=cols,
                             chunksize=batchsize),
                 desc='Batches',
                 total=chunk_count)):
        samples = get_batch(chunk, dialogue_type_dict[dialogue_type])

        assert len(samples) == chunk.shape[0], 'Some samples went missing!'

        if batchsize == 1:
            results = convert_single_example_to_features(samples, tokenizer)
        else:
            results = convert_examples_to_features(samples, tokenizer)

        with torch.no_grad():
            input_ids = torch.tensor([x.input_ids for x in results]).cuda()
            token_type_ids = torch.tensor([x.input_type_ids
                                           for x in results]).cuda()
            attention_mask = torch.tensor([x.input_mask
                                           for x in results]).cuda()

            outputs = model(input_ids,
                            token_type_ids=token_type_ids,
                            attention_mask=attention_mask)[0]
            outputs = torch.softmax(outputs, dim=1)
        db_probs = outputs[:, 1]

        with open(dest, 'a') as f:
            f.write('\n'.join([str(x) for x in db_probs.tolist()]) + '\n')
Ejemplo n.º 23
0
def main():
    torch.cuda.empty_cache()
    parser = setup_parser()
    args = parser.parse_args()
    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir
    ) and args.do_train and not args.overwrite_output_dir:
        raise ValueError("Output directory already exists and is not empty.")
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    args.n_gpu = torch.cuda.device_count()
    args.device = device
    set_seed(args)
    args.task_name = args.task_name.lower()
    if args.task_name not in processors:
        raise ValueError("Task not found: {}".format(args.task_name))
    processor = processors[args.task_name]()
    args.output_mode = output_modes[args.task_name]
    label_list = processor.get_labels()

    ##Load Models
    config = BertConfig.from_pretrained(args.config_name)
    tokenizer = BertTokenizer.from_pretrained(args.text_encoder_checkpoint,
                                              do_lower_case=args.do_lower_case)
    text_encoder = BertModel.from_pretrained(args.text_encoder_checkpoint,
                                             config=config)
    graph_encoder = GraphEncoder(args.n_hidden, args.min_score)

    medsts_classifier = PairClassifier(config.hidden_size + args.n_hidden, 1)
    medsts_c_classifier = PairClassifier(config.hidden_size + args.n_hidden, 5)
    medsts_c2_classifier = PairClassifier(config.hidden_size + args.n_hidden,
                                          2)
    medsts_type_classifier = PairClassifier(config.hidden_size + args.n_hidden,
                                            4)
    model = MedstsNet(text_encoder, graph_encoder, medsts_classifier,
                      medsts_c_classifier, medsts_c2_classifier,
                      medsts_type_classifier)
    if args.text_only:
        medsts_classifier = PairClassifier(config.hidden_size, 1)
        medsts_c_classifier = PairClassifier(config.hidden_size, 5)
        medsts_c2_classifier = PairClassifier(config.hidden_size, 2)
        medsts_type_classifier = PairClassifier(config.hidden_size, 4)
        model = MedstsNet_Textonly(text_encoder, medsts_classifier,
                                   medsts_c_classifier, medsts_c2_classifier,
                                   medsts_type_classifier)

    model.to(args.device)

    args.n_gpu = 1

    if args.do_train:
        train_dataset = load_and_cache_examples(args,
                                                args.task_name,
                                                tokenizer,
                                                evaluate=False,
                                                reverse=True)
        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info('global step = {}, average loss = {}'.format(
            global_step, tr_loss))
Ejemplo n.º 24
0
def start(check_accr=False):
    bert_config = BertConfig.from_json_file(config.bert_config_root)
    model = BertCloze(bert_config, num_choices=10)
    load_model(model, config.pretrained_bert_root)
    generate_prob(model)
    generate_result(i_range=5)
    if check_accr:
        check_result()
    print("程序运行完成")
Ejemplo n.º 25
0
def load_artifacts(model_path):
    """ Loads pretrained model , tokenizer , config."""
    model_class = BertForQuestionAnswering
    model = model_class.from_pretrained(model_path)
    tokenizer = BertTokenizer.from_pretrained(model_path)
    config = BertConfig.from_pretrained(model_path)
    model.to("cpu")
    model.eval()
    return model, tokenizer, config
Ejemplo n.º 26
0
def load_artifacts(model_path):
    """ Loads pretrained model , tokenizer , config."""
    model_class = BertForSequenceClassification
    model = model_class.from_pretrained(model_path)
    tokenizer = BertTokenizer.from_pretrained(model_path)
    config = BertConfig.from_pretrained(model_path)
    model.to("cpu")
    model.eval()
    return model, tokenizer, config
Ejemplo n.º 27
0
    def __init__(self, code_length):  # code_length为fc映射到的维度大小
        super(TextNet, self).__init__()

        modelConfig = BertConfig.from_pretrained(
            './data/bert-base-uncased-config.json')
        self.textExtractor = BertModel.from_pretrained(
            './data/bert-base-uncased-pytorch_model.bin', config=modelConfig)
        # self.textExtractor.eval()
        embedding_dim = self.textExtractor.config.hidden_size
Ejemplo n.º 28
0
 def __init__(self, code_length=1024):
     super(TextNet, self).__init__()
     modelConfig = BertConfig.from_pretrained(
         '/home/hengyuli/cross-modal/model/bert_config.json')
     self.textExtractor = BertModel.from_pretrained(
         '/home/hengyuli/cross-modal/model/pytorch_model.bin',
         config=modelConfig)
     embedding_dim = self.textExtractor.config.hidden_size
     self.fc = nn.Linear(embedding_dim, code_length)
     self.tanh = torch.nn.Tanh()
    def __init__(self, code_length):  # code_length为fc映射到的维度大小
        super(TextNet, self).__init__()

        modelConfig = BertConfig.from_pretrained('bert-base-chinese')
        self.textExtractor = BertModel.from_pretrained('bert-base-chinese',
                                                       config=modelConfig)
        embedding_dim = self.textExtractor.config.hidden_size  #embedding_dim应该是模型截断处输出的维度

        self.fc = nn.Linear(embedding_dim, code_length)
        self.tanh = torch.nn.Tanh()
Ejemplo n.º 30
0
    def __init__(self):
        super(Bert, self).__init__()

        self.tokenizer = BertTokenizer.from_pretrained(
            '../pretrained/bert-base-uncased/bert-base-uncased-vocab.txt')
        modelConfig = BertConfig.from_pretrained(
            '../pretrained/bert-base-uncased/bert_config.json')
        self.textExtractor = BertModel.from_pretrained(
            '../pretrained/bert-base-uncased/pytorch_model.bin',
            config=modelConfig)