def load_model(): model_dir = '../../model/model/' config = BertConfig(num_labels=3, output_attentions=True) config.from_pretrained('../../model/bert-cased/') model = BertAttn(config, option='feed', dropout=0.1, gpu=False, seed=0, do_lower_case=False) class_weights = [0.6058, 0.1161, 0.2781] model.set_focal_loss(alpha=class_weights, gamma=-1) model.load_model(True, model_dir) return model
def createCsvData(): config = BertConfig.from_pretrained('bert-base-uncased') tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertModel(config) with Cd("lemmadata"): with open("id_to_sent.json") as sent_id_dict_file: sent_id_dict = json.load(sent_id_dict_file) for dir_item in os.listdir(): if os.path.isfile(dir_item): if dir_item.endswith(".json") and dir_item != "id_to_sent.json": print(dir_item) with open(dir_item, "r") as f: lemma_data = json.load(f) with Cd("vectors"): with open(dir_item[:-5]+".csv", "w") as vector_file: writer = csv.writer(vector_file, delimiter=",") for instance in lemma_data: inst_sent_id = instance["sent_id"] inst_sense = instance["sense"] inst_sent = sent_id_dict[str(inst_sent_id)] if(len(inst_sent) > 511): continue vector = vectorizeWordInContext(inst_sent, instance["pos"], tokenizer, model) vec_list = vector.detach().tolist() row_data = [inst_sent_id, instance["pos"], inst_sense] + vec_list writer.writerow(row_data)
def main(): bert_base_config = BertConfig.from_pretrained('bert-base-uncased', num_labels=2) bert_base_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', config=bert_base_config) count = 0 for name, param in bert_base_model.named_parameters(): if param.requires_grad: size = 1 for s in param.data.size(): size = s * size count += size print('The total number of parameters in bert_base_uncased: ', count) roberta_config = RobertaConfig.from_pretrained('roberta-base', num_labels=2) roberta_model = RobertaForSequenceClassification.from_pretrained('roberta-base',config=roberta_config) count = 0 for name, param in roberta_model.named_parameters(): if param.requires_grad: size = 1 for s in param.data.size(): size = s * size count += size print('The total number of parameters in roberta: ', count) albert_config = AlbertConfig.from_pretrained('albert-base-v2', num_labels=2) albert_model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', config=albert_config) count = 0 for name, param in albert_model.named_parameters(): if param.requires_grad: size = 1 for s in param.data.size(): size = s * size count += size print('The total number of parameters in albert: ', count)
def __init__(self, name='bert-base-uncased', dropout=0.1, num_class=2): super(BertC, self).__init__() config = BertConfig.from_pretrained(name) self.bert = BertModel_attack(config) self.proj = nn.Linear(config.hidden_size, num_class) self.loss_f = nn.CrossEntropyLoss() self.drop = nn.Dropout(p=dropout)
def __init__(self, opt): self.opt = opt if 'bert' in opt.model_name: tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) # bert = BertModel.from_pretrained(opt.pretrained_bert_name) config = BertConfig.from_pretrained(opt.pretrained_bert_name, output_attentions=True) bert = BertModel.from_pretrained(opt.pretrained_bert_name, config=config) self.pretrained_bert_state_dict = bert.state_dict() self.model = opt.model_class(bert, opt).to(opt.device) else: tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, dat_fname='{0}_tokenizer.dat'.format(opt.dataset)) embedding_matrix = build_embedding_matrix( word2idx=tokenizer.word2idx, embed_dim=opt.embed_dim, dat_fname='{0}_{1}_embedding_matrix.dat'.format( str(opt.embed_dim), opt.dataset)) self.model = opt.model_class(embedding_matrix, opt).to(opt.device) self.trainset = ABSADataset(opt.dataset_file['train'], tokenizer) self.testset = ABSADataset(opt.dataset_file['test'], tokenizer) if opt.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format( torch.cuda.memory_allocated(device=opt.device.index))) self._print_args()
def __init__(self, args, dictionary, left_pad=False): super().__init__(dictionary) self.dropout = args.dropout from pytorch_transformers import RobertaModel, BertModel from pytorch_transformers.file_utils import PYTORCH_TRANSFORMERS_CACHE from pytorch_transformers import RobertaConfig, RobertaTokenizer, BertConfig, BertTokenizer if args.pretrained_bert_model.startswith('roberta'): self.embed = RobertaModel.from_pretrained( args.pretrained_bert_model, cache_dir=PYTORCH_TRANSFORMERS_CACHE / 'distributed_{}'.format(args.distributed_rank)) # self.context = RobertaModel.from_pretrained(args.pretrained_bert_model, # cache_dir=PYTORCH_TRANSFORMERS_CACHE / 'distributed_{}'.format(args.distributed_rank)) self.config = RobertaConfig.from_pretrained( args.pretrained_bert_model) self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base') else: self.embed = BertModel.from_pretrained( args.pretrained_bert_model, cache_dir=PYTORCH_TRANSFORMERS_CACHE / 'distributed_{}'.format(args.distributed_rank)) # self.context = BertModel.from_pretrained(args.pretrained_bert_model, # cache_dir=PYTORCH_TRANSFORMERS_CACHE / 'distributed_{}'.format(args.distributed_rank)) self.config = BertConfig.from_pretrained( args.pretrained_bert_model) self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') self.padding_idx = self.tokenizer.convert_tokens_to_ids( self.tokenizer.pad_token)
def __init__(self): super(Bert, self).__init__() self.tokenizer = BertTokenizer.from_pretrained(os.path.join(config.get('model_config')['language_model_path'], 'bert-base-uncased-vocab.txt')) modelConfig = BertConfig.from_pretrained(os.path.join(config.get('model_config')['language_model_path'], 'bert_config.json')) self.textExtractor = BertModel.from_pretrained( os.path.join(config.get('model_config')['language_model_path'], 'pytorch_model.bin'), config=modelConfig)
def __init__(self, vocab_size, tag_to_ix, hidden_dim, n_layers): super(BERT_BiLSTM_CRF, self).__init__() self.hidden_dim = hidden_dim self.n_layers = n_layers self.vocab_size = vocab_size self.tag_to_ix = tag_to_ix self.tagset_size = len(tag_to_ix) config = BertConfig.from_pretrained('bert-base-multilingual-cased') self.model = BertModel(config) self.lstm = nn.LSTM(768, hidden_dim, num_layers=n_layers, bidirectional=True) # Maps the output of the LSTM into tag space. self.hidden2tag = nn.Linear(hidden_dim * 2, self.tagset_size) # Matrix of transition parameters. Entry i,j is the score of # transitioning *to* i *from* j. self.transitions = nn.Parameter( torch.randn(self.tagset_size, self.tagset_size, device=device)) # These two statements enforce the constraint that we never transfer # to the start tag and we never transfer from the stop tag self.transitions.data[tag_to_ix[START_TAG], :] = -10000 self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000 self.hidden = self.init_hidden()
def load_model(model_name: str, do_lower_case=False): config = BertConfig.from_pretrained(model_name) tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=do_lower_case) model = BertForQuestionAnswering.from_pretrained(model_name, from_tf=False, config=config) return model, tokenizer
def __init__(self, hidden_dim, n_layers, tagset_size): super(BertLSTM, self).__init__() config = BertConfig.from_pretrained('bert-base-multilingual-cased') self.model = BertModel(config) self.decoder = nn.LSTM(768, hidden_dim, n_layers) self.hiddentotag = nn.Linear(hidden_dim, tagset_size)
def load_model(self, model_path: str, do_lower_case=False): config = BertConfig.from_pretrained(model_path + "/config.json") tokenizer = BertTokenizer.from_pretrained(model_path, do_lower_case=do_lower_case) model = BertForQuestionAnswering.from_pretrained(model_path, from_tf=False, config=config) return model, tokenizer
def start_inference(data, dialogue_type, dest, batchsize, bert_model, cuda): assert torch.cuda.is_available( ) == True, 'PyTorch not running on GPU! #sadpanda' torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False torch.manual_seed(100) dialogue_type_dict = {'DB': 'db_response_new', 'normal': 'response'} config = BertConfig.from_pretrained(bert_model) tokenizer = BertTokenizer.from_pretrained(bert_model) model = BertForNextSentencePrediction(config) model.cuda() model.eval() df = pd.read_csv(data, usecols=['id']) df.dropna(inplace=True) row_count = df.shape[0] del df chunk_count = math.ceil(row_count / batchsize) with open(dest, 'w+'): pass cols = ['context', dialogue_type_dict[dialogue_type]] for i, chunk in enumerate( tqdm(pd.read_csv(open(data, 'r'), usecols=cols, chunksize=batchsize), desc='Batches', total=chunk_count)): samples = get_batch(chunk, dialogue_type_dict[dialogue_type]) assert len(samples) == chunk.shape[0], 'Some samples went missing!' if batchsize == 1: results = convert_single_example_to_features(samples, tokenizer) else: results = convert_examples_to_features(samples, tokenizer) with torch.no_grad(): input_ids = torch.tensor([x.input_ids for x in results]).cuda() token_type_ids = torch.tensor([x.input_type_ids for x in results]).cuda() attention_mask = torch.tensor([x.input_mask for x in results]).cuda() outputs = model(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)[0] outputs = torch.softmax(outputs, dim=1) db_probs = outputs[:, 1] with open(dest, 'a') as f: f.write('\n'.join([str(x) for x in db_probs.tolist()]) + '\n')
def main(): torch.cuda.empty_cache() parser = setup_parser() args = parser.parse_args() if os.path.exists(args.output_dir) and os.listdir( args.output_dir ) and args.do_train and not args.overwrite_output_dir: raise ValueError("Output directory already exists and is not empty.") device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') args.n_gpu = torch.cuda.device_count() args.device = device set_seed(args) args.task_name = args.task_name.lower() if args.task_name not in processors: raise ValueError("Task not found: {}".format(args.task_name)) processor = processors[args.task_name]() args.output_mode = output_modes[args.task_name] label_list = processor.get_labels() ##Load Models config = BertConfig.from_pretrained(args.config_name) tokenizer = BertTokenizer.from_pretrained(args.text_encoder_checkpoint, do_lower_case=args.do_lower_case) text_encoder = BertModel.from_pretrained(args.text_encoder_checkpoint, config=config) graph_encoder = GraphEncoder(args.n_hidden, args.min_score) medsts_classifier = PairClassifier(config.hidden_size + args.n_hidden, 1) medsts_c_classifier = PairClassifier(config.hidden_size + args.n_hidden, 5) medsts_c2_classifier = PairClassifier(config.hidden_size + args.n_hidden, 2) medsts_type_classifier = PairClassifier(config.hidden_size + args.n_hidden, 4) model = MedstsNet(text_encoder, graph_encoder, medsts_classifier, medsts_c_classifier, medsts_c2_classifier, medsts_type_classifier) if args.text_only: medsts_classifier = PairClassifier(config.hidden_size, 1) medsts_c_classifier = PairClassifier(config.hidden_size, 5) medsts_c2_classifier = PairClassifier(config.hidden_size, 2) medsts_type_classifier = PairClassifier(config.hidden_size, 4) model = MedstsNet_Textonly(text_encoder, medsts_classifier, medsts_c_classifier, medsts_c2_classifier, medsts_type_classifier) model.to(args.device) args.n_gpu = 1 if args.do_train: train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False, reverse=True) global_step, tr_loss = train(args, train_dataset, model, tokenizer) logger.info('global step = {}, average loss = {}'.format( global_step, tr_loss))
def main(): torch.cuda.empty_cache() parser = setup_parser() args = parser.parse_args() if os.path.exists(args.output_dir) and os.listdir( args.output_dir ) and args.do_train and not args.overwrite_output_dir: raise ValueError("Output directory already exists and is not empty.") device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') args.n_gpu = torch.cuda.device_count() args.device = device set_seed(args) args.task_name = args.task_name.lower() if args.task_name not in processors: raise ValueError("Task not found: {}".format(args.task_name)) processor = processors[args.task_name]() args.output_mode = output_modes[args.task_name] label_list = processor.get_labels() num_labels = len(label_list) ##Load Models config = BertConfig.from_pretrained(args.config_name) tokenizer = BertTokenizer.from_pretrained(args.text_encoder_checkpoint, do_lower_case=args.do_lower_case) text_encoder = BertModel.from_pretrained(args.text_encoder_checkpoint, config=config) graph_encoder = GraphEncoder(args.n_hidden, args.min_score) if args.graph_encoder_checkpoint: graph_encoder.gcnnet.load_state_dict( torch.load(args.graph_encoder_checkpoint)) medsts_classifier = PairClassifier(config.hidden_size + args.n_hidden, 1) medsts_c_classifier = PairClassifier(config.hidden_size + args.n_hidden, 5) medsts_type_classifier = PairClassifier(config.hidden_size + args.n_hidden, 4) model = MedstsNet(text_encoder, graph_encoder, medsts_classifier, medsts_c_classifier, medsts_type_classifier) model.to(args.device) args.n_gpu = 1 if args.do_train: train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False) global_step, tr_loss = train(args, train_dataset, model, tokenizer) logger.info('global step = {}, average loss = {}'.format( global_step, tr_loss)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) logger.info("saving model checkpoint to {}".format(args.output_dir)) model_to_save = model.module if hasattr(model, 'module') else model # model_to_save.save_pretrained(args.output_dir) torch.save(model_to_save.state_dict(), os.path.join(args.output_dir, 'saved_model.pth')) tokenizer.save_pretrained(args.output_dir) torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
def __init__(self, code_length): # code_length为fc映射到的维度大小 super(TextNet, self).__init__() modelConfig = BertConfig.from_pretrained( './data/bert-base-uncased-config.json') self.textExtractor = BertModel.from_pretrained( './data/bert-base-uncased-pytorch_model.bin', config=modelConfig) # self.textExtractor.eval() embedding_dim = self.textExtractor.config.hidden_size
def load_artifacts(model_path): """ Loads pretrained model , tokenizer , config.""" model_class = BertForQuestionAnswering model = model_class.from_pretrained(model_path) tokenizer = BertTokenizer.from_pretrained(model_path) config = BertConfig.from_pretrained(model_path) model.to("cpu") model.eval() return model, tokenizer, config
def load_artifacts(model_path): """ Loads pretrained model , tokenizer , config.""" model_class = BertForSequenceClassification model = model_class.from_pretrained(model_path) tokenizer = BertTokenizer.from_pretrained(model_path) config = BertConfig.from_pretrained(model_path) model.to("cpu") model.eval() return model, tokenizer, config
def __init__(self, code_length=1024): super(TextNet, self).__init__() modelConfig = BertConfig.from_pretrained( '/home/hengyuli/cross-modal/model/bert_config.json') self.textExtractor = BertModel.from_pretrained( '/home/hengyuli/cross-modal/model/pytorch_model.bin', config=modelConfig) embedding_dim = self.textExtractor.config.hidden_size self.fc = nn.Linear(embedding_dim, code_length) self.tanh = torch.nn.Tanh()
def __init__(self): super(Bert, self).__init__() self.tokenizer = BertTokenizer.from_pretrained( '../pretrained/bert-base-uncased/bert-base-uncased-vocab.txt') modelConfig = BertConfig.from_pretrained( '../pretrained/bert-base-uncased/bert_config.json') self.textExtractor = BertModel.from_pretrained( '../pretrained/bert-base-uncased/pytorch_model.bin', config=modelConfig)
def __init__(self, code_length): # code_length为fc映射到的维度大小 super(TextNet, self).__init__() modelConfig = BertConfig.from_pretrained('bert-base-chinese') self.textExtractor = BertModel.from_pretrained('bert-base-chinese', config=modelConfig) embedding_dim = self.textExtractor.config.hidden_size #embedding_dim应该是模型截断处输出的维度 self.fc = nn.Linear(embedding_dim, code_length) self.tanh = torch.nn.Tanh()
def __init__(self, opt): self.opt = opt if opt.model_name.lower() in ['vh_bert', 'bert_att', 'my_lcf']: tokenizer = BertTokenizer.from_pretrained(opt.pretrained_bert_name) config = BertConfig.from_pretrained(opt.pretrained_bert_name, output_attentions=True) self.model = opt.model_class(config, ).to(opt.device) elif 'bert' in opt.model_name.lower(): tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) config = BertConfig.from_pretrained(opt.pretrained_bert_name, output_attentions=True) bert = BertModel.from_pretrained(opt.pretrained_bert_name, config=config) self.model = opt.model_class(bert, opt).to(opt.device) else: tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, dat_fname='./cache/{0}_tokenizer.dat'.format(opt.dataset)) embedding_matrix = build_embedding_matrix( word2idx=tokenizer.word2idx, embed_dim=opt.embed_dim, dat_fname='./cache/{0}_{1}_embedding_matrix.dat'.format( str(opt.embed_dim), opt.dataset)) self.model = opt.model_class(embedding_matrix, opt).to(opt.device) self.trainset = ABSADataset(opt.dataset_file['train'], tokenizer) self.testset = ABSADataset(opt.dataset_file['test'], tokenizer) assert 0 <= opt.valset_ratio < 1 if opt.valset_ratio > 0: valset_len = int(len(self.trainset) * opt.valset_ratio) self.trainset, self.valset = random_split( self.trainset, (len(self.trainset) - valset_len, valset_len)) else: self.valset = self.testset if opt.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format( torch.cuda.memory_allocated(device=opt.device.index))) self._print_args()
def build_model(do_lower_case, num_labels): config = BertConfig.from_pretrained(MODEL_NAME, num_labels=num_labels, output_hidden_states=True) tokenizer = TOKENIZER_CLASS.from_pretrained(MODEL_NAME, do_lower_case=do_lower_case) model = MODEL_CLASS.from_pretrained(MODEL_NAME, config=config) model.to(get_device()) return model, tokenizer
def __init__(self, code_length): # code_length为fc映射到的维度大小 super(TextNet, self).__init__() # model_name = 'bert-base-multilingual-cased' modelConfig = BertConfig.from_pretrained(model_name) self.textExtractor = BertModel.from_pretrained( model_name, config=modelConfig) embedding_dim = self.textExtractor.config.hidden_size #embedding_dim是模型截断处输出的维度 self.fc = nn.Linear(embedding_dim, code_length) # code_length是特征维度 self.tanh = torch.nn.Tanh()
def __init__(self, model_name: str) -> None: super().__init__() config = BertConfig.from_pretrained(model_name) self.input_dim = config.hidden_size self.output_dim = config.vocab_size # TODO(mattg): It's possible that we could use some kind of cache like we have in # allennlp.modules.token_embedders.bert_token_embedder.PretrainedBertModel. That way, we # would only load the BERT weights once. Though, it's not clear how to do that here, as we # need to load `BertForMaskedLM`, not just `BertModel`... bert_model = BertForMaskedLM.from_pretrained(model_name) self.bert_lm_head = bert_model.cls # pylint: disable=no-member
def __init__(self, model_state_dict) -> None: no_cuda = True self.device = torch.device( "cuda" if torch.cuda.is_available() and not no_cuda else "cpu") self.tokenizer = BertTokenizer.from_pretrained('bert-base-chinese', do_lower_case=False) config = BertConfig.from_pretrained('bert-base-chinese') self.model = BertForQuestionAnswering(config) self.model.load_state_dict( torch.load(model_state_dict, map_location='cpu')) self.model.to(self.device) self.model.eval() # TODO
def __init__(self, code_length): super(TextNet, self).__init__() modelConfig = BertConfig.from_pretrained( '/home/disk1/zhaoyuying/models/modeling_bert/bert-base-uncased-config.json' ) self.textExtractor = BertModel.from_pretrained( '/home/disk1/zhaoyuying/models/modeling_bert/bert-base-uncased-pytorch_model.bin', config=modelConfig) embedding_dim = self.textExtractor.config.hidden_size self.fc = nn.Linear(embedding_dim, code_length) self.tanh = torch.nn.Tanh()
def load_pretrained_model(model_path: str, lower_case=True): """ Imports pretrained BERT model from the official format as seen on: https://github.com/google-research/bert :param model_path: Path to the model checkpoint file :param lower_case: select False if loading cased model :return: pretrained model and its tokenizer """ config = BertConfig.from_pretrained(model_path + "/bert_config.json") tokenizer = BertTokenizer.from_pretrained(model_path, do_lower_case=lower_case) model = BertForQuestionAnswering.from_pretrained(model_path, from_tf=False, config=config) return model, tokenizer
def init(maxlen=512): global config, tokenizer, model, sim_model, MAX_LENGTH MAX_LENGTH = maxlen bert_model_name = 'bert-base-uncased' config = BertConfig.from_pretrained(bert_model_name) config.output_hidden_states = True tokenizer = BertTokenizer.from_pretrained(bert_model_name) model = BertForMaskedLM.from_pretrained(bert_model_name, config=config) model.to(DEVICE) model.eval() sim_model = smodel.WebBertSimilarity(device=DEVICE)
def train( root=True, binary=False, bert="bert-large-uncased", epochs=30, batch_size=8, save=False, ): trainset = SSTDataset("train", root=root, binary=binary) devset = SSTDataset("dev", root=root, binary=binary) testset = SSTDataset("test", root=root, binary=binary) config = BertConfig.from_pretrained(bert) if not binary: config.num_labels = 5 model = BertForSequenceClassification.from_pretrained(bert, config=config) model = model.to(device) lossfn = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=1e-5) for epoch in range(1, epochs): train_loss, train_acc = train_one_epoch(model, lossfn, optimizer, trainset, batch_size=batch_size) val_loss, val_acc = evaluate_one_epoch(model, lossfn, optimizer, devset, batch_size=batch_size) test_loss, test_acc = evaluate_one_epoch(model, lossfn, optimizer, testset, batch_size=batch_size) logger.info(f"epoch={epoch}") logger.info( f"train_loss={train_loss:.4f}, val_loss={val_loss:.4f}, test_loss={test_loss:.4f}" ) logger.info( f"train_acc={train_acc:.3f}, val_acc={val_acc:.3f}, test_acc={test_acc:.3f}" ) if save: label = "binary" if binary else "fine" nodes = "root" if root else "all" torch.save(model, f"{bert}__{nodes}__{label}__e{epoch}.pickle") logger.success("Done!")
def load_artifacts(model_path, is_quantized=False): """ Loads pretrained model , tokenizer , config.""" model_class = BertForSequenceClassification print("quantized_ouput/" if is_quantized else model_path) if not is_quantized: model = model_class.from_pretrained(model_path) else: model = torch.load("4bit_quantized_model.bin") tokenizer = BertTokenizer.from_pretrained(model_path) config = BertConfig.from_pretrained( "quantized_ouput/" if is_quantized else model_path) model.to("cpu") model.eval() return model, tokenizer, config