def load_classification_model(): global trainer global tokenizer mod = 'mtn_models/pytorch_model.bin' tok = 'mtn_models/vocab.txt' conf = 'mtn_models/config.json' tokenizer = BertTokenizer.from_pretrained(tok, do_lower_case=False, do_basic_tokenize=True, never_split=never_split_tokens, truncation=True) config = PretrainedConfig.from_pretrained(conf, num_labels=6) model = BertForSequenceClassification.from_pretrained(mod, config=config) training_args = TrainingArguments("./train") training_args.do_train = True training_args.evaluate_during_training = True training_args.adam_epsilon = 1e-8 training_args.learning_rate = 2e-5 training_args.warmup_steps = 0 training_args.per_gpu_train_batch_size = 16 training_args.per_gpu_eval_batch_size = 16 training_args.num_train_epochs = 3 #training_args.logging_steps = (len(train_features) - 1) // training_args.per_gpu_train_batch_size + 1 training_args.save_steps = training_args.logging_steps training_args.seed = 42 trainer = Trainer(model=model, args=training_args)
def load_model_from_s3(self, model_path:str, s3_bucket: str, model_prefix:str): if model_path and s3_bucket and model_prefix: obj = s3.get_object(Bucket=s3_bucket, Key=model_prefix) config = PretrainedConfig.from_pretrained(f'{model_path}/model_config.json') state = torch.load(io.BytesIO(obj['Body'].read())) model = MarianMTModel.from_pretrained( pretrained_model_name_or_path=None, state_dict=state, config=config) return model else: raise KeyError('Error loading model from s3')
def __init__(self): self.tokenizer = AutoTokenizer.from_pretrained( "Alaeddin/convbert-base-turkish-ner-cased") self.model = AutoModelForTokenClassification.from_pretrained( "Alaeddin/convbert-base-turkish-ner-cased") self.config = PretrainedConfig.from_pretrained( "Alaeddin/convbert-base-turkish-ner-cased") self.pipeline = pipeline('ner', model=self.model, tokenizer=self.tokenizer, config=self.config) self.nlp = spacy.load("en_core_web_sm") self.nlp_grouped = TokenClassificationPipeline( model=self.model, tokenizer=self.tokenizer, grouped_entities=True)
def run_multi_process_generation(args): config = PretrainedConfig.from_pretrained(args.model_name_or_path, cache_dir=args.cache_dir) # get model type from saved config if hasattr(config, 'model_type'): args.model_type = getattr(config, 'model_type') # bart and mbart share the same config # check which model we are actually using if args.model_type == 'bart': try: if config.normalize_before and config.add_final_layer_norm and config.scale_embedding: args.model_type = 'mbart' except AttributeError as e: args.model_type = 'bart' else: raise ValueError('Model should be either GPT2, BART, MBART, or Marian') if args.trained_model_type and args.trained_model_type != '' and args.model_type != args.trained_model_type: raise ValueError( 'The loaded model type does not match with what the user provided') if args.model_type == 'marian' and args.model_name_or_path.rsplit( '-', 1)[1] in MARIAN_GROUP_MEMBERS: if not args.tgt_lang: raise ValueError( 'For translation task using Marian model, if target language is a group of languages, ' 'you have to specify the --tgt_lang flag.') elif args.tgt_lang not in MARIAN_GROUP_MEMBERS[ args.model_name_or_path.rsplit('-', 1)[1]]: raise ValueError( 'Target language is not in the model group languages, please specify the correct target language.' ) if args.model_type == 'marian' and args.model_name_or_path.rsplit( '-', 1)[1] not in MARIAN_GROUP_MEMBERS and args.tgt_lang: logger.warning( 'Target language should not be provided when using models with single language pairs,' 'otherwise the translation outputs will be incorrect; thus we ignore the target language you provided...' ) args.tgt_lang = None if args.model_type == 'mbart' and not (args.tgt_lang and args.src_lang): raise ValueError( 'Source and Target language should be provided when using mBART cc25 model' ) if args.prompt_column is not None and args.copy is not None and args.copy != 0: raise ValueError( 'Cannot copy from the input and use prompt at the same time. Disable either --copy or --prompt_column.' ) if args.gold_column is None: args.gold_column = args.input_column args.device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() if args.output_file is not None: if not os.path.exists(os.path.dirname(args.output_file)): os.makedirs(os.path.dirname(args.output_file), exist_ok=False) set_seed(args) if args.n_gpu > 1: if args.input_file is None: raise ValueError( 'Cannot use multiple GPUs when reading from stdin. You should provide an --input_file' ) # Independent multi-GPU generation all_processes = [] all_input_files = split_file_on_disk(args.input_file, args.n_gpu) for gpu_idx in range(args.n_gpu): copy_args = copy.copy(args) if torch.cuda.is_available() and not args.no_cuda: copy_args.device = torch.device("cuda:" + str(gpu_idx)) copy_args.n_gpu = 1 copy_args.input_file = all_input_files[gpu_idx] copy_args.output_file = get_part_path(args.output_file, gpu_idx) p = Process(target=run_single_process_generation, args=(copy_args, config)) all_processes.append(p) p.start() for p in all_processes: p.join() for file in all_input_files: os.remove(file) combine_files_on_disk(args.output_file, args.n_gpu, line_group_size=sum(args.num_samples), delete=True) else: run_single_process_generation(args, config)
def run_multi_process_generation(args): config = PretrainedConfig.from_pretrained(args.model_name_or_path, cache_dir=args.cache_dir) # get model type from saved config if hasattr(config, 'model_type'): args.model_type = getattr(config, 'model_type') if args.model_type == 'mbart' and '-50-' in args.model_name_or_path: args.model_type = 'mbart50' else: raise ValueError('Model should be either GPT2, BART, MBART, or Marian') # check arguments validity check_args(args) if sum([ args.mask_tokens, args.delete_tokens, args.infill_text, args.permute_sentences, args.rotate_sentence ]) >= 2: raise ValueError( 'Mixing denoising techniques is unlikely to work. Please use one method per run' ) if (args.mask_tokens or args.delete_tokens or args.rotate_sentence) and args.model_type == 'mbart': raise ValueError( 'MBART is pretrained only with text_infilling and permute_sentences noising methods. ' 'Applying other noising techniques is unlikely to work') if args.trained_model_type and args.trained_model_type != '' and args.model_type != args.trained_model_type: raise ValueError( 'The loaded model type does not match with what the user provided') if args.prompt_column is not None and args.copy is not None and args.copy != 0: raise ValueError( 'Cannot copy from the input and use prompt at the same time. Disable either --copy or --prompt_column.' ) if args.gold_column is None: args.gold_column = args.input_column args.device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() if args.output_file is not None: if not os.path.exists(os.path.dirname(args.output_file)): os.makedirs(os.path.dirname(args.output_file), exist_ok=False) set_seed(args) if args.n_gpu > 1: if args.input_file is None: raise ValueError( 'Cannot use multiple GPUs when reading from stdin. You should provide an --input_file' ) logger.info('Running generation in parallel on {} GPUs'.format( args.n_gpu)) # Independent multi-GPU generation all_processes = [] all_input_files = split_file_on_disk(args.input_file, args.n_gpu) for gpu_idx in range(args.n_gpu): copy_args = copy.copy(args) if torch.cuda.is_available() and not args.no_cuda: copy_args.device = torch.device("cuda:" + str(gpu_idx)) copy_args.n_gpu = 1 copy_args.input_file = all_input_files[gpu_idx] copy_args.output_file = get_part_path(args.output_file, gpu_idx) p = Process(target=run_single_process_generation, args=(copy_args, config)) all_processes.append(p) p.start() for p in all_processes: p.join() for file in all_input_files: os.remove(file) combine_files_on_disk(args.output_file, args.n_gpu, line_group_size=sum(args.num_samples), delete=True) else: run_single_process_generation(args, config)
from transformers import PretrainedConfig, AutoTokenizer, MarianMTModel, MarianTokenizer import torch model_path = './model' config = PretrainedConfig.from_pretrained(f'{model_path}/model_config.json') state = torch.load(f'{model_path}/Marian_pytorch_model_fr-en.bin') src = 'fr' trg = 'en' mname = f'Helsinki-NLP/opus-mt-{src}-{trg}' text = 'bonjour' model = MarianMTModel.from_pretrained(pretrained_model_name_or_path=None, state_dict=state, config=config) # src = 'en' # trg = 'fr' # mname = f'Helsinki-NLP/opus-mt-{src}-{trg}' # text = 'hello' # tokenizer = MarianTokenizer.from_pretrained(mname)#.save_pretrained('./model') tokenizer = AutoTokenizer.from_pretrained( './model') #.save_pretrained('./model') tokenized_text = tokenizer.encode(text, return_tensors='pt') translation = model.generate(tokenized_text) s = tokenizer.batch_decode(translation, skip_special_tokens=True)[0] print(s)