Exemple #1
0
def load_classification_model():
    global trainer
    global tokenizer
    mod = 'mtn_models/pytorch_model.bin'
    tok = 'mtn_models/vocab.txt'
    conf = 'mtn_models/config.json'
    tokenizer = BertTokenizer.from_pretrained(tok,
                                              do_lower_case=False,
                                              do_basic_tokenize=True,
                                              never_split=never_split_tokens,
                                              truncation=True)
    config = PretrainedConfig.from_pretrained(conf, num_labels=6)
    model = BertForSequenceClassification.from_pretrained(mod, config=config)

    training_args = TrainingArguments("./train")

    training_args.do_train = True
    training_args.evaluate_during_training = True
    training_args.adam_epsilon = 1e-8
    training_args.learning_rate = 2e-5
    training_args.warmup_steps = 0
    training_args.per_gpu_train_batch_size = 16
    training_args.per_gpu_eval_batch_size = 16
    training_args.num_train_epochs = 3
    #training_args.logging_steps = (len(train_features) - 1) // training_args.per_gpu_train_batch_size + 1
    training_args.save_steps = training_args.logging_steps
    training_args.seed = 42

    trainer = Trainer(model=model, args=training_args)
Exemple #2
0
	def load_model_from_s3(self, model_path:str, s3_bucket: str, model_prefix:str):
		if model_path and s3_bucket and model_prefix:
			obj = s3.get_object(Bucket=s3_bucket, Key=model_prefix)

			config = PretrainedConfig.from_pretrained(f'{model_path}/model_config.json')
			state = torch.load(io.BytesIO(obj['Body'].read()))

			model = MarianMTModel.from_pretrained(
					pretrained_model_name_or_path=None, state_dict=state, config=config)

			return model
		else:
			raise KeyError('Error loading model from s3')
 def __init__(self):
     self.tokenizer = AutoTokenizer.from_pretrained(
         "Alaeddin/convbert-base-turkish-ner-cased")
     self.model = AutoModelForTokenClassification.from_pretrained(
         "Alaeddin/convbert-base-turkish-ner-cased")
     self.config = PretrainedConfig.from_pretrained(
         "Alaeddin/convbert-base-turkish-ner-cased")
     self.pipeline = pipeline('ner',
                              model=self.model,
                              tokenizer=self.tokenizer,
                              config=self.config)
     self.nlp = spacy.load("en_core_web_sm")
     self.nlp_grouped = TokenClassificationPipeline(
         model=self.model, tokenizer=self.tokenizer, grouped_entities=True)
Exemple #4
0
def run_multi_process_generation(args):
    config = PretrainedConfig.from_pretrained(args.model_name_or_path,
                                              cache_dir=args.cache_dir)

    # get model type from saved config
    if hasattr(config, 'model_type'):
        args.model_type = getattr(config, 'model_type')
        # bart and mbart share the same config
        # check which model we are actually using
        if args.model_type == 'bart':
            try:
                if config.normalize_before and config.add_final_layer_norm and config.scale_embedding:
                    args.model_type = 'mbart'
            except AttributeError as e:
                args.model_type = 'bart'
    else:
        raise ValueError('Model should be either GPT2, BART, MBART, or Marian')

    if args.trained_model_type and args.trained_model_type != '' and args.model_type != args.trained_model_type:
        raise ValueError(
            'The loaded model type does not match with what the user provided')

    if args.model_type == 'marian' and args.model_name_or_path.rsplit(
            '-', 1)[1] in MARIAN_GROUP_MEMBERS:
        if not args.tgt_lang:
            raise ValueError(
                'For translation task using Marian model, if target language is a group of languages, '
                'you have to specify the --tgt_lang flag.')
        elif args.tgt_lang not in MARIAN_GROUP_MEMBERS[
                args.model_name_or_path.rsplit('-', 1)[1]]:
            raise ValueError(
                'Target language is not in the model group languages, please specify the correct target language.'
            )

    if args.model_type == 'marian' and args.model_name_or_path.rsplit(
            '-', 1)[1] not in MARIAN_GROUP_MEMBERS and args.tgt_lang:
        logger.warning(
            'Target language should not be provided when using models with single language pairs,'
            'otherwise the translation outputs will be incorrect; thus we ignore the target language you provided...'
        )
        args.tgt_lang = None

    if args.model_type == 'mbart' and not (args.tgt_lang and args.src_lang):
        raise ValueError(
            'Source and Target language should be provided when using mBART cc25 model'
        )

    if args.prompt_column is not None and args.copy is not None and args.copy != 0:
        raise ValueError(
            'Cannot copy from the input and use prompt at the same time. Disable either --copy or --prompt_column.'
        )

    if args.gold_column is None:
        args.gold_column = args.input_column
    args.device = torch.device(
        "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    args.n_gpu = torch.cuda.device_count()

    if args.output_file is not None:
        if not os.path.exists(os.path.dirname(args.output_file)):
            os.makedirs(os.path.dirname(args.output_file), exist_ok=False)

    set_seed(args)

    if args.n_gpu > 1:
        if args.input_file is None:
            raise ValueError(
                'Cannot use multiple GPUs when reading from stdin. You should provide an --input_file'
            )
        # Independent multi-GPU generation
        all_processes = []
        all_input_files = split_file_on_disk(args.input_file, args.n_gpu)
        for gpu_idx in range(args.n_gpu):
            copy_args = copy.copy(args)
            if torch.cuda.is_available() and not args.no_cuda:
                copy_args.device = torch.device("cuda:" + str(gpu_idx))
            copy_args.n_gpu = 1
            copy_args.input_file = all_input_files[gpu_idx]
            copy_args.output_file = get_part_path(args.output_file, gpu_idx)

            p = Process(target=run_single_process_generation,
                        args=(copy_args, config))
            all_processes.append(p)
            p.start()

        for p in all_processes:
            p.join()

        for file in all_input_files:
            os.remove(file)
        combine_files_on_disk(args.output_file,
                              args.n_gpu,
                              line_group_size=sum(args.num_samples),
                              delete=True)

    else:
        run_single_process_generation(args, config)
Exemple #5
0
def run_multi_process_generation(args):
    config = PretrainedConfig.from_pretrained(args.model_name_or_path,
                                              cache_dir=args.cache_dir)

    # get model type from saved config
    if hasattr(config, 'model_type'):
        args.model_type = getattr(config, 'model_type')
        if args.model_type == 'mbart' and '-50-' in args.model_name_or_path:
            args.model_type = 'mbart50'
    else:
        raise ValueError('Model should be either GPT2, BART, MBART, or Marian')

    # check arguments validity
    check_args(args)

    if sum([
            args.mask_tokens, args.delete_tokens, args.infill_text,
            args.permute_sentences, args.rotate_sentence
    ]) >= 2:
        raise ValueError(
            'Mixing denoising techniques is unlikely to work. Please use one method per run'
        )

    if (args.mask_tokens or args.delete_tokens
            or args.rotate_sentence) and args.model_type == 'mbart':
        raise ValueError(
            'MBART is pretrained only with text_infilling and permute_sentences noising methods. '
            'Applying other noising techniques is unlikely to work')

    if args.trained_model_type and args.trained_model_type != '' and args.model_type != args.trained_model_type:
        raise ValueError(
            'The loaded model type does not match with what the user provided')

    if args.prompt_column is not None and args.copy is not None and args.copy != 0:
        raise ValueError(
            'Cannot copy from the input and use prompt at the same time. Disable either --copy or --prompt_column.'
        )

    if args.gold_column is None:
        args.gold_column = args.input_column
    args.device = torch.device(
        "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    args.n_gpu = torch.cuda.device_count()

    if args.output_file is not None:
        if not os.path.exists(os.path.dirname(args.output_file)):
            os.makedirs(os.path.dirname(args.output_file), exist_ok=False)

    set_seed(args)

    if args.n_gpu > 1:
        if args.input_file is None:
            raise ValueError(
                'Cannot use multiple GPUs when reading from stdin. You should provide an --input_file'
            )
        logger.info('Running generation in parallel on {} GPUs'.format(
            args.n_gpu))
        # Independent multi-GPU generation
        all_processes = []
        all_input_files = split_file_on_disk(args.input_file, args.n_gpu)
        for gpu_idx in range(args.n_gpu):
            copy_args = copy.copy(args)
            if torch.cuda.is_available() and not args.no_cuda:
                copy_args.device = torch.device("cuda:" + str(gpu_idx))
            copy_args.n_gpu = 1
            copy_args.input_file = all_input_files[gpu_idx]
            copy_args.output_file = get_part_path(args.output_file, gpu_idx)

            p = Process(target=run_single_process_generation,
                        args=(copy_args, config))
            all_processes.append(p)
            p.start()

        for p in all_processes:
            p.join()

        for file in all_input_files:
            os.remove(file)
        combine_files_on_disk(args.output_file,
                              args.n_gpu,
                              line_group_size=sum(args.num_samples),
                              delete=True)

    else:
        run_single_process_generation(args, config)
Exemple #6
0
from transformers import PretrainedConfig, AutoTokenizer, MarianMTModel, MarianTokenizer
import torch

model_path = './model'
config = PretrainedConfig.from_pretrained(f'{model_path}/model_config.json')
state = torch.load(f'{model_path}/Marian_pytorch_model_fr-en.bin')

src = 'fr'
trg = 'en'
mname = f'Helsinki-NLP/opus-mt-{src}-{trg}'
text = 'bonjour'

model = MarianMTModel.from_pretrained(pretrained_model_name_or_path=None,
                                      state_dict=state,
                                      config=config)

# src = 'en'
# trg = 'fr'
# mname = f'Helsinki-NLP/opus-mt-{src}-{trg}'
# text = 'hello'
# tokenizer = MarianTokenizer.from_pretrained(mname)#.save_pretrained('./model')
tokenizer = AutoTokenizer.from_pretrained(
    './model')  #.save_pretrained('./model')
tokenized_text = tokenizer.encode(text, return_tensors='pt')
translation = model.generate(tokenized_text)
s = tokenizer.batch_decode(translation, skip_special_tokens=True)[0]
print(s)