Python AutoModelForTokenClassificationの例、transformers.AutoModelForTokenClassification Pythonの例

コード例 #1

0

ファイルを表示

    def __init__(self, cache_dir=DEFAULT_CACHE_DIR, verbose=False):
        from transformers import AutoModelForTokenClassification, AutoTokenizer

        # download the model or load the model path
        weights_path = download_model('bert.ner', cache_dir,
                                      process_func=_unzip_process_func,
                                      verbose=verbose)

        self.label_list = ["O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG",
                           "I-ORG", "B-LOC", "I-LOC"]

        self.model = AutoModelForTokenClassification.from_pretrained(weights_path)
        self.tokenizer = AutoTokenizer.from_pretrained(weights_path)

コード例 #2

0

ファイルを表示

ファイル: packer.py プロジェクト: jackyzha0/versailles

def pack_ner():
    svc = NERService()
    model_name = "dbmdz/bert-large-cased-finetuned-conll03-english"
    tokenizer_name = "bert-base-cased"
    model = AutoModelForTokenClassification.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

    artifact = {
        "model": model,
        "tokenizer": tokenizer
    }
    svc.pack("model", artifact)
    print(f"NER service packed: {svc.save()}")

コード例 #3

0

ファイルを表示

    def __init__(self,
                 model_type: str = "BERT",
                 model_name: str = "dslim/bert-base-NER",
                 load_path: str = ""):

        self.adaptor = get_adaptor(model_type)

        if load_path != "":
            model = AutoModelForTokenClassification.from_pretrained(load_path)
        else:
            model = AutoModelForTokenClassification.from_pretrained(model_name)

        super().__init__(model_type, model_name, model)

        device_number = detect_cuda_device_number()

        self._pipeline = TokenClassificationPipeline(model=self.model,
                                                     tokenizer=self.tokenizer,
                                                     device=device_number)

        self._trainer = TOCTrainer(self.model, model_type, self.tokenizer,
                                   self._device, self.logger)

コード例 #4

0

ファイルを表示

 def _build_model(self):
     config = AutoConfig.from_pretrained(
         self._model_name,
         num_labels=len(self._labels),
         id2label=self._label_map,
         label2id={label: i
                   for i, label in enumerate(self._labels)},
     )
     model = AutoModelForTokenClassification.from_pretrained(
         self._model_name, config=config)
     self._adapter_internal_name = model.load_adapter(
         self._adapter_name, "text_task")
     return model

コード例 #5

0

ファイルを表示

ファイル: run_ptst.py プロジェクト: kmkurn/ptst-semeval2021

def evaluate(
    _log,
    _run,
    temperature=1.0,
    artifacts_dir="artifacts",
    load_params="model.pth",
    device="cpu",
    save_confusion_matrix=False,
):
    """Evaluate a trained target model."""
    model_name = "clulab/roberta-timex-semeval"
    _log.info("Loading %s", model_name)
    config = AutoConfig.from_pretrained(model_name)
    token_clf = AutoModelForTokenClassification.from_pretrained(model_name,
                                                                config=config)
    model = RoBERTagger(token_clf, config.num_labels, temperature)

    artifacts_dir = Path(artifacts_dir)
    _log.info("Loading model parameters from %s", artifacts_dir / load_params)
    model.load_state_dict(torch.load(artifacts_dir / load_params, "cpu"))
    model.to(device)

    _log.info("Evaluating")
    eval_score, _ = run_eval(model,
                             config.id2label,
                             read_samples_(),
                             confusion=save_confusion_matrix)
    c = eval_score.pop("confusion", None)
    print_accs(eval_score, on="test", run=_run)
    if c is not None:
        labels = set()
        for k in c.keys():
            labels.update(k)
        if "O" in labels:
            labels.remove("O")
        labels = sorted(labels)
        labels.insert(0, "O")

        label2id = {l: i for i, l in enumerate(labels)}
        m = np.zeros((len(labels), len(labels)))
        for k, cnt in c.items():
            m[label2id[k[0]], label2id[k[1]]] = cnt

        _log.info("Saving labels list in %s", artifacts_dir / "labels.pkl")
        with open(artifacts_dir / "labels.pkl", "wb") as f:
            pickle.dump(labels, f)
        _log.info("Saving confusion matrix in %s",
                  artifacts_dir / "confusion.npy")
        np.save(artifacts_dir / "confusion.npy", m)

    return eval_score["f1"]

コード例 #6

0

ファイルを表示

def load_model(pred_config, args, device):
    # Check whether model exists
    if not os.path.exists(pred_config.model_dir):
        raise Exception("Model doesn't exists! Train first!")

    try:
        model = AutoModelForTokenClassification.from_pretrained(args.model_dir)  # Config will be automatically loaded from model_dir
        model.to(device)
        model.eval()
        logger.info("***** Model Loaded *****")
    except:
        raise Exception("Some model files might be missing...")

    return model

コード例 #7

0

ファイルを表示

ファイル: doc.py プロジェクト: okanvk/Turkish-Wikipedia-Based-Knowledge-Graph

 def __init__(self):
     self.tokenizer = AutoTokenizer.from_pretrained(
         "Alaeddin/convbert-base-turkish-ner-cased")
     self.model = AutoModelForTokenClassification.from_pretrained(
         "Alaeddin/convbert-base-turkish-ner-cased")
     self.config = PretrainedConfig.from_pretrained(
         "Alaeddin/convbert-base-turkish-ner-cased")
     self.pipeline = pipeline('ner',
                              model=self.model,
                              tokenizer=self.tokenizer,
                              config=self.config)
     self.nlp = spacy.load("en_core_web_sm")
     self.nlp_grouped = TokenClassificationPipeline(
         model=self.model, tokenizer=self.tokenizer, grouped_entities=True)

コード例 #8

0

ファイルを表示

ファイル: token_classification.py プロジェクト: Novetta/adaptnlp

    def load(cls, model_name_or_path: str) -> AdaptiveModel:
        """Class method for loading and constructing this tagger

        * **model_name_or_path** - A key string of one of Transformer's pre-trained Token Tagger Model or a `HFModelResult`

        Note: To search for valid models, you should use the AdaptNLP `model_hub` API
        """
        if isinstance(model_name_or_path, HFModelResult):
            model_name_or_path = model_name_or_path.name
        tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
        model = AutoModelForTokenClassification.from_pretrained(
            model_name_or_path)
        tagger = cls(tokenizer, model)
        return tagger

コード例 #9

0

ファイルを表示

 def __init__(self):
     self.gen_model = AutoModelForCausalLM.from_pretrained(
         'model/transformers/gen_model')
     self.gen_tokenizer = AutoTokenizer.from_pretrained(
         'model/transformers/gen_tokenizer'
     )  # Add specific options if needed
     self.chat_tokenizer = AutoTokenizer.from_pretrained(
         "model/transformers/chat_tokenizer")
     self.chat_model = AutoModelForCausalLM.from_pretrained(
         "model/transformers/chat_model")
     self.ner_model = AutoModelForTokenClassification.from_pretrained(
         "model/transformers/ner_model")
     self.ner_tokenizer = AutoTokenizer.from_pretrained(
         "model/transformers/ner_tokenizer")

コード例 #10

0

ファイルを表示

 def load_model(self):
     tokenizer = AutoTokenizer.from_pretrained(self.model_path)
     model = AutoModelForTokenClassification.from_pretrained(self.model_path)
     model.to(self.device)
     f2=open(self.labels_file,'r')
     lablels_dict={}
     for i,line in enumerate(f2):
         # l=line.split(" ")
         l=line.replace("\n",'')
         # print(l)
         lablels_dict[i]=l
     f2.close()
     self.lablels_dict=lablels_dict
     return model,tokenizer

コード例 #11

0

ファイルを表示

ファイル: trace.py プロジェクト: frankfliu/djl-demo

def transformers_model_downloader(app):
    model_file = os.path.join(app, app + ".pt")
    if os.path.isfile(model_file):
        print("model already downloaded: " + model_file)
        return

    print("Download model for: ", model_file)
    if app == "text_classification":
        model_name = "bert-base-uncased"
        max_length = 150
        model = AutoModelForSequenceClassification.from_pretrained(
            model_name, torchscript=True, num_labels=2)
        tokenizer = AutoTokenizer.from_pretrained(model_name,
                                                  do_lower_case=True)
    elif app == "question_answering":
        model_name = "distilbert-base-uncased-distilled-squad"
        max_length = 128
        model = AutoModelForQuestionAnswering.from_pretrained(model_name,
                                                              torchscript=True)
        tokenizer = AutoTokenizer.from_pretrained(model_name,
                                                  do_lower_case=True)
    elif app == "token_classification":
        model_name = "bert-base-uncased"
        max_length = 150
        model = AutoModelForTokenClassification.from_pretrained(
            model_name, torchscript=True, num_labels=9)
        tokenizer = AutoTokenizer.from_pretrained(model_name,
                                                  do_lower_case=True)
    else:
        print("Unknown application: " + app)
        return

    text = "How is the weather"
    paraphrase = tokenizer.encode_plus(text,
                                       max_length=max_length,
                                       truncation=True,
                                       padding='max_length',
                                       add_special_tokens=True,
                                       return_tensors='pt')
    example_inputs = paraphrase['input_ids'], paraphrase['attention_mask']

    traced_model = torch.neuron.trace(model, example_inputs)

    # Export to saved model
    os.makedirs(app, exist_ok=True)
    traced_model.save(model_file)

    tokenizer.save_pretrained(app)

    logging.info("Compile model %s success.", app)

コード例 #12

0

ファイルを表示

    def initialize(self, ctx):
        self.manifest = ctx.manifest
        self.metrics = ctx.metrics

        logger.info(f"Manifest: {self.manifest}")

        properties = ctx.system_properties
        self._batch_size = properties["batch_size"]

        logger.info(f"properties: {properties}")

        model_dir = properties.get("model_dir")
        self.device = torch.device("cuda:" +
                                   str(properties.get("gpu_id")) if torch.cuda.
                                   is_available() else "cpu")

        labels = get_labels(os.path.join(model_dir, "labels.txt"))
        label_map = {i: label for i, label in enumerate(labels)}
        num_labels = len(labels)

        config = AutoConfig.from_pretrained(
            os.path.join(model_dir, "config.json"),
            num_labels=num_labels,
            id2label=label_map,
            label2id={label: i
                      for i, label in enumerate(labels)},
        )

        # Read model serialize/pt file
        self.model = AutoModelForTokenClassification.from_pretrained(
            model_dir, config=config)
        self.tokenizer = AutoTokenizer.from_pretrained(model_dir,
                                                       use_fast=True)

        self.nlp = pipeline(
            "ner",
            model=self.model,
            tokenizer=self.tokenizer,
            ignore_labels=[],
            grouped_entities=True,
            #             ignore_subwords=True,
            device=self.device.index,
        )

        logger.debug(
            "Transformer model from path {0} loaded successfully".format(
                model_dir))

        self.initialized = True

コード例 #13

0

ファイルを表示

    def __init__(self, cfg: DictConfig, trainer: Trainer = None):
        self._tokenizer = AutoTokenizer.from_pretrained(cfg.tokenizer,
                                                        add_prefix_space=True)
        super().__init__(cfg=cfg, trainer=trainer)
        self.num_labels = len(constants.ALL_TAG_LABELS)
        self.model = AutoModelForTokenClassification.from_pretrained(
            cfg.transformer, num_labels=self.num_labels)

        # Loss Functions
        self.loss_fct = nn.CrossEntropyLoss(
            ignore_index=constants.LABEL_PAD_TOKEN_ID)

        # setup to track metrics
        self.classification_report = ClassificationReport(
            self.num_labels, mode='micro', dist_sync_on_step=True)

コード例 #14

0

ファイルを表示

ファイル: named_entity_recognition.py プロジェクト: zini-julia/nlp-recipes

 def __init__(self,
              model_name="bert-base-cased",
              num_labels=2,
              cache_dir="."):
     config = AutoConfig.from_pretrained(model_name,
                                         num_labels=num_labels,
                                         cache_dir=cache_dir)
     model = AutoModelForTokenClassification.from_pretrained(
         model_name,
         cache_dir=cache_dir,
         config=config,
         output_loading_info=False)
     super().__init__(model_name=model_name,
                      model=model,
                      cache_dir=cache_dir)

コード例 #15

0

ファイルを表示

 def __init__(self,
              model_name="bert-base-cased-finetuned-mrpc",
              task="SequenceClassification"):
     self.tokenizer = AutoTokenizer.from_pretrained(model_name)
     if task == "SC":
         self.model = AutoModelForSequenceClassification.from_pretrained(
             model_name)
     elif task == "QA":
         self.model = AutoModelForQuestionAnswering.from_pretrained(
             model_name)
     elif task == "LM":
         self.model = AutoModelWithLMHead.from_pretrained(model_name)
     elif task == "TC":
         self.model = AutoModelForTokenClassification.from_pretrained(
             "dbmdz/bert-large-cased-finetuned-conll03-english")

コード例 #16

0

ファイルを表示

ファイル: ner_analyzer.py プロジェクト: rongpenl/obsei

    def __init__(self, **data: Any):
        super().__init__(**data)

        model = AutoModelForTokenClassification.from_pretrained(self.model_name_or_path)
        if self.tokenizer_name:
            tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name, use_fast=True)
        else:
            tokenizer = None

        self._pipeline = pipeline(
            'ner',
            model=model,
            tokenizer=tokenizer,
            grouped_entities=self.grouped_entities
        )

コード例 #17

0

ファイルを表示

    def __init__(
        self,
        checkpoint_directory: str,
        batch_size: int = 16,
        max_seq_length: int = None,
    ):
        """
        Args:
            checkpoint_directory: path
            batch_size: used in dataloader
        """
        # 0. device
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        # 1: max_seq_length
        if max_seq_length is not None:
            self.max_seq_length = max_seq_length
        else:
            path_max_seq_length = join(checkpoint_directory, "max_seq_length.json")
            with open(path_max_seq_length, "r") as f:
                self.max_seq_length = json.load(f)

        # 2. annotation
        path_annotation_classes = join(checkpoint_directory, "annotation_classes.json")
        with open(path_annotation_classes, "r") as f:
            self.annotation_classes = json.load(f)
        id2label = {i: label for i, label in enumerate(self.annotation_classes)}
        label2id = {label: i for i, label in id2label.items()}

        self.annotation_scheme = derive_annotation_scheme(id2label)

        # 3. model
        self.model = AutoModelForTokenClassification.from_pretrained(
            checkpoint_directory,
            id2label=id2label,
            label2id=label2id,
            return_dict=False,
        )
        self.model.eval()
        self.model = self.model.to(self.device)

        # 4. tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(
            checkpoint_directory,
        )

        # 5. batch_size (dataloader)
        self.batch_size = batch_size

コード例 #18

0

ファイルを表示

ファイル: models.py プロジェクト: uds-lsv/transfer-distant-transformer-african

def create_base_model(exp_config):
    labels = exp_config["labels"]

    config = AutoConfig.from_pretrained(
        exp_config["model_name"],
        num_labels=len(labels),
        id2label={str(i): label
                  for i, label in enumerate(labels)},
        label2id={label: i
                  for i, label in enumerate(labels)},
    )

    model = AutoModelForTokenClassification.from_pretrained(
        exp_config["model_name"], config=config)
    model.to(exp_config["device"])
    return model

コード例 #19

0

ファイルを表示

    def load(self, train_output):
        pretrained_model = train_output['model_path']
        self._model_type = train_output['model_type']
        _, model_class, tokenizer_class = MODEL_CLASSES[train_output['model_type']]

        self._tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
        self._model = AutoModelForTokenClassification.from_pretrained(pretrained_model)
        self._batch_size = train_output['batch_size']
        self._pad_token = self._tokenizer.convert_tokens_to_ids([self._tokenizer.pad_token])[0]
        self._pad_token_label_id = train_output['pad_token_label_id']
        self._label_map = train_output['label_map']
        self._mask_padding_with_zero = True
        self._dataset_params_dict = train_output['dataset_params_dict']

        self._batch_padding = SpanLabeledTextDataset.get_padding_function(
            self._model_type, self._tokenizer, self._pad_token_label_id)

コード例 #20

0

ファイルを表示

ファイル: run_model.py プロジェクト: alvaroalon2/NER4COVID

def create_pipeline(model_name):
    config = AutoConfig.from_pretrained(model_name)

    tokenizer = AutoTokenizer.from_pretrained(
            model_name,
            use_fast=True,
            return_offsets_mapping=True
        )

    model = AutoModelForTokenClassification.from_pretrained(
            model_name,
            config=config
        )
    NER_pipeline = TokenClassificationPipeline(model= model,tokenizer=tokenizer, framework='pt', task='ner', grouped_entities=True)

    return NER_pipeline

コード例 #21

0

ファイルを表示

    def on_epoch_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
        model_path = args.output_dir + "/checkpoint-" + str(956 * int(state.epoch))

        global label_map
        label_map = {i: label for i, label in enumerate(self.labels)}
        num_labels = len(self.labels)

        config = AutoConfig.from_pretrained(
            model_path,
            num_labels=num_labels,
            id2label=label_map,
            label2id={label: i for i, label in enumerate(self.labels)},
            cache_dir=None,
        )
        model = AutoModelForTokenClassification.from_pretrained(
            model_path,
            from_tf=False,
            config=config,
            cache_dir=None,
        )

        trainer = Trainer(
            model=model,
        )

        m1, m2, m3 = student_performance(trainer, self.teacher_sets)

        results = str(m1['precision']) + ", " + str(m1['recall']) + ", " + str(m1['f1']) + ", " + str(
            m2['precision']) + ", " + str(m2['recall']) + ", " + str(m2['f1']) + ", " + str(
            m3['precision']) + ", " + str(m3['recall']) + ", " + str(m3['f1']) + "\n"
        f = open(args.output_dir + "/results.csv", "a")
        f.write(results)
        f.close()

        print(results)

        if (m1['f1'] + m3['f1']) / 2 <= self.best_f1:
            delete_filename = model_path + "/pytorch_model.bin"
            open(delete_filename, 'w').close()
            os.remove(delete_filename)

            delete_filename = model_path + "/optimizer.pt"
            open(delete_filename, 'w').close()
            os.remove(delete_filename)
            print("deleted")
        else:
            self.best_f1 = (m1['f1'] + m3['f1']) / 2

コード例 #22

0

ファイルを表示

ファイル: keyword_extractor.py プロジェクト: ldzhangyx/music-nlp-chatbot

def keyword_extractor(input_sentence, pos_list = ['0', '1', '2']):
    input_words = ['<SOS>'] + nltk.word_tokenize(input_sentence) + ['<EOS>']
    tokenizer = AutoTokenizer.from_pretrained("vblagoje/bert-english-uncased-finetuned-pos")
    model = AutoModelForTokenClassification.from_pretrained("vblagoje/bert-english-uncased-finetuned-pos")

    inputs = tokenizer(input_sentence)
    output = model(torch.tensor(inputs['input_ids']).unsqueeze(0))

    output_label = torch.argmax(output.logits.data.squeeze(0), dim=1)

    # what we need are ADJ and ADV
    keywords = list()
    for i, label in enumerate(output_label.tolist()):
        if str(label) in pos_list:
            keywords.append(input_words[i])

    return keywords

コード例 #23

0

ファイルを表示

ファイル: token_classifier.py プロジェクト: Kotwic4/ocr-correction

    def __init__(self,
                 output_dir,
                 labels: List[str],
                 ignore_sub_tokens_labes: bool,
                 spliting_strategy: Optional[str],
                 sentence_strategy: Optional[str],
                 prediction_strategy: Optional[str],
                 model_name_or_path=None,
                 loaded_model=None):
        self.output_dir = output_dir

        self.labels = labels
        self.num_labels = len(self.labels)
        self.label_map: Dict[int, str] = {
            i: label
            for i, label in enumerate(self.labels)
        }
        self.label2id = {label: i for i, label in enumerate(self.labels)}

        self.ignore_sub_tokens_labes = ignore_sub_tokens_labes
        self.spliting_strategy = spliting_strategy
        self.sentence_strategy = sentence_strategy
        self.prediction_strategy = prediction_strategy

        if loaded_model is not None:
            self.config, self.tokenizer, self.model = loaded_model
        else:
            tokenizer_name = model_name_or_path
            config_name = model_name_or_path

            self.config = AutoConfig.from_pretrained(
                config_name,
                num_labels=self.num_labels,
                id2label=self.label_map,
                label2id=self.label2id,
            )

            self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, )

            self.model = AutoModelForTokenClassification.from_pretrained(
                model_name_or_path,
                from_tf=bool(".ckpt" in model_name_or_path),
                config=self.config,
            )

        self.max_seq_length = 128

コード例 #24

0

ファイルを表示

ファイル: sanity_check.py プロジェクト: minstar/biobert-pytorch

def main():
    config = AutoConfig.from_pretrained("bert-base-cased", num_labels=93)
    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
    model = AutoModelForTokenClassification.from_pretrained("bert-base-cased",
                from_tf=bool(".ckpt" in "bert-base-cased"),
                config=config)

    criterion = nn.CrossEntropyLoss()

    batch_size = 1
    # if torch.cuda.is_available():
    #     model.load_state_dict(
    #         torch.load('bert-base-cased')
    #     )
    # else:
    #     model.load_state_dict(
    #         torch.load('bert-base-cased', map_location=torch.device('cpu'))
    #     )
        
    # with open('../label_encoder.sklrn', 'rb') as f:
    #     le = pickle.load(f)

    test_example = [
        ["Interpretation of HuggingFase's model decision"], 
        ["Transformer-based models have taken a leading role "
        "in NLP today."]
    ]

    test_dataset = NewsDataset(
        data_list=test_example,
        tokenizer=tokenizer,
        max_length=config.max_position_embeddings, 
    )
    test_dataloader = DataLoader(
        test_dataset,
        batch_size=batch_size,
        shuffle=False,
    )
    integrated_grad = IntegratedGradient(
        model, 
        criterion, 
        tokenizer, 
        show_progress=True,
        encoder="bert"
    )
    instances = integrated_grad.saliency_interpret(test_dataloader)

コード例 #25

0

ファイルを表示

    def __init__(self, hparams, tag2idx):
        super().__init__()

        self.embedding_dim = hparams['embedding_dim']
        self.batch_size = hparams['batch_size']
        self.seq_length = hparams['seq_length']

        self.device = hparams['device']

        self.tag2idx = tag2idx
        self.tagset_size = len(tag2idx)

        self.bert = AutoModelForTokenClassification.from_pretrained(
            hparams['bert'], output_hidden_states=True)

        self.hidden2tag = nn.Linear(self.embedding_dim, self.tagset_size)
        self.Softmax = nn.Softmax(dim=-1)

コード例 #26

0

ファイルを表示

    def __init__(self,
                 model_name="dumitrescustefan/bert-base-romanian-cased-v1",
                 tokenizer_name=None,
                 lr=2e-05,
                 model_max_length=512,
                 bio2tag_list=[],
                 tag_list=[]):
        super().__init__()

        if tokenizer_name is None or tokenizer_name == "":
            tokenizer_name = model_name

        print("Loading AutoModel [{}] ...".format(model_name))
        self.model_name = model_name
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name,
                                                       strip_accents=False)
        self.model = AutoModelForTokenClassification.from_pretrained(
            model_name, num_labels=len(bio2tag_list), from_flax=False)
        self.dropout = nn.Dropout(0.2)

        self.lr = lr
        self.model_max_length = model_max_length
        self.bio2tag_list = bio2tag_list
        self.tag_list = tag_list
        self.num_labels = len(bio2tag_list)

        self.train_loss = []
        self.valid_y_hat = []
        self.valid_y = []
        self.valid_loss = []
        self.test_y_hat = []
        self.test_y = []
        self.test_loss = []

        # check cls, sep and pad tokens
        if self.tokenizer.cls_token_id is None:
            print(
                f"*** Warning, tokenizer {tokenizer_name} has no defined CLS token: sequences will not be marked with special chars! ***"
            )
        if self.tokenizer.sep_token_id is None:
            print(
                f"*** Warning, tokenizer {tokenizer_name} has no defined SEP token: sequences will not be marked with special chars! ***"
            )

        # add pad token
        self.validate_pad_token()

コード例 #27

0

ファイルを表示

ファイル: token_classifier.py プロジェクト: Kotwic4/ocr-correction

    def load(output_dir):
        config = AutoConfig.from_pretrained(output_dir)
        tokenizer = AutoTokenizer.from_pretrained(output_dir)
        model = AutoModelForTokenClassification.from_pretrained(output_dir)

        with open(f'{output_dir}/settings.json', 'r') as outfile:
            data = json.load(outfile)

        ignore_sub_tokens_labes = data.get("ignore_sub_tokens_labes", False)
        spliting_strategy = data.get("spliting_strategy", None)
        sentence_strategy = data.get("sentence_strategy", None)
        prediction_strategy = data.get("prediction_strategy", None)

        return TokenClassifier(output_dir=output_dir, labels=data["labels"], loaded_model=(config, tokenizer, model),
                               sentence_strategy=sentence_strategy,
                               spliting_strategy=spliting_strategy,
                               prediction_strategy=prediction_strategy,
                               ignore_sub_tokens_labes=ignore_sub_tokens_labes)

コード例 #28

0

ファイルを表示

ファイル: transformers.py プロジェクト: ShantanuNair/FARM

    def _convert_to_transformers_ner(adaptive_model, prediction_head):
        # add more info to config
        adaptive_model.language_model.model.config.num_labels = prediction_head.num_labels
        adaptive_model.language_model.model.config.id2label = {id: label for id, label in
                                                               enumerate(prediction_head.label_list)}
        adaptive_model.language_model.model.config.label2id = {label: id for id, label in
                                                               enumerate(prediction_head.label_list)}
        adaptive_model.language_model.model.config.finetuning_task = "token_classification"
        adaptive_model.language_model.model.config.language = adaptive_model.language_model.language

        # init model
        transformers_model = AutoModelForTokenClassification.from_config(adaptive_model.language_model.model.config)
        # transfer weights for language model + prediction head
        setattr(transformers_model, transformers_model.base_model_prefix, adaptive_model.language_model.model)
        transformers_model.classifier.load_state_dict(
            prediction_head.feed_forward.feed_forward[0].state_dict())

        return transformers_model

コード例 #29

0

ファイルを表示

    def get_this_model(task, model_config):
        from transformers import AutoModelForSequenceClassification
        from transformers import AutoModelForSeq2SeqLM
        from transformers import AutoModelForMultipleChoice
        from transformers import AutoModelForTokenClassification

        if task in (SEQCLASSIFICATION, SEQREGRESSION):
            return AutoModelForSequenceClassification.from_pretrained(
                checkpoint_path, config=model_config)
        elif task == TOKENCLASSIFICATION:
            return AutoModelForTokenClassification.from_pretrained(
                checkpoint_path, config=model_config)
        elif task in NLG_TASKS:
            return AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path,
                                                         config=model_config)
        elif task == MULTICHOICECLASSIFICATION:
            return AutoModelForMultipleChoice.from_pretrained(
                checkpoint_path, config=model_config)

コード例 #30

0

ファイルを表示

ファイル: ner_model_predict_legacy.py プロジェクト: af-ai-center/nerblackbox

    def _preparations_data_predict(self):
        """
        :created attr: annotation        [Annotation]
        :created attr: model             [transformers AutoModelForTokenClassification]
        :return: -
        """
        # annotation
        self.annotation = Annotation(
            json.loads(self.hparams.annotation_classes))

        # model
        self.model = AutoModelForTokenClassification.from_pretrained(
            self.pretrained_model_name,
            num_labels=len(self.annotation.classes),
            return_dict=False,
        )
        self.model.resize_token_embeddings(len(
            self.tokenizer))  # due to addtional_special_tokens