Example #1
0
    def __init__(self, model: str = None, service: str = "classification"):
        """
        Constructor to the class that does the Classification in the back end
        :param model: Transfomer model that will be used for Classification Task
        :param service: string to represent the service, this will be defaulted to classification
        """
        if model is None:
            model = "distilbert"
        # path to all the files that will be used for inference
        self.path = f"./{service}/{model}/"
        # json file for mapping of network output to the correct category
        self.mapping = self.path + "mapping.json"
        self.model_path = self.path + "model.bin"
        # Selecting the correct model based on the passed madel input. Default distilbert
        if model == "distilbert":
            self.model = DistillBERTClass()
            self.tokenizer = DistilBertTokenizerFast.from_pretrained(self.path)
        else:
            self.model = DistillBERTClass()
            self.tokenizer = DistilBertTokenizerFast.from_pretrained(self.path)

        self.model.eval()
        self.model.load_state_dict(
            torch.load(self.model_path, map_location=device))

        with open(self.mapping) as f:
            self.config = json.load(f)
Example #2
0
def get_train_test_embeddings():
    tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
    de_tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-german-cased')
    #de_tokenizer = bert-base-german-dbmdz-cased

    train_texts, train_qualities = read_en_de_split('./data/en-de')
    test_texts, test_qualities = read_en_de_split('./data/en-de', train=False)

    print(max([len(i) for i in train_texts[0]]), max([len(i) for i in train_texts[1]]))
    train_encodings = ((tokenizer(train_texts[0], max_length=201, truncation=True, padding='max_length'), de_tokenizer(train_texts[1], max_length=201, truncation=True, padding='max_length')), train_qualities)
    test_encodings = ((tokenizer(test_texts[0], max_length=201, truncation=True, padding='max_length'), de_tokenizer(test_texts[1], max_length=201, truncation=True, padding='max_length')), test_qualities)
    return train_encodings, test_encodings
Example #3
0
 def __init__(self, text_mod: MimicText, clf):
     super().__init__()
     self.text_mod = text_mod
     self.clf = clf
     tokenizer_path = Path(__file__).parent.parent / 'classifiers/tokenizer'
     if not tokenizer_path.exists():
         tokenizer = DistilBertTokenizerFast.from_pretrained(
             'distilbert-base-uncased')
         tokenizer.save_pretrained(tokenizer_path)
     else:
         tokenizer = DistilBertTokenizerFast.from_pretrained(tokenizer_path)
     self.tokenizer = tokenizer
    def makeUnilabelModel(self, modelName, num_labels=10, root='', **kwargs):
        if modelName == 'distilbert-base-uncased':
            tokenizer = DistilBertTokenizerFast.from_pretrained(
                'distilbert-base-uncased')
            model = DistilBertForSequenceClassification.from_pretrained(
                root + "distilbert-base-uncased",
                num_labels=num_labels,
                **kwargs)
        if modelName == 'gpt2':
            tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
            tokenizer.add_special_tokens({'pad_token': '[PAD]'})
            model = GPT2ForSequenceClassification.from_pretrained(
                root + "gpt2", num_labels=num_labels, **kwargs)
            model.resize_token_embeddings(len(tokenizer))
            # add padding token
            model.config.pad_token_id = tokenizer('[PAD]').input_ids[0]
        if modelName == 'bertweet':
            tokenizer = AutoTokenizer.from_pretrained('vinai/bertweet-base')
            model = AutoModelForSequenceClassification.from_pretrained(
                root + "vinai/bertweet-base", num_labels=num_labels, **kwargs)
        if modelName == 'distilroberta-base':
            tokenizer = AutoTokenizer.from_pretrained('distilroberta-base')
            model = AutoModelForSequenceClassification.from_pretrained(
                root + "distilroberta-base", num_labels=num_labels, **kwargs)
        if modelName == 'lstm':
            tokenizer = AutoTokenizer.from_pretrained(
                'distilbert-base-uncased')
            model = LSTMCclassifier(128, 64, 2, tokenizer.vocab_size,
                                    num_labels)

        return tokenizer, model
Example #5
0
 def get_bert_tokenizer(model):
     if model == "bert-base-uncased":
         return BertTokenizerFast.from_pretrained(model)
     elif model == "distilbert-base-uncased":
         return DistilBertTokenizerFast.from_pretrained(model)
     else:
         raise ValueError(f"Model: {model} not recognized.")
Example #6
0
def getTweets():
    print("Getting tweets now ...")

    # Default keyword if you hit search
    keyword = request.args.get(
        'keyword', default='coronavirus covid vaccine vaccination COVID-19')

    # Fetch the 20 most recent tweets matching the query. Change the argument
    # in `items()` to decrease or increase the number of retrieved tweets.
    # The larger the number the longer the retreival time
    query = keyword  # text from the search box
    tweets_ = tweepy.Cursor(api.search, query, result_type='recent').items(20)
    tweets = [tweet.text for tweet in tweets_]

    print("Done ... retrieving tweets from API based on the keyword=" +
          keyword)

    df = pd.DataFrame(data=tweets, columns=['Tweet'])
    print("Done ... creating dataframe")

    # Iterate over the tweet texts in `tweets` and pass each item to the model
    # to obtain a prediction, then write those predictions to a Pandas dataframe
    model = pipeline(
        'sentiment-analysis',
        model=DistilBertForSequenceClassification.from_pretrained("model"),
        tokenizer=DistilBertTokenizerFast.from_pretrained(
            'distilbert-base-uncased'))
    results = list(model(tweet) for tweet in tweets)
    df['Sentiment'] = list(LABELS[s[0].get('label')] for s in results)
    df['Score'] = list(s_[0].get('score') for s_ in results)

    print("Done ... sentiment-analysis")
    print(df)
    return render_template("covid.html", data=list(df.values.tolist()))
Example #7
0
    def load_from_pretrained(cls, path: Path):
        model = TFDistilBertForSequenceClassification.from_pretrained(path)
        tokenizer = DistilBertTokenizerFast.from_pretrained(path)
        processing_pipeline = TransformersProcessingPipeline(
            TextPipeline.encode_dataset, tokenizer)
        validate_variables(model, tokenizer, processing_pipeline)

        return cls(model=model, processing_pipeline=processing_pipeline)
Example #8
0
 def __init__(self, model_path, tag_path):
     with open(tag_path, "r") as tag_file:
         file_content = tag_file.read().strip()
         self.id_to_tag = file_content.splitlines()
     self.model = DistilBertForTokenClassification.from_pretrained('distilbert-base-cased', num_labels=len(self.id_to_tag))
     self.model.load_state_dict(torch.load(model_path))
     self.model.eval()
     self.tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')
 def load_tokenizer(self):
     tokenizer = DistilBertTokenizerFast.from_pretrained(
         "distilbert-base-uncased",
         padding="max_length",
         max_length=self.params.max_len,
         truncation=True,
         is_split_into_words=True)
     return tokenizer
 def load_model(self, model_name: str = "bert_ner_test"):
     # TODO model loaded from mlflow
     # Load model and tokenizer.
     config = DistilBertConfig.from_pretrained(model_name)
     model = DistilBertForTokenClassification(config).from_pretrained(
         model_name)
     tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
     return model, config, tokenizer
Example #11
0
def get_bert_tokenizer(bert_model_type):
    if bert_model_type in [
            'bert-base-uncased', 'prod-bert-base-uncased', 'bert-base-cased',
            'bert-large-uncased', 'tune_bert-base-uncased_nsp',
            'bert-large-uncased-whole-word-masking',
            'bert-large-uncased-whole-word-masking-finetuned-squad'
    ]:
        if '-cased' in bert_model_type:
            do_lower_case = False
        else:
            do_lower_case = True  # default
        return BertTokenizerFast(vocab_file=BERT_VOCAB_FILE[bert_model_type],
                                 do_lower_case=do_lower_case)
    elif bert_model_type in [
            'roberta-base', 'prod-roberta-base-cased', 'roberta-large',
            'roberta-large-mnli', 'distilroberta-base'
    ]:
        return RobertaTokenizerFast(
            vocab_file=BERT_VOCAB_FILE[bert_model_type],
            merges_file=BERT_MERGE_FILE[bert_model_type],
            add_prefix_space=True)
    elif bert_model_type in ['xlnet-base-cased']:
        if '-uncased' in bert_model_type:
            do_lower_case = True
        else:
            do_lower_case = False  # default
        return XLNetTokenizer(vocab_file=BERT_VOCAB_FILE[bert_model_type],
                              do_lower_case=do_lower_case)
    elif bert_model_type in [
            'albert-base-v1', 'albert-large-v1', 'albert-xlarge-v1',
            'albert-xxlarge-v1'
    ]:
        return AlbertTokenizer(vocab_file=BERT_VOCAB_FILE[bert_model_type])
    elif bert_model_type in ['gpt2', 'gpt2-medium']:
        tokenizer = GPT2TokenizerFast(
            vocab_file=BERT_VOCAB_FILE[bert_model_type],
            merges_file=BERT_MERGE_FILE[bert_model_type],
            add_prefix_space=True)
        # https://github.com/huggingface/transformers/issues/3859
        tokenizer.pad_token = tokenizer.eos_token
        return tokenizer
    elif bert_model_type in ['transfo-xl']:
        return TransfoXLTokenizerFast(
            vocab_file=BERT_VOCAB_FILE[bert_model_type])
    elif bert_model_type in [
            'distilbert-base-uncased',
            'distilbert-base-uncased-distilled-squad'
    ]:
        if '-cased' in bert_model_type:
            do_lower_case = False
        else:
            do_lower_case = True  # default
        return DistilBertTokenizerFast(
            vocab_file=BERT_VOCAB_FILE[bert_model_type],
            do_lower_case=do_lower_case)
    else:
        raise ValueError(
            f'`bert_model_type` not understood: {bert_model_type}')
Example #12
0
def getBertTokenizer(model):
    if model == 'bert-base-uncased':
        tokenizer = BertTokenizerFast.from_pretrained(model)
    elif model == 'distilbert-base-uncased':
        tokenizer = DistilBertTokenizerFast.from_pretrained(model)
    else:
        raise ValueError(f'Model: {model} not recognized.')

    return tokenizer
Example #13
0
def generate_tokenizer_and_model(model_name): 
    if model_name == "bert-base-uncased":
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = BertForSequenceClassification.from_pretrained("bert-base-uncased")
    elif model_name == 'distilbert-base-uncased':
        tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
        model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

    return tokenizer, model
    def __init__(self, data_path, split):
        self.tokenizer = DistilBertTokenizerFast.from_pretrained(
            'distilbert-base-uncased')

        self.original_sentences, self.original_spans, self.fixed_spans = \
            self.get_sentences_from_data_split(data_path, split)

        self.token_ids, self.offsets, self.att_masks, self.special_masks, self.labels_ids = \
            self.preprocess_and_tokenize(self.original_sentences, self.fixed_spans)
Example #15
0
def create_dataset(data_dir: Text) -> Tuple[CabbyDataset, CabbyDataset, CabbyDataset]:
  '''Loads data and creates datasets and train, validate and test sets.
  Arguments:
    data_dir: The directory of the data.
  Returns:
    The train, validate and test sets.
  '''

  LABELS = data.Field(
      sequential=False,
      preprocessing=lambda xs: 1 if xs == "manhattan" else 0,
      use_vocab=False, 
      batch_first=True, 
  )
  TEXT = data.Field(
      use_vocab=False,
      batch_first=True,
      sequential=False,  
  )

  train_ds, valid_ds, test_ds = data.TabularDataset.splits(
      path=data_dir,
      format='tsv',
      skip_header=False,
      train='train.tsv',
      validation='dev.tsv',
      test='test.tsv',
      fields=[
          ('label', LABELS),
          ('instructions', TEXT)])

  logging.info('Data sample: %s', vars(train_ds[0]))

  # Get list of instructions.
  train_texts = [train_ds.examples[idx].instructions for idx in range(len(train_ds))]
  val_texts = [valid_ds.examples[idx].instructions for idx in range(len(valid_ds))]
  test_texts = [test_ds.examples[idx].instructions for idx in range(len(test_ds))]

  # Get list of lables.
  train_labels = [train_ds.examples[idx].label for idx in range(len(train_ds))]
  val_labels = [valid_ds.examples[idx].label for idx in range(len(valid_ds))]
  test_labels = [test_ds.examples[idx].label for idx in range(len(test_ds))]


  # Tokenize instructions.
  tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
  train_encodings = tokenizer(train_texts, truncation=True, padding=True, add_special_tokens=True)
  val_encodings = tokenizer(val_texts, truncation=True, padding=True)
  test_encodings = tokenizer(test_texts, truncation=True, padding=True, add_special_tokens=True)

  # Create Cabby dataset.
  train_dataset = CabbyDataset(train_encodings, train_labels)
  val_dataset = CabbyDataset(val_encodings, val_labels)
  test_dataset = CabbyDataset(test_encodings, test_labels)

  return train_dataset, val_dataset, test_dataset
 def __init__(self, model='bert-base-uncased'):
     super().__init__(model, "BERT")
     self.mlm = None  # Masked Language Model
     self.nsp = None  # Next Sentence Prediction
     self.qa = None  # Question Answering
     self.tokenizer = DistilBertTokenizerFast.from_pretrained(
         'distilbert-base-uncased')
     self.masked_token = self.tokenizer.mask_token
     self.sep_token = self.tokenizer.sep_token
     self.cls_token = self.tokenizer.cls_token
Example #17
0
    def __init__(self, max_vocab_size = 30522, embedding_dim = 768, from_pt=False):
        super().__init__(max_vocab_size, embedding_dim)
        config = transformers.DistilBertConfig()
        config.vocab_size = max_vocab_size
        config.dim = embedding_dim
        self.model = transformers.modeling_tf_distilbert.TFDistilBertModel.from_pretrained(self.model_path, config=config, from_pt=from_pt)
        from transformers import DistilBertTokenizerFast
        self.tokenizer = DistilBertTokenizerFast.from_pretrained(self.tokenizer_path)

        self.pipeline = transformers.pipeline(task='feature-extraction', model=self.model, tokenizer=self.tokenizer)
 def __init__(self, path=None, model_name=None):
     if path:
         self.model = DistilBertForSequenceClassification.from_pretrained(
             path)
         tokenizer_path = os.path.join(path, "model/")
         if os.path.exists(tokenizer_path):
             self.tokenizer = DistilBertTokenizerFast.from_pretrained(
                 tokenizer_path)
         else:
             self.tokenizer = DistilBertTokenizerFast.from_pretrained(
                 "distilbert-base-uncased")
     elif model_name:
         config = DistilBertConfig.from_pretrained(model_name,
                                                   return_dict=True,
                                                   num_labels=2)
         self.model = DistilBertForSequenceClassification.from_pretrained(
             model_name, config=config)
         self.tokenizer = DistilBertTokenizerFast.from_pretrained(
             model_name)
Example #19
0
 def __init__(self, model_path=None, use_cuda=False):
     if not model_path:
         model_path = get_model_path()
     if not os.path.exists(model_path):
         raise FileNotFoundError("Cannot find model under " + model_path)
     self.device = "cuda" if use_cuda and torch.cuda.is_available() else "cpu"
     self.model = DistilBertForTokenClassification.from_pretrained(model_path)
     self.model.to(self.device)
     self.tokenizer = DistilBertTokenizerFast.from_pretrained(model_path)
     self.label_map = self.get_label_map(model_path)
Example #20
0
def main():
    # define parser and arguments
    args = get_train_test_args()
    util.set_seed(args.seed)

    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    DistilBert = DistilBertModel.from_pretrained('distilbert-base-uncased')
    Experts = [DistilBertQA(DistilBertModel.from_pretrained('distilbert-base-uncased')).to(device) for _ in range(args.num_experts)]
    tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
    gate_model = GateNetwork(384, 3,3, DistilBert.config).to(device)
    print(f'Args: {json.dumps(vars(args), indent=4, sort_keys=True)}')
    if args.do_train:
        if not os.path.exists(args.save_dir):
            os.makedirs(args.save_dir)
        args.save_dir = util.get_save_dir(args.save_dir, args.run_name)
        log = util.get_logger(args.save_dir, 'log_train')
        log.info(f'Args: {json.dumps(vars(args), indent=4, sort_keys=True)}')
        log.info("Preparing Training Data...")
        args.device = device
        trainer = train.Trainer(args, log)
        train_dataset, _ = get_dataset(args, args.train_datasets, args.train_dir, tokenizer, 'train')
        log.info("Preparing Validation Data...")
        val_dataset, val_dict = get_dataset(args, args.train_datasets, args.val_dir, tokenizer, 'val')
        train_loader = DataLoader(train_dataset,
                                batch_size=args.batch_size,
                                sampler=RandomSampler(train_dataset))
        val_loader = DataLoader(val_dataset,
                                batch_size=1,
                                sampler=SequentialSampler(val_dataset))
        best_scores = trainer.train(Experts, gate_model, train_loader, val_loader, val_dict, args.num_experts)
    if args.do_eval:
        split_name = 'test' if 'test' in args.eval_dir else 'validation'
        log = util.get_logger(args.save_dir, f'log_{split_name}')
        trainer = train.Trainer(args, log)
        # load model
        restore_model("",args.num_experts, Experts, gate_model)
        eval_dataset, eval_dict = get_dataset(args, args.eval_datasets, args.eval_dir, tokenizer, split_name)
        eval_loader = DataLoader(eval_dataset,
                                 batch_size=1,
                                 sampler=SequentialSampler(eval_dataset))
        args.device = device
        eval_preds, eval_scores = trainer.evaluate(Experts, gate_model, eval_loader,
                                                   eval_dict, return_preds=True,
                                                   split=split_name)
        results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in eval_scores.items())
        log.info(f'Eval {results_str}')
        # Write submission file
        sub_path = os.path.join(args.save_dir, split_name + '_' + args.sub_file)
        log.info(f'Writing submission file to {sub_path}...')
        with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh:
            csv_writer = csv.writer(csv_fh, delimiter=',')
            csv_writer.writerow(['Id', 'Predicted'])
            for uuid in sorted(eval_preds):
                csv_writer.writerow([uuid, eval_preds[uuid]])
def get_tokenizer(model_type='BERT'):
    if model_type == 'distilBERT':
        tokenizer = DistilBertTokenizerFast.from_pretrained(
            'distilbert-base-uncased')
    elif model_type == 'BERT':
        tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
    elif model_type == 'alBERT':
        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
    else:
        print('model_type not allowed ', model_type)
    return tokenizer
Example #22
0
def __criar_base_treinamento_validacao():
    tokenizer = DistilBertTokenizerFast.from_pretrained(
        'neuralmind/bert-base-portuguese-cased',
        model_max_length=512,
        do_lower_case=False)
    # Obtém a base rotulada a partir de um arquivo JSONL gerado pela ferramenta de anotação Docanno
    textos, tags = __get_textos_tags()
    textos, tags = __pre_processar_base(textos, tags, tokenizer)
    # Divide os textos com quantidade de tokens maior do que o suportado em textos menores.
    train_texts, val_texts, train_tags, val_tags = train_test_split(
        textos, tags, test_size=.2, random_state=42)
    return tags, tokenizer, train_tags, train_texts, val_tags, val_texts
def get_tokenizer() -> DistilBertTokenizerFast:
    """
    Returns tokenizer for that model.

    Parameters:
        None
    Returns:
        tokenizer (DistilBertTokenizerFast) : loaded and set tokenizer.
    """
    tokenizer = DistilBertTokenizerFast.from_pretrained(
        'distilbert-base-uncased')
    return tokenizer
Example #24
0
 def tokenize(self):
     """
     Tokenize DLND data
     """
     tokenizer = DistilBertTokenizerFast.from_pretrained(
         'distilbert-base-uncased',
         cache_dir=TRANSFORMERS_CACHE_DIR,
         local_files_only=True)
     return tokenizer(
         [('. '.join(src_docs), '. '.join(tgt_docs))
          for src_docs, tgt_docs in zip(self.data[-3], self.data[-2])],
         padding=True,
         truncation=True)
    def test_load_from_pretrained(self, tmp_path):
        pickle_dir = tmp_path / "pickle_dir"
        pickle_dir.mkdir()

        model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
        tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

        model.save_pretrained(pickle_dir)
        tokenizer.save_pretrained(pickle_dir)

        modeling_pipeline = TransformersModelingPipeline.load_from_pretrained(str(pickle_dir))

        assert isinstance(modeling_pipeline, TransformersModelingPipeline)
Example #26
0
 def __init__(self, hparams):
     self.train = {}
     self.dev = {}
     self.test = {}
     self.data_turns = {}
     self.data_path = hparams.data_path
     self.tokenizer = DistilBertTokenizerFast.from_pretrained(
         "distilbert-base-uncased")
     self.batch_size = hparams.batch_size
     self.max_len = hparams.max_len
     self.max_value_len = hparams.max_value_len
     self.max_context_len = hparams.max_context_len
     self.no_history = hparams.no_history
Example #27
0
 def __init__(self, model):
     """
     Inicializa com um modelo passado como parâmetro.  Utiliza o tokenizador para língua portuguesa
     neuralmind/bert-base-portuguese-cased.
     """
     super().__init__()
     self.tokenizer = DistilBertTokenizerFast.from_pretrained(
         'neuralmind/bert-base-portuguese-cased',
         model_max_length=512,
         do_lower_case=False)
     self.nlp = pipeline('ner',
                         model=model,
                         tokenizer=self.tokenizer,
                         grouped_entities=True)
def fine_tune_model(dir):

    train_texts, train_labels = read_imdb_split(dir + '/train/')
    test_texts, test_labels = read_imdb_split(dir + '/test/')
    train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2)

    # Use DistilBert tokenizer. Usually to match the pretrained models, we need to use the same tokenization and
    # numericalization as the model. Fortunately, the tokenizer class from transformers provides the correct
    # pre-process tools that correspond to each pre-trained models.



    tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

    # PAss truncation=True and padding=True, which ensure that all sequences are padded to the same length
    # and are truncated to be no longer maximum input length.

    train_encodings = tokenizer(train_texts, truncation=True, padding=True)
    val_encodings = tokenizer(val_texts, truncation=True, padding=True)
    test_encodings = tokenizer(test_texts, truncation=True, padding=True)

    # Now, let's turn our labels and encodings into a Dataset object.In PyTorch, this is done by subclassing
    # a torch.utils.data.Dataset object and implementing __len__ and __getitem__.

    train_dataset = IMDbDataset(train_encodings, train_labels)
    val_dataset = IMDbDataset(val_encodings, val_labels)
    test_dataset = IMDbDataset(test_encodings, test_labels)

    training_args = TrainingArguments(
        output_dir='./results',  # output directory
        num_train_epochs=3,  # total number of training epochs
        per_device_train_batch_size=16,  # batch size per device during training
        per_device_eval_batch_size=64,  # batch size for evaluation
        warmup_steps=500,  # number of warmup steps for learning rate scheduler
        weight_decay=0.01,  # strength of weight decay
        logging_dir='./logs',  # directory for storing logs
        logging_steps=10,
    )

    model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

    trainer = Trainer(
        model=model,  # the instantiated 🤗 Transformers model to be trained
        args=training_args,  # training arguments, defined above
        train_dataset=train_dataset,  # training dataset
        eval_dataset=val_dataset  # evaluation dataset
    )

    trainer.train()
Example #29
0
    def prepare_dataset(self):
        saved_path = os.path.join(self.spot_and_diff_dir, self.mode + '.pt')
        if os.path.exists(saved_path):
            print(self.mode, 'data exists, read from', saved_path)
        else:
            raw_dataset = []
            neg_dataset = []
            tokenizer = DistilBertTokenizerFast.from_pretrained(
                'distilbert-base-cased')
            with open(self.data_dir) as f:
                data = json.load(f)
                for i in range(len(data)):
                    idx = data[i]['img_id']
                    sentences = data[i]['sentences']
                    self.all_sentences += sentences

                    img_0, img_1 = img2tensor(self.img_dir, idx)
                    sample = {
                        'img_0': img_0,
                        'img_1': img_1,
                        'sentences': sentences,
                        'label': 1
                    }
                    raw_dataset.append(sample)
                    if i % 100 == 99:
                        print(i, '/', len(data))
                for i in range(len(data)):
                    # Here, we do shallow copy to avoid dict-level in-place modification
                    current_sample = {**raw_dataset[i]}
                    current_sample[
                        'sentences'] = [] + current_sample['sentences']

                    if self.torch_bernoulli() >= .0:
                        negative_sample = self.augment_sample_sentences(
                            current_sample, n_replacement=1)
                        current_sample['label'] = 0
                        neg_dataset.append(current_sample)
                    if i % 100 == 99:
                        print(i, '/', len(data))

            new_dataset = raw_dataset + neg_dataset
            for r in new_dataset:
                r['sentences'] = text2tensor(r['sentences'], tokenizer)

            os.makedirs(self.spot_and_diff_dir, exist_ok=True)
            torch.save(new_dataset, saved_path)
            print('Saved to', saved_path)
            print(len(raw_dataset), 'positive samples')
            print(len(neg_dataset), 'negative samples')
Example #30
0
 def load_model(self):
     if self.model_name == 'distilbert-base-uncased':
         from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
         self.tokenizer = DistilBertTokenizerFast.from_pretrained(self.model_name)
         self.model = DistilBertForSequenceClassification.from_pretrained(self.model_name)
     elif self.model_name == 'distilbert-base-multilingual-cased':
         from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
         self.tokenizer = DistilBertTokenizerFast.from_pretrained(self.model_name)
         self.model = DistilBertForSequenceClassification.from_pretrained(self.model_name)
     elif self.model_name == 'bert-base-uncased':
         from transformers import BertTokenizerFast, BertForSequenceClassification
         self.tokenizer = BertTokenizerFast.from_pretrained(self.model_name)
         self.model = BertForSequenceClassification.from_pretrained(self.model_name)        
     elif self.model_name == 'bert-base-cased-finetuned-mrpc':
         from transformers import AutoTokenizer, AutoModelForSequenceClassification
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
         self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name)
     elif self.model_name == 'bert-base-multilingual-cased':
         from transformers import BertTokenizerFast, BertForSequenceClassification
         self.tokenizer = BertTokenizerFast.from_pretrained(self.model_name)
         self.model = BertForSequenceClassification.from_pretrained(self.model_name)
     else:
         print('wrongly model name!')
         pass