Ejemplo n.º 1
0
 def __init__(self,
              model_directory: str,
              predictor_name: str,
              device="cuda") -> None:
     self.device = device
     self.config = RobertaConfig.from_pretrained(model_directory)
     # Load in model related information
     self._tokenizer = RobertaTokenizerFast.from_pretrained(
         model_directory, add_special_tokens=False)
     self._model = model = RobertaForSequenceClassification.from_pretrained(
         model_directory, config=self.config).to(device)
     self._model.eval()
     # Prepare optimizer
     no_decay = ["bias", "LayerNorm.weight"]
     optimizer_grouped_parameters = [
         {
             "params": [
                 p for n, p in model.named_parameters()
                 if not any(nd in n for nd in no_decay)
             ],
         },
         {
             "params": [
                 p for n, p in model.named_parameters()
                 if any(nd in n for nd in no_decay)
             ]
         },
     ]
     self._optimizer = AdamW(optimizer_grouped_parameters)
     self._optimizer.load_state_dict(
         torch.load(os.path.join(model_directory, "optimizer.pt")))
Ejemplo n.º 2
0
def tokenize(args):
    src, tgt = args

    if not os.path.exists(src):
        return

#     print(src, tgt)
#     return

    tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")

    print("START", flush = True)
    with open(src, "r", encoding = "utf-8") as read_f:
        text = read_f.read()
    print(f"Read {src}", flush = True)

    tokens = tokenizer.tokenize(text)
    print(f"Tokenized {src}", flush = True)
    del text

    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    print(f"To Token IDs {src}", flush = True)

    with open(tgt, "wb") as dump_f:
        pickle.dump(token_ids, dump_f)
    print(f"Dump {tgt}", flush = True)
    print("END", flush = True)
Ejemplo n.º 3
0
def build(image_set, args):

    img_dir = Path(args.vg_img_path)
    if image_set == "val":
        # We validate on the minival for efficiency
        image_set = "miniv"

    if image_set == "miniv":
        ann_file = Path(
            args.phrasecut_ann_path) / f"finetune_phrasecut_miniv.json"
        image_set = "val"
    else:
        ann_file = Path(
            args.phrasecut_ann_path) / f"finetune_phrasecut_{image_set}.json"

    if args.test:
        ann_file = Path(
            args.phrasecut_ann_path) / f"finetune_phrasecut_test.json"

    tokenizer = RobertaTokenizerFast.from_pretrained(args.text_encoder_type)
    dataset = PhrasecutDetection(
        img_dir,
        ann_file,
        transforms=make_coco_transforms(image_set, cautious=True),
        return_masks=args.masks,
        return_tokens=True,  # args.contrastive_align_loss,
        tokenizer=tokenizer,
    )
    return dataset
Ejemplo n.º 4
0
    def get_data(self, data, all_data=None, train=True, smoothing_label_factor=0.4):
        
        if all_data is None:
            all_data = data

        if train:
            self.tokenizer = RobertaTokenizerFast.from_pretrained('blinoff/roberta-base-russian-v0')
            self.encoder = OneHotEncoder()
            self.encoder.fit(np.array(all_data['class']).reshape(-1, 1))

        X = data['purp'].apply(lambda x: ' '.join(re.findall(r'[\w\d\+]+[\.,]*[\w\d\+]*', x)))
        result = self.tokenizer(list(X), padding='longest')

        tokens = np.array(result['input_ids'])
        attn_mask = np.array(result['attention_mask'])

        if train:
            self.padding_len = tokens.shape[1]
        else:
            tokens = pad_sequences(tokens, padding='post', maxlen=self.padding_len)
            attn_mask = pad_sequences(attn_mask, padding='post', maxlen=self.padding_len)

        y = self.encoder.transform(np.array(data['class']).reshape(-1, 1)).toarray()
        if train:
            y = smooth_labels(y, factor=smoothing_label_factor)

        return tokens, attn_mask, y
Ejemplo n.º 5
0
 def __init__(self, cfg, device):
     super().__init__()
     tokenizer = RobertaTokenizerFast.from_pretrained('./bird_bpe_vocab', max_len=256)
     _config = RobertaConfig(
         vocab_size=tokenizer._tokenizer.get_vocab_size(),
         hidden_size=512,
         num_hidden_layers=4,
         num_attention_heads=8,
         max_position_embeddings=256,
         pad_token_id=1,
         eos_token_id=0,
         bos_token_id=2,
         output_attentions=False,
         output_hidden_states=False
     )
     _model = RobertaForMaskedLM(_config)
     _model.load_state_dict(torch.load('bert_small/checkpoint-1100/pytorch_model.bin'))
     _model.eval()
     self.tokenizer = tokenizer
     self._model = _model
     self.device = device
     self.pad_token = 0
     self.batch_size = cfg.batch_size
     self.proj = None
     if cfg.proj_lang:
         self.proj = nn.Sequential(*[EqualisedLinearLayer(512, cfg.latent_dim, weight_scaling=cfg.weight_scaling), nn.Tanh()])
Ejemplo n.º 6
0
def load_mask_predictor(model_name='roberta-large'):
    logger.info(
        f"Downloading  roBERTa model from huggingface for Masked Text Prediction "
    )
    model = RobertaForMaskedLM.from_pretrained(model_name)
    tokenizer = RobertaTokenizerFast.from_pretrained(model_name)
    device_number = torch.cuda.current_device() if torch.cuda.is_available(
    ) else -1
    predictor = FillMaskPipeline(model=model,
                                 tokenizer=tokenizer,
                                 device=device_number)

    def _postprocess_mask_prediction_token(text):
        return text[1:] if text[0] == "Ġ" else text

    def predict_mask(masked_text: str,
                     options: Optional[List[str]] = None,
                     num_results: int = 1) -> List[Dict[str, Any]]:

        results = predictor(masked_text, targets=options, top_k=num_results)

        parsed_results = []
        for result in results:
            parsed_result = {
                "word":
                _postprocess_mask_prediction_token(result['token_str']),
                "softmax": result["score"]
            }
            parsed_results.append(parsed_result)
        return parsed_results

    return predict_mask
Ejemplo n.º 7
0
def less_than_n_tokens(data, n):
    tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')
    tokenizer.add_special_tokens(
        {"additional_special_tokens": [AddedToken('<skip>', lstrip=True), AddedToken('<no_skip>', lstrip=True)]})

    # splits_ratio = [1, 1, 0]
    splits_ratio = [1]
    splits = []
    for split, ratio in zip([data], splits_ratio):
        text = split['social_assesment'].tolist()[:int(len(split['social_assesment']) * ratio)]
        n_samples = len(text)
        if n_samples == 0:
            continue

        batch_size = 10000
        batch_idx = 0
        while batch_idx * batch_size < n_samples:
            batch_text = text[batch_idx * batch_size: min((batch_idx + 1) * batch_size, n_samples)]

            encoded_texts = tokenizer(batch_text, return_attention_mask=False, truncation=False, padding=False)['input_ids']
            greater_than_n_indices = []
            for text_idx, encoded_text in enumerate(encoded_texts):
                text_length = len(encoded_text)
                if text_length > n:
                    greater_than_n_indices.append(text_idx)
            split = split.drop(greater_than_n_indices, axis=0)

            print('batch ' + str(batch_idx) + ' done.')
            batch_idx += 1

        splits.append(split)
        return splits
Ejemplo n.º 8
0
    def __init__(self, hyper_params: Namespace):
        super().__init__()
        print(hyper_params)
        self.model_name = hyper_params.pretrained_encoder
        self.lower_case = "uncased" in self.model_name

        if self.model_name.startswith("bert"):
            # BERT
            self.tokenizer = BertTokenizerFast.from_pretrained(
                self.model_name, do_lower_case=self.lower_case)
            self.pretrained_model = BertModel.from_pretrained(self.model_name)
        else:
            # RoBERTa
            self.tokenizer = RobertaTokenizerFast.from_pretrained(
                self.model_name)
            self.pretrained_model = RobertaModel.from_pretrained(
                self.model_name)
            self.CLS = "<s>"
            self.SEP = "</s>"

        # Add the new tokens to BERT
        self.new_tags = [
            self.START_ARG, self.END_ARG, self.START_PRED, self.END_PRED
        ]
        n_new_tags = self.tokenizer.add_tokens(self.new_tags)
        assert len(
            self.new_tags) == n_new_tags, "Couldn't add all the new tokens!"
        self.pretrained_model.resize_token_embeddings(len(self.tokenizer))

        self.max_len = 100  #self.pretrained_model.embeddings.position_embeddings.weight.size(0)
        self.dim = self.pretrained_model.embeddings.position_embeddings.weight.size(
            1)
Ejemplo n.º 9
0
    def __init__(self, model: str = None, service: str = "sentiment"):
        """
        Constructor to the class that does the Sentiment Analysis Processing in the back end
        :param model: Transfomer model that will be used for sentiment analysis
        :param service: string to represent the service, this will be defaulted to sentiment
        """
        if model is None:
            model = "distilbert"
        # path to all the files that will be used for inference
        self.path = f"./app/api/{model}/"
        # json file for mapping of network output to the correct category
        self.mapping = self.path + "mapping.json"
        self.model_path = self.path + "model.bin"
        # Selecting the correct model based on the passed madel input. Default distilbert
        if model == "roberta":
            self.model = RobertaClass()
            self.tokenizer = RobertaTokenizerFast.from_pretrained(self.path)
        elif model == "distilbert":
            self.model = DistilBertClass()
            self.tokenizer = DistilBertTokenizerFast.from_pretrained(self.path)
        else:
            self.model = DistilBertClass()
            self.tokenizer = DistilBertTokenizerFast.from_pretrained(self.path)

        self.model.eval()
        self.model.load_state_dict(
            torch.load(self.model_path, map_location=device))

        with open(self.mapping) as f:
            self.config = json.load(f)
    def test_multiple_sequences(self):
        tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
        model = FlaxRobertaModel.from_pretrained("roberta-base")

        sequences = [
            "this is an example sentence", "this is another", "and a third one"
        ]
        encodings = tokenizer(sequences,
                              return_tensors=TensorType.JAX,
                              padding=True,
                              truncation=True)

        @jax.jit
        def model_jitted(input_ids, attention_mask=None, token_type_ids=None):
            return model(input_ids, attention_mask, token_type_ids)

        with self.subTest("JIT Disabled"):
            with jax.disable_jit():
                tokens, pooled = model_jitted(**encodings)
                self.assertEqual(tokens.shape, (3, 7, 768))
                self.assertEqual(pooled.shape, (3, 768))

        with self.subTest("JIT Enabled"):
            jitted_tokens, jitted_pooled = model_jitted(**encodings)

            self.assertEqual(jitted_tokens.shape, (3, 7, 768))
            self.assertEqual(jitted_pooled.shape, (3, 768))
Ejemplo n.º 11
0
    def __init__(self, use_gpu=True, tokenizer=None):
        super().__init__()
        MODEL_NAME = 'iarfmoose/roberta-small-bulgarian-pos'
        if tokenizer:
            self.tokenizer = tokenizer
        else:
            self.tokenizer = RobertaTokenizerFast.from_pretrained(MODEL_NAME)
        self.model = RobertaForTokenClassification.from_pretrained(MODEL_NAME)
        self.model.to(self.device)

        self.tag_to_id = {
            'ADJ': 0,
            'ADP': 1,
            'PUNCT': 2,
            'ADV': 3,
            'AUX': 4,
            'SYM': 5,
            'INTJ': 6,
            'CCONJ': 7,
            'X': 8,
            'NOUN': 9,
            'DET': 10,
            'PROPN': 11,
            'NUM': 12,
            'VERB': 13,
            'PART': 14,
            'PRON': 15,
            'SCONJ': 16
        }

        self.id_to_tag = {self.tag_to_id[tag]: tag for tag in self.tag_to_id}
Ejemplo n.º 12
0
 def __init__(self, config: Dict, datapoints: List[Datapoint]):
     self.data = []
     base_dir = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
     full_model_output_path = os.path.join(base_dir,
                                           config["model_output_path"])
     tokenizer = RobertaTokenizerFast.from_pretrained(
         config["tokenizer_path"],
         cache_dir=full_model_output_path,
         padding_side="right")
     for datapoint in datapoints:
         tokenized = tokenizer(datapoint.statement,
                               padding="max_length",
                               max_length=config["max_seq_len"],
                               truncation=True,
                               return_tensors="np",
                               return_token_type_ids=True,
                               return_attention_mask=True,
                               return_special_tokens_mask=True)
         # Only a single encoding since only a single datapoint tokenized
         self.data.append({
             "ids":
             tokenized.data["input_ids"].squeeze(),
             "type_ids":
             tokenized.data["token_type_ids"].squeeze(),
             "attention_mask":
             tokenized.data["attention_mask"].squeeze(),
             "special_tokens_mask":
             tokenized.data["special_tokens_mask"].squeeze(),
             "label":
             np.array(int(datapoint.label))
         })
    def test_from_pytorch(self):
        with torch.no_grad():
            with self.subTest("roberta-base"):
                tokenizer = RobertaTokenizerFast.from_pretrained(
                    "roberta-base")
                fx_model = FlaxRobertaModel.from_pretrained("roberta-base")
                pt_model = RobertaModel.from_pretrained("roberta-base")

                # Check for simple input
                pt_inputs = tokenizer.encode_plus(
                    "This is a simple input",
                    return_tensors=TensorType.PYTORCH)
                fx_inputs = tokenizer.encode_plus(
                    "This is a simple input", return_tensors=TensorType.JAX)
                pt_outputs = pt_model(**pt_inputs)
                fx_outputs = fx_model(**fx_inputs)

                self.assertEqual(
                    len(fx_outputs), len(pt_outputs),
                    "Output lengths differ between Flax and PyTorch")

                for fx_output, pt_output in zip(fx_outputs,
                                                pt_outputs.to_tuple()):
                    self.assert_almost_equals(fx_output, pt_output.numpy(),
                                              5e-3)
Ejemplo n.º 14
0
def get_model_and_tokenizer(args, type='pattern'):
    if type == 'pattern':
        dropout = args.pattern_dropout
    elif type == 'classifier':
        dropout = args.classifier_dropout
    else:
        raise ValueError('"type" argument for "get_model_and_tokenizer" mast be "pattern" or "classifier", not {}'.format(type))

    model, tokenizer = None, None
    if 'roberta' in args.model_name:
        tokenizer = RobertaTokenizerFast.from_pretrained(args.model_name)
        if args.model_type == 'sequence_classification':
            model = RobertaForSequenceClassification.from_pretrained(args.model_name,
                                                                     hidden_dropout_prob=dropout,
                                                                     attention_probs_dropout_prob=dropout,
                                                                     num_labels=args.num_labels)
        elif args.model_type == 'MLM':
            model = CompactRobertaForMaskedLM.from_pretrained(args.model_name,
                                                                  hidden_dropout_prob=dropout,
                                                                  attention_probs_dropout_prob=dropout)
        elif args.model_type == 'soft_label_classification':
            model = RobertaForSoftLabelSequenceClassification.from_pretrained(args.model_name,
                                                                     hidden_dropout_prob=dropout,
                                                                     attention_probs_dropout_prob=dropout,
                                                                     num_labels=args.num_labels)
    if model and args.eval:
        model = model.from_pretrained(args.model_dir)
    if model and tokenizer:
        model.resize_token_embeddings(len(tokenizer))
        return model, tokenizer
    raise Exception('no such model: name "{}", type "{}"'.format(args.model_name, args.model_type))
Ejemplo n.º 15
0
def build(image_set, args):

    img_dir = Path(args.flickr_img_path) / f"{image_set}"

    if args.GT_type == "merged":
        identifier = "mergedGT"
    elif args.GT_type == "separate":
        identifier = "separateGT"
    else:
        assert False, f"{args.GT_type} is not a valid type of annotation for flickr"

    if args.test:
        ann_file = Path(
            args.flickr_ann_path) / f"final_flickr_{identifier}_test.json"
    else:
        ann_file = Path(args.flickr_ann_path
                        ) / f"final_flickr_{identifier}_{image_set}.json"

    tokenizer = RobertaTokenizerFast.from_pretrained(args.text_encoder_type)
    dataset = FlickrDetection(
        img_dir,
        ann_file,
        transforms=make_coco_transforms(image_set, cautious=True),
        return_masks=False,
        return_tokens=True,  # args.contrastive_align_loss,
        tokenizer=tokenizer,
        is_train=image_set == "train")
    return dataset
Ejemplo n.º 16
0
def build(image_set, args):
    img_dir = Path(args.coco_path) / "train2014"

    refexp_dataset_name = args.refexp_dataset_name
    if refexp_dataset_name in ["refcoco", "refcoco+", "refcocog"]:
        if args.test:
            test_set = args.test_type
            ann_file = Path(args.refexp_ann_path) / f"finetune_{refexp_dataset_name}_{test_set}.json"
        else:
            ann_file = Path(args.refexp_ann_path) / f"finetune_{refexp_dataset_name}_{image_set}.json"
    elif refexp_dataset_name in ["all"]:
        ann_file = Path(args.refexp_ann_path) / f"final_refexp_{image_set}.json"
    else:
        assert False, f"{refexp_dataset_name} not a valid datasset name for refexp"

    tokenizer = RobertaTokenizerFast.from_pretrained(args.text_encoder_type)
    dataset = RefExpDetection(
        img_dir,
        ann_file,
        transforms=make_coco_transforms(image_set, cautious=True),
        return_masks=args.masks,
        return_tokens=True,
        tokenizer=tokenizer,
    )
    return dataset
Ejemplo n.º 17
0
 def __init__(self, batch_size=256):
     self.input_ids_list = []
     self.attention_mask_list = []
     self.label_list = []
     self.batch_size = batch_size
     self.roberta_tokenizer = RobertaTokenizerFast.from_pretrained(
         "roberta-base")
Ejemplo n.º 18
0
def run():
    #load and prepare data
    train, test = load_data()
    train, test = prepare_input(train), prepare_input(test, True)

    #train-test split
    train_texts, val_texts, train_labels, val_labels = train_test_split(
        list(train["input"].values),
        list(train["label_numeric"].values),
        test_size=.2,
        random_state=5)

    #tokenize and train
    tokenizer = RobertaTokenizerFast.from_pretrained('roberta-large')
    train_encodings, val_encodings = tokenize_data(train_texts,
                                                   tokenizer), tokenize_data(
                                                       val_texts, tokenizer)
    model = start_train(train_encodings, train_labels, val_encodings,
                        val_labels)

    #validate and predict on test and write test output
    validate_model(model, tokenizer, val_texts, val_labels)
    predict_on_test(model, tokenizer, test)

    #save model
    model.save_pretrained("data/roberta_model")
Ejemplo n.º 19
0
    def __init__(self, use_gpu=True, tokenizer=None):
        super().__init__()
        MODEL_NAME = 'iarfmoose/roberta-small-bulgarian-ner'
        if tokenizer:
            self.tokenizer = tokenizer
        else:
            self.tokenizer = RobertaTokenizerFast.from_pretrained(MODEL_NAME)

        self.model = RobertaForTokenClassification.from_pretrained(MODEL_NAME)
        self.model.to(self.device)

        self.tag_to_id = {
            'O': 0,
            'I-PRO': 1,
            'I-PER': 2,
            'I-ORG': 3,
            'I-LOC': 4,
            'I-EVT': 5,
            'B-PRO': 6,
            'B-PER': 7,
            'B-ORG': 8,
            'B-LOC': 9,
            'B-EVT': 10
        }

        self.id_to_tag = {self.tag_to_id[tag]: tag for tag in self.tag_to_id}
Ejemplo n.º 20
0
def tokenize(data_x, name, length):
    print(f"Started tokenizer for {name} at length {length}")
    tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')
    mm = np.memmap(f'vectors/{name}_{length}.mm',
                   dtype='int32',
                   mode='w+',
                   shape=(len(data_x), 3, length))
    for i in range(
        (int(len(data_x) / 1000))
    ):  # This is run in batches of 1000 due to memory. Slow tokenizer has
        # issues with some files which causes it to take hours even though batching isn't needed
        tokens = tokenizer.batch_encode_plus(data_x[(i * 1000):((i + 1) *
                                                                1000)],
                                             add_special_tokens=True,
                                             pad_to_max_length=True,
                                             truncation=True,
                                             max_length=length,
                                             return_attention_mask=True,
                                             return_token_type_ids=True,
                                             return_tensors='np')
        mm[(i * 1000):((i + 1) * 1000),
           0, :] = np.array(tokens.get('input_ids'))
        mm[(i * 1000):((i + 1) * 1000),
           1, :] = np.array(tokens.get('attention_mask'))
        mm[(i * 1000):((i + 1) * 1000),
           2, :] = np.array(tokens.get('token_type_ids'))
    print(f"Finished tokenizer for {name} at length {length}")
Ejemplo n.º 21
0
def convert_to_long_model(model_name, tokenizer_name, save_model_to,
                          attention_window, max_pos):
    """
    Starting from the roberta-base checkpoint, the following function converts it into an instance 
    of RobertaLong.

    Args:
        save_model_to (str): path to output dir
        attention_window (int): 
        max_pos (int): max model position before adding extra 2 tokens for roberta models

    Returns:
        transformers.RobertaForMaskedLM: RoBERTa model with LM head on top
    """
    model = RobertaForMaskedLM.from_pretrained(model_name)
    tokenizer = RobertaTokenizerFast.from_pretrained(tokenizer_name,
                                                     model_max_length=max_pos)
    config = model.config

    # extend position embeddings
    tokenizer.model_max_length = max_pos
    tokenizer.init_kwargs['model_max_length'] = max_pos
    current_max_pos, embed_size = model.roberta.embeddings.position_embeddings.weight.shape
    max_pos += 2  # NOTE: RoBERTa has positions 0,1 reserved, so embedding size is max position + 2
    config.max_position_embeddings = max_pos
    assert max_pos > current_max_pos

    # allocate a larger position embedding matrix
    new_pos_embed = model.roberta.embeddings.position_embeddings.weight.new_empty(
        max_pos, embed_size)

    # copy position embeddings over and over to initialize the new position embeddings
    k = 2
    step = current_max_pos - 2
    while k < max_pos - 1:
        new_pos_embed[k:(
            k +
            step)] = model.roberta.embeddings.position_embeddings.weight[2:]
        k += step
    model.roberta.embeddings.position_embeddings.weight.data = new_pos_embed

    # replace the `modeling_bert.BertSelfAttention` object with `LongformerSelfAttention`
    config.attention_window = [attention_window] * config.num_hidden_layers
    for i, layer in enumerate(model.roberta.encoder.layer):
        longformer_self_attn = LongformerSelfAttention(config, layer_id=i)
        longformer_self_attn.query = layer.attention.self.query
        longformer_self_attn.key = layer.attention.self.key
        longformer_self_attn.value = layer.attention.self.value

        longformer_self_attn.query_global = layer.attention.self.query
        longformer_self_attn.key_global = layer.attention.self.key
        longformer_self_attn.value_global = layer.attention.self.value

        layer.attention.self = longformer_self_attn

    logger.info(f'      saving model to {save_model_to}')
    model.save_pretrained(save_model_to)
    tokenizer.save_pretrained(save_model_to)
    return model, tokenizer
Ejemplo n.º 22
0
def getTokenizer(model_name):
    if 'roberta' in model_name:
        return RobertaTokenizerFast.from_pretrained(model_name,
                                                    add_prefix_space=False)
    elif model_name.startswith('bert'):
        return BertTokenizerFast.from_pretrained(model_name,
                                                 add_prefix_space=False)
    elif 'bart' in model_name:
        return RobertaTokenizerFast.from_pretrained(
            'roberta-large', add_prefix_space=False
        )  #check https://github.com/huggingface/transformers/blob/68e19f1c228c92d5d800533f558faff24b57127a/src/transformers/tokenization_bart.py#L27
    elif 'electra' in model_name:
        return ElectraTokenizerFast.from_pretrained(model_name,
                                                    add_prefix_space=False)
    else:
        return AutoTokenizer.from_pretrained(model_name,
                                             add_prefix_space=False)
Ejemplo n.º 23
0
 def setUpClass(self):
     self.tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
     self.dataset = pd.DataFrame.from_dict({
         "question": ["question 0", "question 1"],
         "passage": ["passage 0", "passage 1"],
         "idx": [0, 1],
         "label": [True, False],
     })
     self.max_seq_len = 4
Ejemplo n.º 24
0
def build(dataset_file, image_set, args):

    if dataset_file == "clevr_question":
        if args.clevr_variant == "humans":
            assert args.no_detection, "CLEVR-Humans doesn't have boxes, please disable detection"
            im_set = image_set
            if args.test:
                im_set = "test"
            ann_file = Path(
                args.clevr_ann_path) / f"CLEVR-Humans-{im_set}.json"
            img_dir = Path(args.clevr_img_path) / f"{im_set}"
            image_set = "train" if im_set == "train" else "val"
        elif args.clevr_variant == "cogent":
            assert image_set != "train", "Please train CoGenT with 'clevr' dataset, not 'clevr_question'"
            im_set = args.cogent_set
            ann_file = Path(
                args.clevr_ann_path) / f"CLEVR_{im_set}_questions.json"
            img_dir = Path(args.clevr_img_path) / f"{im_set}"
            image_set = "train" if im_set == "train" else "val"
        elif args.clevr_variant == "normal":
            im_set = image_set
            if args.test:
                im_set = "test"

            ann_file = Path(
                args.clevr_ann_path) / f"CLEVR_{im_set}_questions.json"
            img_dir = Path(args.clevr_img_path) / f"{im_set}"
            image_set = "train" if im_set == "train" else "val"
        else:
            assert False, f"Unknown clevr variant {args.clevr_variant}"
        print("loading ", img_dir, ann_file)
        return ClevrQuestion(
            img_dir,
            ann_file,
            transforms=make_clevr_transforms(image_set, cautious=True),
        )
    tokenizer = RobertaTokenizerFast.from_pretrained(args.text_encoder_type)

    img_dir = Path(args.clevr_img_path) / f"{image_set}"
    ann_file = Path(args.clevr_ann_path) / f"{image_set}.json"

    if args.clevr_variant == "cogent":
        im_set = "trainA" if image_set == "train" else "valA"
        img_dir = Path(args.clevr_img_path) / f"{image_set}A"

    dataset = ClevrDetection(
        img_dir,
        ann_file,
        transforms=make_clevr_transforms(image_set, cautious=True),
        return_masks=False,
        return_tokens=True,
        tokenizer=tokenizer,
        do_qa=args.do_qa,
    )

    return dataset
Ejemplo n.º 25
0
    def __init__(self, path_to_model):
        # model:
        self.model = RobertaForClaimDetection(n_classes=2, unfreeze=False)
        checkpoint = torch.load(path_to_model)
        self.model.load_state_dict(checkpoint['model_state_dict'])
        self.model.eval()
        self.model.to(DEVICE)

        # tokenizer:
        self.tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')
Ejemplo n.º 26
0
 def get_fast_tokenizer(self):
     if 'roberta' in self.bert_name:
         tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', do_lower_case=True)
     elif 'xlnet' in self.bert_name:
         tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased') 
     else:
         tokenizer = BertWordPieceTokenizer(
             "data/.bert-base-uncased-vocab.txt",
             lowercase=True)
     return tokenizer
Ejemplo n.º 27
0
def get_tokenizer(vocab_size):
    pretrained_tokenizer_path = Path(
        'experiments/tokenizers') / f'{tokenizer_type}-{vocab_size}'
    logger.info(
        f'loading {tokenizer_type}-{vocab_size} tokenizer from {pretrained_tokenizer_path}'
    )
    if transformer_type == 'roberta':
        return RobertaTokenizerFast.from_pretrained(
            str(pretrained_tokenizer_path), max_len=512)
    return BertTokenizerFast.from_pretrained(str(pretrained_tokenizer_path),
                                             max_len=512)
Ejemplo n.º 28
0
 def __init__(self, config: Bunch) -> None:
     pl.LightningModule.__init__(self)
     self.config = config
     self.model = RobertaForSequenceClassification.from_pretrained(
         config.pretrained_model)
     roberta_tokenizer = RobertaTokenizerFast.from_pretrained(
         self.config.pretrained_model)
     tokenizer = PreTrainedTokenizer(roberta_tokenizer,
                                     self.config.max_tokens_per_tweet)
     self.data_processor = DataProcessor(config, tokenizer)
     self.loss = CrossEntropyLoss()
Ejemplo n.º 29
0
 def __init__(self, use_gpu=True):
     tokenizer = RobertaTokenizerFast.from_pretrained(
         'iarfmoose/roberta-small-bulgarian')
     self.pos_tagger = POSTagger(use_gpu=use_gpu, tokenizer=tokenizer)
     self.ner_tagger = NERTagger(use_gpu=use_gpu, tokenizer=tokenizer)
     self.entity_types = {
         'PRO': 'PRODUCT',
         'PER': 'PERSON',
         'ORG': 'ORGANISATION',
         'LOC': 'LOCATION',
         'EVT': 'EVENT'
     }
Ejemplo n.º 30
0
def test_featurize():
    """Test that RxnFeaturizer.featurize() correctly featurizes the reactions,
    correctly outputs the input_ids and attention_mask.
    """
    from transformers import RobertaTokenizerFast
    from deepchem.feat.reaction_featurizer import RxnFeaturizer
    tokenizer = RobertaTokenizerFast.from_pretrained(
        "seyonec/PubChem10M_SMILES_BPE_450k")
    featurizer = RxnFeaturizer(tokenizer, sep_reagent=True)
    reaction = ['CCS(=O)(=O)Cl.OCCBr>CCN(CC)CC.CCOCC>CCS(=O)(=O)OCCBr']
    feats = featurizer.featurize(reaction)
    assert (feats.shape == (1, 2, 2, 1))