コード例 #1
0
def tokenize_and_cache_data(data_dir,
                            output_dir,
                            tokenizer=None,
                            tokenizer_path=None,
                            n_sentences=0,
                            use_overflow=False,
                            two_segments=True,
                            delete_existing=False,
                            max_length=512):

    if not tokenizer:
        tokenizer = BertWordPieceTokenizer(tokenizer_path)

    tokenizer.enable_truncation(max_length=max_length)
    tokenizer.enable_padding(max_length=max_length)

    num_tokens = 0
    num_examples = 0

    if delete_existing:
        rmtree(output_dir)
    os.makedirs(output_dir, exist_ok=True)

    pbar = tqdm(os.listdir(data_dir))
    for path in pbar:
        result = process_one_file(data_dir, path, tokenizer, output_dir,
                                  n_sentences, use_overflow, two_segments)
        num_examples += result['num_examples']
        num_tokens += result['num_tokens']

        pbar.set_description(
            f"{num_tokens} tokens, {num_examples} examples, {num_tokens/(num_examples*max_length)} non-pad tokens"
        )
コード例 #2
0
def get_transformer_tokenizer(vocab_path, max_tokens, device="cpu"):
    """
    Return a tokenizer to be used with Transformer-based models
    """
    wp_tokenizer = BertWordPieceTokenizer(vocab_path, lowercase=True)
    wp_tokenizer.enable_padding(direction="right", pad_type_id=1)
    wp_tokenizer.enable_truncation(max_tokens)
    return TransformerSquadTokenizer(wp_tokenizer, device=device)
コード例 #3
0
    def __init__(self, tokenizer: AutoTokenizer, file_path: str, args):
        print(file_path)
        assert os.path.isfile(file_path)

        directory, filename = os.path.split(file_path)
        cached_features_file = os.path.join(
            directory, args.bert_model_type + "_cached_mlm_" + filename)

        if os.path.exists(cached_features_file):
            print("Loading features from cached file %s", cached_features_file)
            with open(cached_features_file, "rb") as handle:
                self.samples = torch.load(handle)
        else:
            print("Creating features from dataset file at %s", directory)

            # Get the faster tokenizer from tokenizers package
            tokenizer.save_vocabulary(vocab_path='.')
            fast_tokenizer = BertWordPieceTokenizer("vocab.txt",
                                                    lowercase=args.lowercase)
            fast_tokenizer.enable_truncation(tokenizer.max_len)
            fast_tokenizer.enable_padding(max_length=tokenizer.max_len,
                                          pad_token=tokenizer.pad_token)

            self.samples = []

            # Load data over here
            df = pd.read_json(file_path)
            print('SQUAD data: ')

            for _, row in tqdm(df.iterrows(), total=df.shape[0]):
                for paragraph in row['data']['paragraphs']:
                    context = paragraph['context']
                    for qa_pair in paragraph['qas']:
                        question = qa_pair['question']

                        batch = fast_tokenizer.encode(question, context)
                        self.samples.append({
                            'input_ids':
                            batch.ids,
                            'attention_mask':
                            batch.attention_mask
                        })

                        for encoding in batch.overflowing:
                            self.samples.append({
                                'input_ids':
                                encoding.ids,
                                'attention_mask':
                                encoding.attention_mask
                            })

            df = None

            print("Saving features into cached file: ", cached_features_file)
            with open(cached_features_file, "wb") as handle:
                torch.save(self.samples,
                           handle,
                           pickle_protocol=pickle.HIGHEST_PROTOCOL)
コード例 #4
0
def tokenize(texts: pd.Series,
             tokenizer: BertWordPieceTokenizer,
             chunk_size: int = 240,
             maxlen: int = 512) -> np.array:
    '''Tokenize input text, return in a form of array'''
    tokenizer.enable_truncation(max_length=maxlen)
    try:
        tokenizer.enable_padding(max_length=maxlen)
    except TypeError:
        tokenizer.enable_padding(length=maxlen)
    all_ids = []

    for i in range(0, len(texts), chunk_size):
        text_chunk = texts[i:i + chunk_size].tolist()
        encs = tokenizer.encode_batch(text_chunk)
        all_ids.extend([enc.ids for enc in encs])

    return np.array(all_ids)
コード例 #5
0
ファイル: _dataset.py プロジェクト: shaharazulay/dna-bert
class BERT16SDataset(Dataset):
	"""
	A torch dataset class designed to load a 16S data found in a tsv file and encode it for BERT.
	:param vocab_path: str, path to the pre-trained bert tokenizer vocab file.
	:param data_path: str, path to the 16S data file.
	:param block_size: str, maximal BERT input (an encoded sample will be padded to this length if too short)
	:param max_word_length: int, the maximal word length the tokenizer can encode.
	"""
	def __init__(self, vocab_path: str, data_path: str, block_size=512, max_word_length=100):

		assert os.path.isfile(data_path)
		assert os.path.isfile(vocab_path)

		_logger.info(f"Loading BERT tokenizer using vocab file {vocab_path}")
		self.tokenizer = BertWordPieceTokenizer(
			vocab_path,
			handle_chinese_chars=False,
			lowercase=False)
		self.tokenizer.enable_truncation(block_size)
		self.tokenizer.enable_padding(max_length=block_size)

		_logger.info(f"Loading 16S dataset file at {data_path}...")
		self._16s_corpus_df = pd.read_csv(data_path, sep='\t')
		_logger.info(f"16S corpus is of shape {self._16s_corpus_df.shape}")

		self.samples = self._16s_corpus_df.seq.values.tolist()
		self.max_word_length = max_word_length

	def __len__(self):
		return len(self._16s_corpus_df)

	def __getitem__(self, i):
		sample = self._split_sequence_by_max_word_length(self.samples[i])
		tokens = self.tokenizer.encode(sample)
		return torch.tensor(tokens.ids, dtype=torch.long)

	def _split_sequence_by_max_word_length(self, seq):
		"""
		split a 16S sequence (~1K long usually) into white-spaces separated chunks that the tokenizer can encode.
		:param seq: str, 16S sequence
		:return: str
		"""
		chunks = [seq[i: i + self.max_word_length] for i in range(0, len(seq), self.max_word_length)]
		return ' '.join(chunks)
コード例 #6
0
class Tokenizer:
    def __init__(self, lang):
        """
        A Tokenizer class to load and train a custom tokenizer
        Using the Hugging Face tokenization library for the same
        """
        self.tokenizer_dir = r"data/{}".format(lang)
        if not os.path.exists(self.tokenizer_dir):
            os.mkdir(self.tokenizer_dir)
        self.vocab = self.tokenizer_dir + "/vocab.txt"
        if os.path.exists(self.vocab):
            print("Initialized tokenizer using cached vocab file {}".format(self.vocab))
            self.tokenizer = BertWordPieceTokenizer(vocab_file=self.vocab)
        else:
            self.tokenizer = BertWordPieceTokenizer()

        self.tokenizer.enable_padding(max_length=MAX_LENGTH)
        self.tokenizer.enable_truncation(max_length=MAX_LENGTH)

    def train_tokenizer(self, sentences):
        """
        Train a tokenizer with a list of sentences
        """

        if not os.path.exists(self.vocab):
            print("Training tokenizer for {}".format(self.tokenizer_dir))
            # Hugging Face only accepts a Temp File with sentences for Training Tokenizer
            with open(self.tokenizer_dir + "/data.txt", "w+", encoding="utf-8") as f:
                [f.write(i + "\n") for i in sentences]
            self.tokenizer.train([self.tokenizer_dir + "/data.txt"])
            self.tokenizer.save(self.tokenizer_dir)
            print("Trained a tokenizer with vocab size {}".format(self.tokenizer.get_vocab_size()))

            # Removing the temp file
            os.remove(self.tokenizer_dir + "/data.txt")

    def encode(self, decoded):
        return self.tokenizer.encode(decoded)

    def decode(self, encoded):
        return self.tokenizer.decode_batch(encoded)
def get_preds(list_of_texts):
    transformer_layer = (transformers.TFDistilBertModel.from_pretrained(
        'distilbert-base-multilingual-cased'))

    model = build_model(transformer_layer, max_len=MAX_LEN)
    model.load_weights('model/weights')

    #model = tf.keras.models.load_model('model')

    print('weights loaded')

    tokenizer = transformers.DistilBertTokenizer.from_pretrained(
        'distilbert-base-multilingual-cased')
    tokenizer.save_pretrained('.')
    # Reload it with the huggingface tokenizers library
    fast_tokenizer = BertWordPieceTokenizer('vocab.txt', lowercase=False)

    fast_tokenizer.enable_truncation(max_length=MAX_LEN)
    fast_tokenizer.enable_padding(length=MAX_LEN)

    all_ids = []
    encs = fast_tokenizer.encode_batch(list_of_texts)
    all_ids.extend([enc.ids for enc in encs])

    all_ids = np.array(all_ids).astype(np.float32)

    to_predict = create_test(all_ids)

    predictions = model.predict(to_predict)
    #print(predictions*10)

    for prediction in predictions:
        print(prediction)

    dic = {'predictions': predictions}

    parsed = []
    #response = pd.DataFrame(dic)
    #parsed = response.to_json(orient = 'columns') #not sure if works
    #json.dumps(parsed)           #to be reviewed
    return parsed, predictions
コード例 #8
0
def train(args, rep):
    # Set random seed
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    # Rename output dir based on arguments
    if args.output_dir == "":
        cwd = os.getcwd()
        base = args.model_name_or_path.split("/")[-1]
        model_type = "_example" if args.example else "_linear"
        data_path = '_' + '_'.join(
            args.train_data_path.split("/")[-2:]).replace(".csv", "")
        mlm_on = "_mlmtrain" if args.mlm_data_path == "" or args.mlm_data_path == args.train_data_path else "_mlmfull"
        mlm_pre = "_mlmpre" if args.mlm_pre else ""
        mlm_dur = "_mlmdur" if args.mlm_during else ""
        observer = "_observer" if args.use_observers else ""
        name = base + model_type + data_path + mlm_on + mlm_pre + mlm_dur + observer + "_v{}".format(
            rep)
        args.output_dir = os.path.join(cwd, "checkpoints", name)

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
    elif args.num_epochs == 0:
        pass
    else:
        raise Exception("Directory {} already exists".format(args.output_dir))
        #pass

    json.dump(args.__dict__,
              open(os.path.join(args.output_dir, 'args.json'), "w+"))

    # Save args
    torch.save(args, os.path.join(args.output_dir, "run_args"))

    # Configure tensorboard writer
    tb_writer = SummaryWriter(log_dir=args.output_dir)

    # Configure tokenizer
    token_vocab_name = os.path.basename(args.token_vocab_path).replace(
        ".txt", "")
    tokenizer = BertWordPieceTokenizer(args.token_vocab_path,
                                       lowercase=args.do_lowercase)
    tokenizer.enable_padding(max_length=args.max_seq_length)
    tokenizer.save(args.output_dir + "/tokenizer")

    # Data readers
    if args.task == "intent":
        dataset_initializer = IntentDataset
    elif args.task == "slot":
        if 'taskmaster' in args.train_data_path:
            dataset_initializer = TMSlotDataset
        else:
            dataset_initializer = SlotDataset
    elif args.task == "response":
        dataset_initializer = ResponseSelectionDataset
    elif args.task == "dst":
        dataset_initializer = StateTrackingDataset
    elif args.task == "top":
        dataset_initializer = TOPDataset
    else:
        raise ValueError("Not a valid task type: {}".format(args.task))

    train_dataset = dataset_initializer(args.train_data_path, tokenizer,
                                        args.max_seq_length, token_vocab_name)

    if args.mlm_data_path != '':
        mlm_dataset = dataset_initializer(args.mlm_data_path, tokenizer,
                                          args.max_seq_length,
                                          token_vocab_name)
    else:
        mlm_dataset = train_dataset

    val_dataset = dataset_initializer(
        args.val_data_path, tokenizer, 512,
        token_vocab_name) if args.val_data_path else None

    test_dataset = dataset_initializer(args.test_data_path, tokenizer, 512,
                                       token_vocab_name)

    # Data loaders
    train_dataloader = DataLoader(dataset=train_dataset,
                                  batch_size=args.train_batch_size,
                                  shuffle=True,
                                  pin_memory=True)

    mlm_dataloader = DataLoader(dataset=mlm_dataset,
                                batch_size=args.train_batch_size,
                                shuffle=True,
                                pin_memory=True)

    val_dataloader = DataLoader(dataset=val_dataset,
                                batch_size=1,
                                pin_memory=True) if val_dataset else None

    test_dataloader = DataLoader(dataset=test_dataset,
                                 batch_size=1,
                                 pin_memory=True)

    # Load model
    if args.task == "intent":
        if args.example:
            model = ExampleIntentBertModel(
                args.model_name_or_path,
                dropout=args.dropout,
                num_intent_labels=len(train_dataset.intent_label_to_idx),
                use_observers=args.use_observers)
        else:
            model = IntentBertModel(args.model_name_or_path,
                                    dropout=args.dropout,
                                    num_intent_labels=len(
                                        train_dataset.intent_label_to_idx),
                                    use_observers=args.use_observers)
    elif args.task == "slot":
        if args.example:
            model = ExampleSlotBertModel(args.model_name_or_path,
                                         dropout=args.dropout,
                                         num_slot_labels=len(
                                             train_dataset.slot_label_to_idx),
                                         use_observers=args.use_observers)
        else:
            model = SlotBertModel(args.model_name_or_path,
                                  dropout=args.dropout,
                                  num_slot_labels=len(
                                      train_dataset.slot_label_to_idx),
                                  use_observers=args.use_observers)
    elif args.task == "response":
        model = ResponseSelectionBertModel(args.model_name_or_path,
                                           dropout=args.dropout)
    elif args.task == "dst":
        model = StateTrackingBertModel(
            args.model_name_or_path,
            dropout=args.dropout,
            num_slot_labels=train_dataset.slot_lengths)
    elif args.task == "top":
        if args.example:
            model = ExampleJointSlotIntentBertModel(
                args.model_name_or_path,
                dropout=args.dropout,
                num_intent_labels=len(train_dataset.intent_label_to_idx),
                num_slot_labels=len(train_dataset.slot_label_to_idx))
        else:
            model = JointSlotIntentBertModel(
                args.model_name_or_path,
                dropout=args.dropout,
                num_intent_labels=len(train_dataset.intent_label_to_idx),
                num_slot_labels=len(train_dataset.slot_label_to_idx))
    else:
        raise ValueError("Cannot instantiate model for task: {}".format(
            args.task))

    if torch.cuda.is_available():
        model.to(args.device)

    if args.mlm_pre or args.mlm_during:
        pre_model = BertPretrain(args.model_name_or_path)
        mlm_optimizer = AdamW(pre_model.parameters(),
                              lr=args.learning_rate,
                              eps=args.adam_epsilon)
        if torch.cuda.is_available():
            pre_model.to(args.device)

    # MLM Pre-train
    if args.mlm_pre and args.num_epochs > 0:
        # Maintain most recent score per label.
        for epoch in trange(3, desc="Pre-train Epochs"):
            pre_model.train()
            epoch_loss = 0
            num_batches = 0
            for batch in tqdm(mlm_dataloader):
                num_batches += 1

                # Train model
                if "input_ids" in batch:
                    inputs, labels = mask_tokens(batch["input_ids"].cuda(),
                                                 tokenizer)
                else:
                    inputs, labels = mask_tokens(batch["ctx_input_ids"].cuda(),
                                                 tokenizer)

                loss = pre_model(inputs, labels)
                if args.grad_accum > 1:
                    loss = loss / args.grad_accum
                loss.backward()
                epoch_loss += loss.item()

                if args.grad_accum <= 1 or num_batches % args.grad_accum == 0:
                    if args.max_grad_norm > 0:
                        torch.nn.utils.clip_grad_norm_(pre_model.parameters(),
                                                       args.max_grad_norm)

                    mlm_optimizer.step()
                    pre_model.zero_grad()

            LOGGER.info("Epoch loss: {}".format(epoch_loss / num_batches))

        # Transfer BERT weights
        model.bert_model = pre_model.bert_model.bert

    # Train
    optimizer = AdamW(model.parameters(),
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    global_step = 0
    metrics_to_log = {}
    best_score = -1
    patience = 0
    for epoch in trange(args.num_epochs, desc="Epoch"):
        model.train()
        epoch_loss = 0
        num_batches = 0

        if args.task == "top" and args.example:
            # Pre-fill cache but don't return anything
            retrieve_examples(train_dataset, None, None, task="top")

        for batch in tqdm(train_dataloader):
            num_batches += 1
            global_step += 1

            # Transfer to gpu
            if torch.cuda.is_available():
                for key, val in batch.items():
                    if type(batch[key]) is list:
                        continue

                    batch[key] = batch[key].to(args.device)

            # Train model
            if args.task == "intent":
                if args.example:
                    examples = retrieve_examples(train_dataset,
                                                 batch["intent_label"],
                                                 batch["ind"],
                                                 task="intent")

                    _, intent_loss = model(
                        input_ids=batch["input_ids"],
                        attention_mask=batch["attention_mask"],
                        token_type_ids=batch["token_type_ids"],
                        intent_label=batch["intent_label"],
                        example_input=examples["input_ids"],
                        example_mask=examples["attention_mask"],
                        example_token_types=examples["token_type_ids"],
                        example_intents=examples["intent_label"])
                else:
                    _, intent_loss = model(
                        input_ids=batch["input_ids"],
                        attention_mask=batch["attention_mask"],
                        token_type_ids=batch["token_type_ids"],
                        intent_label=batch["intent_label"])
                if args.grad_accum > 1:
                    intent_loss = intent_loss / args.grad_accum
                intent_loss.backward()
                epoch_loss += intent_loss.item()
            elif args.task == "slot":
                if args.example:
                    examples = retrieve_examples(train_dataset,
                                                 batch["slot_labels"],
                                                 batch["ind"],
                                                 task="slot",
                                                 num=64)

                    _, slot_loss = model(
                        input_ids=batch["input_ids"],
                        attention_mask=batch["attention_mask"],
                        token_type_ids=batch["token_type_ids"],
                        slot_labels=batch["slot_labels"],
                        example_word_inds=examples["word_ind"],
                        example_input=examples["input_ids"],
                        example_mask=examples["attention_mask"],
                        example_token_types=examples["token_type_ids"],
                        example_slots=examples["slot_labels"])
                else:
                    _, slot_loss = model(
                        input_ids=batch["input_ids"],
                        attention_mask=batch["attention_mask"],
                        token_type_ids=batch["token_type_ids"],
                        slot_labels=batch["slot_labels"])
                if args.grad_accum > 1:
                    slot_loss = slot_loss / args.grad_accum
                slot_loss.backward()
                epoch_loss += slot_loss.item()
            elif args.task == "response":
                resp_loss = model(
                    ctx_input_ids=batch["ctx_input_ids"],
                    ctx_attention_mask=batch["ctx_attention_mask"],
                    ctx_token_type_ids=batch["ctx_token_type_ids"],
                    rsp_input_ids=batch["rsp_input_ids"],
                    rsp_attention_mask=batch["rsp_attention_mask"],
                    rsp_token_type_ids=batch["rsp_token_type_ids"])
                resp_loss.backward()
                epoch_loss += resp_loss.item()
            elif args.task == "dst":
                _, state_loss = model(input_ids=batch["input_ids"],
                                      attention_mask=batch["attention_mask"],
                                      token_type_ids=batch["token_type_ids"],
                                      state_label=batch["state_label"])
                state_loss.backward()
                epoch_loss += state_loss.item()
            elif args.task == "top":
                if args.example:
                    # Get intent examples
                    intent_examples = retrieve_examples(train_dataset,
                                                        batch["intent_label"],
                                                        batch["ind"],
                                                        task="intent",
                                                        num=32)

                    # Get slot examples
                    slot_examples = retrieve_examples(train_dataset,
                                                      batch["slot_labels"],
                                                      batch["ind"],
                                                      task="slot",
                                                      num=32)

                    loss = model(input_ids=batch["input_ids"],
                                 attention_mask=batch["attention_mask"],
                                 token_type_ids=batch["token_type_ids"],
                                 intent_label=batch["intent_label"],
                                 slot_labels=batch["slot_labels"],
                                 intent_examples=intent_examples,
                                 slot_examples=slot_examples)
                else:
                    _, _, loss = model(input_ids=batch["input_ids"],
                                       attention_mask=batch["attention_mask"],
                                       token_type_ids=batch["token_type_ids"],
                                       intent_label=batch["intent_label"],
                                       slot_labels=batch["slot_labels"])
                if args.grad_accum > 1:
                    loss = loss / args.grad_accum
                loss.backward()
                epoch_loss += loss.item()

            if args.grad_accum <= 1 or num_batches % args.grad_accum == 0:
                if args.max_grad_norm > 0:
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)

                optimizer.step()
                model.zero_grad()

        LOGGER.info("Epoch loss: {}".format(epoch_loss / num_batches))

        # Evaluate and save checkpoint
        score = evaluate(model,
                         val_dataloader,
                         train_dataloader,
                         tokenizer,
                         task=args.task,
                         example=args.example,
                         device=args.device)
        metrics_to_log["eval_score"] = score
        LOGGER.info("Task: {}, score: {}---".format(args.task, score))

        if score < best_score:
            patience += 1
        else:
            patience = 0

        if score > best_score:
            LOGGER.info("New best results found for {}! Score: {}".format(
                args.task, score))
            torch.save(model.state_dict(),
                       os.path.join(args.output_dir, "model.pt"))
            torch.save(optimizer.state_dict(),
                       os.path.join(args.output_dir, "optimizer.pt"))
            best_score = score

        for name, val in metrics_to_log.items():
            tb_writer.add_scalar(name, val, global_step)

        if patience >= args.patience:
            LOGGER.info("Stopping early due to patience")
            break

        # Run MLM during training
        if args.mlm_during:
            pre_model.train()
            epoch_loss = 0
            num_batches = 0
            for batch in tqdm(mlm_dataloader):
                num_batches += 1

                # Train model
                if "input_ids" in batch:
                    inputs, labels = mask_tokens(batch["input_ids"].cuda(),
                                                 tokenizer)
                else:
                    inputs, labels = mask_tokens(batch["ctx_input_ids"].cuda(),
                                                 tokenizer)

                loss = pre_model(inputs, labels)

                if args.grad_accum > 1:
                    loss = loss / args.grad_accum

                loss.backward()
                epoch_loss += loss.item()

                if args.grad_accum <= 1 or num_batches % args.grad_accum == 0:
                    if args.max_grad_norm > 0:
                        torch.nn.utils.clip_grad_norm_(pre_model.parameters(),
                                                       args.max_grad_norm)

                    mlm_optimizer.step()
                    pre_model.zero_grad()

            LOGGER.info("MLMloss: {}".format(epoch_loss / num_batches))

    # Evaluate on test set
    LOGGER.info("Loading up best model for test evaluation...")
    model.load_state_dict(torch.load(os.path.join(args.output_dir,
                                                  "model.pt")))
    score = evaluate(model,
                     test_dataloader,
                     train_dataloader,
                     tokenizer,
                     task=args.task,
                     example=args.example,
                     device=args.device)
    print("Best result for {}: Score: {}".format(args.task, score))
    tb_writer.add_scalar("final_test_score", score, global_step)
    tb_writer.close()
    return score
コード例 #9
0
# FAZ O DOWNLOAD DO PRE-TREINADO EM PT-BT
if not os.path.exists('bert-base-portuguese-cased_pytorch_checkpoint.zip'):
  wget.download("https://neuralmind-ai.s3.us-east-2.amazonaws.com/nlp/bert-base-portuguese-cased/bert-base-portuguese-cased_pytorch_checkpoint.zip")
  !unzip bert-base-portuguese-cased_pytorch_checkpoint.zip -d bert-portuguese

# CRIA O TOKENIZER A PARTIR DE UM VOCABULÁRIO
# LOWERCASE = FALSE (NÃO IRÁ CONVERTER AS ENTRADAS PARA LOWERCASE. MANTEM O ORGINIAL)
# STRIP ACCENTS = FALSE (MANTEM OS ACENTOS)
tokenizer = BertWordPieceTokenizer("vocab.txt", lowercase=False, strip_accents=False)

# MOSTRA AS INFORMAÇÕES DO TONENIZER
print(tokenizer)

# PERMITE O TRUNCATION E O PADDING
tokenizer.enable_truncation(max_length=60)
tokenizer.enable_padding()


# TOKENINZA EM BATCH TODAS AS SENTENÇAS
# TEM QUE USAR .TOLIST PARA CONVERTER POR LISTA. SENTENCAS É UM ARRAY NUMPY
output = tokenizer.encode_batch(sentencas.tolist())

# O TOKENIZER RETORAR UMA LISTA DE OBJETOS DO TIPO TOKENIZER
# PRECISAMOS PEGAR OS ATRIBUTOS IDS E MASKS E ADICIONAR PARA LISTAS
# OS OBJETOS TEM O ATRIBUTO IDS(IDS), TOKENS (TOKENS) E attention_mask
# PRECISAMOS FAXER O FOR PARA PEGAR CADA UM E DEPOIS CRIAR A LISTA
ids=[x.ids for x in output]
attention_mask = [x.attention_mask for x in output]

print(len(ids))
print(len(attention_mask))
コード例 #10
0
class InferNER(object):

    def __init__(self, head_directories, head_configs, device=None,
                 from_huner=False, lowercase=False):
        """

        :param head_directories: list containing the directory paths to the head models.
        :param head_configs: a list containing the paths to the head config files.
        :param device: One of 'cpu' or 'cuda'. Defaults to 'cpu'.
        :param lowercase: preprocessing option. If predicting an entity type,
            like Gene, where the case matters, set to False (default).
        """
        # SET DEVICE
        if not device:
            self.device = "cuda" if torch.cuda.is_available() else "cpu"
        else:
            self.device = device

        assert len(head_directories) == len(head_configs)

        # LOAD TOKENIZER AND MODELS
        self.models = []
        for i, head in enumerate(head_directories):
            # LOAD BASE MODEL
            print(f'Loading BERT pre-trained model {head}')
            self.bert = BertModel.from_pretrained(head, from_tf=False)
            # LOAD HEAD
            print(f'Loading {head}')
            path_to_head_config = os.path.join(head, head_configs[i])
            self.path_to_vocab = os.path.join(head, 'vocab.txt')
            self.head_directory = head
            self.head_config = BertConfig.from_pretrained(path_to_head_config)
            head_config_dict = json.load(open(os.path.join(self.head_directory, head_configs[i]), 'rb'))
            self.head = SubwordClassificationHead(head_config_dict['head_task'], labels=head_config_dict['labels'])
            print(self.head.from_pretrained(self.head_directory))

            # Collect models
            self.models.append({'head': self.head,
                                'base': self.bert,
                                'entity_type': head.split('_')[-3],
                                'dataset': head.split('_')[-4]})

        # LOAD TOKENIZER AND SET OPTIONS
        print('Loading Tokenizer and setting options')
        self.tokenizer = BertWordPieceTokenizer(self.path_to_vocab,  # uses last head loaded for vocab
                                                lowercase=lowercase)
        self.tokenizer.enable_padding(direction='right',
                                      pad_id=0,
                                      max_length=self.head_config.max_position_embeddings)

        # CONSTRUCT PROCESSORS
        head_name = os.path.basename(self.head_directory)
        self.sentencizer = TransformersLanguage(trf_name=head_name, meta={"lang": "en"})
        self.sentencizer.add_pipe(self.sentencizer.create_pipe("sentencizer"))

        print('Loaded BERT head, config, tokenizer, and sentencizer')
        self.labels = sorted(self.head.config.labels)  # Fine-tuning may have been done on sorted labels.

        self.from_huner = from_huner

    def run_document(self, path_to_document, output_filename=None, output_directory="."):

        output_filename_fullpath = os.path.join(output_directory, output_filename)
        if os.path.exists(output_filename_fullpath):
            print(f"{output_filename_fullpath} exists, skipping")
            return

        with open(path_to_document, encoding='utf-8') as f:
            document_as_string = f.read()  # does this work for large documents?

        self.output_dict = {'tokens': [],
                            'sentence_spans': [],
                            'document_spans': [],
                            'probability': [],
                            'labels': []
                            }

        sentencized_document = self.sentencizer(document_as_string)
        number_of_sentences = len(list(sentencized_document.sents))
        test_stop = 10000000

        # number_of_sentences = test_stop
        for model in self.models:
            self.head = model['head']
            self.bert = model['base']

            if self.from_huner:
                model_entity_type = model['entity_type']
                model_dataset = model['dataset']
                document_entity_type = os.path.basename(path_to_document).split("_")[0]
                document_dataset = os.path.basename(path_to_document).split("_")[1].replace('.txt', '')
                if model_entity_type != document_entity_type:
                    print(model_entity_type, document_entity_type)
                    continue
                if model_dataset != document_dataset:
                    print(model_dataset, document_dataset)
                    continue

            for sentence_idx, sentence in enumerate(sentencized_document.sents):
                annotation_start = time.time()
                if sentence_idx > test_stop:
                    break

                print(f'\nAnnotating sentence {sentence_idx} of {number_of_sentences}')

                self.sentence = sentence
                self.sentence_idx = sentence_idx
                self.sentence_encoding = self.tokenizer.encode(self.sentence.string)
                if len(self.sentence_encoding) > 512:
                    print(f"In document {os.path.basename(output_filename)}, this sentence exeeds the maximum token sequence size\n{self.sentence}")
                    print("Skipping document")
                    raise Exception

                # PREPARE MODEL INPUT
                input_ids = torch.tensor([self.sentence_encoding.ids], dtype=torch.long)
                attention_mask = torch.tensor([self.sentence_encoding.attention_mask], dtype=torch.long)
                token_type_ids = torch.tensor([self.sentence_encoding.type_ids], dtype=torch.long)
                self.document = sentencized_document
                self.tokens = self.sentence_encoding.tokens
                self.spans = self.sentence_encoding.offsets
                self.input_ids = input_ids

                # RUN EXAMPLE THROUGH BERT
                self.bert.eval()
                if not next(self.bert.parameters()).is_cuda:
                    self.bert.to(device=self.device)
                self.head.eval()
                if not next(self.head.parameters()).is_cuda:
                    self.head.to(device=self.device)
                with torch.no_grad():
                    print(f"BERT Head: {self.head}")
                    print(f"On {self.device} device")
                    input_ids = input_ids.to(device=self.device)
                    attention_mask = attention_mask.to(device=self.device)
                    self.bert_outputs = self.bert(input_ids=input_ids,
                                                  attention_mask=attention_mask,
                                                  # token_type_ids=token_type_ids,
                                                  token_type_ids=None,
                                                  position_ids=None)[0]
                    self.subword_scores = self.head(self.bert_outputs)[0]
                    self.subword_scores_softmax = softmax(self.subword_scores,
                                                          dim=2)  # Get probabilities for each label

                    self.predicted_label_keys = self.subword_scores_softmax.max(2)[1][0]
                    self.predicted_label_probabilities = self.subword_scores_softmax.max(2)[0][0].cpu().numpy()

                    self.labels = sorted(self.head.config.labels)
                    self.predicted_labels = [self.labels[label_key] for label_key in
                                             self.predicted_label_keys.cpu().numpy()]

                    token_mask = self.sentence_encoding.special_tokens_mask

                    # List of indices containing subwords
                    subwords_idx = [index_of_subword for index_of_subword, mask in enumerate(token_mask) if mask == 0]

                    self.predicted_label_probabilities = [self.predicted_label_probabilities[i] for i in subwords_idx]
                    self.output_tokens = [self.sentence_encoding.tokens[i] for i in subwords_idx]

                    # Print subword spans
                    self.output_spans_within_sentence = [
                        " ".join([str(span_idx) for span_idx in self.sentence_encoding.offsets[i]])
                        for i in subwords_idx]
                    self.output_spans_within_document = [" ".join(
                        [str(span_idx + self.sentence.start_char) for span_idx in self.sentence_encoding.offsets[i]])
                                                         for i in subwords_idx]
                    # Print labels
                    self.output_labels = [self.predicted_labels[i].replace("NP", model['entity_type']) for i in
                                          subwords_idx]  # Generalize to task type

                    # Update document output
                    self.output_dict['tokens'] = self.output_dict['tokens'] + self.output_tokens
                    self.output_dict['sentence_spans'] = self.output_dict[
                                                             'sentence_spans'] + self.output_spans_within_sentence
                    self.output_dict['document_spans'] = self.output_dict[
                                                             'document_spans'] + self.output_spans_within_document
                    self.output_dict['probability'] = self.output_dict[
                                                          'probability'] + self.predicted_label_probabilities
                    self.output_dict['labels'] = self.output_dict['labels'] + self.output_labels
                    annotation_end = time.time()
                    print(
                        f'finished sentence {sentence_idx} of {number_of_sentences} in {annotation_end - annotation_start:0.2f} seconds')

        if self.output_dict:
            self.output_table = pd.DataFrame.from_dict(self.output_dict)
            if output_filename:
                self.output_table.to_csv(output_filename_fullpath, sep='\t', header=True, index=True, index_label="#")
            else:
                self.output_table.to_csv(os.path.join(output_directory, 'example_output.tsv'), sep='\t', header=True,
                                         index=True, index_label="#")

    def run_all_documents(self, path_to_document_dir, output_directory=".", recursive=False):
        print('started running all documents')
        if not os.path.exists(output_directory):
            os.makedirs(output_directory)
        file_list = []
        if recursive:
            print(f'Looking for files to add in {path_to_document_dir}. Searching Recursively')
            for root, directories, filenames in os.walk(path_to_document_dir):
                for filename in filenames:
                    file_list.append(os.path.join(root, filename))
        else:
            print(f'Looking for files to add. Searching {path_to_document_dir}')
            for filename in os.listdir(path_to_document_dir):
                file_list.append(os.path.join(path_to_document_dir, filename))

        log = open('infer.log', 'a')
        failed_list_log = open('infer_failed_list.log', 'a')
        for input_document in file_list:
            if not input_document.endswith(".txt"):
                continue
            output_basename = os.path.basename(input_document).replace('.txt', '') + "_biobert_annotated"
            output_filename = output_basename + ".tsv"

            # Check if the out file exists already, if so skip it.
            if os.path.exists(os.path.join(output_directory, output_filename)):
                print(f'Skipping document {input_document}. \nResults already in {output_directory}/{output_filename}')
                continue
            print(f'Running document {input_document}. \nSaving Results to {output_directory}/{output_filename}')
            try:
                self.run_document(input_document, output_filename, output_directory)
            except Exception as e:
                print(f"Failed to process {output_filename}. See log for error.")
                print(f"Failed to process {output_filename}: {e}", file=log)
                print(f"{output_filename}", file=failed_list_log)
            finally:
                pass

    def __str__(self):
        return self.document.sents
コード例 #11
0
    model.compile(Adam(lr=6e-6),
                  loss='binary_crossentropy',
                  metrics=['accuracy', 'AUC'])

    return model


transformer_layer = (transformers.TFDistilBertModel.from_pretrained(
    'distilbert-base-multilingual-cased'))

model = build_model(transformer_layer, max_len=MAX_LEN)
model.load_weights('/home/aziz/vneuron/model/weights')

fast_tokenizer = BertWordPieceTokenizer('vocab.txt', lowercase=False)
fast_tokenizer.enable_truncation(max_length=MAX_LEN)
fast_tokenizer.enable_padding(length=MAX_LEN)

app = Flask(__name__)


@app.route('/')
def index():

    return render_template('index.html')


@app.route('/predict', methods=['POST'])
def predict():
    text = request.form['content']
    text = [str(text)]
    all_ids = []
コード例 #12
0
    parser.add_argument("--batch_size", type=int, default=1)
    args = parser.parse_args()

    settings = dm_fast_mapping.EnvironmentSettings(seed=0,
                                                   level_name=args.level_name)
    env = dm_fast_mapping.load_from_docker(settings)
    wrapped_env = FastSlowEnvWrapper(env)

    temp_timestep = wrapped_env.reset()
    vision_dim = temp_timestep.observation["RGB_INTERLEAVED"].shape

    tokenizer = BertWordPieceTokenizer(
        DIR_PATH + "/embedding_files/bert-base-uncased-vocab.txt",
        lowercase=True)
    tokenizer.enable_padding(pad_id=3,
                             length=args.language_dim,
                             pad_token="[PAD]")

    policy = Agent(
        language_dim=args.language_dim,
        vision_dim=vision_dim,
        num_embeddings=args.num_embeddings,
        embedding_dim=args.embedding_dim,
        memory_hidden_dim=args.memory_hidden_dim,
        tokenizer=tokenizer,
    )

    evaluations = [eval_policy(policy, settings)]

    replay_buffer = MultiModalReplayBuffer(
        args.buffer_size,
コード例 #13
0
ファイル: _base.py プロジェクト: sycomix/odin-ai
 def tokenizer(
         self) -> Union[BaseTokenizer, CountVectorizer, TfidfVectorizer]:
     pkl_path = os.path.join(self.tokenizer_path, "model.pkl")
     if self._tokenizer is not None:
         return self._tokenizer
     ### get pickled tokenizer
     if os.path.exists(pkl_path) and not self.retrain_tokenizer:
         with open(pkl_path, 'rb') as f:
             tokenizer = pickle.load(f)
     ### train new tokenizer
     else:
         self.retrain_tokenizer = False
         if self.algorithm == 'bert':
             from tokenizers import BertWordPieceTokenizer
             tokenizer = BertWordPieceTokenizer(
                 vocab_file=None if self._init_vocabulary is None else os.
                 path.join(self.cache_path, "bert_vocab.txt"))
             tokenizer.enable_truncation(max_length=self.max_length)
             tokenizer.enable_padding(length=self.max_length)
             # train the tokenizer
             if self._init_vocabulary is None:
                 path = os.path.join(self.cache_path, 'train.txt')
                 with open(path, 'w') as f:
                     for i in chain(self.train_text, self.valid_text,
                                    self.test_text):
                         if len(i) == 0:
                             continue
                         f.write(i + "\n" if i[-1] != "\n" else i)
                 tokenizer.train(files=path,
                                 vocab_size=self.vocab_size,
                                 min_frequency=self.min_frequency,
                                 limit_alphabet=self.limit_alphabet,
                                 show_progress=True)
             tokenizer.save_model(self.tokenizer_path)
         elif self.algorithm in ('count', 'tf', 'tfidf'):
             if self.algorithm == 'count':
                 tokenizer = CountVectorizer(
                     input='content',
                     ngram_range=self.ngram_range,
                     min_df=self.min_frequency,
                     max_df=self.max_frequency,
                     max_features=self.vocab_size,
                     vocabulary=self._init_vocabulary,
                     tokenizer=_simple_tokenizer,
                     stop_words='english')
             elif self.algorithm in ('tf', 'tfidf'):
                 tokenizer = TfidfVectorizer(
                     input='content',
                     ngram_range=self.ngram_range,
                     min_df=self.min_frequency,
                     max_df=self.max_frequency,
                     max_features=self.vocab_size,
                     stop_words='english',
                     vocabulary=self._init_vocabulary,
                     tokenizer=_simple_tokenizer,
                     use_idf=False if self.algorithm == 'tf' else True)
             tokenizer.fit((_simple_preprocess(i) for i in chain(
                 self.train_text, self.valid_text, self.test_text)))
         else:
             raise NotImplementedError
         # save the pickled model
         with open(pkl_path, "wb") as f:
             pickle.dump(tokenizer, f)
     ### assign and return
     self._tokenizer = tokenizer
     return self._tokenizer
コード例 #14
0
# Inference runtime
import onnxruntime as ort
from tokenizers import BertWordPieceTokenizer

# Helper scripts
from .PreprocessData import normalize_text, truncate_text
from .Predict import get_ids_and_masks, predict

# Initialize ONNX runtime and language model tokenizer
vocab_file_path = os.path.join(os.path.dirname(__file__),
                               "Model/bert-base-uncased-vocab.txt")
onnx_file_path = os.path.join(os.path.dirname(__file__),
                              "Model/watchdog_model.onnx")

tokenizer = BertWordPieceTokenizer(vocab_file_path)
tokenizer.enable_padding(pad_id=0, pad_token="[PAD]", length=128)
tokenizer.enable_truncation(max_length=128)

ort_session = ort.InferenceSession(onnx_file_path)


def main(req: func.HttpRequest) -> func.HttpResponse:
    logging.info('Invoked TextQualityWatchdog Skill.')

    try:
        body = json.dumps(req.get_json())

        if body:
            logging.info(body)
            values = json.loads(body)['values']
            results = {}
コード例 #15
0
def main():
    
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--data_dir", default=None, type=str, required=True,
                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
    parser.add_argument("--output_dir", default=None, type=str, required=True,
                        help="The output directory where the model predictions and checkpoints will be written.")

     ## Other parameters
    parser.add_argument("--config_name", default="", type=str,
                        help="Pretrained config name or path if not the same as model_name")
    parser.add_argument("--tokenizer_name", default="", type=str,
                        help="Pretrained tokenizer name or path if not the same as model_name")
    parser.add_argument("--max_seq_length", default=512, type=int,
                        help="The maximum total input sequence length after tokenization. Sequences longer "
                             "than this will be truncated, sequences shorter will be padded.")
    parser.add_argument("--do_train", action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval", action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--evaluate_during_training", action='store_true',
                        help="Rul evaluation during training at each logging step.")
    parser.add_argument("--do_lower_case", action='store_true',
                        help="Set this flag if you are using an uncased model.")

    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
                        help="Batch size per GPU/CPU for training.")
    parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
                        help="Batch size per GPU/CPU for evaluation.")
    parser.add_argument("--num_eval_docs", default=1000, type=int,
                        help="number of docs per query in eval set.")
    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
                        help="Number of updates steps to accumulate before performing a backward/update pass.")     
    parser.add_argument("--learning_rate", default=5e-5, type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--weight_decay", default=0.0, type=float,
                        help="Weight deay if we apply some.")
    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm", default=1.0, type=float,
                        help="Max gradient norm.")
    parser.add_argument("--num_train_epochs", default=3.0, type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--max_steps", default=-1, type=int,
                        help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
    parser.add_argument("--warmup_steps", default=0, type=int,
                        help="Linear warmup over warmup_steps.")

    parser.add_argument('--logging_steps', type=int, default=50,
                        help="Log every X updates steps.")
    parser.add_argument('--save_steps', type=int, default=50,
                        help="Save checkpoint every X updates steps.")
    parser.add_argument("--eval_all_checkpoints", action='store_true',
                        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
    parser.add_argument("--no_cuda", action='store_true',
                        help="Avoid using CUDA when available")
    parser.add_argument('--overwrite_output_dir', action='store_true',
                        help="Overwrite the content of the output directory")
    parser.add_argument('--seed', type=int, default=42,
                        help="random seed for initialization")
    parser.add_argument("--msmarco_output", action='store_true',
                        help="Return msmarco output format file")

    parser.add_argument("--local_rank", type=int, default=-1,
                        help="For distributed training: local_rank")
    args = parser.parse_args()

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
        raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
        args.n_gpu=1
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend='nccl')
        args.n_gpu = 1
    args.device = device

    # Setup logging
    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                        datefmt = '%m/%d/%Y %H:%M:%S',
                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s",
    #                args.local_rank, device, args.n_gpu, bool(args.local_rank != -1))
    
    # Set seed
    set_seed(args)
    num_labels=2
    config = BertConfig.from_pretrained("bert-base-uncased", num_labels=num_labels)
    tokenizer = BertWordPieceTokenizer(f"{args.data_dir}/bert_based_uncased_vocab.txt", lowercase=True)
    tokenizer.enable_truncation(args.max_seq_length)
    tokenizer.enable_padding('right',max_length=args.max_seq_length)
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', config=config)
    model.to(args.device)

    args.output_mode='classification'

    logger.info("Training/evaluation parameters %s", args)

    if args.do_train:
        dataset_path = f'{args.data_dir}/triples.unique.eq.train.small.csv'
        train_dataset=LazyTextDataset(dataset_path, tokenizer,args.max_seq_length)
        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)


    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
        # Create output directory if needed
        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(args.output_dir)

        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))



    # Evaluation
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
        tokenizer = BertWordPieceTokenizer(f"{args.data_dir}/bert_based_uncased_vocab.txt", lowercase=True)
        tokenizer.enable_truncation(args.max_seq_length)
        tokenizer.enable_padding('right',max_length=args.max_seq_length)
        checkpoints = [args.output_dir] # can specifiy only one checkpoint checkpoints = [f'{args.data_dir}/checkpoint-{args.checkpoint}']
        if args.eval_all_checkpoints:
            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
            prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else ""
            
            model = BertForSequenceClassification.from_pretrained(checkpoint)
            model.to(args.device)
            evaluate(args, model, tokenizer, prefix=prefix, set_name='eval', global_step)
            

    return results


if __name__ == "__main__":
    main()
コード例 #16
0
def main():
    start_time = time.time()
    args = parse_args()
    make_directories(args.output_dir)

    # Start Tensorboard and log hyperparams.
    tb_writer = SummaryWriter(args.output_dir)
    tb_writer.add_hparams(vars(args), {})

    file_log_handler = logging.FileHandler(
        os.path.join(args.output_dir, 'log.txt'))
    logger.addHandler(file_log_handler)

    # Get list of text and list of label (integers) from disk.
    train_text, train_label_id_list, eval_text, eval_label_id_list = \
        get_examples_and_labels(args.dataset)

    # Augment training data.
    if (args.augmentation_recipe is not None) and len(
            args.augmentation_recipe):
        import pandas as pd

        if args.augmentation_recipe == 'textfooler':
            aug_csv = '/p/qdata/jm8wx/research/text_attacks/textattack/outputs/attack-1590551967800.csv'
        elif args.augmentation_recipe == 'tf-adjusted':
            aug_csv = '/p/qdata/jm8wx/research/text_attacks/textattack/outputs/attack-1590564015768.csv'
        else:
            raise ValueError(
                f'Unknown augmentation recipe {args.augmentation_recipe}')

        aug_df = pd.read_csv(aug_csv)

        # filter skipped outputs
        aug_df = aug_df[aug_df['original_text'] != aug_df['perturbed_text']]

        print(
            f'Augmentation recipe {args.augmentation_recipe} / augmentation num. examples {args.augmentation_num}/ len {len(aug_df)}'
        )

        original_text = aug_df['original_text']
        perturbed_text = aug_df['perturbed_text']

        # convert `train_text` and `train_label_id_list` to an np array so things are faster
        train_text = np.array(train_text)
        train_label_id_list = np.array(train_label_id_list)

        x_adv_list = []
        x_adv_id_list = []
        for (x, x_adv) in zip(original_text, perturbed_text):
            x = x.replace('[[', '').replace(']]', '')
            x_adv = x_adv.replace('[[', '').replace(']]', '')
            x_idx = (train_text == x).nonzero()[0][0]
            x_adv_label = train_label_id_list[x_idx]
            x_adv_id_list.append(x_adv_label)
            x_adv_list.append(x_adv)

        # truncate to `args.augmentation_num` examples
        if (args.augmentation_num >= 0):
            perm = list(range(len(x_adv_list)))
            random.shuffle(perm)
            perm = perm[:args.augmentation_num]
            x_adv_list = [x_adv_list[i] for i in perm]
            x_adv_id_list = [x_adv_id_list[i] for i in perm]

        train_text = train_text.tolist() + x_adv_list
        train_label_id_list = train_label_id_list.tolist() + x_adv_id_list

        print(
            f'Augmentation added {len(x_adv_list)} examples, for a total of {len(train_text)}'
        )

    label_id_len = len(train_label_id_list)
    num_labels = len(set(train_label_id_list))
    logger.info('num_labels: %s', num_labels)

    train_examples_len = len(train_text)

    if len(train_label_id_list) != train_examples_len:
        raise ValueError(
            f'Number of train examples ({train_examples_len}) does not match number of labels ({len(train_label_id_list)})'
        )
    if len(eval_label_id_list) != len(eval_text):
        raise ValueError(
            f'Number of teste xamples ({len(eval_text)}) does not match number of labels ({len(eval_label_id_list)})'
        )

    print_cuda_memory(args)
    # old INFO:__main__:Loaded data and tokenized in 189.66675066947937s

    # @TODO support other vocabularies, or at least, support case
    tokenizer = BertWordPieceTokenizer('bert-base-uncased-vocab.txt',
                                       lowercase=True)
    tokenizer.enable_padding(max_length=args.max_seq_len)
    tokenizer.enable_truncation(max_length=args.max_seq_len)

    logger.info(f'Tokenizing training data. (len: {train_examples_len})')
    train_text_ids = [
        encoding.ids for encoding in tokenizer.encode_batch(train_text)
    ]
    logger.info(f'Tokenizing test data (len: {len(eval_label_id_list)})')
    eval_text_ids = [
        encoding.ids for encoding in tokenizer.encode_batch(eval_text)
    ]
    load_time = time.time()
    logger.info(f'Loaded data and tokenized in {load_time-start_time}s')

    print_cuda_memory(args)

    # Load pre-trained model tokenizer (vocabulary)
    logger.info('Loading model: %s', args.model_dir)
    # Load pre-trained model (weights)
    logger.info(f'Model class: (vanilla) BertForSequenceClassification.')
    model = BertForSequenceClassification.from_pretrained(
        args.model_dir, num_labels=num_labels)

    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    model.to(device)
    # print(model)

    # multi-gpu training
    if args.num_gpus > 1:
        model = torch.nn.DataParallel(model)
    logger.info(f'Training model across {args.num_gpus} GPUs')

    num_train_optimization_steps = int(
        train_examples_len / args.batch_size /
        args.grad_accum_steps) * args.num_train_epochs

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)

    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_proportion,
        num_training_steps=num_train_optimization_steps)

    global_step = 0

    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", train_examples_len)
    logger.info("  Batch size = %d", args.batch_size)
    logger.info("  Max sequence length = %d", args.max_seq_len)
    logger.info("  Num steps = %d", num_train_optimization_steps)

    wandb.log({'train_examples_len': train_examples_len})

    train_input_ids = torch.tensor(train_text_ids, dtype=torch.long)
    train_label_ids = torch.tensor(train_label_id_list, dtype=torch.long)
    train_data = TensorDataset(train_input_ids, train_label_ids)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.batch_size)

    eval_input_ids = torch.tensor(eval_text_ids, dtype=torch.long)
    eval_label_ids = torch.tensor(eval_label_id_list, dtype=torch.long)
    eval_data = TensorDataset(eval_input_ids, eval_label_ids)
    eval_sampler = RandomSampler(eval_data)
    eval_dataloader = DataLoader(eval_data,
                                 sampler=eval_sampler,
                                 batch_size=args.batch_size)

    def get_eval_acc():
        correct = 0
        total = 0
        for input_ids, label_ids in tqdm.tqdm(eval_dataloader,
                                              desc="Evaluating accuracy"):
            input_ids = input_ids.to(device)
            label_ids = label_ids.to(device)

            with torch.no_grad():
                logits = model(input_ids)[0]

            correct += (logits.argmax(dim=1) == label_ids).sum()
            total += len(label_ids)

        return float(correct) / total

    def save_model():
        model_to_save = model.module if hasattr(
            model, 'module') else model  # Only save the model itself

        # If we save using the predefined names, we can load using `from_pretrained`
        output_model_file = os.path.join(args.output_dir, args.weights_name)
        output_config_file = os.path.join(args.output_dir, args.config_name)

        torch.save(model_to_save.state_dict(), output_model_file)
        model_to_save.config.to_json_file(output_config_file)

        logger.info(
            f'Best acc found. Saved tokenizer, model config, and model to {args.output_dir}.'
        )

    global_step = 0

    def save_model_checkpoint(checkpoint_name=None):
        # Save model checkpoint
        checkpoint_name = checkpoint_name or 'checkpoint-{}'.format(
            global_step)
        output_dir = os.path.join(args.output_dir, checkpoint_name)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        # Take care of distributed/parallel training
        model_to_save = model.module if hasattr(model, 'module') else model
        model_to_save.save_pretrained(output_dir)
        torch.save(args, os.path.join(output_dir, 'training_args.bin'))
        logger.info('Checkpoint saved to %s.', output_dir)

    print_cuda_memory(args)
    model.train()
    best_eval_acc = 0
    steps_since_best_eval_acc = 0

    def loss_backward(loss):
        if args.num_gpus > 1:
            loss = loss.mean(
            )  # mean() to average on multi-gpu parallel training
        if args.grad_accum_steps > 1:
            loss = loss / args.grad_accum_steps
        loss.backward()

    for epoch in tqdm.trange(int(args.num_train_epochs), desc="Epoch"):
        prog_bar = tqdm.tqdm(train_dataloader, desc="Iteration")
        for step, batch in enumerate(prog_bar):
            print_cuda_memory(args)
            batch = tuple(t.to(device) for t in batch)
            input_ids, labels = batch
            logits = model(input_ids)[0]
            loss_fct = torch.nn.CrossEntropyLoss()
            loss = torch.nn.CrossEntropyLoss()(logits.view(-1, num_labels),
                                               labels.view(-1))
            if global_step % args.tb_writer_step == 0:
                tb_writer.add_scalar('loss', loss, global_step)
                tb_writer.add_scalar('lr', loss, global_step)
            loss_backward(loss)
            prog_bar.set_description(f"Loss {loss.item()}")
            if (step + 1) % args.grad_accum_steps == 0:
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                global_step += 1
            # Save model checkpoint to file.
            if global_step % args.checkpoint_steps == 0:
                save_model_checkpoint()

            model.zero_grad()

            # Inc step counter.
            global_step += 1

        # Check accuracy after each epoch.
        eval_acc = get_eval_acc()
        tb_writer.add_scalar('epoch_eval_acc', eval_acc, global_step)
        wandb.log({'epoch_eval_acc': eval_acc, 'epoch': epoch})

        if args.checkpoint_every_epoch:
            save_model_checkpoint(f'epoch-{epoch}')

        logger.info(f'Eval acc: {eval_acc*100}%')
        if eval_acc > best_eval_acc:
            best_eval_acc = eval_acc
            steps_since_best_eval_acc = 0
            save_model()
        else:
            steps_since_best_eval_acc += 1
            if (args.early_stopping_epochs > 0) and (
                    steps_since_best_eval_acc > args.early_stopping_epochs):
                logger.info(
                    f'Stopping early since it\'s been {args.early_stopping_epochs} steps since validation acc increased'
                )
                break
コード例 #17
0
def main():
    parser = ArgumentParser('GLUE evaluation example')
    parser.add_argument(
        '--glue_dir',
        type=str,
        metavar='PATH',
        required=True,
        help='Path to directory containing the GLUE tasks data.')
    parser.add_argument(
        '--output_dir',
        type=str,
        metavar='PATH',
        required=True,
        help=
        'Path to the output directory (for logs, checkpoints, parameters, etc.).'
    )
    parser.add_argument('-f',
                        '--force',
                        action='store_true',
                        help='Overwrite output_dir if it already exists.')
    parser.add_argument(
        '--task_name',
        type=str,
        default=None,
        choices=GLUE_TASKS,
        help='The specific GLUE task to train and/or evaluate on.')
    parser.add_argument('--do_train',
                        action='store_true',
                        help='Whether to run training.')
    parser.add_argument('--do_eval',
                        action='store_true',
                        help='Whether to run eval (on the dev set).')
    parser.add_argument('--config_file',
                        type=str,
                        metavar='PATH',
                        required=True,
                        help='Path to the model configuration.')
    parser.add_argument('--weights_file',
                        type=str,
                        metavar='PATH',
                        required=True,
                        help='Path to the model initialization weights.')
    parser.add_argument('--tokenizer_vocab_file',
                        type=str,
                        metavar='PATH',
                        required=True,
                        help='Path to the tokenizer vocabulary.')
    parser.add_argument('--overwrite_cache',
                        action='store_true',
                        help='Overwrite the cache if it already exists.')
    parser.add_argument('--max_sequence_len',
                        type=int,
                        default=128,
                        metavar='N',
                        help='The maximum length of a sequence.')
    parser.add_argument('--do_lower_case',
                        action='store_true',
                        help='Whether to lowercase the input when tokenizing.')
    parser.add_argument('-n',
                        '--num_epochs',
                        type=int,
                        default=3,
                        metavar='N',
                        help='The number of distillation epochs.')
    parser.add_argument('--per_gpu_train_batch_size',
                        type=int,
                        default=8,
                        metavar='N',
                        help='The batch size per GPU used during training.')
    parser.add_argument('--per_gpu_eval_batch_size',
                        type=int,
                        default=8,
                        metavar='N',
                        help='The batch size per GPU used during evaluation.')
    parser.add_argument('-lr',
                        '--learning_rate',
                        type=float,
                        default=2e-5,
                        metavar='F',
                        help='The initial learning rate.')
    parser.add_argument('--epsilon',
                        type=float,
                        default=1e-8,
                        metavar='F',
                        help="Adam's epsilon.")
    parser.add_argument('--warmup_prop',
                        type=float,
                        default=0.05,
                        metavar='F',
                        help='Linear warmup proportion.')
    parser.add_argument(
        '--num_gradient_accumulation_steps',
        type=int,
        default=1,
        metavar='N',
        help=
        'The number of gradient accumulation steps (for larger batch sizes).')
    parser.add_argument('--max_gradient_norm',
                        type=float,
                        default=1.0,
                        metavar='F',
                        help='The maximum gradient norm.')
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        metavar='N',
                        help='Random seed.')
    parser.add_argument('-c',
                        '--use_cuda',
                        action='store_true',
                        help='Whether to use cuda or not.')
    parser.add_argument(
        '-d',
        '--use_distributed',
        action='store_true',
        help='Whether to use distributed training (distillation) or not.')
    parser.add_argument('--local_rank',
                        type=int,
                        default=-1,
                        metavar='N',
                        help='Local process rank.')
    params = parser.parse_args()

    if not params.use_distributed:
        params.local_rank = 0
        params.train_batch_size = params.per_gpu_train_batch_size
        params.eval_batch_size = params.per_gpu_eval_batch_size
    else:
        params.num_gpus = torch.cuda.device_count()
        params.train_batch_size = params.per_gpu_train_batch_size * params.num_gpus
        params.eval_batch_size = params.per_gpu_eval_batch_size * params.num_gpus
    params.is_master = params.local_rank == 0

    if params.use_cuda:
        device = torch.device('cuda', params.local_rank)
    else:
        device = torch.device('cpu')

    # make output_dir
    if Path(params.output_dir).is_dir() and not params.force:
        raise ValueError(
            f'Output directory {params.output_dir} already exists. Use `--force` if you want to overwrite it.'
        )
    if params.is_master:
        Path(params.output_dir).mkdir(parents=True, exist_ok=params.force)

        # dump params
        json.dump(vars(params),
                  open(Path(params.output_dir) / 'params.json', 'w'),
                  indent=4,
                  sort_keys=True)
    params.glue_dir = Path(params.glue_dir)
    params.output_dir = Path(params.output_dir)
    params.device = device

    # initialize multi-GPU
    if params.use_distributed:
        if params.is_master:
            logger.info('Initializing PyTorch distributed')
        torch.cuda.set_device(params.local_rank)
        dist.init_process_group(backend='nccl', init_method='env://')

    # set seed(s)
    if params.is_master:
        logger.info('Setting random seed(s)')
    random.seed(params.seed)
    np.random.seed(params.seed)
    torch.manual_seed(params.seed)
    if params.use_distributed:
        torch.cuda.manual_seed_all(params.seed)

    # initialize the tokenizer
    if params.is_master:
        logger.info('Initializing the tokenizer')
    tokenizer = BertWordPieceTokenizer(params.tokenizer_vocab_file,
                                       lowercase=params.do_lower_case)

    # enable truncation and padding
    tokenizer.enable_truncation(params.max_sequence_len)
    tokenizer.enable_padding(length=params.max_sequence_len)

    # go over each task
    if params.task_name is not None:
        tasks = [params.task_name]
        output_dirs = [params.output_dir]
    else:
        tasks = GLUE_TASKS
        output_dirs = [
            params.output_dir / task / str(params.seed) for task in tasks
        ]

    for task, task_output_dir in zip(tasks, output_dirs):
        # prepare the GLUE task
        if params.is_master:
            logger.info(f'Preparing the {task} GLUE task')

        # make task_output_dir
        if task_output_dir.is_dir() and not params.force:
            raise ValueError(
                f'Task output directory {task_output_dir} already exists. Use `--force` if you want to overwrite it.'
            )
        if params.is_master:
            task_output_dir.mkdir(parents=True, exist_ok=params.force)

        # initialize the model
        if params.is_master:
            logger.info(f'{task} - Initializing the model')
        config = DistilBertConfig.from_pretrained(
            params.config_file,
            num_labels=len(GLUE_TASKS_MAPPING[task]['labels']),
            finetuning_task=task)
        model = DistilBertForSequenceClassification.from_pretrained(
            params.weights_file, config=config)

        # send model to device
        model = model.to(params.device)

        # perform the training
        if params.do_train:
            # initialize the training dataset
            if params.is_master:
                logger.info(f'{task} - Initializing the training dataset')
            train_dataset = GLUETaskDataset(
                task=task,
                glue_dir=params.glue_dir,
                split='train',
                tokenizer=tokenizer,
                overwrite_cache=params.overwrite_cache)

            # initialize the sampler
            if params.is_master:
                logger.info(f'{task} - Initializing the training sampler')
            train_sampler = DistributedSampler(
                train_dataset) if params.use_distributed else RandomSampler(
                    train_dataset)

            # initialize the dataloader
            if params.is_master:
                logger.info(f'{task} - Initializing the training dataloader')
            train_dataloader = DataLoader(dataset=train_dataset,
                                          sampler=train_sampler,
                                          batch_size=params.train_batch_size)

            # initialize the optimizer
            if params.is_master:
                logger.info(f'{task} - Initializing the optimizer')
            optimizer = optim.Adam(model.parameters(),
                                   lr=params.learning_rate,
                                   eps=params.epsilon,
                                   betas=(0.9, 0.98))

            # initialize the learning rate scheduler
            if params.is_master:
                logger.info(
                    f'{task} - Initializing the learning rate scheduler')
            num_steps_epoch = len(train_dataloader)
            num_train_steps = math.ceil(
                num_steps_epoch / params.num_gradient_accumulation_steps *
                params.num_epochs)
            num_warmup_steps = math.ceil(num_train_steps * params.warmup_prop)

            def lr_lambda(current_step):
                if current_step < num_warmup_steps:
                    return float(current_step) / float(max(
                        1, num_warmup_steps))
                return max(
                    0.0,
                    float(num_train_steps - current_step) /
                    float(max(1, num_train_steps - num_warmup_steps)))

            lr_scheduler = optim.lr_scheduler.LambdaLR(optimizer=optimizer,
                                                       lr_lambda=lr_lambda,
                                                       last_epoch=-1)

            # initialize distributed data parallel (DDP)
            if params.use_distributed:
                if params.is_master:
                    logger.info('Initializing DDP')
                model = DDP(model,
                            device_ids=[params.local_rank],
                            output_device=params.local_rank)

            # start training
            if params.is_master:
                logger.info(f'{task} - Starting the training')
            train(task=task,
                  model=model,
                  dataloader=train_dataloader,
                  optimizer=optimizer,
                  num_epochs=params.num_epochs,
                  lr_scheduler=lr_scheduler,
                  num_gradient_accumulation_steps=params.
                  num_gradient_accumulation_steps,
                  max_gradient_norm=params.max_gradient_norm,
                  device=params.device,
                  use_distributed=params.use_distributed,
                  is_master=params.is_master,
                  use_tqdm=True,
                  logger=logger)

            # save the finetuned model
            if params.is_master:
                # take care of distributed training
                model_to_save = model.module if hasattr(model,
                                                        'module') else model
                model_to_save.config.architectures = [
                    model_to_save.__class__.__name__
                ]

                logger.info(f'{task} - Saving the finetuned model config')
                json.dump(vars(model_to_save.config),
                          open(task_output_dir /
                               TRAINED_CONFIG_FILE_TEMPLATE.format(
                                   model_name=model_to_save.__class__.__name__,
                                   task=task),
                               mode='w'),
                          indent=4,
                          sort_keys=True)

                logger.info(f'{task} - Saving the finetuned model weights')
                torch.save(
                    model_to_save.state_dict(),
                    task_output_dir / TRAINED_WEIGHTS_FILE_TEMPLATE.format(
                        model_name=model_to_save.__class__.__name__,
                        task=task))

                # reload the model
                if params.do_eval:
                    if params.is_master:
                        logger.info(f'{task} - Reloading the model')
                    config = DistilBertConfig.from_pretrained(
                        str(task_output_dir /
                            TRAINED_CONFIG_FILE_TEMPLATE.format(
                                model_name=model_to_save.__class__.__name__,
                                task=task)),
                        num_labels=len(GLUE_TASKS_MAPPING[task]['labels']),
                        finetuning_task=task)
                    model = DistilBertForSequenceClassification.from_pretrained(
                        str(task_output_dir /
                            TRAINED_WEIGHTS_FILE_TEMPLATE.format(
                                model_name=model_to_save.__class__.__name__,
                                task=task)),
                        config=config)
                    model = model.to(params.device)

        # perform the evaluation
        if params.do_eval and params.is_master:
            # initialize the evaluation dataset
            logger.info(f'{task} - Initializing the evaluation dataset')
            eval_datasets = [
                GLUETaskDataset(task=task,
                                glue_dir=params.glue_dir,
                                split='dev',
                                tokenizer=tokenizer,
                                overwrite_cache=params.overwrite_cache)
            ]

            # hot fix for MNLI-MM
            if task == 'MNLI':
                eval_datasets.append(
                    GLUETaskDataset(task='MNLI-MM',
                                    glue_dir=params.glue_dir,
                                    split='dev',
                                    tokenizer=tokenizer))

            for eval_dataset in eval_datasets:
                # initialize the sampler
                logger.info(
                    f'{eval_dataset.task} - Initializing the evaluation sampler'
                )
                eval_sampler = SequentialSampler(eval_dataset)

                # initialize the dataloader
                logger.info(
                    f'{eval_dataset.task} - Initializing the evaluation dataloader'
                )
                eval_dataloader = DataLoader(dataset=eval_dataset,
                                             sampler=eval_sampler,
                                             batch_size=params.eval_batch_size)

                # start evaluating
                logger.info(f'{eval_dataset.task} - Starting the evaluation')
                results = evaluate(task=task,
                                   model=model,
                                   dataloader=eval_dataloader,
                                   device=params.device,
                                   use_tqdm=True)

                # log results
                logger.info(f'{eval_dataset.task} - Evaluation results:')
                for key, result in results.items():
                    logger.info(f'{eval_dataset.task} -  {key}: {result}')

                # dump results
                json.dump(results,
                          open(
                              task_output_dir / RESULTS_FILE_TEMPLATE.format(
                                  model_name=model.__class__.__name__,
                                  task=eval_dataset.task), 'w'),
                          indent=4)

        if params.is_master:
            logger.info(f'Done with the {task} GLUE task')
コード例 #18
0
ファイル: masking.py プロジェクト: cybo1112/cantoformer
from google.cloud import storage
import tokenizers
from transformers import BertTokenizer
from tokenizers import BertWordPieceTokenizer
from torch.utils.data import Dataset, TensorDataset, DataLoader
from torch.utils.data.sampler import RandomSampler
import numpy as np
import random
import jieba
import logging
logging.getLogger("jieba").setLevel(logging.WARNING)

tokenizer = BertWordPieceTokenizer(vocab_file='../tokenizer/vocab.txt')
tokenizer.add_special_tokens(["<nl>"])
tokenizer.enable_truncation(max_length=512)
tokenizer.enable_padding(length=512)
client = storage.Client()
blobs = []
size = 0
for blob in client.list_blobs('tfrc-tfrc', prefix='public_model/corpus/'):
    if (blob.name.endswith('.txt')):
        blobs.append(blob)

sub_blobs = random.sample(blobs, 5)


def iterator_gen(generator, handler=None, parallel=False):
    try:
        import gc
        import multiprocessing as multiprocessing
        if parallel:
コード例 #19
0
#     tokenizer.save('./', 'token_test')
# else:

#     tokenizer = ByteLevelBPETokenizer( "./{}-vocab.json".format('token_test'), "./{}-merges.txt".format('token_test'),
#         add_prefix_space=True,
#     )

# # Now we can encode
# encoded = tokenizer.encode("will be back later.  http://plurk.com/p/rp3k7,will be back later, loooove u @mahboi #blessed")
# print(encoded.tokens)
# print(encoded.offsets)

from tokenizers import BertWordPieceTokenizer
# My arbitrary sentence
sentence = "[CLS] will be back later.  www.facebook.com ,will be back later, loooove u @mahboi #blessed"
# Bert vocabularies
# Instantiate a Bert tokenizers
tokenizer = BertWordPieceTokenizer("bert-large-uncased-vocab.txt",
                                   lowercase=True,
                                   clean_text=True)
tokenizer.add_tokens(['[LINK]'])

tokenizer.enable_padding(max_length=100)
WordPieceEncoder = tokenizer.encode(sentence)
# Print the ids, tokens and offsets
print(WordPieceEncoder.ids)
print(WordPieceEncoder.tokens)
print(WordPieceEncoder.offsets)
print(tokenizer.get_vocab()['[PAD]'])
print(tokenizer.decode(WordPieceEncoder.ids))
コード例 #20
0
class Tweets(Dataset):
    def __init__(self, device='cpu', pad=150, test=False, N=4):
        self.samples = []
        self.pad = pad

        self.tokenizer = BertWordPieceTokenizer(
            "./data/bert-base-uncased-vocab.txt",
            lowercase=True,
            clean_text=True)

        self.tokenizer.enable_padding(max_length=pad -
                                      1)  # -1 for sentiment token

        self.tokenizer.add_special_tokens(['[POS]'])
        self.tokenizer.add_special_tokens(['[NEG]'])
        self.tokenizer.add_special_tokens(['[NEU]'])
        self.vocab = self.tokenizer.get_vocab()

        self.sent_t = {
            'positive': self.tokenizer.token_to_id('[POS]'),
            'negative': self.tokenizer.token_to_id('[NEG]'),
            'neutral': self.tokenizer.token_to_id('[NEU]')
        }

        self.pos_set = {'UNK': 0}
        all_pos = load('help/tagsets/upenn_tagset.pickle').keys()

        for i, p in enumerate(all_pos):
            self.pos_set[p] = i + 1

        self.tweet_tokenizer = TweetTokenizer()

        data = None
        if test is True:
            data = pd.read_csv(TEST_PATH).values
            for row in data:
                tid, tweet, sentiment = tuple(row)

                pos_membership = [0] * len(tweet)

                pos_tokens = self.tweet_tokenizer.tokenize(tweet)
                pos = nltk.pos_tag(pos_tokens)
                offset = 0

                for i, token in enumerate(pos_tokens):
                    start = tweet.find(token, offset)
                    end = start + len(token)
                    if pos[i][1] in self.pos_set:
                        pos_membership[start:end] = [self.pos_set[pos[i][1]]
                                                     ] * len(token)
                    offset += len(token)

                tokens = self.tokenizer.encode(tweet)
                word_to_index = tokens.ids
                offsets = tokens.offsets

                token_pos = [0] * len(word_to_index)
                # get pos info
                for i, (s, e) in enumerate(offsets):
                    if word_to_index[i] == 0 or word_to_index[
                            i] == 101 or word_to_index[i] == 102:
                        pass
                    elif s != e:
                        sub = pos_membership[s:e]
                        token_pos[i] = max(set(sub), key=sub.count)

                token_pos = [0] + token_pos
                word_to_index = [self.sent_t[sentiment]] + word_to_index
                offsets = [(0, 0)] + offsets
                offsets = np.array([[off[0], off[1]] for off in offsets])
                word_to_index = np.array(word_to_index)
                token_pos = np.array(token_pos)

                self.samples.append({
                    'tid': tid,
                    'sentiment': sentiment,
                    'tweet': word_to_index,
                    'offsets': offsets,
                    'raw_tweet': tweet,
                    'pos': token_pos
                })

        else:

            data = pd.read_csv(TRAIN_PATH).values
            if N > 0:
                data = augment_n(data, N=N)

            for row in data:
                tid, tweet, selection, sentiment = tuple(row)

                char_membership = [0] * len(tweet)
                pos_membership = [0] * len(tweet)
                si = tweet.find(selection)
                if si < 0:
                    char_membership[0:] = [1] * len(char_membership)
                else:
                    char_membership[si:si +
                                    len(selection)] = [1] * len(selection)

                pos_tokens = self.tweet_tokenizer.tokenize(tweet)
                pos = nltk.pos_tag(pos_tokens)
                offset = 0

                for i, token in enumerate(pos_tokens):
                    start = tweet.find(token, offset)
                    end = start + len(token)
                    if pos[i][1] in self.pos_set:
                        pos_membership[start:end] = [self.pos_set[pos[i][1]]
                                                     ] * len(token)
                    offset += len(token)

                tokens = self.tokenizer.encode(tweet)
                word_to_index = tokens.ids
                offsets = tokens.offsets

                token_membership = [0] * len(word_to_index)
                token_pos = [0] * len(word_to_index)

                # Inclusive indices
                start = None
                end = None
                for i, (s, e) in enumerate(offsets):
                    if word_to_index[i] == 0 or word_to_index[
                            i] == 101 or word_to_index[i] == 102:
                        token_membership[i] = -1
                    elif sum(char_membership[s:e]) > 0:
                        token_membership[i] = 1
                        if start is None:
                            start = i + 1
                        end = i + 1

                # get pos info
                for i, (s, e) in enumerate(offsets):
                    if word_to_index[i] == 0 or word_to_index[
                            i] == 101 or word_to_index[i] == 102:
                        pass
                    elif s != e:
                        sub = pos_membership[s:e]
                        token_pos[i] = max(set(sub), key=sub.count)

                if start is None:
                    print("Data Point Error")
                    print(tweet)
                    print(selection)
                    continue
                # token_membership = torch.LongTensor(token_membership).to(device)
                word_to_index = [self.sent_t[sentiment]] + word_to_index
                token_membership = [-1] + token_membership
                offsets = [(0, 0)] + offsets
                token_pos = [0] + token_pos

                offsets = np.array([[off[0], off[1]] for off in offsets])
                word_to_index = np.array(word_to_index)
                token_membership = np.array(token_membership).astype('float')
                token_pos = np.array(token_pos)

                if tid is None:
                    raise Exception('None field detected')
                if sentiment is None:
                    raise Exception('None field detected')
                if word_to_index is None:
                    raise Exception('None field detected')
                if token_membership is None:
                    raise Exception('None field detected')
                if selection is None:
                    raise Exception('None field detected')
                if tweet is None:
                    raise Exception('None field detected')
                if start is None:
                    raise Exception('None field detected')
                if end is None:
                    raise Exception('None field detected')
                if offsets is None:
                    raise Exception('None field detected')

                self.samples.append({
                    'tid': tid,
                    'sentiment': sentiment,
                    'tweet': word_to_index,
                    'selection': token_membership,
                    'raw_selection': selection,
                    'raw_tweet': tweet,
                    'start': start,
                    'end': end,
                    'offsets': offsets,
                    'pos': token_pos
                })

    def get_splits(self, val_size=.3):
        N = len(self.samples)
        indices = np.random.permutation(N)
        split = int(N * (1 - val_size))
        train_indices = indices[0:split]
        valid_indices = indices[split:]
        return train_indices, valid_indices

    def k_folds(self, k=5):
        N = len(self.samples)
        indices = np.random.permutation(N)
        return np.array_split(indices, k)

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        try:
            return self.samples[idx]
        except TypeError:
            pass
        return [self.samples[i] for i in idx]