def __init__(self, t: PreTrainedTokenizer, args, file_path: str, block_size=512):
        assert os.path.isfile(file_path)
        logger.info("Creating features from dataset file at %s", file_path)
        
        # -------------------------- CHANGES START
        bert_tokenizer = os.path.join(args.tokenizer_name, "vocab.txt")
        if os.path.exists(bert_tokenizer):
            logger.info("Loading BERT tokenizer")
            from tokenizers import BertWordPieceTokenizer
            tokenizer = BertWordPieceTokenizer(os.path.join(args.tokenizer_name, "vocab.txt"), handle_chinese_chars=False, lowercase=False)
            tokenizer.enable_truncation(512)
        else:
            from tokenizers import ByteLevelBPETokenizer
            from tokenizers.processors import BertProcessing
            logger.info("Loading RoBERTa tokenizer")
            
            tokenizer = ByteLevelBPETokenizer(
                os.path.join(args.tokenizer_name, "vocab.json"),
                os.path.join(args.tokenizer_name, "merges.txt")
            )
            tokenizer._tokenizer.post_processor = BertProcessing(
                ("</s>", tokenizer.token_to_id("</s>")),
                ("<s>", tokenizer.token_to_id("<s>")),
            )
            tokenizer.enable_truncation(max_length=512)

        logger.info("Reading file %s", file_path)
        with open(file_path, encoding="utf-8") as f:
            lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]

        logger.info("Running tokenization")
        self.examples = tokenizer.encode_batch(lines)
def train_tokenizer(captions):
    print('Create training file...')
    train_tokenizer = [sample for samples in captions for sample in samples]
    with open('train_tokenizer.txt', 'a') as f:
        for sample in train_tokenizer:
            f.write(sample)
    # init
    bwpt = BertWordPieceTokenizer(vocab_file=None,
                                  unk_token='[UNK]',
                                  sep_token='[SEP]',
                                  cls_token='[CLS]',
                                  clean_text=True,
                                  handle_chinese_chars=True,
                                  strip_accents=True,
                                  lowercase=True,
                                  wordpieces_prefix='##')
    print('Tokenizer training...')
    bwpt.train(files=['train_tokenizer.txt'],
               vocab_size=30000,
               min_frequency=5,
               limit_alphabet=1000,
               special_tokens=['[PAD]', '[UNK]', '[CLS]', '[MASK]', '[SEP]'])

    bwpt.save('.', 'captions')

    # initialization of a trained tokenizer
    tokenizer = BertWordPieceTokenizer('captions-vocab.txt')
    tokenizer.enable_truncation(max_length=16)
    print('Tokenizer is ready to use...')
    return tokenizer
def tokenize_and_cache_data(data_dir,
                            output_dir,
                            tokenizer=None,
                            tokenizer_path=None,
                            n_sentences=0,
                            use_overflow=False,
                            two_segments=True,
                            delete_existing=False,
                            max_length=512):

    if not tokenizer:
        tokenizer = BertWordPieceTokenizer(tokenizer_path)

    tokenizer.enable_truncation(max_length=max_length)
    tokenizer.enable_padding(max_length=max_length)

    num_tokens = 0
    num_examples = 0

    if delete_existing:
        rmtree(output_dir)
    os.makedirs(output_dir, exist_ok=True)

    pbar = tqdm(os.listdir(data_dir))
    for path in pbar:
        result = process_one_file(data_dir, path, tokenizer, output_dir,
                                  n_sentences, use_overflow, two_segments)
        num_examples += result['num_examples']
        num_tokens += result['num_tokens']

        pbar.set_description(
            f"{num_tokens} tokens, {num_examples} examples, {num_tokens/(num_examples*max_length)} non-pad tokens"
        )
def get_transformer_tokenizer(vocab_path, max_tokens, device="cpu"):
    """
    Return a tokenizer to be used with Transformer-based models
    """
    wp_tokenizer = BertWordPieceTokenizer(vocab_path, lowercase=True)
    wp_tokenizer.enable_padding(direction="right", pad_type_id=1)
    wp_tokenizer.enable_truncation(max_tokens)
    return TransformerSquadTokenizer(wp_tokenizer, device=device)
Beispiel #5
0
    def __init__(self, tokenizer: AutoTokenizer, file_path: str, args):
        print(file_path)
        assert os.path.isfile(file_path)

        directory, filename = os.path.split(file_path)
        cached_features_file = os.path.join(
            directory, args.bert_model_type + "_cached_mlm_" + filename)

        if os.path.exists(cached_features_file):
            print("Loading features from cached file %s", cached_features_file)
            with open(cached_features_file, "rb") as handle:
                self.samples = torch.load(handle)
        else:
            print("Creating features from dataset file at %s", directory)

            # Get the faster tokenizer from tokenizers package
            tokenizer.save_vocabulary(vocab_path='.')
            fast_tokenizer = BertWordPieceTokenizer("vocab.txt",
                                                    lowercase=args.lowercase)
            fast_tokenizer.enable_truncation(tokenizer.max_len)
            fast_tokenizer.enable_padding(max_length=tokenizer.max_len,
                                          pad_token=tokenizer.pad_token)

            self.samples = []

            # Load data over here
            df = pd.read_json(file_path)
            print('SQUAD data: ')

            for _, row in tqdm(df.iterrows(), total=df.shape[0]):
                for paragraph in row['data']['paragraphs']:
                    context = paragraph['context']
                    for qa_pair in paragraph['qas']:
                        question = qa_pair['question']

                        batch = fast_tokenizer.encode(question, context)
                        self.samples.append({
                            'input_ids':
                            batch.ids,
                            'attention_mask':
                            batch.attention_mask
                        })

                        for encoding in batch.overflowing:
                            self.samples.append({
                                'input_ids':
                                encoding.ids,
                                'attention_mask':
                                encoding.attention_mask
                            })

            df = None

            print("Saving features into cached file: ", cached_features_file)
            with open(cached_features_file, "wb") as handle:
                torch.save(self.samples,
                           handle,
                           pickle_protocol=pickle.HIGHEST_PROTOCOL)
def tokenize(texts: pd.Series,
             tokenizer: BertWordPieceTokenizer,
             chunk_size: int = 240,
             maxlen: int = 512) -> np.array:
    '''Tokenize input text, return in a form of array'''
    tokenizer.enable_truncation(max_length=maxlen)
    try:
        tokenizer.enable_padding(max_length=maxlen)
    except TypeError:
        tokenizer.enable_padding(length=maxlen)
    all_ids = []

    for i in range(0, len(texts), chunk_size):
        text_chunk = texts[i:i + chunk_size].tolist()
        encs = tokenizer.encode_batch(text_chunk)
        all_ids.extend([enc.ids for enc in encs])

    return np.array(all_ids)
Beispiel #7
0
class BERT16SDataset(Dataset):
	"""
	A torch dataset class designed to load a 16S data found in a tsv file and encode it for BERT.
	:param vocab_path: str, path to the pre-trained bert tokenizer vocab file.
	:param data_path: str, path to the 16S data file.
	:param block_size: str, maximal BERT input (an encoded sample will be padded to this length if too short)
	:param max_word_length: int, the maximal word length the tokenizer can encode.
	"""
	def __init__(self, vocab_path: str, data_path: str, block_size=512, max_word_length=100):

		assert os.path.isfile(data_path)
		assert os.path.isfile(vocab_path)

		_logger.info(f"Loading BERT tokenizer using vocab file {vocab_path}")
		self.tokenizer = BertWordPieceTokenizer(
			vocab_path,
			handle_chinese_chars=False,
			lowercase=False)
		self.tokenizer.enable_truncation(block_size)
		self.tokenizer.enable_padding(max_length=block_size)

		_logger.info(f"Loading 16S dataset file at {data_path}...")
		self._16s_corpus_df = pd.read_csv(data_path, sep='\t')
		_logger.info(f"16S corpus is of shape {self._16s_corpus_df.shape}")

		self.samples = self._16s_corpus_df.seq.values.tolist()
		self.max_word_length = max_word_length

	def __len__(self):
		return len(self._16s_corpus_df)

	def __getitem__(self, i):
		sample = self._split_sequence_by_max_word_length(self.samples[i])
		tokens = self.tokenizer.encode(sample)
		return torch.tensor(tokens.ids, dtype=torch.long)

	def _split_sequence_by_max_word_length(self, seq):
		"""
		split a 16S sequence (~1K long usually) into white-spaces separated chunks that the tokenizer can encode.
		:param seq: str, 16S sequence
		:return: str
		"""
		chunks = [seq[i: i + self.max_word_length] for i in range(0, len(seq), self.max_word_length)]
		return ' '.join(chunks)
def get_preds(list_of_texts):
    transformer_layer = (transformers.TFDistilBertModel.from_pretrained(
        'distilbert-base-multilingual-cased'))

    model = build_model(transformer_layer, max_len=MAX_LEN)
    model.load_weights('model/weights')

    #model = tf.keras.models.load_model('model')

    print('weights loaded')

    tokenizer = transformers.DistilBertTokenizer.from_pretrained(
        'distilbert-base-multilingual-cased')
    tokenizer.save_pretrained('.')
    # Reload it with the huggingface tokenizers library
    fast_tokenizer = BertWordPieceTokenizer('vocab.txt', lowercase=False)

    fast_tokenizer.enable_truncation(max_length=MAX_LEN)
    fast_tokenizer.enable_padding(length=MAX_LEN)

    all_ids = []
    encs = fast_tokenizer.encode_batch(list_of_texts)
    all_ids.extend([enc.ids for enc in encs])

    all_ids = np.array(all_ids).astype(np.float32)

    to_predict = create_test(all_ids)

    predictions = model.predict(to_predict)
    #print(predictions*10)

    for prediction in predictions:
        print(prediction)

    dic = {'predictions': predictions}

    parsed = []
    #response = pd.DataFrame(dic)
    #parsed = response.to_json(orient = 'columns') #not sure if works
    #json.dumps(parsed)           #to be reviewed
    return parsed, predictions
Beispiel #9
0
class Tokenizer:
    def __init__(self, lang):
        """
        A Tokenizer class to load and train a custom tokenizer
        Using the Hugging Face tokenization library for the same
        """
        self.tokenizer_dir = r"data/{}".format(lang)
        if not os.path.exists(self.tokenizer_dir):
            os.mkdir(self.tokenizer_dir)
        self.vocab = self.tokenizer_dir + "/vocab.txt"
        if os.path.exists(self.vocab):
            print("Initialized tokenizer using cached vocab file {}".format(self.vocab))
            self.tokenizer = BertWordPieceTokenizer(vocab_file=self.vocab)
        else:
            self.tokenizer = BertWordPieceTokenizer()

        self.tokenizer.enable_padding(max_length=MAX_LENGTH)
        self.tokenizer.enable_truncation(max_length=MAX_LENGTH)

    def train_tokenizer(self, sentences):
        """
        Train a tokenizer with a list of sentences
        """

        if not os.path.exists(self.vocab):
            print("Training tokenizer for {}".format(self.tokenizer_dir))
            # Hugging Face only accepts a Temp File with sentences for Training Tokenizer
            with open(self.tokenizer_dir + "/data.txt", "w+", encoding="utf-8") as f:
                [f.write(i + "\n") for i in sentences]
            self.tokenizer.train([self.tokenizer_dir + "/data.txt"])
            self.tokenizer.save(self.tokenizer_dir)
            print("Trained a tokenizer with vocab size {}".format(self.tokenizer.get_vocab_size()))

            # Removing the temp file
            os.remove(self.tokenizer_dir + "/data.txt")

    def encode(self, decoded):
        return self.tokenizer.encode(decoded)

    def decode(self, encoded):
        return self.tokenizer.decode_batch(encoded)
Beispiel #10
0
class SentimentModel:
    def __init__(self, model_dir):
        # load session and graph
        self.sess = tf.Session(graph=tf.Graph())
        tf.saved_model.loader.load(self.sess, ['serve'], export_dir=model_dir)
        self.tokenizer = BertWordPieceTokenizer(os.path.join(model_dir, 'vocab.txt'))
        self.tokenizer.enable_truncation(max_length=MAX_LEN)

    def predict(self, text):
        tokenized = self.tokenizer.encode(text)
        token_ids = tokenized.ids
        segment_ids = tokenized.type_ids
        token_ids, segment_ids = np.array([token_ids]), np.array([segment_ids])
        # placeholder
        input_token = self.sess.graph.get_tensor_by_name('Input-Token:0')
        input_segment = self.sess.graph.get_tensor_by_name('Input-Segment:0')
        output = self.sess.graph.get_tensor_by_name('label/Softmax:0')

        probas = self.sess.run([output], feed_dict={input_token: token_ids,
                                                    input_segment: segment_ids})
        return tuple(probas[0][0].tolist())
Beispiel #11
0
def train_tokenizer(filename, params):
    """
    Train a BertWordPieceTokenizer with the specified params and save it
    """
    # Get tokenization params
    save_location = params["tokenizer_path"]
    max_length = params["max_length"]
    min_freq = params["min_freq"]
    vocabsize = params["vocab_size"]

    tokenizer = BertWordPieceTokenizer()
    tokenizer.do_lower_case = False
    special_tokens = ["[S]","[PAD]","[/S]","[UNK]","[MASK]", "[SEP]","[CLS]"]
    tokenizer.train(files=[filename], vocab_size=vocabsize, min_frequency=min_freq, special_tokens = special_tokens)

    tokenizer._tokenizer.post_processor = BertProcessing(("[SEP]", tokenizer.token_to_id("[SEP]")), ("[CLS]", tokenizer.token_to_id("[CLS]")),)
    tokenizer.enable_truncation(max_length=max_length)

    print("Saving tokenizer ...")
    if not os.path.exists(save_location):
        os.makedirs(save_location)
    tokenizer.save(save_location)
def main():
    
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--data_dir", default=None, type=str, required=True,
                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
    parser.add_argument("--output_dir", default=None, type=str, required=True,
                        help="The output directory where the model predictions and checkpoints will be written.")

     ## Other parameters
    parser.add_argument("--config_name", default="", type=str,
                        help="Pretrained config name or path if not the same as model_name")
    parser.add_argument("--tokenizer_name", default="", type=str,
                        help="Pretrained tokenizer name or path if not the same as model_name")
    parser.add_argument("--max_seq_length", default=512, type=int,
                        help="The maximum total input sequence length after tokenization. Sequences longer "
                             "than this will be truncated, sequences shorter will be padded.")
    parser.add_argument("--do_train", action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval", action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--evaluate_during_training", action='store_true',
                        help="Rul evaluation during training at each logging step.")
    parser.add_argument("--do_lower_case", action='store_true',
                        help="Set this flag if you are using an uncased model.")

    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
                        help="Batch size per GPU/CPU for training.")
    parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
                        help="Batch size per GPU/CPU for evaluation.")
    parser.add_argument("--num_eval_docs", default=1000, type=int,
                        help="number of docs per query in eval set.")
    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
                        help="Number of updates steps to accumulate before performing a backward/update pass.")     
    parser.add_argument("--learning_rate", default=5e-5, type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--weight_decay", default=0.0, type=float,
                        help="Weight deay if we apply some.")
    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm", default=1.0, type=float,
                        help="Max gradient norm.")
    parser.add_argument("--num_train_epochs", default=3.0, type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--max_steps", default=-1, type=int,
                        help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
    parser.add_argument("--warmup_steps", default=0, type=int,
                        help="Linear warmup over warmup_steps.")

    parser.add_argument('--logging_steps', type=int, default=50,
                        help="Log every X updates steps.")
    parser.add_argument('--save_steps', type=int, default=50,
                        help="Save checkpoint every X updates steps.")
    parser.add_argument("--eval_all_checkpoints", action='store_true',
                        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
    parser.add_argument("--no_cuda", action='store_true',
                        help="Avoid using CUDA when available")
    parser.add_argument('--overwrite_output_dir', action='store_true',
                        help="Overwrite the content of the output directory")
    parser.add_argument('--seed', type=int, default=42,
                        help="random seed for initialization")
    parser.add_argument("--msmarco_output", action='store_true',
                        help="Return msmarco output format file")

    parser.add_argument("--local_rank", type=int, default=-1,
                        help="For distributed training: local_rank")
    args = parser.parse_args()

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
        raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
        args.n_gpu=1
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend='nccl')
        args.n_gpu = 1
    args.device = device

    # Setup logging
    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                        datefmt = '%m/%d/%Y %H:%M:%S',
                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s",
    #                args.local_rank, device, args.n_gpu, bool(args.local_rank != -1))
    
    # Set seed
    set_seed(args)
    num_labels=2
    config = BertConfig.from_pretrained("bert-base-uncased", num_labels=num_labels)
    tokenizer = BertWordPieceTokenizer(f"{args.data_dir}/bert_based_uncased_vocab.txt", lowercase=True)
    tokenizer.enable_truncation(args.max_seq_length)
    tokenizer.enable_padding('right',max_length=args.max_seq_length)
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', config=config)
    model.to(args.device)

    args.output_mode='classification'

    logger.info("Training/evaluation parameters %s", args)

    if args.do_train:
        dataset_path = f'{args.data_dir}/triples.unique.eq.train.small.csv'
        train_dataset=LazyTextDataset(dataset_path, tokenizer,args.max_seq_length)
        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)


    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
        # Create output directory if needed
        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(args.output_dir)

        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))



    # Evaluation
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
        tokenizer = BertWordPieceTokenizer(f"{args.data_dir}/bert_based_uncased_vocab.txt", lowercase=True)
        tokenizer.enable_truncation(args.max_seq_length)
        tokenizer.enable_padding('right',max_length=args.max_seq_length)
        checkpoints = [args.output_dir] # can specifiy only one checkpoint checkpoints = [f'{args.data_dir}/checkpoint-{args.checkpoint}']
        if args.eval_all_checkpoints:
            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
            prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else ""
            
            model = BertForSequenceClassification.from_pretrained(checkpoint)
            model.to(args.device)
            evaluate(args, model, tokenizer, prefix=prefix, set_name='eval', global_step)
            

    return results


if __name__ == "__main__":
    main()
Beispiel #13
0
# FAZ O DOWNLOAD DO PRE-TREINADO EM PT-BT
if not os.path.exists('bert-base-portuguese-cased_pytorch_checkpoint.zip'):
  wget.download("https://neuralmind-ai.s3.us-east-2.amazonaws.com/nlp/bert-base-portuguese-cased/bert-base-portuguese-cased_pytorch_checkpoint.zip")
  !unzip bert-base-portuguese-cased_pytorch_checkpoint.zip -d bert-portuguese

# CRIA O TOKENIZER A PARTIR DE UM VOCABULÁRIO
# LOWERCASE = FALSE (NÃO IRÁ CONVERTER AS ENTRADAS PARA LOWERCASE. MANTEM O ORGINIAL)
# STRIP ACCENTS = FALSE (MANTEM OS ACENTOS)
tokenizer = BertWordPieceTokenizer("vocab.txt", lowercase=False, strip_accents=False)

# MOSTRA AS INFORMAÇÕES DO TONENIZER
print(tokenizer)

# PERMITE O TRUNCATION E O PADDING
tokenizer.enable_truncation(max_length=60)
tokenizer.enable_padding()


# TOKENINZA EM BATCH TODAS AS SENTENÇAS
# TEM QUE USAR .TOLIST PARA CONVERTER POR LISTA. SENTENCAS É UM ARRAY NUMPY
output = tokenizer.encode_batch(sentencas.tolist())

# O TOKENIZER RETORAR UMA LISTA DE OBJETOS DO TIPO TOKENIZER
# PRECISAMOS PEGAR OS ATRIBUTOS IDS E MASKS E ADICIONAR PARA LISTAS
# OS OBJETOS TEM O ATRIBUTO IDS(IDS), TOKENS (TOKENS) E attention_mask
# PRECISAMOS FAXER O FOR PARA PEGAR CADA UM E DEPOIS CRIAR A LISTA
ids=[x.ids for x in output]
attention_mask = [x.attention_mask for x in output]

print(len(ids))
Beispiel #14
0
class Vocab:
    """Regulard vocabulary for holding the conversations and number of words."""

    DEFAULT_CONTEXT = 'default'

    def __init__(self, max_seq_len: int, conversation_depth: int = 4):
        self.words = {}
        self._context = Vocab.DEFAULT_CONTEXT
        self.conversations = {}
        self._held_conversations = {}
        self.conversation_depth = conversation_depth
        self.longest = 0
        self.longest_tokenized = 0
        self.tokenizer = BertWordPieceTokenizer(
            'data/bert-base-uncased-vocab.txt', lowercase=True)
        self.tokenizer.enable_truncation(max_seq_len)

    def add_word(self, word: str) -> None:
        word = word.lower()
        if not word in self.words:
            self.words[word] = 0
        self.words[word] += 1

    def add_sentence(self, sentence: str) -> None:
        [self.add_word(s) for s in sentence.split()]

    def switch_context(self, new_context: str) -> None:
        if self._context in self._held_conversations and len(
                self._held_conversations[
                    self._context]) > self.conversation_depth:
            self.conversations[self._context].append(self._held_conversations[
                self._context][-self.conversation_depth:][::-1])
        self._context = new_context

    def add_conversation(self, conversation: Dict[str, object]) -> None:
        if not self._context in self.conversations:
            self.conversations[self._context] = []
        self.add_line(conversation)
        lc = self._held_conversations[self._context][-1]
        line = lc['line'].split()
        if len(line) > self.longest:
            self.longest = len(line)
        tokenized = self.tokenizer.encode(lc['line'])
        if len(tokenized.ids) > self.longest_tokenized:
            self.longest_tokenized = len(tokenized.ids)

    def add_line(self, conversation: Dict[str, object]) -> bool:
        if not self._context in self._held_conversations or len(
                self._held_conversations[self._context]) == 0:
            self._held_conversations[self._context] = [conversation]
            return True
        hc = self._held_conversations[self._context]  # Held Conversation
        lc = hc[-1]  # Last conversation

        same_speaker = len(lc['speaker']) > 0 and lc[
            'speaker'] == conversation['speaker'] and lc['speaker'] != 'NTP'
        continuing_line = (len(lc['speaker']) == 0 or lc['speaker'] == 'NTP') and \
            (len(conversation['speaker']) == 0 or conversation['speaker'] == 'NTP') \
            and len(conversation['line']) > 0 and conversation['line'][0].islower()

        if same_speaker or continuing_line and conversation['when'] - lc[
                'when'] < 1000 * 60 * 1.5:
            hc[-1]['when'] = conversation['when']
            hc[-1]['line'] += f" {conversation['line']}"
            return False
        if len(self._held_conversations[self._context]) >= 2:
            if self.conversation_depth > 2:
                self.conversations[self._context].append(hc[-2:][::-1])
            self.conversations[self._context].append(
                hc[-min(self.conversation_depth, len(hc)):][::-1])
        hc.append(conversation)
        return True

    def get_tokenizer(self) -> BaseTokenizer:
        return self.tokenizer
Beispiel #15
0
import torch
from google.cloud import storage
import tokenizers
from transformers import BertTokenizer
from tokenizers import BertWordPieceTokenizer
from torch.utils.data import Dataset, TensorDataset, DataLoader
from torch.utils.data.sampler import RandomSampler
import numpy as np
import random
import jieba
import logging
logging.getLogger("jieba").setLevel(logging.WARNING)

tokenizer = BertWordPieceTokenizer(vocab_file='../tokenizer/vocab.txt')
tokenizer.add_special_tokens(["<nl>"])
tokenizer.enable_truncation(max_length=512)
tokenizer.enable_padding(length=512)
client = storage.Client()
blobs = []
size = 0
for blob in client.list_blobs('tfrc-tfrc', prefix='public_model/corpus/'):
    if (blob.name.endswith('.txt')):
        blobs.append(blob)

sub_blobs = random.sample(blobs, 5)


def iterator_gen(generator, handler=None, parallel=False):
    try:
        import gc
        import multiprocessing as multiprocessing
Beispiel #16
0
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=6e-6),
                  loss='binary_crossentropy',
                  metrics=['accuracy', 'AUC'])

    return model


transformer_layer = (transformers.TFDistilBertModel.from_pretrained(
    'distilbert-base-multilingual-cased'))

model = build_model(transformer_layer, max_len=MAX_LEN)
model.load_weights('/home/aziz/vneuron/model/weights')

fast_tokenizer = BertWordPieceTokenizer('vocab.txt', lowercase=False)
fast_tokenizer.enable_truncation(max_length=MAX_LEN)
fast_tokenizer.enable_padding(length=MAX_LEN)

app = Flask(__name__)


@app.route('/')
def index():

    return render_template('index.html')


@app.route('/predict', methods=['POST'])
def predict():
    text = request.form['content']
    text = [str(text)]
Beispiel #17
0
 def tokenizer(
         self) -> Union[BaseTokenizer, CountVectorizer, TfidfVectorizer]:
     pkl_path = os.path.join(self.tokenizer_path, "model.pkl")
     if self._tokenizer is not None:
         return self._tokenizer
     ### get pickled tokenizer
     if os.path.exists(pkl_path) and not self.retrain_tokenizer:
         with open(pkl_path, 'rb') as f:
             tokenizer = pickle.load(f)
     ### train new tokenizer
     else:
         self.retrain_tokenizer = False
         if self.algorithm == 'bert':
             from tokenizers import BertWordPieceTokenizer
             tokenizer = BertWordPieceTokenizer(
                 vocab_file=None if self._init_vocabulary is None else os.
                 path.join(self.cache_path, "bert_vocab.txt"))
             tokenizer.enable_truncation(max_length=self.max_length)
             tokenizer.enable_padding(length=self.max_length)
             # train the tokenizer
             if self._init_vocabulary is None:
                 path = os.path.join(self.cache_path, 'train.txt')
                 with open(path, 'w') as f:
                     for i in chain(self.train_text, self.valid_text,
                                    self.test_text):
                         if len(i) == 0:
                             continue
                         f.write(i + "\n" if i[-1] != "\n" else i)
                 tokenizer.train(files=path,
                                 vocab_size=self.vocab_size,
                                 min_frequency=self.min_frequency,
                                 limit_alphabet=self.limit_alphabet,
                                 show_progress=True)
             tokenizer.save_model(self.tokenizer_path)
         elif self.algorithm in ('count', 'tf', 'tfidf'):
             if self.algorithm == 'count':
                 tokenizer = CountVectorizer(
                     input='content',
                     ngram_range=self.ngram_range,
                     min_df=self.min_frequency,
                     max_df=self.max_frequency,
                     max_features=self.vocab_size,
                     vocabulary=self._init_vocabulary,
                     tokenizer=_simple_tokenizer,
                     stop_words='english')
             elif self.algorithm in ('tf', 'tfidf'):
                 tokenizer = TfidfVectorizer(
                     input='content',
                     ngram_range=self.ngram_range,
                     min_df=self.min_frequency,
                     max_df=self.max_frequency,
                     max_features=self.vocab_size,
                     stop_words='english',
                     vocabulary=self._init_vocabulary,
                     tokenizer=_simple_tokenizer,
                     use_idf=False if self.algorithm == 'tf' else True)
             tokenizer.fit((_simple_preprocess(i) for i in chain(
                 self.train_text, self.valid_text, self.test_text)))
         else:
             raise NotImplementedError
         # save the pickled model
         with open(pkl_path, "wb") as f:
             pickle.dump(tokenizer, f)
     ### assign and return
     self._tokenizer = tokenizer
     return self._tokenizer
Beispiel #18
0
import onnxruntime as ort
from tokenizers import BertWordPieceTokenizer

# Helper scripts
from .PreprocessData import normalize_text, truncate_text
from .Predict import get_ids_and_masks, predict

# Initialize ONNX runtime and language model tokenizer
vocab_file_path = os.path.join(os.path.dirname(__file__),
                               "Model/bert-base-uncased-vocab.txt")
onnx_file_path = os.path.join(os.path.dirname(__file__),
                              "Model/watchdog_model.onnx")

tokenizer = BertWordPieceTokenizer(vocab_file_path)
tokenizer.enable_padding(pad_id=0, pad_token="[PAD]", length=128)
tokenizer.enable_truncation(max_length=128)

ort_session = ort.InferenceSession(onnx_file_path)


def main(req: func.HttpRequest) -> func.HttpResponse:
    logging.info('Invoked TextQualityWatchdog Skill.')

    try:
        body = json.dumps(req.get_json())

        if body:
            logging.info(body)
            values = json.loads(body)['values']
            results = {}
            results["values"] = []
 def train_tokenizer(captions):
     # инициализируем токенайзер используя сгенерированный словарь
     tokenizer = BertWordPieceTokenizer('captions-vocab.txt')
     tokenizer.enable_truncation(max_length=16)
     return tokenizer
Beispiel #20
0
class Vocab:
    """Regulard vocabulary for holding the conversations and number of words."""

    DEFAULT_CONTEXT = 'default'
    SENTENCE_CUTOFF_DURATION = 1000 * 6

    def __init__(self, max_seq_len: int, conversation_depth: int = 4):
        self.words = {}
        self._context = Vocab.DEFAULT_CONTEXT
        self.conversations = {}
        self._held_conversations = {}
        self.conversation_depth = conversation_depth
        self.longest = 0
        self.longest_tokenized = 0
        self.tokenizer = BertWordPieceTokenizer(
            'data/bert-base-uncased-vocab.txt', lowercase=True)
        self.tokenizer.enable_truncation(max_seq_len)

    def add_word(self, word: str) -> None:
        word = word.lower()
        if not word in self.words:
            self.words[word] = 0
        self.words[word] += 1

    def add_sentence(self, sentence: str) -> None:
        [self.add_word(s) for s in sentence.split()]

    def switch_context(self, new_context: str) -> None:
        if self._context in self._held_conversations and len(
                self._held_conversations[
                    self._context]) > self.conversation_depth:
            self.conversations[self._context].append(self._held_conversations[
                self._context][-self.conversation_depth:])
        self._context = new_context

    def add_conversation(self, conversation: Dict[str, object]) -> None:
        if not self._context in self.conversations:
            self.conversations[self._context] = []
        self.add_line(conversation)
        lc = self._held_conversations[self._context][-1]
        line = lc['line'].split()
        if len(line) > self.longest:
            self.longest = len(line)
        tokenized = self.tokenizer.encode(lc['line'])
        if len(tokenized.ids) > self.longest_tokenized:
            self.longest_tokenized = len(tokenized.ids)

    def add_line(self, conversation: Dict[str, object]) -> bool:
        if not self._context in self._held_conversations or len(
                self._held_conversations[self._context]) == 0:
            self._held_conversations[self._context] = [conversation]
            return True
        hc = self._held_conversations[self._context]  # Held Conversation
        lc = hc[-1]  # Last conversation

        same_speaker = len(
            lc['speaker']) > 0 and lc['speaker'] == conversation[
                'speaker'] and not lc['speaker'] in ['NTP', 'Text']
        continuing_line = (len(lc['speaker']) == 0 or lc['speaker'] in ['NTP', 'Text']) and \
            (len(conversation['speaker']) == 0 or conversation['speaker'] in ['NTP', 'Text']) \
            and len(conversation['line']) > 0 and conversation['line'][0].islower()

        if same_speaker or continuing_line and conversation['when'] - lc[
                'when'] < Vocab.SENTENCE_CUTOFF_DURATION:
            lc['when'] = conversation['when']
            lc['line'] += f" {conversation['line']}"
            return False
        if len(self._held_conversations[self._context]) >= 2:
            self.conversations[self._context].append(
                hc[-min(self.conversation_depth, len(hc)):])
        hc.append(conversation)
        if conversation['when'] - lc['when'] >= Vocab.SENTENCE_CUTOFF_DURATION:
            self._held_conversations[self._context] = [conversation]
        return True

    def get_tokenizer(self) -> BaseTokenizer:
        return self.tokenizer

    def get_conversations(self, in_seq_len: int, out_seq_len: int, \
        add_two_person: bool = True) -> List[Dict[str,List[int]]]:
        conversations = []
        for conversation in self.conversations.values():
            for dialogue in conversation:
                inputs = [
                    self.tokenizer.encode(y['line']) for y in dialogue[:-1]
                ][::-1]
                target = self.tokenizer.encode(dialogue[-1]['line'])
                target.pad(out_seq_len)
                target.truncate(out_seq_len)
                if add_two_person and self.conversation_depth > 2 and len(
                        dialogue) > 2 and len(inputs) > 0:
                    inputs[0].pad(in_seq_len)
                    inputs[0].truncate(in_seq_len)
                    conversations.append({
                        'inputs': inputs[0].ids,
                        'target': target.ids,
                        'mask': inputs[0].attention_mask
                    })
                inputs = Encoding.merge(inputs)
                inputs.pad(in_seq_len)
                inputs.truncate(in_seq_len)
                conversations.append({
                    'inputs': inputs.ids,
                    'target': target.ids,
                    'mask': inputs.attention_mask
                })
        return conversations
def main():
    start_time = time.time()
    args = parse_args()
    make_directories(args.output_dir)

    # Start Tensorboard and log hyperparams.
    tb_writer = SummaryWriter(args.output_dir)
    tb_writer.add_hparams(vars(args), {})

    file_log_handler = logging.FileHandler(
        os.path.join(args.output_dir, 'log.txt'))
    logger.addHandler(file_log_handler)

    # Get list of text and list of label (integers) from disk.
    train_text, train_label_id_list, eval_text, eval_label_id_list = \
        get_examples_and_labels(args.dataset)

    # Augment training data.
    if (args.augmentation_recipe is not None) and len(
            args.augmentation_recipe):
        import pandas as pd

        if args.augmentation_recipe == 'textfooler':
            aug_csv = '/p/qdata/jm8wx/research/text_attacks/textattack/outputs/attack-1590551967800.csv'
        elif args.augmentation_recipe == 'tf-adjusted':
            aug_csv = '/p/qdata/jm8wx/research/text_attacks/textattack/outputs/attack-1590564015768.csv'
        else:
            raise ValueError(
                f'Unknown augmentation recipe {args.augmentation_recipe}')

        aug_df = pd.read_csv(aug_csv)

        # filter skipped outputs
        aug_df = aug_df[aug_df['original_text'] != aug_df['perturbed_text']]

        print(
            f'Augmentation recipe {args.augmentation_recipe} / augmentation num. examples {args.augmentation_num}/ len {len(aug_df)}'
        )

        original_text = aug_df['original_text']
        perturbed_text = aug_df['perturbed_text']

        # convert `train_text` and `train_label_id_list` to an np array so things are faster
        train_text = np.array(train_text)
        train_label_id_list = np.array(train_label_id_list)

        x_adv_list = []
        x_adv_id_list = []
        for (x, x_adv) in zip(original_text, perturbed_text):
            x = x.replace('[[', '').replace(']]', '')
            x_adv = x_adv.replace('[[', '').replace(']]', '')
            x_idx = (train_text == x).nonzero()[0][0]
            x_adv_label = train_label_id_list[x_idx]
            x_adv_id_list.append(x_adv_label)
            x_adv_list.append(x_adv)

        # truncate to `args.augmentation_num` examples
        if (args.augmentation_num >= 0):
            perm = list(range(len(x_adv_list)))
            random.shuffle(perm)
            perm = perm[:args.augmentation_num]
            x_adv_list = [x_adv_list[i] for i in perm]
            x_adv_id_list = [x_adv_id_list[i] for i in perm]

        train_text = train_text.tolist() + x_adv_list
        train_label_id_list = train_label_id_list.tolist() + x_adv_id_list

        print(
            f'Augmentation added {len(x_adv_list)} examples, for a total of {len(train_text)}'
        )

    label_id_len = len(train_label_id_list)
    num_labels = len(set(train_label_id_list))
    logger.info('num_labels: %s', num_labels)

    train_examples_len = len(train_text)

    if len(train_label_id_list) != train_examples_len:
        raise ValueError(
            f'Number of train examples ({train_examples_len}) does not match number of labels ({len(train_label_id_list)})'
        )
    if len(eval_label_id_list) != len(eval_text):
        raise ValueError(
            f'Number of teste xamples ({len(eval_text)}) does not match number of labels ({len(eval_label_id_list)})'
        )

    print_cuda_memory(args)
    # old INFO:__main__:Loaded data and tokenized in 189.66675066947937s

    # @TODO support other vocabularies, or at least, support case
    tokenizer = BertWordPieceTokenizer('bert-base-uncased-vocab.txt',
                                       lowercase=True)
    tokenizer.enable_padding(max_length=args.max_seq_len)
    tokenizer.enable_truncation(max_length=args.max_seq_len)

    logger.info(f'Tokenizing training data. (len: {train_examples_len})')
    train_text_ids = [
        encoding.ids for encoding in tokenizer.encode_batch(train_text)
    ]
    logger.info(f'Tokenizing test data (len: {len(eval_label_id_list)})')
    eval_text_ids = [
        encoding.ids for encoding in tokenizer.encode_batch(eval_text)
    ]
    load_time = time.time()
    logger.info(f'Loaded data and tokenized in {load_time-start_time}s')

    print_cuda_memory(args)

    # Load pre-trained model tokenizer (vocabulary)
    logger.info('Loading model: %s', args.model_dir)
    # Load pre-trained model (weights)
    logger.info(f'Model class: (vanilla) BertForSequenceClassification.')
    model = BertForSequenceClassification.from_pretrained(
        args.model_dir, num_labels=num_labels)

    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    model.to(device)
    # print(model)

    # multi-gpu training
    if args.num_gpus > 1:
        model = torch.nn.DataParallel(model)
    logger.info(f'Training model across {args.num_gpus} GPUs')

    num_train_optimization_steps = int(
        train_examples_len / args.batch_size /
        args.grad_accum_steps) * args.num_train_epochs

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)

    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_proportion,
        num_training_steps=num_train_optimization_steps)

    global_step = 0

    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", train_examples_len)
    logger.info("  Batch size = %d", args.batch_size)
    logger.info("  Max sequence length = %d", args.max_seq_len)
    logger.info("  Num steps = %d", num_train_optimization_steps)

    wandb.log({'train_examples_len': train_examples_len})

    train_input_ids = torch.tensor(train_text_ids, dtype=torch.long)
    train_label_ids = torch.tensor(train_label_id_list, dtype=torch.long)
    train_data = TensorDataset(train_input_ids, train_label_ids)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.batch_size)

    eval_input_ids = torch.tensor(eval_text_ids, dtype=torch.long)
    eval_label_ids = torch.tensor(eval_label_id_list, dtype=torch.long)
    eval_data = TensorDataset(eval_input_ids, eval_label_ids)
    eval_sampler = RandomSampler(eval_data)
    eval_dataloader = DataLoader(eval_data,
                                 sampler=eval_sampler,
                                 batch_size=args.batch_size)

    def get_eval_acc():
        correct = 0
        total = 0
        for input_ids, label_ids in tqdm.tqdm(eval_dataloader,
                                              desc="Evaluating accuracy"):
            input_ids = input_ids.to(device)
            label_ids = label_ids.to(device)

            with torch.no_grad():
                logits = model(input_ids)[0]

            correct += (logits.argmax(dim=1) == label_ids).sum()
            total += len(label_ids)

        return float(correct) / total

    def save_model():
        model_to_save = model.module if hasattr(
            model, 'module') else model  # Only save the model itself

        # If we save using the predefined names, we can load using `from_pretrained`
        output_model_file = os.path.join(args.output_dir, args.weights_name)
        output_config_file = os.path.join(args.output_dir, args.config_name)

        torch.save(model_to_save.state_dict(), output_model_file)
        model_to_save.config.to_json_file(output_config_file)

        logger.info(
            f'Best acc found. Saved tokenizer, model config, and model to {args.output_dir}.'
        )

    global_step = 0

    def save_model_checkpoint(checkpoint_name=None):
        # Save model checkpoint
        checkpoint_name = checkpoint_name or 'checkpoint-{}'.format(
            global_step)
        output_dir = os.path.join(args.output_dir, checkpoint_name)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        # Take care of distributed/parallel training
        model_to_save = model.module if hasattr(model, 'module') else model
        model_to_save.save_pretrained(output_dir)
        torch.save(args, os.path.join(output_dir, 'training_args.bin'))
        logger.info('Checkpoint saved to %s.', output_dir)

    print_cuda_memory(args)
    model.train()
    best_eval_acc = 0
    steps_since_best_eval_acc = 0

    def loss_backward(loss):
        if args.num_gpus > 1:
            loss = loss.mean(
            )  # mean() to average on multi-gpu parallel training
        if args.grad_accum_steps > 1:
            loss = loss / args.grad_accum_steps
        loss.backward()

    for epoch in tqdm.trange(int(args.num_train_epochs), desc="Epoch"):
        prog_bar = tqdm.tqdm(train_dataloader, desc="Iteration")
        for step, batch in enumerate(prog_bar):
            print_cuda_memory(args)
            batch = tuple(t.to(device) for t in batch)
            input_ids, labels = batch
            logits = model(input_ids)[0]
            loss_fct = torch.nn.CrossEntropyLoss()
            loss = torch.nn.CrossEntropyLoss()(logits.view(-1, num_labels),
                                               labels.view(-1))
            if global_step % args.tb_writer_step == 0:
                tb_writer.add_scalar('loss', loss, global_step)
                tb_writer.add_scalar('lr', loss, global_step)
            loss_backward(loss)
            prog_bar.set_description(f"Loss {loss.item()}")
            if (step + 1) % args.grad_accum_steps == 0:
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                global_step += 1
            # Save model checkpoint to file.
            if global_step % args.checkpoint_steps == 0:
                save_model_checkpoint()

            model.zero_grad()

            # Inc step counter.
            global_step += 1

        # Check accuracy after each epoch.
        eval_acc = get_eval_acc()
        tb_writer.add_scalar('epoch_eval_acc', eval_acc, global_step)
        wandb.log({'epoch_eval_acc': eval_acc, 'epoch': epoch})

        if args.checkpoint_every_epoch:
            save_model_checkpoint(f'epoch-{epoch}')

        logger.info(f'Eval acc: {eval_acc*100}%')
        if eval_acc > best_eval_acc:
            best_eval_acc = eval_acc
            steps_since_best_eval_acc = 0
            save_model()
        else:
            steps_since_best_eval_acc += 1
            if (args.early_stopping_epochs > 0) and (
                    steps_since_best_eval_acc > args.early_stopping_epochs):
                logger.info(
                    f'Stopping early since it\'s been {args.early_stopping_epochs} steps since validation acc increased'
                )
                break
Beispiel #22
0
def main():
    parser = ArgumentParser('GLUE evaluation example')
    parser.add_argument(
        '--glue_dir',
        type=str,
        metavar='PATH',
        required=True,
        help='Path to directory containing the GLUE tasks data.')
    parser.add_argument(
        '--output_dir',
        type=str,
        metavar='PATH',
        required=True,
        help=
        'Path to the output directory (for logs, checkpoints, parameters, etc.).'
    )
    parser.add_argument('-f',
                        '--force',
                        action='store_true',
                        help='Overwrite output_dir if it already exists.')
    parser.add_argument(
        '--task_name',
        type=str,
        default=None,
        choices=GLUE_TASKS,
        help='The specific GLUE task to train and/or evaluate on.')
    parser.add_argument('--do_train',
                        action='store_true',
                        help='Whether to run training.')
    parser.add_argument('--do_eval',
                        action='store_true',
                        help='Whether to run eval (on the dev set).')
    parser.add_argument('--config_file',
                        type=str,
                        metavar='PATH',
                        required=True,
                        help='Path to the model configuration.')
    parser.add_argument('--weights_file',
                        type=str,
                        metavar='PATH',
                        required=True,
                        help='Path to the model initialization weights.')
    parser.add_argument('--tokenizer_vocab_file',
                        type=str,
                        metavar='PATH',
                        required=True,
                        help='Path to the tokenizer vocabulary.')
    parser.add_argument('--overwrite_cache',
                        action='store_true',
                        help='Overwrite the cache if it already exists.')
    parser.add_argument('--max_sequence_len',
                        type=int,
                        default=128,
                        metavar='N',
                        help='The maximum length of a sequence.')
    parser.add_argument('--do_lower_case',
                        action='store_true',
                        help='Whether to lowercase the input when tokenizing.')
    parser.add_argument('-n',
                        '--num_epochs',
                        type=int,
                        default=3,
                        metavar='N',
                        help='The number of distillation epochs.')
    parser.add_argument('--per_gpu_train_batch_size',
                        type=int,
                        default=8,
                        metavar='N',
                        help='The batch size per GPU used during training.')
    parser.add_argument('--per_gpu_eval_batch_size',
                        type=int,
                        default=8,
                        metavar='N',
                        help='The batch size per GPU used during evaluation.')
    parser.add_argument('-lr',
                        '--learning_rate',
                        type=float,
                        default=2e-5,
                        metavar='F',
                        help='The initial learning rate.')
    parser.add_argument('--epsilon',
                        type=float,
                        default=1e-8,
                        metavar='F',
                        help="Adam's epsilon.")
    parser.add_argument('--warmup_prop',
                        type=float,
                        default=0.05,
                        metavar='F',
                        help='Linear warmup proportion.')
    parser.add_argument(
        '--num_gradient_accumulation_steps',
        type=int,
        default=1,
        metavar='N',
        help=
        'The number of gradient accumulation steps (for larger batch sizes).')
    parser.add_argument('--max_gradient_norm',
                        type=float,
                        default=1.0,
                        metavar='F',
                        help='The maximum gradient norm.')
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        metavar='N',
                        help='Random seed.')
    parser.add_argument('-c',
                        '--use_cuda',
                        action='store_true',
                        help='Whether to use cuda or not.')
    parser.add_argument(
        '-d',
        '--use_distributed',
        action='store_true',
        help='Whether to use distributed training (distillation) or not.')
    parser.add_argument('--local_rank',
                        type=int,
                        default=-1,
                        metavar='N',
                        help='Local process rank.')
    params = parser.parse_args()

    if not params.use_distributed:
        params.local_rank = 0
        params.train_batch_size = params.per_gpu_train_batch_size
        params.eval_batch_size = params.per_gpu_eval_batch_size
    else:
        params.num_gpus = torch.cuda.device_count()
        params.train_batch_size = params.per_gpu_train_batch_size * params.num_gpus
        params.eval_batch_size = params.per_gpu_eval_batch_size * params.num_gpus
    params.is_master = params.local_rank == 0

    if params.use_cuda:
        device = torch.device('cuda', params.local_rank)
    else:
        device = torch.device('cpu')

    # make output_dir
    if Path(params.output_dir).is_dir() and not params.force:
        raise ValueError(
            f'Output directory {params.output_dir} already exists. Use `--force` if you want to overwrite it.'
        )
    if params.is_master:
        Path(params.output_dir).mkdir(parents=True, exist_ok=params.force)

        # dump params
        json.dump(vars(params),
                  open(Path(params.output_dir) / 'params.json', 'w'),
                  indent=4,
                  sort_keys=True)
    params.glue_dir = Path(params.glue_dir)
    params.output_dir = Path(params.output_dir)
    params.device = device

    # initialize multi-GPU
    if params.use_distributed:
        if params.is_master:
            logger.info('Initializing PyTorch distributed')
        torch.cuda.set_device(params.local_rank)
        dist.init_process_group(backend='nccl', init_method='env://')

    # set seed(s)
    if params.is_master:
        logger.info('Setting random seed(s)')
    random.seed(params.seed)
    np.random.seed(params.seed)
    torch.manual_seed(params.seed)
    if params.use_distributed:
        torch.cuda.manual_seed_all(params.seed)

    # initialize the tokenizer
    if params.is_master:
        logger.info('Initializing the tokenizer')
    tokenizer = BertWordPieceTokenizer(params.tokenizer_vocab_file,
                                       lowercase=params.do_lower_case)

    # enable truncation and padding
    tokenizer.enable_truncation(params.max_sequence_len)
    tokenizer.enable_padding(length=params.max_sequence_len)

    # go over each task
    if params.task_name is not None:
        tasks = [params.task_name]
        output_dirs = [params.output_dir]
    else:
        tasks = GLUE_TASKS
        output_dirs = [
            params.output_dir / task / str(params.seed) for task in tasks
        ]

    for task, task_output_dir in zip(tasks, output_dirs):
        # prepare the GLUE task
        if params.is_master:
            logger.info(f'Preparing the {task} GLUE task')

        # make task_output_dir
        if task_output_dir.is_dir() and not params.force:
            raise ValueError(
                f'Task output directory {task_output_dir} already exists. Use `--force` if you want to overwrite it.'
            )
        if params.is_master:
            task_output_dir.mkdir(parents=True, exist_ok=params.force)

        # initialize the model
        if params.is_master:
            logger.info(f'{task} - Initializing the model')
        config = DistilBertConfig.from_pretrained(
            params.config_file,
            num_labels=len(GLUE_TASKS_MAPPING[task]['labels']),
            finetuning_task=task)
        model = DistilBertForSequenceClassification.from_pretrained(
            params.weights_file, config=config)

        # send model to device
        model = model.to(params.device)

        # perform the training
        if params.do_train:
            # initialize the training dataset
            if params.is_master:
                logger.info(f'{task} - Initializing the training dataset')
            train_dataset = GLUETaskDataset(
                task=task,
                glue_dir=params.glue_dir,
                split='train',
                tokenizer=tokenizer,
                overwrite_cache=params.overwrite_cache)

            # initialize the sampler
            if params.is_master:
                logger.info(f'{task} - Initializing the training sampler')
            train_sampler = DistributedSampler(
                train_dataset) if params.use_distributed else RandomSampler(
                    train_dataset)

            # initialize the dataloader
            if params.is_master:
                logger.info(f'{task} - Initializing the training dataloader')
            train_dataloader = DataLoader(dataset=train_dataset,
                                          sampler=train_sampler,
                                          batch_size=params.train_batch_size)

            # initialize the optimizer
            if params.is_master:
                logger.info(f'{task} - Initializing the optimizer')
            optimizer = optim.Adam(model.parameters(),
                                   lr=params.learning_rate,
                                   eps=params.epsilon,
                                   betas=(0.9, 0.98))

            # initialize the learning rate scheduler
            if params.is_master:
                logger.info(
                    f'{task} - Initializing the learning rate scheduler')
            num_steps_epoch = len(train_dataloader)
            num_train_steps = math.ceil(
                num_steps_epoch / params.num_gradient_accumulation_steps *
                params.num_epochs)
            num_warmup_steps = math.ceil(num_train_steps * params.warmup_prop)

            def lr_lambda(current_step):
                if current_step < num_warmup_steps:
                    return float(current_step) / float(max(
                        1, num_warmup_steps))
                return max(
                    0.0,
                    float(num_train_steps - current_step) /
                    float(max(1, num_train_steps - num_warmup_steps)))

            lr_scheduler = optim.lr_scheduler.LambdaLR(optimizer=optimizer,
                                                       lr_lambda=lr_lambda,
                                                       last_epoch=-1)

            # initialize distributed data parallel (DDP)
            if params.use_distributed:
                if params.is_master:
                    logger.info('Initializing DDP')
                model = DDP(model,
                            device_ids=[params.local_rank],
                            output_device=params.local_rank)

            # start training
            if params.is_master:
                logger.info(f'{task} - Starting the training')
            train(task=task,
                  model=model,
                  dataloader=train_dataloader,
                  optimizer=optimizer,
                  num_epochs=params.num_epochs,
                  lr_scheduler=lr_scheduler,
                  num_gradient_accumulation_steps=params.
                  num_gradient_accumulation_steps,
                  max_gradient_norm=params.max_gradient_norm,
                  device=params.device,
                  use_distributed=params.use_distributed,
                  is_master=params.is_master,
                  use_tqdm=True,
                  logger=logger)

            # save the finetuned model
            if params.is_master:
                # take care of distributed training
                model_to_save = model.module if hasattr(model,
                                                        'module') else model
                model_to_save.config.architectures = [
                    model_to_save.__class__.__name__
                ]

                logger.info(f'{task} - Saving the finetuned model config')
                json.dump(vars(model_to_save.config),
                          open(task_output_dir /
                               TRAINED_CONFIG_FILE_TEMPLATE.format(
                                   model_name=model_to_save.__class__.__name__,
                                   task=task),
                               mode='w'),
                          indent=4,
                          sort_keys=True)

                logger.info(f'{task} - Saving the finetuned model weights')
                torch.save(
                    model_to_save.state_dict(),
                    task_output_dir / TRAINED_WEIGHTS_FILE_TEMPLATE.format(
                        model_name=model_to_save.__class__.__name__,
                        task=task))

                # reload the model
                if params.do_eval:
                    if params.is_master:
                        logger.info(f'{task} - Reloading the model')
                    config = DistilBertConfig.from_pretrained(
                        str(task_output_dir /
                            TRAINED_CONFIG_FILE_TEMPLATE.format(
                                model_name=model_to_save.__class__.__name__,
                                task=task)),
                        num_labels=len(GLUE_TASKS_MAPPING[task]['labels']),
                        finetuning_task=task)
                    model = DistilBertForSequenceClassification.from_pretrained(
                        str(task_output_dir /
                            TRAINED_WEIGHTS_FILE_TEMPLATE.format(
                                model_name=model_to_save.__class__.__name__,
                                task=task)),
                        config=config)
                    model = model.to(params.device)

        # perform the evaluation
        if params.do_eval and params.is_master:
            # initialize the evaluation dataset
            logger.info(f'{task} - Initializing the evaluation dataset')
            eval_datasets = [
                GLUETaskDataset(task=task,
                                glue_dir=params.glue_dir,
                                split='dev',
                                tokenizer=tokenizer,
                                overwrite_cache=params.overwrite_cache)
            ]

            # hot fix for MNLI-MM
            if task == 'MNLI':
                eval_datasets.append(
                    GLUETaskDataset(task='MNLI-MM',
                                    glue_dir=params.glue_dir,
                                    split='dev',
                                    tokenizer=tokenizer))

            for eval_dataset in eval_datasets:
                # initialize the sampler
                logger.info(
                    f'{eval_dataset.task} - Initializing the evaluation sampler'
                )
                eval_sampler = SequentialSampler(eval_dataset)

                # initialize the dataloader
                logger.info(
                    f'{eval_dataset.task} - Initializing the evaluation dataloader'
                )
                eval_dataloader = DataLoader(dataset=eval_dataset,
                                             sampler=eval_sampler,
                                             batch_size=params.eval_batch_size)

                # start evaluating
                logger.info(f'{eval_dataset.task} - Starting the evaluation')
                results = evaluate(task=task,
                                   model=model,
                                   dataloader=eval_dataloader,
                                   device=params.device,
                                   use_tqdm=True)

                # log results
                logger.info(f'{eval_dataset.task} - Evaluation results:')
                for key, result in results.items():
                    logger.info(f'{eval_dataset.task} -  {key}: {result}')

                # dump results
                json.dump(results,
                          open(
                              task_output_dir / RESULTS_FILE_TEMPLATE.format(
                                  model_name=model.__class__.__name__,
                                  task=eval_dataset.task), 'w'),
                          indent=4)

        if params.is_master:
            logger.info(f'Done with the {task} GLUE task')
Beispiel #23
0
print(movie_reviews.columns.values)
print(movie_reviews.sentiment.unique())

y = movie_reviews["sentiment"]

y = np.array(list(map(lambda x: 1 if x == "positive" else 0, y)))

slow_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
save_path = "bert_base_uncased/"
if not os.path.exists(save_path):
    os.makedirs(save_path)
slow_tokenizer.save_pretrained(save_path)

# Load the fast tokenizer from saved file
tokenizer = BertWordPieceTokenizer("bert_base_uncased/vocab.txt", lowercase=True)
tokenizer.enable_truncation(MAX_SEQ_LEN - 2)

train_count = 40000  # 40000
test_count = 2000  #

# X_train = convert_sentences_to_features(reviews[:40000], tokenizer)
# X_test = convert_sentences_to_features(reviews[40000:], tokenizer)

X_train = convert_sentences_to_features(reviews[:train_count], tokenizer)
X_test = convert_sentences_to_features(reviews[train_count : train_count + test_count], tokenizer)

one_hot_encoded = to_categorical(y)
# one_hot_encoded = tf.one_hot(y, 1)

# y_train = one_hot_encoded[:40000]
# y_test = one_hot_encoded[40000:]