import os
import sys
import tokenizers

sys.path.append("/usr/src/app/kaggle/tweet-sentiment-extraction")

MAX_LEN = 128
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 50
BERT_PATH = "inputs/bert-large-uncased-wwm-finetuned-squad"

TOKENIZER = tokenizers.BertWordPieceTokenizer(os.path.join(
    BERT_PATH, "vocab.txt"),
                                              lowercase=True)

INPUT_DIR = 'inputs'
OUT_DIR = 'models'
TRAIN_PATH = os.path.join(INPUT_DIR, "train.csv")
TEST_PATH = os.path.join(INPUT_DIR, "test.csv")
SAMPLE_PATH = os.path.join(INPUT_DIR, "sample_submission.csv")

FOLD0_ONLY = False
DEBUG = False
Ejemplo n.º 2
0
# reference: https://huggingface.co/clagator/biobert_squad2_cased

from transformers import BertForQuestionAnswering
import torch
import time
import tokenizers
import os
BERT_PATH = "model"
# print(f"Bert path: {BERT_PATH}")
model = BertForQuestionAnswering.from_pretrained(f'{BERT_PATH}/')
tokenizer = tokenizers.BertWordPieceTokenizer(f"{BERT_PATH}/vocab.txt",
                                              lowercase=True)


def get_answer(question, context):
    tok_text = tokenizer.encode(question, context)
    tokens = tok_text.tokens
    start_positions = torch.tensor([0])
    end_positions = torch.tensor([1])

    input_ids = tok_text.ids
    token_type_ids = tok_text.type_ids

    mask = [1] * len(token_type_ids)
    max_len = 50
    padding_length = max_len - len(input_ids)
    if padding_length > 0:
        input_ids = input_ids + ([0] * padding_length)
        mask = mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)
Ejemplo n.º 3
0
def __process_data(
    text_path: str,
    dst_folder: str,
    vocab_size: int,
    tokenizer_type: str,
    spe_type: str,
    spe_character_coverage: float,
    spe_train_extremely_large_corpus: bool,
    spe_sample_size: int,
    spe_max_sentencepiece_length: int,
    spe_bos: bool,
    spe_eos: bool,
    spe_pad: bool,
    lower_case: bool,
):
    """
    Converts flac to wav and build manifests's json
    Args:
        text_path: source with text lines
        dst_folder: where wav files will be stored
        vocab_size: vocabular size used in encoding the text
        tokenizer_type: type of tokenization to perform - wpe or spe
        spe_type: type of tokenization model used for spe.
        spe_character_coverage: float value between 0 and 1 (as a percentage). For languages with a vast charset,
            can be < 1.0, but for all other languages, it should be set as 1.0
        spe_sample_size: int, default of -1. If positive integer is used, samples the dataset
            by given sample size.
        spe_train_extremely_large_corpus: bool. If dataset is too large, and user has sufficient RAM,
            this flag can be set to try to trained the tokenizer. Will silently fail if it runs out of RAM.
        spe_max_sentencepiece_length: Limits the maximum length of the SentencePiece subword that can be constructed.
            By default, no limit is placed.
        spe_bos: Bool flag, whether to add <s> to SentencePiece tokenizer vocabulary.
        spe_eos: Bool flag, whether to add </s> to SentencePiece tokenizer vocabulary.
        spe_pad: Bool flag, whether to add <pad> to SentencePiece tokenizer vocabulary.
        lower_case: whether to tokenize with lower case character set only (for english)

    Returns:
    """
    if tokenizer_type == 'spe':
        if spe_max_sentencepiece_length > 0:
            tokenizer_dir = os.path.join(dst_folder,
                                         'tokenizer_{}_{}_v{}_max{}').format(
                                             tokenizer_type, spe_type,
                                             vocab_size,
                                             spe_max_sentencepiece_length)
        else:
            tokenizer_dir = os.path.join(dst_folder,
                                         'tokenizer_{}_{}_v{}').format(
                                             tokenizer_type, spe_type,
                                             vocab_size)

        if not os.path.exists(tokenizer_dir):
            os.makedirs(tokenizer_dir)

        if os.path.exists(os.path.join(tokenizer_dir, 'tokenizer.model')):
            logging.warning(
                "Model file already exists, overriding old model file !")
            os.remove(os.path.join(tokenizer_dir, 'tokenizer.model'))

        tokenizer_path, vocab_path = create_spt_model(
            data_file=text_path,
            vocab_size=vocab_size,
            sample_size=spe_sample_size,
            do_lower_case=lower_case,
            output_dir=tokenizer_dir,
            tokenizer_type=spe_type,
            character_coverage=spe_character_coverage,
            train_extremely_large_corpus=spe_train_extremely_large_corpus,
            max_sentencepiece_length=spe_max_sentencepiece_length,
            bos=spe_bos,
            eos=spe_eos,
            pad=spe_pad,
        )

    else:
        tokenizer_dir = os.path.join(dst_folder, 'tokenizer_{}_v{}').format(
            tokenizer_type, vocab_size)

        if not os.path.exists(tokenizer_dir):
            os.makedirs(tokenizer_dir)

        tokenizer = tokenizers.BertWordPieceTokenizer(lowercase=lower_case)

        tokenizer.train(text_path, vocab_size=vocab_size)
        tokenizer.save_model(tokenizer_dir)

    return tokenizer_dir
Ejemplo n.º 4
0
 def __init__(self, df):
     self.df = df
     self.tokenizer = tokenizers.BertWordPieceTokenizer(
         f"/mnt/bert-base-uncased/vocab.txt", lowercase=True
     )
     self.max_len = 128
Ejemplo n.º 5
0
    print('[encoder] Found {} input files'.format(len(input_files)))

    output_prefix = 'sequences_'
    output_prefix += 'uppercase' if args.uppercase else 'lowercase'
    output_prefix += '_max_seq_len_' + str(args.max_seq_len)
    output_prefix += '_next_seq_task_' + str(
        True if args.next_seq_prob > 0 else False).lower()

    args.output_dir = os.path.join(args.output_dir, output_prefix)
    if not os.path.isdir(args.output_dir):
        os.makedirs(args.output_dir)

    if args.tokenizer == 'wordpiece':
        tokenizer = tokenizers.BertWordPieceTokenizer(
            vocab=args.vocab_file,
            clean_text=True,
            handle_chinese_chars=True,
            lowercase=not args.uppercase,
        )
    elif args.tokenizer == 'bpe':
        tokenizer = tokenizers.ByteLevelBPETokenizer(
            vocab=args.vocab_file,
            add_prefix_space=True,
            lowercase=not args.uppercase,
            trim_offsets=True,
        )
    #tokenizer.enable_padding(length=args.max_seq_len)
    #tokenizer.enable_truncation(max_length=args.max_seq_len)

    params = []
    for i, ifile in enumerate(input_files):
        ofile = os.path.join(args.output_dir, 'train_{}.hdf5'.format(i))
Ejemplo n.º 6
0
        output_string = text

    #output_string = post_process(output_string)
    output_string = output_string.strip()

    return output_string


def filt_thresh(idx, thresh=0):
    idx = idx[idx > thresh]

    return np.nonzero(idx)[0] if len(np.nonzero(idx)[0]) > 0 else np.array([0])


BERT_TOKENIZER = tokenizers.BertWordPieceTokenizer("../input/vocab.txt",
                                                   lowercase=True,
                                                   add_special_tokens=False)
ROBERT_TOKENIZER = tokenizers.ByteLevelBPETokenizer(
    vocab_file="../input/roberta-base/vocab.json",
    merges_file="../input/roberta-base/merges.txt",
    lowercase=True,
    add_prefix_space=True)

CV_FILE = '../model/exper/cv.csv'
MAX_LEN = 100

df = pd.read_csv("../input/train.csv")
print(df.keys())
pred_df = pd.read_csv(CV_FILE).dropna()
print(pred_df.keys())
Ejemplo n.º 7
0
import tokenizers

MAX_LEN = 128
TRAIN_BATCH_SIZE = 32
TEST_BATCH_SIZE = 4
VALID_BATCH_SIZE = 8
EPOCHS = 10
BASE_MODEL_PATH = 'bert-base-uncased'
VOCAB_PATH = '/content/vocab.txt'
MODEL_PATH = 'model.bin'
TOKENIZER = tokenizers.BertWordPieceTokenizer(VOCAB_PATH, lowercase=True)
Ejemplo n.º 8
0
import transformers
import tokenizers

tok = tokenizers.BertWordPieceTokenizer(
    '/content/drive/MyDrive/notebooks/bertNER1/vacob/bert-base-uncased.txt',
    lowercase=True)

MAX_LEN = 128
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 8
EPOCHS = 10
EARLYSTOPPING = 6
METADATA_PATH = "D:/bertNER1/model/meta.bin"
MODEL_PATH = "/content/drive/MyDrive/notebooks/bertNER1/model/model.bin"
TRAINING_FILE = '/content/drive/MyDrive/notebooks/bertNER1/sample.txt'
TOKENIZER = transformers.BertTokenizer.from_pretrained('bert-base-uncased',
                                                       do_lower_case=True)