コード例 #1
0
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, TextClassificationPipeline
from flask import Flask, request
import json
from config import SentimentClassificationConfig

config = SentimentClassificationConfig.from_json("config.json")

app = Flask(__name__)

tokenizer = DistilBertTokenizer.from_pretrained(config.model_path)
model = DistilBertForSequenceClassification.from_pretrained(config.model_path)

if config.use_cuda:
    model = model.cuda()


@app.route('/api/rest/classify_sentiment', methods=["POST"])
def classify_sentiment():
    rest_request = json.loads(request.data.decode('utf-8'))
    sentence = str(rest_request["sentence"])
    sentiment_classifier = TextClassificationPipeline(
        model=model, tokenizer=tokenizer, device=0 if config.use_cuda else -1)

    result = sentiment_classifier(sentence)
    return str(result)


if __name__ == '__main__':
    app.run(host=config.host, port=config.port, debug=True)

#curl --header "Content-Type: application/json" --request POST --data '{"sentence":"You are so cute!"}' http://localhost:5555/api/rest/classify_sentiment
コード例 #2
0
        device = torch.device("cuda:0")

    # model configuration
    batch_size = args.batch_size
    lr = args.lr
    weight_decay = args.weight_decay
    n_epochs = args.n_epochs
    if args.full_bert:
        bert_model = 'bert-base-uncased'
        bert_config = BertConfig.from_pretrained(bert_model, num_labels=2)
        tokenizer = BertTokenizer.from_pretrained(bert_model)
    else:
        bert_model = 'distilbert-base-uncased'
        bert_config = DistilBertConfig.from_pretrained(bert_model,
                                                       num_labels=2)
        tokenizer = DistilBertTokenizer.from_pretrained(bert_model)

    # wandb initialization
    wandb.init(project="domain-adaptation-twitter-emnlp",
               name=args.run_name,
               config={
                   "epochs": n_epochs,
                   "learning_rate": lr,
                   "warmup": args.warmup_steps,
                   "weight_decay": weight_decay,
                   "batch_size": batch_size,
                   "train_split_percentage": args.train_pct,
                   "bert_model": bert_model,
                   "seed": seed,
                   "pretrained_model": args.pretrained_model,
                   "tags": ",".join(args.tags)
コード例 #3
0
class BERTQA:

    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased',
                                                    return_token_type_ids=True)
    model = DistilBertForQuestionAnswering.from_pretrained(
        'distilbert-base-uncased-distilled-squad')
    MAX_TOKENS = 512
    MAX_TOKENS_QUESTION = 30
    MAX_TOKENS_DOCUMENT = MAX_TOKENS - MAX_TOKENS_QUESTION - 2  # [SEP] and [CLS]

    def __init__(self):
        pass

    def get_token_length(self, string):
        tokens = self.tokenizer.encode(string)
        return len(tokens)

    def chunk_document(self, document, re_consolidate=True):
        '''Chunks up a long document into optimally large pieces so that they
        can be passed to BERT. Activating `re_consolidate` will put the chunks
        back together to make them as large as possible for improved
        performance.
        '''
        document_length = self.get_token_length(document)
        if document_length > self.MAX_TOKENS_DOCUMENT:
            approved_chunks = []
            paragraphs = document.split('\n')
            paragraphs = [par for par in paragraphs if par]
            for paragraph in paragraphs:
                paragraph_length = self.get_token_length(paragraph)
                if paragraph_length > self.MAX_TOKENS_DOCUMENT:
                    sentences = paragraph.split('.')
                    sentences = [sen for sen in sentences if sen]
                    for sentence in sentences:
                        sentence_length = self.get_token_length(sentence)
                        if sentence_length > self.MAX_TOKENS_DOCUMENT:
                            print("Ignoring overlong sentence.")
                        else:
                            approved_chunks.append(sentence)
                else:
                    approved_chunks.append(paragraph)
            if re_consolidate:
                lengths = [
                    self.get_token_length(chunk) for chunk in approved_chunks
                ]
                consolidated_chunks = []
                running_length = 0
                current_chunk = ''
                for chunk, length in zip(approved_chunks, lengths):
                    if (running_length + length) < self.MAX_TOKENS_DOCUMENT:
                        current_chunk += chunk
                        running_length += length
                    else:
                        consolidated_chunks.append(current_chunk)
                        current_chunk = chunk
                        running_length = length
                return consolidated_chunks
            else:
                return approved_chunks
        else:
            return [document]

    def answer_question(self, question, document):
        '''Takes a `question` string and an `document` string (which contains
        the answer), and identifies the words within the `document` that are
        the answer.
        '''
        question_length = self.get_token_length(question)
        document_length = self.get_token_length(document)
        if question_length > self.MAX_TOKENS_QUESTION:
            msg = f'Question exceeds max token length ({str(question_length)}).'
            raise ValueError(msg)
        if document_length > self.MAX_TOKENS_DOCUMENT:
            msg = f'Document exceeds max token length ({str(document_length)}).'
            raise ValueError(msg)
        encoding = self.tokenizer.encode_plus(question, document)
        input_ids, attention_mask = encoding["input_ids"], encoding[
            "attention_mask"]
        start_scores, end_scores = self.model(torch.tensor([input_ids]),
                                              attention_mask=torch.tensor(
                                                  [attention_mask]))
        confidence = float(max(torch.max(start_scores), torch.max(end_scores)))

        start_token = torch.argmax(start_scores)
        end_token = torch.argmax(end_scores)
        ans_tokens = input_ids[torch.argmax(start_scores
                                            ):torch.argmax(end_scores) + 1]
        answer_tokens = self.tokenizer.convert_ids_to_tokens(
            ans_tokens, skip_special_tokens=True)
        if not answer_tokens:  # TODO Understand this bug
            return '<NO ANSWER>', -10
        else:
            answer = answer_tokens[0]
            for token in answer_tokens[1:]:
                if token[0:2] == '##':
                    answer += token[2:]
                else:
                    answer += ' ' + token
            return answer, confidence

    def answer_question_chunked(self, question, document, re_consolidate=True):
        chunks = self.chunk_document(document, re_consolidate=True)
        responses = []
        for chunk in tqdm(chunks):
            answer, confidence = self.answer_question(question, chunk)
            response = {
                'answer': answer,
                'confidence': confidence,
                'chunk': chunk
            }
            responses.append(response)
        responses.sort(key=lambda x: -x['confidence'])
        return responses
コード例 #4
0
if build_local_vectors:
    if local_vec_generator == "w2v":
        corpus2wordVSM(all_data_file, w2v_file, txt_file_name,
                       local_vector_size, 5, 5, 4)

    elif local_vec_generator == "bert":
        pass  #TODO train model here instead of externally
        # generate_local_bert_embeddings()

if local_vec_generator == "w2v":
    local_model = models.Word2Vec.load(w2v_file)  #CBOW
    local_tokenizer = None
elif local_vec_generator == "bert":
    local_model = DistilBertModel(
        DistilBertConfig()).from_pretrained(local_bert_model_location)
    local_tokenizer = DistilBertTokenizer.from_pretrained(
        local_bert_model_location)

util = Utility.Utility()
preprocessor = Preprocess.Preprocess()

data_dir = os.path.abspath(
    f"data/multichannel_{global_vec_generator}_global_{local_vec_generator}_local/"
)

if global_vec_generator == "glove":
    global_dim = 50
else:
    global_dim = 768

if not os.path.exists(data_dir):
コード例 #5
0
def main():
    parser = argparse.ArgumentParser(
        description='argument parsing for testing')

    parser.add_argument('--data_dir',
                        default='data',
                        type=str,
                        help='path to data directory - default: \'data\'')

    parser.add_argument('--review_file',
                        default='yelp_reviews_test1000.csv',
                        type=str,
                        help='file name containig reviews')

    parser.add_argument('--batch_size',
                        default=32,
                        type=int,
                        help='batch size - default: 32')

    parser.add_argument('--model_save',
                        default='./model_save',
                        type=str,
                        help='directory to pull model')

    parser.add_argument('--nolog', action='store_true', help='disable logging')

    # parse input arguments
    clargs = parser.parse_args()

    # log to file and stdout
    if clargs.nolog:
        print("Not logging")
    else:
        sys.stdout = Logger('test')

    print("")
    print("==========================================")
    print("-------------Confirm Arguments------------")
    print("==========================================")

    print("Data directory for test data: {0:s}".format(clargs.data_dir))
    print("Test reviews file: {0:s}".format(clargs.review_file))
    print("Batch size of {0:d}".format(clargs.batch_size))
    print("Loading model from: {0:s}".format(clargs.model_save))

    print("")
    print("==========================================")
    print("---------------Generate Data--------------")
    print("==========================================")

    path = clargs.data_dir
    fn = clargs.review_file
    filename = path + "/" + fn

    t0 = time.perf_counter()
    print("Reading in training data from {0:s}".format(clargs.review_file))
    reviews_df = pd.read_csv(filename)
    reviews_df = reviews_df[['text', 'stars']]
    TEST_SIZE = len(reviews_df.index)
    elapsed = time.perf_counter() - t0
    print("Finished reading {0:d} entries | Took {1:0.2f} seconds".format(
        TEST_SIZE, elapsed))

    # load the model from save
    print("")
    print("==========================================")
    print("----------------Load Model----------------")
    print("==========================================")

    print("Loading model and tokenizer from directory")
    model_path = clargs.model_save
    json_infile = model_path + '/' + 'hyperparams.json'
    with open(json_infile, 'r') as infile:
        hyper_json = json.load(infile)

    if 'model' not in hyper_json or hyper_json['model'] == 'bert':
        print("Loading normal bert model")
        tokenizer = BertTokenizer.from_pretrained(model_path)
        model = BertForSequenceClassification.from_pretrained(model_path)
    else:
        print("Loading distilbert Model")
        tokenizer = DistilBertTokenizer.from_pretrained(model_path)
        model = DistilBertForSequenceClassification.from_pretrained(model_path)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.eval()

    print("Tokenizing the data to be tested")
    dataset = extract_features(reviews_df, tokenizer)
    test_dataloader = DataLoader(dataset,
                                 sampler=SequentialSampler(dataset),
                                 batch_size=clargs.batch_size,
                                 drop_last=False)

    # test the model against some test data
    print("")
    print("==========================================")
    print("----------------Test Model----------------")
    print("==========================================")
    print("Testing - Split {0:d} examples into {1:d} batches".format(
        TEST_SIZE, len(test_dataloader)))
    test_loss, test_acc, pred_labels, actual_labels = evaluate(
        model, device, test_dataloader, TEST_SIZE)
    mae = mean_abs_error(pred_labels, actual_labels)
    mse = mean_square_error(pred_labels, actual_labels)
    conf_matrix = confusion_matrix(pred_labels, actual_labels)
    print("")
    print("==========================================")
    print("---------------TEST RESULTS---------------")
    print("==========================================")
    print("")
    print("-----------TRAINING HYPERPARAMS-----------")
    print("Data directory: {0:s}".format(hyper_json['dataDirectory']))
    print("Reviews file: {0:s}".format(hyper_json['dataFile']))
    print("Batch size of {0:s}".format(hyper_json['batchSize']))
    print("Train ratio of {0:s}".format(hyper_json['trainRatio']))
    print("Train for {0:s} epochs".format(hyper_json['numEpochs']))
    print("")
    print("Testing accuracy: ", test_acc)
    print("Mean absolute error: ", mae)
    print("Mean square error: ", mse)
    print("")
    print("-------------CONFUSION MATRIX-------------")
    print("")
    print(conf_matrix)
    print("")
    target_names = ['1 star', '2 star', '3 star', '4 star', '5 star']
    print(
        metrics.classification_report(actual_labels,
                                      pred_labels,
                                      digits=3,
                                      target_names=target_names))
コード例 #6
0
ファイル: dataset.py プロジェクト: gheyret/imdb-sentiment
 def __init__(self, data_path, seq_length):
     self.data = pd.read_csv(data_path).astype('object')
     self.seq_length = seq_length
     self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
コード例 #7
0
def wsd(
    model_name='distilbert-base-uncased',  #ensemble-distil-1-albert-1 / albert-xxlarge-v2 / bert-base-uncased
    classifier_input='token-embedding-last-1-layers',  # token-embedding-last-layer / token-embedding-last-n-layers
    classifier_hidden_layers=[],
    reduce_options=True,
    freeze_base_model=True,
    max_len=512,
    batch_size=32,
    test=False,
    lr=5e-5,
    eps=1e-8,
    n_epochs=50,
    cls_token=False,  # If true, the cls token is used instead of the relevant-word token
    cache_embeddings=True,  # If true, the embeddings from the base model are saved to disk so that they only need to be computed once
    save_classifier=True  # If true, the classifier part of the network is saved after each epoch, and the training is automatically resumed from this saved network if it exists
):
    train_path = "wsd_train.txt"
    test_path = "wsd_test_blind.txt"
    n_classes = 222
    device = 'cuda'

    import __main__ as main
    print("Script: " + os.path.basename(main.__file__))

    print("Loading base model %s..." % model_name)
    if model_name.startswith('ensemble-distil-'):
        last_n_distil = int(model_name.replace('ensemble-distil-', "")[0])
        last_n_albert = int(model_name[-1])
        from transformers import AlbertTokenizer
        from transformers.modeling_albert import AlbertModel
        base_model = AlbertModel.from_pretrained('albert-xxlarge-v2',
                                                 output_hidden_states=True,
                                                 output_attentions=False)
        tokenizer = AlbertTokenizer.from_pretrained('albert-xxlarge-v2')
        print(
            "Ensemble model with DistilBert last %d layers and Albert last %d layers"
            % (last_n_distil, last_n_albert))
    elif model_name.startswith('distilbert'):
        tokenizer = DistilBertTokenizer.from_pretrained(model_name)
        base_model = DistilBertModel.from_pretrained(model_name,
                                                     num_labels=n_classes,
                                                     output_hidden_states=True,
                                                     output_attentions=False)
    elif model_name.startswith('bert'):
        from transformers import BertTokenizer, BertModel
        tokenizer = BertTokenizer.from_pretrained(model_name)
        base_model = BertModel.from_pretrained(model_name,
                                               num_labels=n_classes,
                                               output_hidden_states=True,
                                               output_attentions=False)
    elif model_name.startswith('albert'):
        from transformers import AlbertTokenizer
        from transformers.modeling_albert import AlbertModel
        tokenizer = AlbertTokenizer.from_pretrained(model_name)
        base_model = AlbertModel.from_pretrained(model_name,
                                                 output_hidden_states=True,
                                                 output_attentions=False)

    use_n_last_layers = 1
    if classifier_input == 'token-embedding-last-layer':
        use_n_last_layers = 1
    elif classifier_input.startswith(
            'token-embedding-last-') and classifier_input.endswith('-layers'):
        use_n_last_layers = int(
            classifier_input.replace('token-embedding-last-',
                                     "").replace('-layers', ""))
    else:
        raise ValueError("Invalid classifier_input argument")
    print("Using the last %d layers" % use_n_last_layers)

    def tokenize(str):
        return tokenizer.tokenize(str)[:max_len - 2]

    SENSE = LabelField(is_target=True)
    LEMMA = LabelField()
    TOKEN_POS = LabelField(use_vocab=False)
    TEXT = Field(tokenize=tokenize,
                 pad_token=tokenizer.pad_token,
                 init_token=tokenizer.cls_token,
                 eos_token=tokenizer.sep_token)
    EXAMPLE_ID = LabelField(use_vocab=False)
    fields = [('sense', SENSE), ('lemma', LEMMA), ('token_pos', TOKEN_POS),
              ('text', TEXT), ('example_id', EXAMPLE_ID)]

    def read_data(corpus_file, fields, max_len=None):
        train_id_start = 0
        test_id_start = 76049  # let the ids for the test examples start after the training example indices
        if corpus_file == "wsd_test_blind.txt":
            print("Loading test data...")
            id_start = test_id_start
        else:
            print("Loading train/val data...")
            id_start = train_id_start
        with open(corpus_file, encoding='utf-8') as f:
            examples = []
            for i, line in enumerate(f):
                sense, lemma, word_position, text = line.split('\t')
                # We need to convert from the word position to the token position
                words = text.split()
                pre_word = " ".join(words[:int(word_position)])
                pre_word_tokenized = tokenizer.tokenize(pre_word)
                token_position = len(
                    pre_word_tokenized
                ) + 1  # taking into account the later addition of the start token
                example_id = id_start + i
                if max_len is None or token_position < max_len - 1:  # ignore examples where the relevant token is cut off due to max_len
                    if cls_token:
                        token_position = 0
                    examples.append(
                        Example.fromlist(
                            [sense, lemma, token_position, text, example_id],
                            fields))
                else:
                    print(
                        "Example %d is skipped because the relevant token was cut off (token pos = %d)"
                        % (example_id, token_position))
                    print(text)
        return Dataset(examples, fields)

    dataset = read_data(train_path, fields, max_len)
    random.seed(0)
    trn, vld = dataset.split(0.7, stratified=True, strata_field='sense')

    TEXT.build_vocab([])
    if model_name.startswith('albert') or model_name.startswith(
            'ensemble-distil-'):

        class Mapping:
            def __init__(self, fn):
                self.fn = fn

            def __getitem__(self, item):
                return self.fn(item)

        TEXT.vocab.stoi = Mapping(tokenizer.sp_model.PieceToId)
        TEXT.vocab.itos = Mapping(tokenizer.sp_model.IdToPiece)
    else:
        TEXT.vocab.stoi = tokenizer.vocab
        TEXT.vocab.itos = list(tokenizer.vocab)
    SENSE.build_vocab(trn)
    LEMMA.build_vocab(trn)

    trn_iter = BucketIterator(trn,
                              device=device,
                              batch_size=batch_size,
                              sort_key=lambda x: len(x.text),
                              repeat=False,
                              train=True,
                              sort=True)
    vld_iter = BucketIterator(vld,
                              device=device,
                              batch_size=batch_size,
                              sort_key=lambda x: len(x.text),
                              repeat=False,
                              train=False,
                              sort=True)

    if freeze_base_model:
        for mat in base_model.parameters():
            mat.requires_grad = False  # Freeze Bert model so that we only train the classifier on top

    if reduce_options:
        lemma_mask = defaultdict(
            lambda: torch.zeros(len(SENSE.vocab), device=device))
        for example in trn:
            lemma = LEMMA.vocab.stoi[example.lemma]
            sense = SENSE.vocab.stoi[example.sense]
            lemma_mask[lemma][sense] = 1
        lemma_mask = dict(lemma_mask)

        def mask(
            batch_logits, batch_lemmas
        ):  # Masks out the senses that do not belong to the specified lemma
            for batch_i in range(len(batch_logits)):
                lemma = batch_lemmas[batch_i].item()
                batch_logits[batch_i, :] *= lemma_mask[lemma]
            return batch_logits
    else:

        def mask(batch_logits, batch_lemmas):
            return batch_logits

    experiment_name = model_name + " " + (
        classifier_input if not model_name.startswith('ensemble-distil-') else
        "") + " " + str(classifier_hidden_layers) + " (" + (
            " cls_token" if cls_token else
            "") + (" reduce_options" if reduce_options else "") + (
                " freeze_base_model" if freeze_base_model else ""
            ) + "  ) " + "max_len=" + str(max_len) + " batch_size=" + str(
                batch_size) + " lr=" + str(lr) + " eps=" + str(eps) + (
                    " cache_embeddings" if cache_embeddings else "")

    if model_name.startswith('ensemble-distil-'):
        model = WSDEnsembleModel(last_n_distil, last_n_albert, n_classes, mask,
                                 classifier_hidden_layers)
    else:
        model = WSDModel(base_model, n_classes, mask, use_n_last_layers,
                         model_name, classifier_hidden_layers,
                         cache_embeddings)
    history = None
    #if save_classifier:
    #    if model.load_classifier(experiment_name):
    #        # Existing saved model loaded
    #        # Also load the corresponding training history
    #        history = read_dict_file("results/"+experiment_name+".txt")

    model.cuda()

    print("Starting experiment  " + experiment_name)
    if test:
        tst = read_data(test_path, fields, max_len=512)
        tst_iter = Iterator(tst,
                            device=device,
                            batch_size=batch_size,
                            sort=False,
                            sort_within_batch=False,
                            repeat=False,
                            train=False)
        batch_predictions = []
        for batch in tst_iter:
            print('.', end='')
            sys.stdout.flush()
            text = batch.text.t()
            with torch.no_grad():
                outputs = model(text,
                                token_positions=batch.token_pos,
                                lemmas=batch.lemma,
                                example_ids=batch.example_id)
                scores = outputs[-1]
            batch_predictions.append(scores.argmax(dim=1))
        batch_preds = torch.cat(batch_predictions, 0).tolist()
        predicted_senses = [SENSE.vocab.itos(pred) for pred in batch_preds]
        with open("test_predictions/" + experiment_name + ".txt", "w") as out:
            out.write("\n".join(predicted_senses))
    else:
        no_decay = ['bias', 'LayerNorm.weight']
        decay = 0.01
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            decay
        }, {
            'params': [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.0
        }]
        optimizer = AdamW(optimizer_grouped_parameters, lr=lr, eps=eps)

        def save_results(history):
            with open("results/" + experiment_name + ".txt", "w") as out:
                out.write(str(history))
            if save_classifier:
                if len(history['val_acc']) < 2 or history['val_acc'][-1] > max(
                        history['val_acc'][:-1]):
                    model.save_classifier(experiment_name, best=True)
                else:
                    model.save_classifier(experiment_name, best=False)

        train(model, optimizer, trn_iter, vld_iter, n_epochs, save_results,
              history)
コード例 #8
0
def main():

    ntasks = len(tasks)

    data_args = list()
    configuration = list()
    sub_models = list()
    datasets = list()
    # train_iter = list()
    # dev_iter = list()
    # test_iter = list()
    sub_optimizer = list()
    metrics = list()
    tokenizer = DistilBertTokenizer.from_pretrained(bert_path,
                                                    cache_dir=cache_dir)

    for i in range(ntasks):
        logger.info("Tasks:" + tasks[i])
        data_args.append(GlueDataArgs(task_name=tasks[i]))
        configuration.append(
            DistilBertConfig.from_pretrained(
                bert_path,
                num_labels=glue_tasks_num_labels[tasks[i].lower()],
                finetuning_task=data_args[i].task_name,
                cache_dir=cache_dir))
        if use_gpu:
            sub_models.append(SequenceClassification(configuration[i]).cuda())
        else:
            sub_models.append(SequenceClassification(configuration[i]))

        datasets.append(
            GlueDataSets(data_args[i],
                         tokenizer=tokenizer,
                         cache_dir=cache_dir))
        sub_optimizer.append(
            torch.optim.AdamW(sub_models[i].parameters(), lr=learning_rate_0))
        metrics.append(ComputeMetrics(data_args[i]))
        logger.info("*** DataSet Ready ***")

    if use_gpu:
        Bert_model = DistilBertModel.from_pretrained(bert_path,
                                                     return_dict=True).cuda()
    else:
        Bert_model = DistilBertModel.from_pretrained(bert_path,
                                                     return_dict=True)

    bert_optimizer = torch.optim.AdamW(Bert_model.parameters(),
                                       lr=learning_rate_0)

    # balaned dataset
    train_num = list()
    for i in range(ntasks):
        train_num.append(datasets[i].length("train"))
    #train_nummax =
    #train_num = [x/train_nummax for x in train_num]
    print(train_num)
    iterations = (epochs * max(train_num) // bs) + 1
    #print(iterations)

    sub_scheduler = list()
    for i in range(ntasks):
        sub_scheduler.append(
            torch.optim.lr_scheduler.LambdaLR(
                sub_optimizer[i], lambda step: (1.0 - step / iterations))
        )  #if step <= frozen else learning_rate_1)
    Bert_scheduler = torch.optim.lr_scheduler.LambdaLR(
        bert_optimizer, lambda step:
        (1.0 - step / iterations))  # if step <= frozen else learning_rate_1

    # datasets[i].dataloader("train", batch_size_train[i])
    train_iter = list()
    for i in range(ntasks):
        train_iter.append(
            GlueIterator(datasets[i].dataloader("train", batch_size_train[i])))

    for i in range(1, iterations + 1):

        if i > frozen:
            for p in Bert_model.parameters():
                p.requires_grad = True
            Bert_model.train()
        elif i == frozen:
            for p in Bert_model.parameters():
                p.requires_grad = True
            Bert_model.train()
            logging.info("#####################################")
            logging.info("Release the Traing of the Main Model.")
            logging.info("#####################################")
        else:
            for p in Bert_model.parameters():
                p.requires_grad = False
            Bert_model.eval()

        losses = list()
        loss_rates = list()

        for j in range(ntasks):
            sub_models[j].train()
            data = train_iter[j].next()

            if use_gpu:
                input_ids = data['input_ids'].cuda()
                attention_mask = data['attention_mask'].cuda()
                #token_type_ids=data['token_type_ids'].cuda()
                label = data['labels'].cuda()
            else:
                input_ids = data['input_ids']
                attention_mask = data['attention_mask']
                #token_type_ids=data['token_type_ids']
                label = data['labels']

            output_inter = Bert_model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                return_dict=True)  # token_type_ids=token_type_ids,
            losses.append(sub_models[j](input=output_inter, labels=label)[0])

        losssum = sum(losses).item()
        for j in range(ntasks):
            loss_rates.append(losses[j].item() / losssum)

        loss = 0
        printInfo = 'TOTAL/Train {}/{}, lr:{}'.format(i, iterations,
                                                      Bert_scheduler.get_lr())
        for j in range(ntasks):
            loss += losses[j] * batch_size_train[j] * loss_rates[j]
            printInfo += ', loss{}-{:.6f}'.format(j, losses[j])
            sub_optimizer[j].zero_grad()

        logging.info(printInfo)

        if i > frozen:
            bert_optimizer.zero_grad()
        loss.backward()

        if i > frozen:
            bert_optimizer.step()

        for j in range(ntasks):
            sub_optimizer[j].step()
            #sub_scheduler[j].step()

        #Bert_scheduler.step()

        if (i % eval_interval == 0):
            evaluate(Bert_model, sub_models, datasets, batch_size_val, metrics,
                     ntasks)
            save_models(Bert_model, sub_models, ntasks, i)

    evaluate(Bert_model, sub_models, datasets, batch_size_val, metrics, ntasks)
    save_models(Bert_model, sub_models, ntasks, iterations)
コード例 #9
0
def main():
    """
    Main function
    """

    # Parse cmd line arguments
    args = nlp_parser.parse_arguments()

    source = ""
    subject = ""
    context = ""
    question = ""
    answer = ""
    squadid = ""

    if args:
        if "text" in args:
            if args["text"]:
                source = args["text"]
        if "subject" in args:
            if args["subject"]:
                subject = args["subject"]
        if "context" in args:
            if args["context"]:
                context = args["context"]
        if "question" in args:
            if args["question"]:
                question = args["question"]
                clean_question = nlp.clean(question)
        if "answer" in args:
            if args["answer"]:
                answer = args["answer"]
        if "squadid" in args:
            if args["squadid"]:
                squadid = args["squadid"]
    else:
        sys.exit("Parser didn't return args correctly")

    # Setup the question, either from a specified SQuAD record
    # or from cmd line arguments.
    # If no question details are provided, a random
    # SQuAD example will be chosen.

    if question:
        if source:
            with open(source, "r") as text_file_handle:
                context = text_file_handle.read()
        else:
            print("No text provided, searching SQuAD dev-2.0 dataset")
            squad_data = nlp.import_squad_data()
            squad_records = squad_data.loc[squad_data["clean_question"] ==
                                           clean_question]
            if squad_records.empty:
                sys.exit(
                    "Question not found in SQuAD data, please provide context using `--text`."
                )
            subject = squad_records["subject"].iloc[0]
            context = squad_records["context"].iloc[0]
            question = squad_records["question"].iloc[0]
            answer = squad_records["answer"]
    else:
        squad_data = nlp.import_squad_data()

        if squadid:
            source = args["squadid"]
            squad_records = squad_data.loc[squad_data["id"] == source]
            i_record = 0
        else:
            if subject:
                print(
                    "Picking a question at random on the subject: ",
                    subject,
                )
                squad_records = squad_data.loc[squad_data["subject"] ==
                                               subject]
            else:
                print(
                    "No SQuAD ID or question provided, picking one at random!")
                squad_records = squad_data

            n_records = len(squad_records.index)
            i_record = random.randint(0, max(0, n_records - 1))

        if squad_records.empty:
            sys.exit(
                "No questions found in SQuAD data, please provide valid ID or subject."
            )

        n_records = len(squad_records.index)
        i_record = random.randint(0, n_records - 1)
        source = squad_records["id"].iloc[i_record]
        subject = squad_records["subject"].iloc[i_record]
        context = squad_records["context"].iloc[i_record]
        question = squad_records["question"].iloc[i_record]
        answer = squad_records["answer"].iloc[i_record]

    # DistilBERT question answering using pre-trained model.
    token = DistilBertTokenizer.from_pretrained("distilbert-base-uncased",
                                                return_token_type_ids=True)

    model = TFDistilBertForQuestionAnswering.from_pretrained(
        "distilbert-base-uncased-distilled-squad")

    encoding = token.encode_plus(question,
                                 context,
                                 max_length=512,
                                 truncation=True)

    input_ids, attention_mask = (
        encoding["input_ids"],
        encoding["attention_mask"],
    )
    model_output = model(np.array([input_ids]),
                         attention_mask=np.array([attention_mask]))
    start_scores = model_output.start_logits
    end_scores = model_output.end_logits

    answer_ids = input_ids[np.argmax(start_scores):np.argmax(end_scores) + 1]
    answer_tokens = token.convert_ids_to_tokens(answer_ids,
                                                skip_special_tokens=True)
    answer_tokens_to_string = token.convert_tokens_to_string(answer_tokens)

    # Display results
    print("\nDistilBERT question answering example.")
    print("======================================")
    print("Reading from: ", subject, source)
    print("\nContext: ", context)
    print("--")
    print("Question: ", question)
    print("Answer: ", answer_tokens_to_string)
    print("Reference Answers: ", answer)
コード例 #10
0
if __name__ == '__main__':
    if torch.cuda.is_available():
        device = 'cuda'
    else:
        device = 'cpu'

    print(f"Using {device}.")

    print(f"Reading {sys.argv[1]}...")
    df = extract_data(sys.argv[1], contain_answers=False).set_index(['id'])
    print(f"DataFrame created.")

    print("Tokenizing the DataFrame...")
    model = DistilBertKnowledge(alpha=0.5)
    DistilBertTokenizer.from_pretrained(
        model.info.pretrained_model).save_pretrained('slow_tokenizer/')
    tokenizer = BertWordPieceTokenizer('slow_tokenizer/vocab.txt',
                                       lowercase=True)
    df = process_dataframe(df, tokenizer, contain_answers=False)
    print("Tokenization complete.")

    dataset = SquadDataset(df, model.info, contain_answers=False)
    loader = DataLoader(dataset, batch_size=16, num_workers=4, pin_memory=True)

    print("Loading model weights...")
    model.load_state_dict(torch.load('model.pt'))
    model = model.to(device)
    print("Model loaded.")

    model.eval()
    print("Starting evaluation...")
コード例 #11
0
def main():
    args = parse_arguments(sys.argv[1:])
    set_seed(args['random_seed'])
    df = get_train_data()
    test_df = get_test_data()
    NUM_CLASSES = df['label'].nunique()

    train_texts, val_texts, train_labels, val_labels = train_test_split(df['sentence'], df['label_int'], random_state=args['random_seed'], test_size=.2)
    print(train_texts.shape, val_texts.shape, train_labels.shape, val_labels.shape)

    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    train_encodings = tokenizer(train_texts.to_list(), truncation=True, padding=True)
    val_encodings = tokenizer(val_texts.to_list(), truncation=True, padding=True)
    test_encodings = tokenizer(test_df['sentence'].to_list(), truncation=True, padding=True)

    train_dataset = HINTDataset(train_encodings, train_labels.values)
    val_dataset = HINTDataset(val_encodings, val_labels.values)
    test_dataset = HINTDataset(test_encodings, test_df['label_int'].values)

    model = HINTModel(num_classes=NUM_CLASSES)
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

    model.to(device)
    model.ffn.train()

    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

    optim = AdamW(model.parameters(), lr=args['learning_rate'])
    loss_fn = nn.CrossEntropyLoss()

    step = 0
    best_acc = 0

    Path(args['model_dir']).mkdir(parents=True, exist_ok=True)

    for epoch in range(args['epochs']):
        train_loss, train_acc, train_f1 = train_fn(model, train_loader, loss_fn, optim, device)
        val_loss, val_acc, val_f1 = val_fn(model, val_loader, loss_fn, device)
        print(f"{epoch+1}: train: [{train_loss:.3f}, {train_acc:.3f}, {train_f1:.3f}], val: [{val_loss:.3f}, {val_acc:.3f}, {val_f1:.3f}]")
        if val_acc > best_acc:
            best_acc = val_acc
            step = 0
            torch.save(model.state_dict(), f"{args['model_dir']}/{args['model_path']}")
        else:
            step += 1
        if step >= args['max_steps']:
            break

    model.load_state_dict(torch.load(f"{args['model_dir']}/{args['model_path']}", map_location=device))
    print("model successfully loaded!")
    test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

    preds, probs = inference_fn(model, test_loader, device)
    test_df['preds'] = preds
    test_df['probs'] = probs
    test_df['label_int'] = test_df['label_int'].fillna(NUM_CLASSES + 1)
    test_df['updated_preds'] = test_df['preds']
    test_df.loc[test_df['probs'] <= args['min_prob'], 'updated_preds'] = NUM_CLASSES + 1

    Path(args['output_dir']).mkdir(parents=True, exist_ok=True)
    test_df.to_csv(f"{args['output_dir']}/{args['test_file_name']}", index=False)
    
    acc1 = accuracy_score(test_df['label_int'], test_df['preds'])
    acc2 = accuracy_score(test_df['label_int'], test_df['updated_preds'])

    f11 = f1_score(test_df['label_int'], test_df['preds'], average='weighted')
    f12 = f1_score(test_df['label_int'], test_df['updated_preds'], average='weighted')

    print(f"Default: acc: {acc1}, f1_score: {f11}")
    print(f"Updated with Min Prob: acc: {acc2}, f1_score: {f12}")
コード例 #12
0
ファイル: tokenization.py プロジェクト: skirdey/FARM
    def load(cls,
             pretrained_model_name_or_path,
             revision=None,
             tokenizer_class=None,
             use_fast=True,
             **kwargs):
        """
        Enables loading of different Tokenizer classes with a uniform interface. Either infer the class from
        model config or define it manually via `tokenizer_class`.

        :param pretrained_model_name_or_path:  The path of the saved pretrained model or its name (e.g. `bert-base-uncased`)
        :type pretrained_model_name_or_path: str
        :param revision: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash.
        :type revision: str
        :param tokenizer_class: (Optional) Name of the tokenizer class to load (e.g. `BertTokenizer`)
        :type tokenizer_class: str
        :param use_fast: (Optional, False by default) Indicate if FARM should try to load the fast version of the tokenizer (True) or
            use the Python one (False).
            Only DistilBERT, BERT and Electra fast tokenizers are supported.
        :type use_fast: bool
        :param kwargs:
        :return: Tokenizer
        """
        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
        kwargs["revision"] = revision

        if tokenizer_class is None:
            tokenizer_class = cls._infer_tokenizer_class(
                pretrained_model_name_or_path)

        logger.info(f"Loading tokenizer of type '{tokenizer_class}'")
        # return appropriate tokenizer object
        ret = None
        if "AlbertTokenizer" in tokenizer_class:
            if use_fast:
                ret = AlbertTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, keep_accents=True, **kwargs)
            else:
                ret = AlbertTokenizer.from_pretrained(
                    pretrained_model_name_or_path, keep_accents=True, **kwargs)
        elif "XLMRobertaTokenizer" in tokenizer_class:
            if use_fast:
                ret = XLMRobertaTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = XLMRobertaTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif "RobertaTokenizer" in tokenizer_class:
            if use_fast:
                ret = RobertaTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = RobertaTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif "DistilBertTokenizer" in tokenizer_class:
            if use_fast:
                ret = DistilBertTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = DistilBertTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif "BertTokenizer" in tokenizer_class:
            if use_fast:
                ret = BertTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = BertTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif "XLNetTokenizer" in tokenizer_class:
            if use_fast:
                ret = XLNetTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, keep_accents=True, **kwargs)
            else:
                ret = XLNetTokenizer.from_pretrained(
                    pretrained_model_name_or_path, keep_accents=True, **kwargs)
        elif "ElectraTokenizer" in tokenizer_class:
            if use_fast:
                ret = ElectraTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = ElectraTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif tokenizer_class == "EmbeddingTokenizer":
            if use_fast:
                logger.error(
                    'EmbeddingTokenizerFast is not supported! Using EmbeddingTokenizer instead.'
                )
                ret = EmbeddingTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = EmbeddingTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif "CamembertTokenizer" in tokenizer_class:
            if use_fast:
                ret = CamembertTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = CamembertTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif "DPRQuestionEncoderTokenizer" in tokenizer_class:
            if use_fast:
                ret = DPRQuestionEncoderTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = DPRQuestionEncoderTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif "DPRContextEncoderTokenizer" in tokenizer_class:
            if use_fast:
                ret = DPRContextEncoderTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = DPRContextEncoderTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif "BigBirdTokenizer" in tokenizer_class:
            if use_fast:
                ret = BigBirdTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = BigBirdTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        if ret is None:
            raise Exception("Unable to load tokenizer")
        else:
            return ret
コード例 #13
0
def main():
    parser = setup_parser()
    args = parser.parse_args()

    # specifies the path where the biobert or clinical bert model is saved
    if args.bert_model == 'biobert' or args.bert_model == 'clinical_bert':
        args.bert_model = args.model_loc

    print(args.bert_model)

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    processors = {
        "cola": ColaProcessor,
        "mnli": MnliProcessor,
        "mrpc": MrpcProcessor,
        "mednli": MedNLIProcessor,
        "goc": GOCProcessor
    }

    num_labels_task = {"cola": 2, "mnli": 3, "mrpc": 2, "mednli": 3, "goc": 2}

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    #if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train:
    #    raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
    #if not os.path.exists(args.output_dir):
    #    os.makedirs(args.output_dir)

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    num_labels = num_labels_task[task_name]
    label_list = processor.get_labels()

    tokenizer = DistilBertTokenizer.from_pretrained(
        args.bert_model, do_lower_case=args.do_lower_case)

    print('TRAIN')
    train = processor.get_train_examples(args.data_dir)
    print([(train[i].text_a, train[i].text_b, train[i].label)
           for i in range(3)])
    print('DEV')
    dev = processor.get_dev_examples(args.data_dir)
    print([(dev[i].text_a, dev[i].text_b, dev[i].label) for i in range(3)])
    print('TEST')
    test = processor.get_test_examples(args.data_dir)
    print([(test[i].text_a, test[i].text_b, test[i].label) for i in range(3)])

    train_examples = None
    num_train_optimization_steps = None
    if args.do_train:
        train_examples = processor.get_train_examples(args.data_dir)
        num_train_optimization_steps = int(
            len(train_examples) / args.train_batch_size /
            args.gradient_accumulation_steps) * args.num_train_epochs
        if args.local_rank != -1:
            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
            )

    # Prepare model
    cache_dir = args.cache_dir if args.cache_dir else os.path.join(
        PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format(
            args.local_rank))
    model = DocDistilBertForSequenceClassification.from_pretrained(
        args.bert_model, cache_dir=cache_dir, num_labels=num_labels)

    if args.freeze_bert:
        print("FREEZING BERT")
        for param in model.distilbert.parameters():
            param.requires_grad = False

    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        model = DDP(model)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    if args.fp16:
        try:
            from apex.optimizers import FP16_Optimizer
            from apex.optimizers import FusedAdam
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        optimizer = FusedAdam(optimizer_grouped_parameters,
                              lr=args.learning_rate,
                              bias_correction=False,
                              max_grad_norm=1.0)
        if args.loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer,
                                       static_loss_scale=args.loss_scale)

    else:
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=args.learning_rate,
                          correct_bias=False)
        num_train_optimization_steps
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=float(num_train_optimization_steps) *
            args.warmup_proportion,
            num_training_steps=num_train_optimization_steps)
        #optimizer = BertAdam(optimizer_grouped_parameters,
        #                     lr=args.learning_rate,
        #                     warmup=args.warmup_proportion,
        #                     t_total=num_train_optimization_steps)

    global_step = 0
    nb_tr_steps = 0
    tr_loss = 0
    if args.do_train:
        train_features = convert_documents_to_features(train_examples,
                                                       label_list,
                                                       args.max_seq_length,
                                                       tokenizer)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)
        print(len(train_features[0].input_ids))
        print(len(train_features[0].input_ids[0]))
        all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in train_features],
                                     dtype=torch.long)
        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_label_ids)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        model.train()
        for epoch_num in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch
                loss, logits, other = model(input_ids=input_ids,
                                            attention_mask=input_mask,
                                            labels=label_ids)

                #print(loss[0].shape)
                #print(loss[1].shape)
                #print(loss[2].shape)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16:
                        # modify learning rate with special warm up BERT uses
                        # if args.fp16 is False, BertAdam is used that handles this automatically
                        lr_this_step = args.learning_rate * warmup_linear(
                            global_step / num_train_optimization_steps,
                            args.warmup_proportion)
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr_this_step
                    optimizer.step()
                    scheduler.step()
                    optimizer.zero_grad()
                    global_step += 1

            # Saving checkpoint
            save_checkpoint(model, args.output_dir,
                            "epoch_%d_checkpoint.pth" % epoch_num)

    if args.do_train:
        # Save a trained model and the associated configuration
        model_to_save = model.module if hasattr(
            model, 'module') else model  # Only save the model it-self
        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
        torch.save(model_to_save.state_dict(), output_model_file)
        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
        with open(output_config_file, 'w') as f:
            f.write(model_to_save.config.to_json_string())

        # Load a trained model and config that you have fine-tuned
        #config = DistilBertConfig(output_config_file)
        model = DistilBertForSequenceClassification.from_pretrained(
            args.output_dir)  #, num_labels=num_labels)
        #model.load_state_dict(torch.load(output_model_file))
    else:
        model = DistilBertForSequenceClassification.from_pretrained(
            args.bert_model)  #, num_labels=num_labels)
    model.to(device)

    if args.do_eval and (args.local_rank == -1
                         or torch.distributed.get_rank() == 0):
        eval_examples = processor.get_dev_examples(args.data_dir)
        eval_features = convert_documents_to_features(eval_examples,
                                                      label_list,
                                                      args.max_seq_length,
                                                      tokenizer)
        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", args.eval_batch_size)
        all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in eval_features],
                                     dtype=torch.long)
        eval_data = TensorDataset(all_input_ids, all_input_mask,
                                  all_segment_ids, all_label_ids)
        # Run prediction for full data
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=args.eval_batch_size)

        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0

        for input_ids, input_mask, segment_ids, label_ids in tqdm(
                eval_dataloader, desc="Evaluating"):
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            label_ids = label_ids.to(device)

            with torch.no_grad():
                tmp_eval_loss, logits, other = model(input_ids=input_ids,
                                                     attention_mask=input_mask,
                                                     labels=label_ids)
            # logits = model(input_ids, segment_ids, input_mask)

            logits = logits.detach().cpu().numpy()
            label_ids = label_ids.to('cpu').numpy()
            tmp_eval_accuracy = accuracy(logits, label_ids)

            eval_loss += tmp_eval_loss.mean().item()
            eval_accuracy += tmp_eval_accuracy

            nb_eval_examples += input_ids.size(0)
            nb_eval_steps += 1

        eval_loss = eval_loss / nb_eval_steps
        eval_accuracy = eval_accuracy / nb_eval_examples
        loss = tr_loss / nb_tr_steps if args.do_train else None
        result = {
            'eval_loss': eval_loss,
            'eval_accuracy': eval_accuracy,
            'global_step': global_step,
            'loss': loss
        }

        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

    if args.do_test and (args.local_rank == -1
                         or torch.distributed.get_rank() == 0):
        test_examples = processor.get_test_examples(args.data_dir)
        test_features = convert_documents_to_features(test_examples,
                                                      label_list,
                                                      args.max_seq_length,
                                                      tokenizer)
        logger.info("***** Running testing *****")
        logger.info("  Num examples = %d", len(test_examples))
        logger.info("  Batch size = %d", args.eval_batch_size)
        all_input_ids = torch.tensor([f.input_ids for f in test_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in test_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in test_features],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in test_features],
                                     dtype=torch.long)
        test_data = TensorDataset(all_input_ids, all_input_mask,
                                  all_segment_ids, all_label_ids)
        # Run prediction for full data
        test_sampler = SequentialSampler(test_data)
        test_dataloader = DataLoader(test_data,
                                     sampler=test_sampler,
                                     batch_size=args.eval_batch_size)

        model.eval()
        test_loss, test_accuracy = 0, 0
        nb_test_steps, nb_test_examples = 0, 0

        for input_ids, input_mask, segment_ids, label_ids in tqdm(
                test_dataloader, desc="Testing"):
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            label_ids = label_ids.to(device)

            with torch.no_grad():
                #tmp_test_loss = model(input_ids, segment_ids, input_mask, label_ids)
                tmp_test_loss, logits, other = model(input_ids=input_ids,
                                                     attention_mask=input_mask,
                                                     labels=label_ids)
                #logits = model(input_ids, segment_ids, input_mask)

            logits = logits.detach().cpu().numpy()
            label_ids = label_ids.to('cpu').numpy()
            tmp_test_accuracy = accuracy(logits, label_ids)

            test_loss += tmp_test_loss.mean().item()
            test_accuracy += tmp_test_accuracy

            nb_test_examples += input_ids.size(0)
            nb_test_steps += 1

        test_loss = test_loss / nb_test_steps
        test_accuracy = test_accuracy / nb_test_examples
        loss = tr_loss / nb_tr_steps if args.do_train else None
        result = {
            'test_loss': test_loss,
            'test_accuracy': test_accuracy,
            'global_step': global_step,
            'loss': loss
        }

        output_test_file = os.path.join(args.output_dir, "test_results.txt")
        with open(output_test_file, "w") as writer:
            logger.info("***** Test results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
コード例 #14
0
# Load MRPC data
data = tensorflow_datasets.load('glue/mrpc')

# Pick GPU device (only pick 1 GPU)
gpus = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_visible_devices(gpus[0], 'GPU')

# Load tokenizer, model from pretrained model/vocabulary
bert_tokenizer = BertTokenizer.from_pretrained('mrpc/1')
bert_model = TFBertForSequenceClassification.from_pretrained('mrpc/1')

valid_dataset = glue_convert_examples_to_features(data['validation'], bert_tokenizer, max_length=128, task='mrpc')
valid_dataset = valid_dataset.batch(64)

# Evaluate time for bert_model (bigger model)
start_time = time.time()
results = bert_model.predict(valid_dataset)
execution_time = time.time() - start_time

# Load tokenizer, model from pretrained model/vocabulary
distilbert_tokenizer = DistilBertTokenizer.from_pretrained('mrpc/2')
distilbert_model = TFDistilBertForSequenceClassification.from_pretrained('mrpc/2')

valid_dataset = glue_convert_examples_to_features(data['validation'], distilbert_tokenizer, max_length=128, task='mrpc')
valid_dataset = valid_dataset.batch(64)

# Evaluate time for distilbert_model (bigger model)
start_time = time.time()
results = distilbert_model.predict(valid_dataset)
execution_time = time.time() - start_time
コード例 #15
0
# handling model output
import torch

# the place where we have our DistilBERT model
output_dir = "./model_save"

# maximum length of the comment/tweet; median lies below 64.
MAX_LEN = 64

# telling pytorch to use CPU for predicting outputs
device = torch.device("cpu")

# Load a trained model and vocabulary that you have fine-tuned
model = DistilBertForSequenceClassification.from_pretrained(output_dir)
tokenizer = DistilBertTokenizer.from_pretrained(output_dir)

# Copy the model to the GPU.
model.to(device)

# softmax layer for converting predicted logits into probability
soft = torch.nn.Softmax()


def predict_sentiment(sentences):
    """Produces sentiment analysis on a list of sentences

    Args:
        sentences:Takes in a list of sentences

    Returns:
コード例 #16
0
    def fit(self, series: pd.Series):
        if self.tokenize_str == "bert":
            if self.doLower:
                tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
            else:
                tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

            def generate_BERT_vectors(s):
                toks = tokenizer(s,
                                 return_attention_mask=True,
                                 padding="max_length",
                                 truncation=True)
                return (toks["input_ids"], toks["attention_mask"])

            self.tokenizer = generate_BERT_vectors
        elif self.tokenize_str == "distilbert":
            if self.doLower:
                tokenizer = DistilBertTokenizer.from_pretrained(
                    'distilbert-base-uncased')
            else:
                tokenizer = DistilBertTokenizer.from_pretrained(
                    'distilbert-base-cased')

            def generate_DistilBERT_vectors(s):
                toks = tokenizer(s,
                                 return_attention_mask=True,
                                 padding="max_length",
                                 truncation=True)
                return (toks["input_ids"], toks["attention_mask"])

            self.tokenizer = generate_DistilBERT_vectors
        elif self.tokenize_str == "roberta":
            tokenizer = RobertaTokenizer.from_pretrained('distilroberta-base')

            def generate_RoBERTa_vectors(s):
                toks = tokenizer(s,
                                 return_attention_mask=True,
                                 padding="max_length",
                                 truncation=True)
                return (toks["input_ids"], toks["attention_mask"])

            self.tokenizer = generate_RoBERTa_vectors
        elif self.tokenize_str == "fasttext":
            embeddingModel = fasttext.load_model(self.fasttextFile)

            def generate_fasttext_vectors(s):
                words = word_tokenize(s)
                words_embed = [
                    embeddingModel.get_word_vector(w) for w in words
                    if w.isalpha()
                ]
                return words_embed

            self.tokenizer = generate_fasttext_vectors
        elif self.tokenize_str == "bow":
            vectorizer = CountVectorizer()
            vectorizer.fit(series)
            self.tokenizer = vectorizer.transform
        elif self.tokenize_str == "tfidf":
            vectorizer = TfidfVectorizer()
            vectorizer.fit(series)
            self.tokenizer = vectorizer.transform
コード例 #17
0
    def __init__(self,
                 binaryClassification: bool,
                 model_str: str,
                 doLower: bool,
                 train_batchSize: int,
                 testval_batchSize: int,
                 learningRate: float,
                 doLearningRateScheduler: bool,
                 labelSentences: dict = None,
                 max_label_len=None,
                 model=None,
                 optimizer=None,
                 device="cpu"):
        self.binaryClassification = binaryClassification
        self.labelSentences = labelSentences
        self.model_str = model_str
        self.tokenizer = None
        self.device = device
        self.train_batchSize = train_batchSize
        self.testval_batchSize = testval_batchSize
        self.learningRate = learningRate
        self.optimizer = optimizer
        self.doLearningRateScheduler = doLearningRateScheduler
        self.learningRateScheduler = None
        self.max_label_len = max_label_len

        if self.binaryClassification:
            self.num_labels = 1
        else:
            self.num_labels = len(self.labelSentences.keys())

        if self.model_str == "distilbert":
            if doLower:
                self.model = DistilBertForSequenceClassification.from_pretrained(
                    'distilbert-base-uncased',
                    num_labels=self.num_labels,
                    output_attentions=False,
                    output_hidden_states=False)
                self.tokenizer = DistilBertTokenizer.from_pretrained(
                    'distilbert-base-uncased')
            else:
                self.model = DistilBertForSequenceClassification.from_pretrained(
                    'distilbert-base-cased',
                    num_labels=self.num_labels,
                    output_attentions=False,
                    output_hidden_states=False)
                self.tokenizer = DistilBertTokenizer.from_pretrained(
                    'distilbert-base-cased')
        elif self.model_str == "bert":
            if doLower:
                self.model = BertForSequenceClassification.from_pretrained(
                    'bert-base-uncased',
                    num_labels=self.num_labels,
                    output_attentions=False,
                    output_hidden_states=False)
                self.tokenizer = BertTokenizer.from_pretrained(
                    'bert-base-uncased')
            else:
                self.model = BertForSequenceClassification.from_pretrained(
                    'bert-base-cased',
                    num_labels=self.num_labels,
                    output_attentions=False,
                    output_hidden_states=False)
                self.tokenizer = BertTokenizer.from_pretrained(
                    'bert-base-cased')
        elif self.model_str == "roberta":
            self.model = RobertaForSequenceClassification.from_pretrained(
                'distilroberta-base',
                num_labels=self.num_labels,
                output_attentions=False,
                output_hidden_states=False)
            self.tokenizer = RobertaTokenizer.from_pretrained(
                'distilroberta-base')
        else:
            if model:
                if binaryClassification:
                    self.model = dict()
                    for key in self.labelSentences.keys():
                        self.model[key] = model
                else:
                    self.model = model
            else:
                logging.error(
                    "If model_str is not predefined, a model needs to be given."
                )
                sys.exit(
                    "If model_str is not predefined, a model needs to be given."
                )
コード例 #18
0
ファイル: embeddings.py プロジェクト: gooppe/ir-20
def load_distillbert() -> Tuple[DistilBertTokenizer, DistilBertModel]:
    model = DistilBertModel.from_pretrained(PRETRAINED_WEIGHTS)
    tokenizer = DistilBertTokenizer.from_pretrained(PRETRAINED_WEIGHTS)
    model.eval()

    return tokenizer, model
コード例 #19
0
import numpy as np
import tensorflow as tf
from transformers import TFDistilBertModel, DistilBertTokenizer
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle

# load huggingface
distilbert = tf.keras.models.load_model('model\\transformer')
model_name = 'distilbert-base-uncased'
huggingface_tokenizer = DistilBertTokenizer.from_pretrained(model_name)

# load keras
bilstm = tf.keras.models.load_model('model\\bilstm')
with open('tokenizer.pickle', 'rb') as handle:
    keras_tokenizer = pickle.load(handle)


def huggingface_classify(input_text, tokenizer, model, max_len=120):
    clean = re.sub(r"[-()\"#/@;:<>{}=~|.?,]", "", str(input_text))
    if 'user' in clean: clean.strip('user')
    tokens = [
        tokenizer.encode_plus(t,
                              max_length=max_len,
                              pad_to_max_length=True,
                              add_special_tokens=True) for t in [clean]
    ]
    tensor = np.array([a['input_ids'] for a in tokens])
    results = model.predict(tensor)
    results = np.argmax(results, axis=1)
コード例 #20
0
ファイル: distillbert_ner.py プロジェクト: caonlp/ner
tags_vals = ["O", "B-PER", "I-PER", "B-LOC", "I-LOC", "B-ORG", "I-ORG"]
tags_vals.append("PAD")
tag2idx = {t: i for i, t in enumerate(tags_vals)}

print("tags_vals: ", tags_vals)
print("tag2idx: ", tag2idx)


MAX_LEN = 512
bs = 2

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# n_gpu = torch.cuda.device_count()
# print("显卡名称: ", torch.cuda.get_device_name(0))

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case = True)

def tokenize_and_preserve_labels(sentence, text_labels):
    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence, text_labels):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
コード例 #21
0
    print(f"Validation Accuracy Epoch: {epoch_accu}")

    return epoch_loss, epoch_accu, y_test_actual, y_test_predicted, y_test_predicted_prob_list


##################################################################

# Prepare the data
df_path = config.df_path

# This will give reduced sentiment [FYI: Its excepting preprocessed dataframe]
df_new_reduced, sentiment_map, sentiment_demap = utility.data_process(
    dataset_path=df_path)

# Initiate the tokenizer
bert_tokenizer = DistilBertTokenizer.from_pretrained(
    config.PRE_TRAINED_MODEL_NAME)

# Creating instance of Preprocess
# This Preprocess internally Triage class
# This will split data and encode using passing tokenizer
# Creating instance of the class
Preprocess = prepare_data.Preprocess(dataframe=df_new_reduced,
                                     tokenizer=bert_tokenizer,
                                     max_len=config.MAX_LEN,
                                     train_batch_size=config.TRAIN_BATCH_SIZE,
                                     valid_batch_size=config.VALID_BATCH_SIZE,
                                     test_batch_size=config.TEST_BATCH_SIZE)

# Accessing the process_data_for_model method of Preprocess class
training_loader, valid_loader, testing_loader = Preprocess.process_data_for_model(
)
コード例 #22
0
        print(f"URL:{s3_url}")
        os.makedirs(os.path.join(path), exist_ok=True)
        filename = Path(path_to_model)
        r = requests.get(s3_url)
        filename.write_bytes(r.content)

    return path_to_model, path


pretrained_weights = 'distilbert-base-cased'

path_to_model, path = download_model(
    "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-cased-pytorch_model.bin",
    "pytorch_model.bin")

tokenizer = DistilBertTokenizer.from_pretrained(path)
bert_model = DistilBertModel.from_pretrained(path)

labels_list = [
    "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"
]


class linear_model(nn.Module):
    def __init__(self, bert_model, num_labels):
        super().__init__()

        embed_size = bert_model.config.hidden_size
        if pretrained_weights == 'distilbert-base-cased':
            dropout_prob = bert_model.config.dropout
        else:
コード例 #23
0
    def fit(self, series: pd.Series):
        if self.args["tokenizer"] == "bert":
            if self.doLower:
                tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
            else:
                tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

            def generate_BERT_vectors(s):
                toks = tokenizer(s,
                                 return_attention_mask=True,
                                 padding="max_length",
                                 truncation=True,
                                 max_length=self.max_length)
                return (toks["input_ids"], toks["attention_mask"])

            def tokenizer_fun(series):
                return pd.Series(series).progress_apply(
                    generate_BERT_vectors).values

            self.tokenizer = tokenizer_fun

        elif self.args["tokenizer"] == "distilbert":
            if self.doLower:
                # distilbert german uncased should be used, however a pretrained model does not exist
                tokenizer = DistilBertTokenizer.from_pretrained(
                    'distilbert-base-uncased')
            else:
                tokenizer = DistilBertTokenizer.from_pretrained(
                    'distilbert-base-cased')

            def generate_DistilBERT_vectors(s):
                toks = tokenizer(s,
                                 return_attention_mask=True,
                                 padding="max_length",
                                 truncation=True,
                                 max_length=self.max_length)
                return (toks["input_ids"], toks["attention_mask"])

            def tokenizer_fun(series):
                return pd.Series(series).progress_apply(
                    generate_DistilBERT_vectors).values

            self.tokenizer = tokenizer_fun

        elif self.args["tokenizer"] == "xlnet":
            if self.doLower:
                # XLNET uncased should be used, however a pretrained model does not exist
                tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
            else:
                tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')

            def generate_XLM_vectors(s):
                toks = tokenizer(s,
                                 return_attention_mask=True,
                                 padding=True,
                                 truncation=True,
                                 max_length=self.max_length)
                return (toks["input_ids"], toks["attention_mask"])

            self.tokenizer = generate_XLM_vectors

            def tokenizer_fun(series):
                return pd.Series(series).progress_apply(
                    generate_XLM_vectors).values

            self.tokenizer = tokenizer_fun

        elif self.args["tokenizer"] == "roberta":
            if self.doLower:
                # roberta uncased should be used, however a pretrained model does not exist
                tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
            else:
                tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

            def generate_Roberta_vectors(s):
                toks = tokenizer(s,
                                 return_attention_mask=True,
                                 padding="max_length",
                                 truncation=True,
                                 max_length=self.max_length)
                return (toks["input_ids"], toks["attention_mask"])

            def tokenizer_fun(series):
                return pd.Series(series).progress_apply(
                    generate_Roberta_vectors).values

            self.tokenizer = tokenizer_fun

        elif self.args["tokenizer"] == "distilroberta":
            if self.doLower:
                # distilroberta uncased should be used, however a pretrained model does not exist
                tokenizer = RobertaTokenizer.from_pretrained(
                    'distilroberta-base')
            else:
                tokenizer = RobertaTokenizer.from_pretrained(
                    'distilroberta-base')

            def generate_DistilRoberta_vectors(s):
                toks = tokenizer(s,
                                 return_attention_mask=True,
                                 padding="max_length",
                                 truncation=True,
                                 max_length=self.max_length)
                return (toks["input_ids"], toks["attention_mask"])

            def tokenizer_fun(series):
                return pd.Series(series).progress_apply(
                    generate_DistilRoberta_vectors).values

            self.tokenizer = tokenizer_fun

        elif "fasttext" in self.args["tokenizer"]:
            embeddingModel = fasttext.load_model(self.fasttextFile)

            def generate_fasttext_vectors(s):
                words = word_tokenize(s)
                if "mean" in self.args["tokenizer"]:
                    words_embed = [
                        embeddingModel.get_word_vector(w) for w in words
                        if w.isalpha()
                    ]
                    words_embed = np.column_stack(words_embed).mean(axis=1)
                elif "max" in self.args["tokenizer"]:
                    words_embed = [
                        embeddingModel.get_word_vector(w) for w in words
                        if w.isalpha()
                    ]
                    words_embed = np.column_stack(words_embed).max(axis=1)
                else:
                    words = words[:self.max_length]
                    words_embed = [
                        embeddingModel.get_word_vector(w) for w in words
                        if w.isalpha()
                    ]
                return words_embed

            def tokenizer_fun(series):
                if "mean" in self.args["tokenizer"] or "max" in self.args[
                        "tokenizer"]:
                    return np.row_stack(
                        pd.Series(series).progress_apply(
                            generate_fasttext_vectors).values)
                else:
                    return pd.Series(series).progress_apply(
                        generate_fasttext_vectors).values

            self.tokenizer = tokenizer_fun

        elif self.args["tokenizer"] == "bow":
            vectorizer = CountVectorizer(ngram_range=(1, self.args["ngram"]),
                                         lowercase=self.doLower)
            vectorizer.fit(series)

            def tokenizer_fun(series):
                return vectorizer.transform(series)

            self.tokenizer = tokenizer_fun

        elif self.args["tokenizer"] == "tfidf":
            vectorizer = TfidfVectorizer(ngram_range=(1, self.args["ngram"]),
                                         lowercase=self.doLower)
            vectorizer.fit(series)

            def tokenizer_fun(series):
                return vectorizer.transform(series)

            self.tokenizer = tokenizer_fun
コード例 #24
0
def main():
    n_colors = 2
    opt = parser.parse_args()

    data_path = os.path.join(os.getcwd(), 'gp_debias', 'wordlist', opt.lang,
                             'occupation_stereotype_list.tsv')

    if opt.model == 'y':
        device = torch.device('cpu')
        n_model = SequenceClassifier(model_name='distilbert-base-uncased',
                                     num_labels=3,
                                     cache_dir='./cache')

        con = DistilBertModel

        state_dict = torch.load("trained_1575511705.pth", map_location=device)
        # create new OrderedDict that does not contain `module. (To deal with pytorch bug)
        from collections import OrderedDict
        new_state_dict = OrderedDict()
        for k, v in state_dict.items():
            name = k[7:]  # remove `module.`
            new_state_dict[name] = v
        # load params
        n_model.model.load_state_dict(state_dict)
        model = n_model.model.distilbert

        print('loaded model')
    else:
        model = DistilBertModel.from_pretrained('distilbert-base-uncased')
    model.eval()
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

    stereo_list = get_stereotype_words(data_path)

    if opt.load == 'n':
        basis = create_subspace(opt.lang, model, tokenizer)
    else:
        basis = load_subspace()

    X_vecs, sentences = proj_gen_space(tokenizer, model, stereo_list, basis,
                                       opt.lang)
    norms = norm(X_vecs, axis=2)
    import pprint
    pp = pprint.PrettyPrinter(indent=2)
    for i in range(len(sentences)):
        pp.pprint(
            sorted(list(zip(norms[i], sentences[i][0])), key=lambda x: x[0]))

    stereo_vecs = np.zeros((X_vecs.shape[0], X_vecs.shape[2]))
    for s in range(len(sentences)):
        stereo_vecs[s] = X_vecs[s, sentences[s][1], :]

    sent2bert, labeled_words, vecs_labels = train_kmeans(
        stereo_vecs, stereo_list, n_colors)

    # if opt.load == 'n':
    train_svm(vecs_labels)

    print(sorted(labeled_words, key=lambda x: x[1]))
    pca_viz(stereo_vecs, labeled_words, n_colors)
    scores = score_vectors(tokenizer, model, stereo_list, basis, opt.lang)
    #print(scores)
    stereo_scores = list(reversed(sorted(scores, key=lambda x: x[1])))

    print()
    print()
    # print(list(stereo_scores))

    gen_df(labeled_words, stereo_scores)

    print('Done')
コード例 #25
0
from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering
import torch

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased',
                                                return_token_type_ids=True)
model = DistilBertForQuestionAnswering.from_pretrained(
    'distilbert-base-uncased-distilled-squad')

data = pd.read_csv('examples.csv')
for idx, row in data.iterrows():
    context = row['context']
    question = row['question']
    encoding = tokenizer.encode_plus(question, context)
    input_ids, attention_mask = encoding["input_ids"], encoding[
        "attention_mask"]
    start_scores, end_scores = model(torch.tensor([input_ids]),
                                     attention_mask=torch.tensor(
                                         [attention_mask])).values()
    ans_tokens = input_ids[torch.argmax(start_scores
                                        ):torch.argmax(end_scores) + 1]
    answer_tokens = tokenizer.convert_ids_to_tokens(ans_tokens,
                                                    skip_special_tokens=True)
    answer_tokens_to_string = tokenizer.convert_tokens_to_string(answer_tokens)
    print(answer_tokens_to_string)
コード例 #26
0
ファイル: bert_base.py プロジェクト: jeffmaxey/lpot
                        default=None,
                        help='output model path and name')
    parser.add_argument('--benchmark',
                        action='store_true',
                        default=False,
                        help='Get benchmark performance of quantized model.')
    parser.add_argument('--benchmark_nums',
                        type=int,
                        default=1000,
                        help="Benchmark numbers of samples")
    parser.add_argument('--accuracy_only',
                        action='store_true',
                        default=False,
                        help="Mode of benchmark")
    args = parser.parse_args()
    tokenizer = DistilBertTokenizer.from_pretrained(args.input_dir,
                                                    do_lower_case=True)
    eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (
        args.task_name, )
    eval_dataset = load_and_cache_examples(args,
                                           args.task_name,
                                           tokenizer,
                                           evaluate=True)
    # exit(0)

    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, \
        batch_size=args.eval_batch_size)

    def eval_func(model):
        return evaluate_onnxrt(args, model, tokenizer, eval_dataloader)
コード例 #27
0
from tqdm import tqdm
from joblib import dump, load

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score as acc
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier

import torch
from transformers import DistilBertTokenizer, DistilBertModel

PREPROCESS = False
MICRO_MULT = 10

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
transformer = DistilBertModel.from_pretrained('distilbert-base-uncased',
                                              output_hidden_states=True)


def embed(x):
    x = tokenizer.encode(x, add_special_tokens=True)
    x = torch.tensor([x])
    with torch.no_grad():
        last_hs = transformer(x)[0]  # last layer hidden state
        # (bs, max_seq_len) -> (bs, seq_len, hid_dim)
        final_hs = torch.squeeze(last_hs, 0)[-1, :]
    emb_X = final_hs.detach().numpy()
    return emb_X

コード例 #28
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--bert_config_file",
        default=None,
        type=str,
        required=True,
        help="The config json file corresponding to the pre-trained BERT model. "
        "This specifies the model architecture.")
    parser.add_argument(
        "--vocab_file",
        default=None,
        type=str,
        required=True,
        help="The vocabulary file that the BERT model was trained on.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help="The output directory where the model checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument("--debug",
                        default=False,
                        action='store_true',
                        help="Whether to run in debug mode.")
    parser.add_argument("--data_dir",
                        default='data/semeval_14',
                        type=str,
                        help="SemEval data dir")
    parser.add_argument("--train_file",
                        default=None,
                        type=str,
                        help="SemEval xml for training")
    parser.add_argument("--predict_file",
                        default=None,
                        type=str,
                        help="SemEval csv for prediction")
    parser.add_argument("--extraction_file",
                        default=None,
                        type=str,
                        help="pkl file for extraction")
    parser.add_argument(
        "--init_checkpoint",
        default=None,
        type=str,
        help="Initial checkpoint (usually from a pre-trained BERT model).")
    parser.add_argument(
        "--do_lower_case",
        default=True,
        action='store_true',
        help="Whether to lower case the input text. Should be True for uncased "
        "models and False for cased models.")
    parser.add_argument(
        "--max_seq_length",
        default=96,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. Sequences "
        "longer than this will be truncated, and sequences shorter than this will be padded."
    )
    parser.add_argument("--do_train",
                        default=False,
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_predict",
                        default=False,
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--do_pipeline",
                        default=False,
                        action='store_true',
                        help="Whether to run pipeline on the dev set.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--predict_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for predictions.")
    parser.add_argument("--learning_rate",
                        default=2e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=8,
                        type=int,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% "
        "of training.")
    parser.add_argument(
        "--save_proportion",
        default=0.5,
        type=float,
        help="Proportion of steps to save models for. E.g., 0.5 = 50% "
        "of training.")
    parser.add_argument(
        "--verbose_logging",
        default=False,
        action='store_true',
        help=
        "If true, all of the warnings related to data processing will be printed. "
        "A number of warnings are expected for a normal SQuAD evaluation.")
    parser.add_argument("--no_cuda",
                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument(
        '--optimize_on_cpu',
        default=False,
        action='store_true',
        help=
        "Whether to perform optimization and keep the optimizer averages on CPU"
    )
    parser.add_argument(
        '--fp16',
        default=False,
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=128,
        help=
        'Loss scaling, positive power of 2 values can improve fp16 convergence.'
    )

    args = parser.parse_args()
    if not args.do_train and not args.do_predict and not args.do_pipeline:
        raise ValueError(
            "At least one of `do_train` or `do_predict` must be True.")

    if args.do_train and not args.train_file:
        raise ValueError(
            "If `do_train` is True, then `train_file` must be specified.")
    if args.do_predict and not args.predict_file:
        raise ValueError(
            "If `do_predict` is True, then `predict_file` must be specified.")
    save_path = os.path.join(args.output_dir, 'checkpoint_mlcls.pth.tar')
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
        if args.fp16:
            logger.info(
                "16-bits training currently not supported in distributed training"
            )
            args.fp16 = False  # (see https://github.com/pytorch/pytorch/pull/13496)
    logger.info(
        "torch_version: {} device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}"
        .format(torch.__version__, device, n_gpu, bool(args.local_rank != -1),
                args.fp16))

    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    # tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
    logger.info('output_dir: {}'.format(args.output_dir))
    save_path = os.path.join(args.output_dir, 'checkpoint_ml_cls.pth.tar')
    log_path = os.path.join(args.output_dir, 'performance_ml_cls.txt')
    network_path = os.path.join(args.output_dir, 'network_ml_cls.txt')
    parameter_path = os.path.join(args.output_dir, 'parameter_ml_cls.txt')
    predictions_path = os.path.join(args.output_dir, 'predictions_ml_cls.txt')

    logger.info("***** Preparing model *****")
    model = DistillBertForMultilabelClassification()
    model.to(device)

    if args.init_checkpoint is not None and not os.path.isfile(save_path):

        checkpoint = torch.load(save_path, map_location='cpu')
        model.load_state_dict(checkpoint['model'])
        optimizer = torch.optim.Adam()
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        logger.info(
            "Loading model from pretrained checkpoint: {}".format(save_path))
    else:
        optimizer = torch.optim.Adam(params=model.parameters(),
                                     lr=args.learning_rate)

    logger.info("***** Preparing data *****")
    training_loader, testing_loader = read_train_data(args, tokenizer, logger)

    if args.do_train:
        logger.info("***** Preparing training *****")

        #bert
        def loss_fn(outputs, targets):
            return torch.nn.BCEWithLogitsLoss()(outputs, targets)

        def calcuate_accu(big_idx, targets):
            n_correct = (big_idx == targets).sum().item()
            return n_correct

        def train(epoch):
            tr_loss = 0
            n_correct = 0
            nb_tr_steps = 0
            nb_tr_examples = 0
            #bert
            # loss = loss_fn(outputs, targets)
            #distilled
            loss_function = torch.nn.CrossEntropyLoss()
            model.train()
            for _, data in enumerate(training_loader, 0):
                ids = data['ids'].to(device, dtype=torch.long)
                mask = data['mask'].to(device, dtype=torch.long)
                #bert
                #token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
                targets = data['targets'].to(device, dtype=torch.float)

                #outputs = model(ids, mask, token_type_ids)
                outputs = model(ids, mask)
                loss = loss_fn(outputs, targets)
                tr_loss += loss.item()
                big_val, big_idx = torch.max(outputs.data, dim=1)
                # n_correct += calcuate_accu(big_idx, targets)

                nb_tr_steps += 1
                nb_tr_examples += targets.size(0)

                optimizer.zero_grad()

                if _ % 5000 == 0:
                    loss_step = tr_loss / nb_tr_steps
                    accu_step = (n_correct * 100) / nb_tr_examples
                    print(f"Training Loss per 5000 steps: {loss_step}")
                    print(f"Training Accuracy per 5000 steps: {accu_step}")

                #print(f'Epoch: {epoch + 1}, Loss:  {loss.item()}')

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
            print(
                f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}'
            )
            epoch_loss = tr_loss / nb_tr_steps
            epoch_accu = (n_correct * 100) / nb_tr_examples
            print(f"Training Loss Epoch: {epoch_loss}")
            print(f"Training Accuracy Epoch: {epoch_accu}")
            return

        for epoch in range(args.num_train_epochs):
            train(epoch)

        torch.save(
            {
                'model': model.state_dict(),
                'optimizer': optimizer.state_dict(),
                'step': 0
            }, save_path)

    logger.info("***** Running validation *****")
    f = open(log_path, "a")
    for epoch in range(3):
        outputs, targets = validation(args, model, device, testing_loader)
        outputs = np.array(outputs) >= 0.5
        from sklearn import metrics
        accuracy = metrics.accuracy_score(targets, outputs)
        recall = metrics.recall_score(targets, outputs, average='samples')
        f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
        f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
        f1 = metrics.f1_score(targets, outputs, average='samples')
        print("epoch: {}, P: {:.4f}, R: {:.4f}, F1: {:.4f}".format(
            epoch + 1, accuracy, recall, f1),
              file=f)
        print(" ", file=f)
    f.close()
    print("epoch: {}, P: {:.4f}, R: {:.4f}, F1: {:.4f}".format(
        epoch + 1, accuracy, recall, f1))
    if args.do_predict:
        logger.info("***** Running prediction *****")

        # restore from best checkpoint
        if save_path and os.path.isfile(save_path) and args.do_train:
            checkpoint = torch.load(save_path)
            model.load_state_dict(checkpoint['model'])
            logger.info("Loading model from finetuned checkpoint: '{}'".format(
                save_path))

        model.eval()
        results = validation(args,
                             model,
                             device,
                             testing_loader,
                             write_pred=True,
                             predictions_path=predictions_path)
コード例 #29
0
        if not discard:
            train = list(train) + [x for x in test if x in no_test]
        test = [x for x in test if x not in no_test]
        yield (train, test)


from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
import tensorflow as tf
import tensorflow_addons as tfa

tf.config.threading.set_intra_op_parallelism_threads(8)
tf.config.threading.set_inter_op_parallelism_threads(8)

MODEL_NAME = 'distilbert-base-uncased'

tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME)


def create_train_val(x, y, train, val):
    train_encodings = tokenizer(list(x[train].values),
                                truncation=True,
                                padding=True)
    val_encodings = tokenizer(list(x[val].values),
                              truncation=True,
                              padding=True)

    train_dataset = tf.data.Dataset.from_tensor_slices(
        (dict(train_encodings), list(y[train].values)))
    val_dataset = tf.data.Dataset.from_tensor_slices(
        (dict(val_encodings), list(y[val].values)))
コード例 #30
0
    def __init__(self, config):
        # self.name, self.num_classes, epochs, batchs
        
        self.Configs = config
        self.num_classes = len(config.label_list)
        
        self.train_logits = []
        self.validation_logits = []
        self.test_logits = []

        self.train_texts = []
        self.train_labels = []
        self.validation_texts = []
        self.validation_labels = []
        self.test_texts = []
        self.test_labels = []

        train = pd.read_csv(os.path.join(self.Configs.data_dir, 'train.csv'))
        
        try:
            dev = pd.read_csv(os.path.join(self.Configs.data_dir, 'dev.csv'))
        
        except:
            print('Validation disabled.')
        test = pd.read_csv(os.path.join(self.Configs.data_dir, 'test.csv'))
        
        self.train_texts = train['text'].tolist()
        
        self.train_labels = train['label'].tolist()
        
        try:
            self.validation_texts = dev['text'].tolist()
            self.validation_labels = dev['label'].tolist()

        except:
            pass
        self.test_texts = test['text'].tolist()
        
        for i in range(len(self.test_texts)):
            self.test_labels.append(0)

        
        if torch.cuda.is_available():
            self.device = torch.device("cuda")
        else:
            print('No GPU available, using the CPU instead.')
            self.device = torch.device("cpu")
        
        if self.Configs.model_name == 'albert':
            self.model = AlbertForSequenceClassification.from_pretrained(self.Configs.pretrained_model_dir,
                                                                         num_labels=self.num_classes)
            self.tokenizer = AlbertTokenizer.from_pretrained(self.Configs.pretrained_model_dir)
        
        if self.Configs.model_name == 'distilbert':
            self.model = DistilBertForSequenceClassification.from_pretrained(self.Configs.pretrained_model_dir,
                                                                             num_labels=self.num_classes)
            self.tokenizer = DistilBertTokenizer.from_pretrained(self.Configs.pretrained_model_dir)
           
        if self.Configs.model_name == 'roberta':
            self.model = RobertaForSequenceClassification.from_pretrained(self.Configs.pretrained_model_dir,
                                                                          num_labels=self.num_classes)
            self.tokenizer = RobertaTokenizer.from_pretrained(self.Configs.pretrained_model_dir)
            
        if torch.cuda.is_available():
            self.model.cuda()