Beispiel #1
0
 def __init__(self, MODEL_NAME):
     super(Classifier, self).__init__()
     self.l1 = BertModel.from_pretrained(MODEL_NAME)
     self.pre_classifier = torch.nn.Linear(768, 768)
     self.classifier = torch.nn.Linear(768, 2)
     self.dropout = torch.nn.Dropout(0.3)
Beispiel #2
0
    def __init__(self, args) -> None:
        super(SSNDM, self).__init__()

        self.hidden_size = args.hidden_size
        self.ext_ff_size = args.ext_ff_size
        self.ext_head_num = args.ext_head_num
        self.ext_dropout = args.ext_dropout
        self.ext_layer_num = args.ext_layer_num

        self.sect_num = args.sect_num
        self.max_seg_num = args.max_seg_num
        self.max_seg_len = args.max_seg_len

        self.memory_slots = args.memory_slots
        self.memory_dim = args.memory_dim
        self.memory_hops = args.memory_hops

        self.gat_head_num = args.gat_head_num
        self.per_head_dim = self.hidden_size // self.gat_head_num
        self.gat_dropout = args.gat_dropout

        self.bert_version = args.bert_version

        self.segment_encoder = BertModel.from_pretrained(self.bert_version)
        self.ext_transformer = TransformerEncoder(self.hidden_size,
                                                  self.ext_ff_size,
                                                  self.ext_head_num,
                                                  self.ext_dropout,
                                                  self.ext_layer_num)

        self.initial_memory = nn.Parameter(
            torch.normal(mean=0,
                         std=1,
                         size=(self.memory_slots, self.memory_dim)))

        self.section_embeddings = torch.nn.Embedding(self.sect_num,
                                                     self.hidden_size)
        self.segment_position_embeddings = torch.nn.Embedding(
            self.max_seg_num, self.hidden_size)

        self.mlp_pred = torch.nn.Sequential(
            collections.OrderedDict([
                ('pred_dense_1', torch.nn.Linear(self.hidden_size * 3, 1024)),
                ('pred_relu_1', torch.nn.ReLU()),
                ('pred_dense_2', torch.nn.Linear(1024, 512)),
                ('pred_relu_2', torch.nn.ReLU()),
                ('pred_dense_3', torch.nn.Linear(512, 1)),
            ]))

        self.mlp_structural = torch.nn.Sequential(
            collections.OrderedDict([('structural_dense_1',
                                      torch.nn.Linear(self.hidden_size,
                                                      self.hidden_size)),
                                     ('structural_tanh_1', torch.nn.Tanh())]))

        self.mlp_gate = torch.nn.Sequential(
            collections.OrderedDict([
                ('gate_dense_1', torch.nn.Linear(768 * 2, 1)),
                ('gate_sigmodi_1', torch.nn.Sigmoid()),
            ]))
        self.mlp_merge = torch.nn.Sequential(
            collections.OrderedDict([
                ('merge_dense_1', torch.nn.Linear(768 * 2, 768)),
                ('merge_tanh_1', torch.nn.Tanh()),
            ]))

        self.mlp_att_q_lst = nn.ModuleList([
            torch.nn.Linear(self.hidden_size,
                            self.hidden_size // self.gat_head_num)
            for i in range(self.gat_head_num)
        ])
        self.mlp_att_k_lst = nn.ModuleList([
            torch.nn.Linear(self.hidden_size,
                            self.hidden_size // self.gat_head_num)
            for i in range(self.gat_head_num)
        ])
        self.mlp_att_v_lst = nn.ModuleList([
            torch.nn.Linear(self.hidden_size,
                            self.hidden_size // self.gat_head_num)
            for i in range(self.gat_head_num)
        ])

        self.attn_graph = MultiHeadAttention(input_dim=self.hidden_size,
                                             output_dim=self.hidden_size,
                                             dropout=self.gat_dropout)

        self.sigmoid = nn.Sigmoid()
Beispiel #3
0
                output = np.swapaxes(output, 0, 1)
                list_output.append(output)

                # ====== Construct Cache ====== #
                temp_cache = {}
                for i, sent in enumerate(mini_batch):
                    hask_key = hashlib.sha256(sent.encode()).hexdigest()
                    temp_cache[hask_key] = output[i]
                self.cache.update(temp_cache)

                idx += mini_batch_size
                self.count += mini_batch_size
            output = np.concatenate(list_output, 0)
            te = time.time()
            print('encoding with model', len(sentences), 'processed', self.count, 'took', '{:4.1f}'.format(te-ts))


        te = time.time()
        embedding = self.get_multi_head_embedding(output, heads, head_size)
        return embedding


if __name__ == '__main__':
    model = BertModel.from_pretrained("/home/users/whwodud98/exp/MRPC/bert-base-uncased/last3/0/checkpoint-0")
    model.cuda()
    model = torch.nn.DataParallel(model)
    model.eval()
    tokenizer = BertTokenizer.from_pretrained("/home/users/whwodud98/exp/MRPC/bert-base-uncased/last3/0/checkpoint-0", do_lower_case=True)


Beispiel #4
0
 def __init__(self, n_classes):
     super(SentimentClassifier, self).__init__()
     self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
     self.drop = nn.Dropout(p=0.3)
     self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
 def __init__(self, feature_engineer=False):
     '''
     @description: initlize Class, EX: model
     @param {type} :
     feature_engineer: whether using feature engineering, if `False`, then compare common ML models
     res_model: res network model
     resnext_model: resnext network model
     wide_model: wide res network model
     bert: bert model
     ml_data: new mldata class
     @return: No return
     '''
     ### TODO
     # 1. 使用torchvision 初始化resnet152模型
     # 2. 使用torchvision 初始化 resnext101_32x8d 模型
     # 3. 使用torchvision 初始化  wide_resnet101_2 模型
     # 4. 加载bert 模型
     print("load")
     self.res_model =torchvision.models.resnet152(pretrained=False)
     self.res_model.load_state_dict(torch.load(config.root_path + '/model/resnet150/resnet152-b121ed2d.pth'))
     self.res_model = self.res_model.to(config.device)
     print("res152 is ok")
     #self.resnext_model =torchvision.models.resnext101_32x8d(pretrained=True)
     #self.resnext_model = self.resnext_model.to(config.device)
     #self.wide_model = torchvision.models.wide_resnet101_2(pretrained=True)
     #self.wide_model = self.wide_model.to(config.device)
     
     self.bert_tonkenizer = BertTokenizer.from_pretrained(config.root_path + '/model/bert')
     self.bert = BertModel.from_pretrained(config.root_path + '/model/bert')
     self.bert = self.bert.to(config.device)
     print("bert is ok")
     self.ml_data = MLData(debug_mode=True)
     if feature_engineer:
         self.model = lgb.LGBMClassifier(objective='multiclass',
                                         n_jobs=10,
                                         num_class=33,
                                         num_leaves=30,
                                         reg_alpha=10,
                                         reg_lambda=200,
                                         max_depth=3,
                                         learning_rate=0.05,
                                         n_estimators=2000,
                                         bagging_freq=1,
                                         bagging_fraction=0.9,
                                         feature_fraction=0.8,
                                         seed=1440)
     else:
         self.models = [
             RandomForestClassifier(n_estimators=500,
                                    max_depth=5,
                                    random_state=0),
             LogisticRegression(solver='liblinear', random_state=0),
             MultinomialNB(),
             SVC(),
             lgb.LGBMClassifier(objective='multiclass',
                                n_jobs=10,
                                num_class=33,
                                num_leaves=30,
                                reg_alpha=10,
                                reg_lambda=200,
                                max_depth=3,
                                learning_rate=0.05,
                                n_estimators=2000,
                                bagging_freq=1,
                                bagging_fraction=0.8,
                                feature_fraction=0.8),
         ]
Beispiel #6
0
                    type=int,
                    required=False,
                    help="Number of Epochs")

    args = parser.parse_args()
    
    if args.featurizer == "cls_featurizer":
        feat = featurizer.cls_featurizer
    elif args.featurizer == "avg_pooling_featurizer":
        feat = featurizer.avg_pooling_featurizer
    else:
        raise ValueError("Please enter name of existing featurizer")

    hf_weights_name = 'bert-base-uncased'
    bert_tokenizer = BertTokenizer.from_pretrained(hf_weights_name)
    bert_model = BertModel.from_pretrained(hf_weights_name)
    for param in bert_model.parameters():
        param.requires_grad = False

    train_dataset = PPDBDataset(corpus_path=args.data_path,
                        tokenizer=bert_tokenizer,
                        encoder=bert_model,
                        seq_len=128) # max seq_length is 129, 128 is appropriate length

    model = logisticRegressionClassifier(2, input_dim=768)
    train(train_dataset, model, encoder=bert_model, featurizer=feat, epochs=args.epochs, batch_size=8)




 def build_embedder(self):
     """ load bert here """
     return BertModel.from_pretrained(self.opt.bert_path)
Beispiel #8
0
    x = sentence - center
    return np.linalg.norm(x, 2)


data_path = '/home/ubuntu/likun/nlp_data/cluster/li_event.csv'
df = pd.read_csv(data_path)
test_text_col = 'use'

kws_path = '/home/ubuntu/likun/nlp_data/cluster/event_sentiment_kws.csv'
kws = pd.read_csv(kws_path)
sentiments = ['正向', '中立', '负向']
centers = [kws[s][0].replace(';', ' ') for s in sentiments]

bert_model_path = '/home/ubuntu/likun/nlp_pretrained/bert-wwm-ext'
tokenizer = BertTokenizer.from_pretrained(bert_model_path, do_lower_case=True)
model = BertModel.from_pretrained(bert_model_path)

sentence_vectors = [
    get_vector(text, tokenizer, model)
    for text in tqdm.tqdm(df[test_text_col].tolist())
]
center_vectors = [
    get_vector(text, tokenizer, model) for text in tqdm.tqdm(centers)
]

distances = []
positions = []
for sentence in sentence_vectors:
    center_dis = []
    for center in center_vectors:
        center_dis.append(distance(sentence, center))
Beispiel #9
0
 def __init__(self, uncased, temp_dir, finetune=False):
     super(Bert, self).__init__()
     self.model = BertModel.from_pretrained(
         'huseinzol05/bert-base-bahasa-cased')
     self.finetune = finetune
Beispiel #10
0
 def __init__(self, num_labels):
     super(VanillaBertLayer, self).__init__()
     self.bert = BertModel.from_pretrained(config.bert_directory,
                                           output_hidden_states=True,
                                           output_attentions=True,
                                           num_labels=num_labels)
Beispiel #11
0
                                            num_labels=2)
        config.update({"return_dict": False})
        model = BertForSequenceClassification(config=config)
        #model = CharacterBertForSequenceClassification(config=config)
        model.bert = CharacterBertModel.from_pretrained(os.path.join(
            'pretrained_models', "general_character_bert"),
                                                        config=config)
    else:
        logging.info('Loading %s model', "general_bert")
        config = BertConfig.from_pretrained(os.path.join(
            'pretrained_models', "bert-base-uncased"),
                                            num_labels=2)
        config.update({"return_dict": False})
        model = BertForSequenceClassification(config=config)
        model.bert = BertModel.from_pretrained(os.path.join(
            'pretrained_models', "bert-base-uncased"),
                                               config=config)
    model.to(args.device)

    if args.freeze_backbone:
        # freeze all params except classification weights
        for param_name, param in model.named_parameters():
            if not 'classifier' in param_name:
                param.requires_grad = False

        for param_name, param in model.named_parameters():
            logging.info("Param %s, requires_grad = %s, size: %s" %
                         (param_name, param.requires_grad, param.size()))

    # train model using custom train/eval loop
    # global_step, train_loss, best_val_metric, best_val_epoch = train(
Beispiel #12
0
import itertools
import sys
from lxml import etree
import numpy as np

from scipy.spatial.distance import cosine
from cdcrapp import CLIContext
from cdcrapp.model import Task, NewsArticle, SciPaper
from transformers import BertModel, BertTokenizerFast

from spacy.tokens.span import Span

torch.cuda.init()

tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').cuda()
nlp = spacy.load('en')


def extract_mentions(text: str):
    """Extract named entities and noun phrases"""

    doc = nlp(text)

    return list(doc.ents) + list(doc.noun_chunks)


def get_tokens_by_offset(start: int,
                         end: int,
                         model_inputs: dict,
                         second_doc=False):
Beispiel #13
0
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-german-cased')
model = BertModel.from_pretrained('bert-base-german-cased')
text = 'Wie alt bist du?'
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
print(output.pooler_output.detach().numpy())
Beispiel #14
0
import pickle

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Extract SVO triples from corpora

flat = itertools.chain.from_iterable

BERT_DIM = 768

# Load pre-trained model tokenizer (vocabulary)
# Crucially, do not do basic tokenization; PTB is tokenized. Just do wordpiece tokenization.
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

bert_model = BertModel.from_pretrained('bert-base-multilingual-cased',
                                       output_hidden_states=True)
bert_model.to('cuda')
bert_model.eval()


def aggregate_by_key(f, xs):
    d = {}
    for x in xs:
        key = f(x)
        if key in d:
            d[key].append(x)
        else:
            d[key] = [x]
    return d

                    type=str2bool,
                    nargs='?',
                    help="Include original sentence in output")
parser.add_argument('--threshold',
                    default=0.003,
                    type=float,
                    help="Any attention score lower than this is removed")

args = parser.parse_args()

use_cuda = args.use_cuda
language_model = args.language_model

if __name__ == '__main__':
    tokenizer = AutoTokenizer.from_pretrained(language_model)
    encoder = BertModel.from_pretrained(language_model)
    encoder.eval()
    if use_cuda:
        encoder = encoder.cuda()
    input_filename = args.input_filename
    output_filename = args.output_filename
    include_sentence = args.include_text_output

    cached_entity_links = set()

    with open(input_filename, 'r') as f, open(output_filename, 'w') as g:
        for idx, line in enumerate(tqdm(f)):
            sentence = line.strip()
            if len(sentence):
                valid_triplets = []
                doc = Doc()
    def __init__(self):
        super().__init__()

        self.bert = BertModel.from_pretrained(
            "bert-base-chinese", cache_dir="/data1/xul/models/bert/")
        self.qa_outputs = nn.Linear(768, 2)
Beispiel #17
0
def load_model(use_covidbert=False):
    """Function that loads and returns the CovidBERT model"""

    # # Load CovidBERT
    # if use_covidbert:
    #     print("Loading model...")
    #     model = AutoModelForMaskedLM.from_pretrained("deepset/covid_bert_base")
    #     print("Loading tokenizer...")
    #     tokenizer = AutoTokenizer.from_pretrained("deepset/covid_bert_base")
    #     print("Finished loading the model successfully!")

    #model = SentenceTransformer(model_path)

    # #Load CovidBERT
    # if use_covidbert:
    #     print("Loading model...")
    #     model = AutoModelWithLMHead.from_pretrained("manueltonneau/clinicalcovid-bert-nli")
    #     print("Loading tokenizer...")
    #     print("\n")
    #     tokenizer = AutoTokenizer.from_pretrained("manueltonneau/clinicalcovid-bert-nli")
    #     print("\n")
    #     print("Finished loading the model successfully!")

    #     # Save the model to model path
    #     model_path = os.path.join("models","clinicalcovid")
    #     if not os.path.exists(model_path):
    #         os.makedirs(model_path)
    #     model.save_pretrained(model_path)
    #     tokenizer.save_pretrained(model_path)

    #     model = SentenceTransformer(model_path)

    # Load CovidBERT
    if use_covidbert:
        print("Loading model...")
        model = AutoModelWithLMHead.from_pretrained("gsarti/covidbert-nli")
        print("Loading tokenizer...")
        print("\n")
        tokenizer = AutoTokenizer.from_pretrained("gsarti/covidbert-nli")
        print("\n")
        print("Finished loading the model successfully!")

        # Save the model to model path
        model_path = os.path.join("models", "gsarticovid")
        if not os.path.exists(model_path):
            os.makedirs(model_path)
        model.save_pretrained(model_path)
        tokenizer.save_pretrained(model_path)
        print(f"Successfully saved model to {model_path}")

        print("Loading Sentence Transformer now!")
        word_embedding_model = models.BERT(
            model_path,
            # max_seq_length=args.max_seq_length,
            # do_lower_case=args.do_lower_case
        )
        pooling_model = models.Pooling(
            word_embedding_model.get_word_embedding_dimension(),
            pooling_mode_mean_tokens=True,
            pooling_mode_cls_token=False,
            pooling_mode_max_tokens=False)
        model = SentenceTransformer(
            modules=[word_embedding_model, pooling_model])
        rmtree(model_path)
        model.save(model_path)
        print("Finished building Sentence Transformer!")

    # Load regular BERT
    else:
        print("Loading BERT")
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = BertModel.from_pretrained('bert-base-uncased')
        print("Finished loading BERT")

    return model, tokenizer
Beispiel #18
0
from data.mybloodyplots import mybloodyplots
from transformers import BertModel, BertTokenizer
import torch
import numpy
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
import re
import os
from torch.nn import CosineSimilarity
from tqdm import tqdm
from scipy.stats import spearmanr
import matplotlib
import matplotlib._color_data as mcd

bert_model = BertModel.from_pretrained('bert-base-uncased',
                                       output_hidden_states=True)
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
cos = CosineSimilarity(dim=-1, eps=1e-6)
os.makedirs('results', exist_ok=True)

datasets = ['men.txt', 'simlex999.txt']
#datasets = ['simlex999.txt']
golden = mcd.CSS4_COLORS['goldenrod']
teal = mcd.CSS4_COLORS['steelblue']
colors = [teal, golden]
current_font_folder = '/import/cogsci/andrea/fonts'

for dataset in datasets:

    word_vectors = defaultdict(list)
Beispiel #19
0
        cross_attentions = []
        for i in range(len(self.blocks)):
            block_out, self_attn, cross_attn = self.blocks[i](out_, attn_mask = attn_masks, encoder_outputs=encoder_outputs, return_self_attn=return_attn, return_cross_attn=return_attn)             
            #print(i, block_out)
            intermediate_outputs.append(block_out)
            self_attentions.append(self_attn)
            cross_attentions.append(cross_attn)
            out_ = block_out
            block_out = None
        return out_, intermediate_outputs
        
        
if __name__ == "__main__":
    config = {
        "vocab_size":30522,
        "model_dims":768,
        "segments":2,
        "blocks":{
            "attn_heads":12,
            "attn_dims":64,
            "ff_inner_dims":3072
        },
        "numblocks":12,
        "dropout":0.1,
        "layerNormEps":1e-12
    }
    print(config)
    t = Transformer(config)
    from transformers import BertModel, BertConfig
    model = BertModel.from_pretrained("bert-base-uncased")
    print(model.config)
Beispiel #20
0
 def __init__(self, n_classes, args=None):
     super().__init__()
     self.bert = BertModel.from_pretrained('bert-base-uncased', )
     self.fc = nn.Linear(768, n_classes)
 def test_model_from_pretrained(self):
     for model_name in BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
         model = BertModel.from_pretrained(model_name)
         self.assertIsNotNone(model)
import torch
from transformers import BertModel, BertTokenizer
from pk_classifier.utils import preprocess, encode_bert, tokens_emb, sentence_emb
import spacy

FAKE_TEXTS = [
    'Candida infections have increased due to the growth and expansion of susceptible patient '
    'populations.', 'The clearance (CLz) of midazolam was 3.25l/h', ''
]

SPACY_MODEL = spacy.load("en_ner_bc5cdr_md")
BERT_TOKENIZER = BertTokenizer.from_pretrained("monologg/biobert_v1.1_pubmed")
BERT_MODEL = BertModel.from_pretrained("monologg/biobert_v1.1_pubmed",
                                       output_hidden_states=True)


def test_preprocess():
    docs = list(SPACY_MODEL.pipe(FAKE_TEXTS))
    docs_preprocessed = [
        preprocess(spacy_obj=doc,
                   ident='',
                   unifier=' ;; ',
                   ngram=1,
                   masking=False) for doc in docs
    ]
    assert len(docs_preprocessed) == len(docs)
    assert docs_preprocessed[
        0] == 'candida ;; infect ;; increas ;; growth ;; expans ;; suscept ;; patient ;; popul'
    assert docs_preprocessed[1] == 'clearanc ;; clz ;; midazolam ;; ## ;; lh'
    assert docs_preprocessed[2] == ''
Beispiel #23
0
def run():
    args = parser.parse_args()
    data = args.data
    nlayer = args.nlayer
    file_path = args.file_path
    save_path = os.path.join(file_path, 'model_params')
    MAX_VOCAB_SIZE = 50000

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    with open('aux_files_transfer/dataset_%d.pkl' % MAX_VOCAB_SIZE, 'rb') as f:
        dataset = pickle.load(f)

    #    skip_list = np.load('aux_files/missed_embeddings_counter_%d.npy' %MAX_VOCAB_SIZE)
    embedding_matrix = np.load('aux_files_transfer/embeddings_glove_%d.npy' %
                               (MAX_VOCAB_SIZE))
    embedding_matrix = torch.tensor(embedding_matrix.T).to(device)

    # pytorch

    if data.lower() == 'imdb':
        data_path = 'aclImdb'

    bert = BertModel.from_pretrained('bert-base-uncased')
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    with open('attack_results_final_stop_100_AL_150.pkl', 'rb') as f:
        results = pickle.load(f)
    seqs = []
    lens = []
    tgts = []
    for i in range(len(results[1])):
        if np.array(results[1][i]).shape == ():
            continue
        seqs.append(' '.join(
            [dataset.inv_dict[j] for j in results[1][i].tolist() if j != 0]))
        lens.append(results[2][i])
        tgts.append(results[3][i])
    lens = torch.tensor(lens)
    tgts = torch.tensor(tgts)

    data_processed = pre_processing(seqs)
    tokenizer_select = args.tokenizer
    tokenizer_selection = tokenizer_select
    if tokenizer_selection.lower() != 'bert':
        data_processed.processing()
        train_sequences, test_sequences = data_processed.bert_indx(tokenizer)
        print('Self preprocessing')
    else:
        data_processed.bert_tokenize(tokenizer)
        test_sequences = data_processed.bert_indx(tokenizer)
        print(test_sequences[:10])
        print('BERT tokenizer')
    test_text_init = data_processed.numerical(tokenizer, test_sequences)

    max_len = max([len(s) for s in test_text_init])
    test_text = pad_sequences(test_text_init, maxlen=max_len, padding='post')
    all_test_data = TensorDataset(torch.tensor(test_text), lens, tgts)
    all_test_loader_bert = DataLoader(all_test_data,
                                      batch_size=128,
                                      shuffle=True)

    lstm_size = 128
    rnn_state_save = os.path.join(save_path, 'best_bert_0.7_0.001_bert_100')
    model = bert_lstm(
        bert, 2, False, nlayer, lstm_size, True, 0.7
    )  # batch_size=batch_size, embedding_matrix = embedding_matrix, hidden_size = lstm_size, kept_prob = 0.73, num_layers=2, bidirection=True)
    model.eval()
    model.load_state_dict(torch.load(rnn_state_save))
    model = model.to(device)

    model.eval()
    test_pred = torch.tensor([])
    test_targets = torch.tensor([])

    with torch.no_grad():
        for batch_index, (seqs, length,
                          target) in enumerate(all_test_loader_bert):
            seqs = seqs.type(torch.LongTensor)
            len_order = torch.argsort(length, descending=True)
            length = length[len_order]
            seqs = seqs[len_order]
            target = target[len_order]
            seqs, target, length = seqs.to(device), target.to(
                device), length.to(device)

            output, pred_out = model.pred(seqs, length, False)
            test_pred = torch.cat((test_pred, pred_out.cpu()), dim=0)
            test_targets = torch.cat(
                (test_targets, target.type(torch.float).cpu()))

        accuracy = model.evaluate_accuracy(test_pred.numpy(),
                                           test_targets.numpy())
    print('Test Accuracy:{:.4f}.'.format(accuracy))
def main(args, logger):
    # trn_df = pd.read_csv(f'{MNT_DIR}/inputs/origin/train.csv')
    trn_df = pd.read_pickle(f'{MNT_DIR}/inputs/nes_info/trn_df.pkl')
    tst_df = pd.read_csv(f'{MNT_DIR}/inputs/origin/test.csv')
    trn_df['is_original'] = 1

    # clean texts
    trn_df = clean_data(trn_df, ['question_title', 'question_body', 'answer'])
    tst_df = clean_data(trn_df, ['question_title', 'question_body', 'answer'])

    gkf = GroupKFold(n_splits=5).split(
        X=trn_df.question_body,
        groups=trn_df.question_body_le,
    )

    histories = {
        'trn_loss': {},
        'val_loss': {},
        'val_metric': {},
        'val_metric_raws': {},
    }
    loaded_fold = -1
    loaded_epoch = -1
    if args.checkpoint:
        histories, loaded_fold, loaded_epoch = load_checkpoint(args.checkpoint)

    fold_best_metrics = []
    fold_best_metrics_raws = []
    for fold, (trn_idx, val_idx) in enumerate(gkf):
        if fold < loaded_fold:
            fold_best_metrics.append(np.max(histories["val_metric"][fold]))
            fold_best_metrics_raws.append(
                histories["val_metric_raws"][fold][np.argmax(
                    histories["val_metric"][fold])])
            continue
        sel_log(
            f' --------------------------- start fold {fold} --------------------------- ',
            logger)
        fold_trn_df = trn_df.iloc[trn_idx]  # .query('is_original == 1')
        fold_trn_df = fold_trn_df.drop(['is_original', 'question_body_le'],
                                       axis=1)
        # use only original row
        fold_val_df = trn_df.iloc[val_idx].query('is_original == 1')
        fold_val_df = fold_val_df.drop(['is_original', 'question_body_le'],
                                       axis=1)
        if args.debug:
            fold_trn_df = fold_trn_df.sample(100, random_state=71)
            fold_val_df = fold_val_df.sample(100, random_state=71)
        temp = pd.Series(
            list(
                itertools.chain.from_iterable(
                    fold_trn_df.question_title.apply(lambda x: x.split(' ')) +
                    fold_trn_df.question_body.apply(lambda x: x.split(' ')) +
                    fold_trn_df.answer.apply(lambda x: x.split(' '))))
        ).value_counts()
        tokens = temp[temp >= 10].index.tolist()
        # tokens = []
        tokens = [
            'CAT_TECHNOLOGY'.casefold(),
            'CAT_STACKOVERFLOW'.casefold(),
            'CAT_CULTURE'.casefold(),
            'CAT_SCIENCE'.casefold(),
            'CAT_LIFE_ARTS'.casefold(),
        ]

        trn_dataset = QUESTDataset(
            df=fold_trn_df,
            mode='train',
            tokens=tokens,
            augment=[],
            tokenizer_type=TOKENIZER_TYPE,
            pretrained_model_name_or_path=TOKENIZER_PRETRAIN,
            do_lower_case=DO_LOWER_CASE,
            LABEL_COL=LABEL_COL,
            t_max_len=T_MAX_LEN,
            q_max_len=Q_MAX_LEN,
            a_max_len=A_MAX_LEN,
            tqa_mode=TQA_MODE,
            TBSEP='[TBSEP]',
            pos_id_type='arange',
            MAX_SEQUENCE_LENGTH=MAX_SEQ_LEN,
        )
        # update token
        trn_sampler = RandomSampler(data_source=trn_dataset)
        trn_loader = DataLoader(trn_dataset,
                                batch_size=BATCH_SIZE,
                                sampler=trn_sampler,
                                num_workers=os.cpu_count(),
                                worker_init_fn=lambda x: np.random.seed(),
                                drop_last=True,
                                pin_memory=True)
        val_dataset = QUESTDataset(
            df=fold_val_df,
            mode='valid',
            tokens=tokens,
            augment=[],
            tokenizer_type=TOKENIZER_TYPE,
            pretrained_model_name_or_path=TOKENIZER_PRETRAIN,
            do_lower_case=DO_LOWER_CASE,
            LABEL_COL=LABEL_COL,
            t_max_len=T_MAX_LEN,
            q_max_len=Q_MAX_LEN,
            a_max_len=A_MAX_LEN,
            tqa_mode=TQA_MODE,
            TBSEP='[TBSEP]',
            pos_id_type='arange',
            MAX_SEQUENCE_LENGTH=MAX_SEQ_LEN,
        )
        val_sampler = RandomSampler(data_source=val_dataset)
        val_loader = DataLoader(val_dataset,
                                batch_size=BATCH_SIZE,
                                sampler=val_sampler,
                                num_workers=os.cpu_count(),
                                worker_init_fn=lambda x: np.random.seed(),
                                drop_last=False,
                                pin_memory=True)

        fobj = BCEWithLogitsLoss()
        state_dict = BertModel.from_pretrained(MODEL_PRETRAIN).state_dict()
        model = BertModelForBinaryMultiLabelClassifier(
            num_labels=len(LABEL_COL),
            config_path=MODEL_CONFIG_PATH,
            state_dict=state_dict,
            token_size=len(trn_dataset.tokenizer),
            MAX_SEQUENCE_LENGTH=MAX_SEQ_LEN,
        )
        optimizer = optim.Adam(model.parameters(), lr=3e-5)
        scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                         T_max=MAX_EPOCH,
                                                         eta_min=1e-5)

        # load checkpoint model, optim, scheduler
        if args.checkpoint and fold == loaded_fold:
            load_checkpoint(args.checkpoint, model, optimizer, scheduler)

        for epoch in tqdm(list(range(MAX_EPOCH))):
            if fold <= loaded_fold and epoch <= loaded_epoch:
                continue
            if epoch < 1:
                model.freeze_unfreeze_bert(freeze=True, logger=logger)
            else:
                model.freeze_unfreeze_bert(freeze=False, logger=logger)
            model = DataParallel(model)
            model = model.to(DEVICE)
            trn_loss = train_one_epoch(model, fobj, optimizer, trn_loader,
                                       DEVICE)
            val_loss, val_metric, val_metric_raws, val_y_preds, val_y_trues, val_qa_ids = test(
                model, fobj, val_loader, DEVICE, mode='valid')

            scheduler.step()
            if fold in histories['trn_loss']:
                histories['trn_loss'][fold].append(trn_loss)
            else:
                histories['trn_loss'][fold] = [
                    trn_loss,
                ]
            if fold in histories['val_loss']:
                histories['val_loss'][fold].append(val_loss)
            else:
                histories['val_loss'][fold] = [
                    val_loss,
                ]
            if fold in histories['val_metric']:
                histories['val_metric'][fold].append(val_metric)
            else:
                histories['val_metric'][fold] = [
                    val_metric,
                ]
            if fold in histories['val_metric_raws']:
                histories['val_metric_raws'][fold].append(val_metric_raws)
            else:
                histories['val_metric_raws'][fold] = [
                    val_metric_raws,
                ]

            logging_val_metric_raws = ''
            for val_metric_raw in val_metric_raws:
                logging_val_metric_raws += f'{float(val_metric_raw):.4f}, '

            sel_log(
                f'fold : {fold} -- epoch : {epoch} -- '
                f'trn_loss : {float(trn_loss.detach().to("cpu").numpy()):.4f} -- '
                f'val_loss : {float(val_loss.detach().to("cpu").numpy()):.4f} -- '
                f'val_metric : {float(val_metric):.4f} -- '
                f'val_metric_raws : {logging_val_metric_raws}', logger)
            model = model.to('cpu')
            model = model.module
            save_checkpoint(
                f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}',
                model,
                optimizer,
                scheduler,
                histories,
                val_y_preds,
                val_y_trues,
                val_qa_ids,
                fold,
                epoch,
                val_loss,
                val_metric,
            )
        fold_best_metrics.append(np.max(histories["val_metric"][fold]))
        fold_best_metrics_raws.append(
            histories["val_metric_raws"][fold][np.argmax(
                histories["val_metric"][fold])])
        save_and_clean_for_prediction(f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}',
                                      trn_dataset.tokenizer,
                                      clean=False)
        del model

    # calc training stats
    fold_best_metric_mean = np.mean(fold_best_metrics)
    fold_best_metric_std = np.std(fold_best_metrics)
    fold_stats = f'{EXP_ID} : {fold_best_metric_mean:.4f} +- {fold_best_metric_std:.4f}'
    sel_log(fold_stats, logger)
    send_line_notification(fold_stats)

    fold_best_metrics_raws_mean = np.mean(fold_best_metrics_raws, axis=0)
    fold_raw_stats = ''
    for metric_stats_raw in fold_best_metrics_raws_mean:
        fold_raw_stats += f'{float(metric_stats_raw):.4f},'
    sel_log(fold_raw_stats, logger)
    send_line_notification(fold_raw_stats)

    sel_log('now saving best checkpoints...', logger)
Beispiel #25
0
model constant and hyper tuning model hypertuning params
"""
import torch
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup

# Set device for running model, in case of GPU ,it will automatically use GPU.
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# BERT pre trained model, which defined tokens, buffer etc.
PRE_TRAINED_MODEL = "bert-base-cased"

# Define BERT based tokenizer here.
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL)

# Define BERT model
bert = BertModel.from_pretrained(PRE_TRAINED_MODEL)

# Maximum input tensor length based on sentence.
SENTENCE_LENGTH = 128

# Random seed for model
RANDOM_SEED = 35

# Training test size 10%
training_test_size = 0.1

# Validation test size 50%
validation_test_size = 0.5

# Worker for data loader
WORKER = 4
from transformers import BertModel, BertTokenizer
import logging

logging.basicConfig(level=logging.INFO)

tokenizer = BertTokenizer.from_pretrained("../wwm/data/aishell_word_table.txt")
model = BertModel.from_pretrained("../wwm/data")


# 语料
def get_contents():
    contents = []
    with open('../aishell_label/aishell_word_text.txt', 'r',
              encoding='utf-8') as fid:
        for line in fid:
            parts = line.strip().split(' ')
            utt_id = parts[0]
            text = parts[1:]
            contents.append(text)
    return contents


seq = get_contents()
tokens = tokenizer.tokenize(seq)
ids = torch.tensor([tokenizer.convert_tokens_to_ids(tokens)])

# 词典

# 微调
Beispiel #27
0
 def get_bert(bert_option):
     model = BertModel.from_pretrained(bert_option)
     return model
Beispiel #28
0
 def __init__(self, drop_rate=0.4, output_size=1):
     super().__init__()
     self.bert = BertModel.from_pretrained('bert-base-uncased')
     self.drop = torch.nn.Dropout(drop_rate)
     self.fc = torch.nn.Linear(768, output_size)  # BERTの出力に合わせて768次元を指定
Beispiel #29
0
# %% Load library
import torch
from transformers import BertTokenizer, BertConfig, BertModel

# %% Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained("beomi/kcbert-base",
                                          do_lower_case=False)

# %% Initialize model
pt_model_config = BertConfig.from_pretrained("beomi/kcbert-base")
model = BertModel.from_pretrained("beomi/kcbert-base", config=pt_model_config)

# %% check configuration
print(pt_model_config)

# %% create an input value for input
sentences = ["안녕하세요", "반갑습니다", "저의 이름은", "로이입니다"]
features = tokenizer(
    sentences,
    max_length=10,
    padding="max_length",
    truncation=True,
)

# %%
print(features.keys())
# %%
print(features['input_ids'])
# %%
print(features['token_type_ids'])
# %%
Beispiel #30
0
import nltk

# use_cuda = config.use_gpu and torch.cuda.is_available()

if torch.cuda.is_available():
    device = torch.device(
        "cuda:0")  # you can continue going on here, like cuda:1 cuda:2....etc.
    print("Running on the GPU")
else:
    device = torch.device("cpu")
    print("Running on the CPU")

from transformers import BertModel, BertTokenizer, BertForQuestionAnswering

#Creating instance of BertModel
bert_model = BertModel.from_pretrained('bert-base-multilingual-cased')
bert_model = bert_model.to(device)
#Creating intance of tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

hidden_size = 768
hidden_dim = 50
layer_dim = 1
batch_size = 1
max_tot_len = 500
max_ques_len = 90
stride_len = 90


def prepare_data(para_tokens, ques_tokens, start, end):
    all_tokens_len = 0