Exemple #1
0
    def __init__(self, args) -> None:
        """Use ELM with fintuned language model for sentiment classification

        Args:
            args (dict): contain all the arguments needed.
                - model_name(str): the name of the transformer model
                - bsz(int): batch size
                - epoch: epochs to train
                - type(str): fintuned type
                  - base: train only ELM
                  - finetune_elm: train transformers with ELM directly
                  - finetune_classifier: train transformers with classifier
                  - finetune_classifier_elm: train transformers with classifier,
                    and use elm replace the classifier
                  - finetune_classifier_beta: train transformers with classifier,
                    and use pinv to calculate beta in classifier
                - learning_rate(float): learning_rate for finetuning
        """
        # load configuration
        self.model_name = args.get('model_name', 'bert-base-uncased')
        self.bsz = args.get('batch_size', 10)
        self.epoch = args.get('epoch_num', 2)
        self.learning_rate = args.get('learning_rate', 0.001)
        self.training_type = args.get('training_type', 'base')
        self.debug = args.get('debug', True)
        self.eval_epoch = args.get('eval_epoch', 1)
        self.lr_decay = args.get('learning_rate_decay', 0.99)
        if torch.cuda.is_available():
            device = torch.device('cuda')
        else:
            device = torch.device('cpu')
        self.device = device
        self.n_gpu = torch.cuda.device_count()

        # load pretrained model
        if (self.model_name == 'bert-base-uncased') or \
                (self.model_name == 'distilbert-base-uncased') or \
                (self.model_name == 'albert-base-v2'):
            self.pretrained_model = AutoModel.from_pretrained(self.model_name)
            self.pretrained_tokenizer = AutoTokenizer.from_pretrained(
                self.model_name)
            input_shape = 768
            output_shape = 256
        elif (self.model_name == 'prajjwal1/bert-tiny'):
            self.pretrained_model = AutoModel.from_pretrained(self.model_name)
            self.pretrained_tokenizer = AutoTokenizer.from_pretrained(
                self.model_name, model_max_length=512)
            input_shape = 128
            output_shape = 64
        elif self.model_name == 'voidful/albert_chinese_xxlarge':
            self.pretrained_model = AlbertForMaskedLM.from_pretrained(
                self.model_name)
            self.pretrained_tokenizer = BertTokenizer.from_pretrained(
                self.model_name)
            input_shape = 768
            output_shape = 256
        else:
            raise TypeError("Unsupported model name")
        self.pretrained_model.to(device)
        device_ids = None
        if self.n_gpu > 1:
            device_ids = range(torch.cuda.device_count())
            self.pretrained_model = DP(self.pretrained_model,
                                       device_ids=device_ids)

        # load specific model
        if (self.training_type == 'finetune_classifier') or \
            (self.training_type == 'finetune_classifier_elm'):
            self.classifier = torch.nn.Sequential(
                torch.nn.Linear(input_shape, 2))
            self.loss_func = torch.nn.CrossEntropyLoss()
            self.classifier.to(device)
            if self.n_gpu > 1:
                self.classifier = DP(self.classifier, device_ids=device_ids)
        if (self.training_type == 'base') or \
            (self.training_type =='finetune_classifier_elm'):
            self.elm = classic_ELM(input_shape, output_shape)
        if (self.training_type == 'finetune_classifier_linear'):
            self.elm = classic_ELM(None, None)
            self.classifier = torch.nn.Sequential(
                OrderedDict([
                    ('w', torch.nn.Linear(input_shape, output_shape)),
                    ('act', torch.nn.Sigmoid()),
                    ('beta', torch.nn.Linear(output_shape, 2)),
                ]))
            self.loss_func = torch.nn.CrossEntropyLoss()
            self.classifier.to(device)
            if self.n_gpu > 1:
                self.classifier = DP(self.classifier, device_ids=device_ids)

        # load processor, trainer, evaluator, inferer.
        processors = {
            'base': self.__processor_base__,
            'finetune_classifier': self.__processor_base__,
            'finetune_classifier_elm': self.__processor_base__,
            'finetune_classifier_linear': self.__processor_base__,
        }
        trainers = {
            'base':
            self.__train_base__,
            'finetune_classifier':
            self.__train_finetune_classifier__,
            'finetune_classifier_elm':
            self.__train_finetune_classifier_elm__,
            'finetune_classifier_linear':
            self.__train_finetune_classifier_linear__,
        }
        evaluators = {
            'base': self.__eval_base__,
            'finetune_classifier': self.__eval_finetune_classifier__,
            'finetune_classifier_elm': self.__eval_base__,
            'finetune_classifier_linear':
            self.__eval_finetune_classifier_linear__,
        }
        inferers = {
            'base': self.__infer_base__,
            'finetune_classifier': self.__infer_finetune_classifier__,
            'finetune_classifier_elm': self.__infer_finetune_classifier_elm__,
            'finetune_classifier_linear': self.__infer_base__
        }
        self.processor = processors[self.training_type]
        self.trainer = trainers[self.training_type]
        self.evaluator = evaluators[self.training_type]
        self.inferer = inferers[self.training_type]
    return el_pres.item()





import json
import torch
import time
from BiencoderRanker4 import BiencoderRanker
from mention_detection.mention_data_proc_all import ReadTrainDectMent,IterData
from mention_detection.utils import *
from transformers import BertTokenizer
from faiss_indexer import DenseFlatIndexer
#torch.cuda.set_device(1)
tokenizer=BertTokenizer.from_pretrained('./model/bert-large-uncased')

biencoder_params=json.load(open('./model/biencoder/wiki_encoder_large2.json'))
with torch.no_grad():
    ranker=BiencoderRanker(biencoder_params)
    ranker.load_state_dict(torch.load('./model/mybiencoder_wiki.bin'))
    for params in ranker.parameters():
        params.requires_grad=False
    ranker=ranker.to('cpu')

trainfile_path='./Data/train.jsonl'
train_data=ReadTrainDectMent(trainfile_path,True)
train_data.padding()
batch_size=32
dataload = IterData(train_data, batch_size, True, True)
top_k=1
Exemple #3
0
                    tmp = [rewards[i]]
                else:
                    tmp.append(rewards[i])
            sent_rewards.append(sum(tmp) / len(tmp))

            token_rewards = []
            for _ in gpt_mapping:
                token_rewards.append(sent_rewards[_])

            return token_rewards

        model.load_state_dict(torch.load(args.load_from))
        print("loading from {}".format(args.load_from))
        model.train()

        bert_tokenizer = BertTokenizer.from_pretrained(args.modelpath)
        scorer = BERTGen(bert_tokenizer.vocab_size, args.dim, args.layers,
                         args.head, args.modelpath)
        scorer.to(args.device)
        scorer.load_state_dict(torch.load('models/BERT_scorer_ep9.pt'))
        scorer.eval()

        optimizer = optim.Adam(model.parameters(), 5e-7)

        avg_loss = 0
        for epoch_idx in range(args.epoch):
            print("start training {}th epoch".format(epoch_idx))
            dataset.shuffle()
            for idx in range(0, dataset.train_len()):
                batch = dataset.get_data(idx, details=True)
                table, sub_columns, title = batch[4:]
logging.getLogger('transformers.tokenization_utils').disabled = True
import numpy as np
import json
import pickle
import datetime
# import spacy
# from allennlp.commands.elmo import ElmoEmbedder
torch.cuda.is_available()


tokenizer_gpt2 = GPT2Tokenizer.from_pretrained('gpt2')
model_gpt2 = GPT2LMHeadModel.from_pretrained('gpt2', output_hidden_states=True)
model_gpt2.eval()
model_gpt2.to('cuda')

tokenizer_bert = BertTokenizer.from_pretrained('bert-base-cased')
model_bert = BertModel.from_pretrained('bert-base-cased')
model_bert.eval()
model_bert.to('cuda')

tokenizer_gpt = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
model_gpt = OpenAIGPTModel.from_pretrained('openai-gpt')
model_gpt.eval()
model_gpt.to('cuda')

# weat 1
flowers = ['aster', 'clover', 'hyacinth', 'marigold', 'poppy', 'azalea', 'crocus', 'iris', 'orchid', 'rose', 'bluebell', 'daffodil', 'lilac', 'pansy', 'tulip', 'buttercup', 'daisy', 'lily', 'peony', 'violet', 'carnation', 
'magnolia', 'petunia', 'zinnia','gladiola'] #'gladiola' deleted since it not appear
insects = ['ant', 'caterpillar', 'flea', 'locust', 'spider', 'bedbug', 'centipede', 'fly', 'maggot', 'tarantula',
'bee', 'cockroach', 'gnat', 'mosquito', 'termite', 'beetle', 'cricket', 'hornet', 'moth', 'wasp', 
'dragonfly', 'horsefly', 'roach', 'weevil','blackfly'] # 'blackfly' deleted for sysmetric since it only appears 1 time.
def get_estimator(max_len=20,
                  epochs=10,
                  batch_size=64,
                  train_steps_per_epoch=None,
                  eval_steps_per_epoch=None,
                  save_dir=tempfile.mkdtemp(),
                  pretrained_model='bert-base-uncased',
                  data_dir=None):
    # step 1 prepare data
    train_data, eval_data, data_vocab, label_vocab = mitmovie_ner.load_data(
        root_dir=data_dir)
    tokenizer = BertTokenizer.from_pretrained(pretrained_model,
                                              do_lower_case=True)
    tag2idx = char2idx(label_vocab)
    pipeline = fe.Pipeline(train_data=train_data,
                           eval_data=eval_data,
                           batch_size=batch_size,
                           ops=[
                               Tokenize(inputs="x",
                                        outputs="x",
                                        tokenize_fn=tokenizer.tokenize),
                               WordtoId(
                                   inputs="x",
                                   outputs="x",
                                   mapping=tokenizer.convert_tokens_to_ids),
                               WordtoId(inputs="y",
                                        outputs="y",
                                        mapping=tag2idx),
                               PadSequence(max_len=max_len,
                                           inputs="x",
                                           outputs="x"),
                               PadSequence(max_len=max_len,
                                           value=len(tag2idx),
                                           inputs="y",
                                           outputs="y"),
                               AttentionMask(inputs="x", outputs="x_masks")
                           ])

    # step 2. prepare model
    model = fe.build(
        model_fn=lambda: ner_model(max_len, pretrained_model, label_vocab),
        optimizer_fn=lambda: tf.optimizers.Adam(1e-5))
    network = fe.Network(ops=[
        ModelOp(model=model, inputs=["x", "x_masks"], outputs="y_pred"),
        Reshape(inputs="y", outputs="y", shape=(-1, )),
        Reshape(inputs="y_pred",
                outputs="y_pred",
                shape=(-1, len(label_vocab) + 1)),
        CrossEntropy(inputs=("y_pred", "y"), outputs="loss"),
        UpdateOp(model=model, loss_name="loss")
    ])

    traces = [
        Accuracy(true_key="y", pred_key="y_pred"),
        BestModelSaver(model=model, save_dir=save_dir)
    ]

    # step 3 prepare estimator
    estimator = fe.Estimator(network=network,
                             pipeline=pipeline,
                             epochs=epochs,
                             traces=traces,
                             train_steps_per_epoch=train_steps_per_epoch,
                             eval_steps_per_epoch=eval_steps_per_epoch)

    return estimator
def extractor():
    tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)
    device = torch.device(cuda_num if torch.cuda.is_available() else "cpu")

    model = BertForTokenClassification.from_pretrained(
        PRETRAINED_MODEL_NAME, num_labels=len(tag2idx))

    model = model.to(device)
    model.load_state_dict(
            torch.load(os.path.join(ModelName.format(model_idx),
                'pytorch_model.bin'), map_location="cpu"))

    model.eval()

    def predict(doc):

        names, docs = [], []
        predset = NameDoc(tokenizer=tokenizer, doc=doc)
        dataloader = torch.utils.data.DataLoader(
                predset,batch_size=1,shuffle=True,collate_fn=create_mini_batch)

        with torch.no_grad():

            for tokens, *data in dataloader:

                if next(model.parameters()).is_cuda:
                    data = [t.to(device) for t in data if t is not None]

                tokens_tensors, segments_tensors, masks_tensors, labels = data


                outputs = model(input_ids=tokens_tensors,
                                token_type_ids=None,
                                # token_type_ids=segments_tensors,
                                attention_mask=masks_tensors)

                logits = outputs[0]

                logits = torch.argmax(F.log_softmax(logits,dim=2),dim=2)
                logits = logits.detach().cpu().numpy()

                # Only predict the real word, mark=0, will not calculate
                masks_tensors = masks_tensors.to('cpu').numpy()

                tr = lambda e: ','.join(e) if len(e) == 2 and all_english(''.join(e)) else ''.join(e)

                for i,mask in enumerate(masks_tensors):
                    name, doc = [], ''
                    names.append([])
                    for j, m in enumerate(mask):
                        if m:
                            if logits[i][j] not in (tag2idx['[CLS]'], tag2idx['[SEP]']):
                                doc += tokens[i][j - 1]
                            if logits[i][j] == tag2idx['B-per']:
                                if name:
                                    names[-1].append(tr(name))
                                    name = []
                                name.append(tokens[i][j - 1])
                            elif logits[i][j] == tag2idx['I-per']:
                                name.append(tokens[i][j - 1])
                            elif name:
                                names[-1].append(tr(name))
                                name = []
                        else:
                            break
                    if name: names[-1].append(tr(name))
                    docs.append(doc)

        # need filter the names from doc and do classification again
        return names, docs

    nft = namefilter()
    def _ext(doc):
        names, docs = predict(doc)
        print('original names', names)
        return nft(list(set().union(*names)), ''.join(docs))

    return _ext
Exemple #7
0
parser.add_argument('--dictionary',
                    default=None,
                    type=str,
                    help='dictionary path')
args = parser.parse_args()

logger = create_logger(config.root_path + '/logs/main.log')

if __name__ == '__main__':
    model_name = args.model

    x = import_module('models.' + model_name)
    if model_name in ['bert', 'xlnet', 'roberta']:
        config.bert_path = config.root_path + '/model/' + model_name + '/'
        if 'bert' in model_name:
            config.tokenizer = BertTokenizer.from_pretrained(config.bert_path)
        elif 'xlnet' in model_name:
            config.tokenizer = XLNetTokenizer.from_pretrained(config.bert_path)
        elif 'roberta' in model_name:
            config.tokenizer = RobertaTokenizer.from_pretrained(
                config.bert_path)
        else:
            raise NotImplementedError

        config.save_path = config.root_path + 'model/saved_dict/' + model_name + '.ckpt'  # 模型训练结果
        config.log_path = config.root_path + '/logs/' + model_name
        config.hidden_size = 768
        config.eps = 1e-8
        config.gradient_accumulation_steps = 1
        config.word = True
        config.max_length = 400
Exemple #8
0
        default=True,
        action='store_true',
        help="Set this flag if you are using an uncased model.")
    parser.add_argument(
        "--max_seq_length",
        default=512,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. Sequences longer "
        "than this will be truncated, and sequences shorter than this will be padded."
    )
    parser.add_argument("--batch_size",
                        default=15,
                        type=int,
                        help="Batch size for predictions.")
    parser.add_argument("--full_data", type=str, required=True)
    parser.add_argument('--tokenizer_path', type=str, required=True)

    args = parser.parse_args()
    tokenizer = BertTokenizer.from_pretrained(args.tokenizer_path)
    examples = read_examples(full_file=args.full_data)
    with gzip.open(args.example_output, 'wb') as fout:
        pickle.dump(examples, fout)

    features = convert_examples_to_features(examples,
                                            tokenizer,
                                            max_seq_length=512,
                                            max_query_length=50)
    with gzip.open(args.feature_output, 'wb') as fout:
        pickle.dump(features, fout)
    per_device_eval_batch_size=batch_size_per_gpu,
    save_steps=-1,
    evaluate_during_training=True,
    output_dir=model_path,
    overwrite_output_dir=another_version,
    do_train=True,
    do_eval=True,
    do_predict=True,
    learning_rate=learning_rate,
    num_train_epochs=num_train_epochs,
)
set_seed(training_args.seed)

# Data Preprocess
tokenizer = BertTokenizer.from_pretrained(
    tokenizer_path,
    cache_dir=tokenizer_path,
)
tokenizer.save_vocabulary(tokenizer_path)
tokenizer.save_pretrained(tokenizer_path)

train_dataset = (MultipleChoiceDataset(
    data_dir=data_path,
    tokenizer=tokenizer,
    task=task_name,
    max_seq_length=max_seq_length,
    overwrite_cache=overwrite_tokenizer,
    mode=Split.train,
) if training_args.do_train else None)

eval_dataset = (MultipleChoiceDataset(
    data_dir=data_path,
Exemple #10
0
    datafile = args.datafile
    data_col = 0
    label_col = int(args.label_col)
    max_len = 100
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    acc_cum = 0
    rec_cum = 0
    pre_cum = 0
    f1_cum = 0
    f1_cum_mic = 0
    acc_arr = []
    rec_arr = []
    pre_arr = []
    f1_arr = []
    f1_arr_mic = []
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',do_lower_case=True, add_special_tokens=True, max_length=max_len, pad_to_max_length=True)

    #------------------------------------------------------------------------------------------------
    text_data, labels = prepare_dataset(datafile, data_col, label_col, "word-based")

    print("Number of Examples: ", len(text_data))

    encoder = LabelEncoder()
    encoder.fit(labels)
    encoded_labels = encoder.transform(labels)
    class_weights_labels = class_weight.compute_class_weight('balanced',
                                                 np.unique(encoded_labels),
                                                 encoded_labels)

    num_classes = len(list(encoder.classes_))
    print("num_classes: ", num_classes)
 def __init__(self, reviews, targets, max_len):
     self.reviews = reviews
     self.targets = targets
     self.tokenizer = BertTokenizer.from_pretrained(
         'cl-tohoku/bert-base-japanese')
     self.max_len = max_len
    print(len(all_documents))

    for document_index in tqdm(range(len(all_documents))):
        instances.extend(
            create_instances_from_document(all_documents, document_index,
                                           max_seq_len, short_seq_prob,
                                           max_ngram, masked_lm_prob,
                                           max_predictions_per_seq,
                                           vocab_words))
    random.shuffle(instances)
    return instances


if __name__ == '__main__':
    # input_file = './corpus/pro_data.txt'
    tokenizer = BertTokenizer(vocab_file='./bert_base_pretrain/vocab.txt',
                              do_lower_case=True)
    max_seq_len = 512
    short_seq_prob = 0.3
    max_ngram = 3
    masked_lm_prob = 0.15
    max_predictions_per_seq = 20

    file_list = ['./data/train.data']
    for i, input_file in enumerate(file_list):
        print('处理第{}个文件的数据'.format(i))
        with open('./data/processed_data{}.json'.format(i),
                  'w',
                  encoding='utf8') as f:
            file_examples = create_training_instances(
                input_file, tokenizer, max_seq_len, short_seq_prob, max_ngram,
                masked_lm_prob, max_predictions_per_seq)
    torch.manual_seed(seed)
    np.random.seed(seed)

    # Set device
    os.environ["CUDA_VISIBLE_DEVICES"] = args['cuda']
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    num_labels = 3 if task == 'c' else 2

    # Set tokenizer for different models
    if model_name == 'bert':
        if task == 'all':
            model = MTL_Transformer_LSTM(model_name, model_size, args=args)
        else:
            model = BERT(model_size, args=args, num_labels=num_labels)
        tokenizer = BertTokenizer.from_pretrained(f'bert-{model_size}-uncased')
    elif model_name == 'roberta':
        if task == 'all':
            model = MTL_Transformer_LSTM(model_name, model_size, args=args)
        else:
            model = RoBERTa(model_size, args=args, num_labels=num_labels)
        tokenizer = RobertaTokenizer.from_pretrained(f'roberta-{model_size}')
    elif model_name == 'bert-gate' and task == 'all':
        model_name = model_name.replace('-gate', '')
        model = GatedModel(model_name, model_size, args=args)
        tokenizer = BertTokenizer.from_pretrained(f'bert-{model_size}-uncased')
    elif model_name == 'roberta-gate' and task == 'all':
        model_name = model_name.replace('-gate', '')
        model = GatedModel(model_name, model_size, args=args)
        tokenizer = RobertaTokenizer.from_pretrained(f'roberta-{model_size}')
def RBERTQ1_data_preprocessor(input_file, csv_output_file, features_output_file):

    input_train_data = input_file
    max_sentence_len = 512
    final_data_list = []
    sentence_count = 0
    sentence_list = []
    exception_count = 0
    
    # Load pre-trained model tokenizer (vocabulary)
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    
    print("Started data preprocessing")
    
    with open(input_train_data) as reader_file:
      for sentence in reader_file:
        #if sentence_count == 5:
          #break
        try:
            input_data = []
        
            # Split the sentence into tokens with BERT tokenizer
            splited_text = sentence.split('\t')
            query_tokenized_text = tokenizer.tokenize(splited_text[1]) 
            tokenized_text = tokenizer.tokenize(splited_text[2])
            #print(tokenized_text)
            sentence_list.append((splited_text[2], splited_text[3]))
            ent1_pos_st = tokenized_text.index('$')
            ent1_pos_end = tokenized_text.index('$', ent1_pos_st+1)
            
            ent2_pos_st = tokenized_text.index('#')
            ent2_pos_end = tokenized_text.index('#', ent2_pos_st+1)
            #print(ent1_pos_st, ent1_pos_end, ent2_pos_st, ent2_pos_end)
        
            if len(query_tokenized_text) > max_sentence_len:
              query_tokenized_text = query_tokenized_text[:max_sentence_len] # If the length of the sentence is more than max length then truncate
        
        
            if len(tokenized_text) > max_sentence_len:
              tokenized_text = tokenized_text[:max_sentence_len] # If the length of the sentence is more than max length then truncate
        
            # Map the token strings to their vocabulary indeces.
            query_indexed_tokens = tokenizer.convert_tokens_to_ids(query_tokenized_text)
            # Mark each of the tokens as belonging to sentence "0".
            query_segments_ids = [0] * len(query_tokenized_text)
        
            # Mask the sentence tokens with 1
            query_att_mask = [1] * len(query_indexed_tokens)
        
            # padding the rest of the sequence length
            query_padding_len = max_sentence_len - len(query_indexed_tokens)
        
            # Add the padded token to the indexed tokens
            query_indexed_tokens = query_indexed_tokens + [0]*query_padding_len
        
            # Mask the padded tokens with 0
            query_att_mask = query_att_mask + [0]*query_padding_len
        
            # Mark the padded tokens as belonging to sentence "0"
            query_segments_ids = query_segments_ids + [0]*query_padding_len
        
        
        
            # Map the token strings to their vocabulary indeces.
            indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
            # Mark each of the tokens as belonging to sentence "0".
            segments_ids = [0] * len(tokenized_text)
            
            # Mask the sentence tokens with 1
            att_mask = [1] * len(indexed_tokens)
            
            # padding the rest of the sequence length
            padding_len = max_sentence_len - len(indexed_tokens)
        
            # Add the padded token to the indexed tokens
            indexed_tokens = indexed_tokens + [0]*padding_len
        
            # Mask the padded tokens with 0
            att_mask = att_mask + [0]*padding_len
        
            # Mark the padded tokens as belonging to sentence "0"
            segments_ids = segments_ids + [0]*padding_len
        
            # Initialize entity masks
            ent1_mask = [0]*len(att_mask)
            ent2_mask = [0]*len(att_mask)
        
            # Mark the entity masks with 1 in the entity positions
            for ent1_ind in range(ent1_pos_st+1, ent1_pos_end):
              ent1_mask[ent1_ind] = 1
            #print(ent1_mask)
        
            for ent2_ind in range(ent2_pos_st+1, ent2_pos_end):
              ent2_mask[ent2_ind] = 1
        
            input_data.append(indexed_tokens)
            input_data.append(segments_ids)
            input_data.append(att_mask)
            input_data.append(ent1_mask)
            input_data.append(ent2_mask)
            input_data.append(query_indexed_tokens)
            input_data.append(query_segments_ids)
            input_data.append(query_att_mask)
            input_data.append([int(splited_text[3])])
            input_data.append([int(splited_text[0])])
            input_data.append([splited_text[4]])
        
        
            final_data_list.append(input_data)
            sentence_count += 1
            print("sentence count : %d " % sentence_count)
        except ValueError:
            exception_count += 1
            print("exception count : %d " % exception_count)
        except Exception:
            exception_count += 1
            print("general exception")
            print("exception count : %d " % exception_count)
    
    #print("The sentence count is %d" % sentence_count)
    # if os.path.exists(features_file):
    #   print('in if')
    #   final_data_list = torch.load(features_output_file)
    # else:
    torch.save(final_data_list, features_output_file)
    writer = csv.writer(open(csv_output_file, 'w'))
    writer.writerows(final_data_list)
      
    
    # indexed_tokens_tensor = torch.tensor([ind_tokens[0] for ind_tokens in final_data_list])
    # segment_ids_tensor = torch.tensor([seg_ids[1] for seg_ids in final_data_list])
    # att_mask_tensor = torch.tensor([attn[2] for attn in final_data_list])
    # ent1_mask_tensor = torch.tensor([ent1_mask[3] for ent1_mask in final_data_list])
    # ent2_mask_tensor = torch.tensor([ent2_mask[4] for ent2_mask in final_data_list])
    # query_indexed_tokens_tensor = torch.tensor([q_ind_tokens[5] for q_ind_tokens in final_data_list])
    # query_segment_ids_tensor = torch.tensor([q_seg_ids[6] for q_seg_ids in final_data_list])
    # query_att_mask_tensor = torch.tensor([q_attn[7] for q_attn in final_data_list])
    # labels_tensor = torch.tensor([labels[8] for labels in final_data_list])
    # seqid_tensor = torch.tensor([seqid[9] for seqid in final_data_list])
    
    
    # #print(ent1_mask_tensor.shape)
    # print("Finished Data Preprocessing")
    
    # final_dataset = torch.utils.data.TensorDataset(
    #     indexed_tokens_tensor,
    #     segment_ids_tensor,
    #     att_mask_tensor,
    #     ent1_mask_tensor,
    #     ent2_mask_tensor,
    #     query_indexed_tokens_tensor,
    #     query_segment_ids_tensor,
    #     query_att_mask_tensor,
    #     labels_tensor,
    #     seqid_tensor
    # )
    # return final_dataset
Exemple #15
0
 def __init__(self, split_name='validation'):
     self.splitname = split_name
     self.sen1, self.sen2, self.label = self.read_tsv(self.splitname)
     self.tokenizer = BertTokenizer.from_pretrained(
         scitailConfig.tokenizer_name)
Exemple #16
0
print()
print("Number of GPUs: ", n_gpu)
print()

batch_size = batch_size // gradient_accumulation_steps

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if n_gpu > 0:
    torch.cuda.manual_seed_all(seed)

if args.local_rank not in [-1, 0]:
    torch.distributed.barrier()

tokenizer = BertTokenizer.from_pretrained(args.output_dir,
                                          do_lower_case=do_lower_case)
model_qa = BertQA.from_pretrained(args.output_dir)
model_qa.to(device)

dev_features = bert_utils.convert_examples_to_features(dev_InputExamples,
                                                       MAX_SEQ_LENGTH,
                                                       tokenizer)
all_input_ids = torch.tensor([f.input_ids for f in dev_features],
                             dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in dev_features],
                              dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in dev_features],
                               dtype=torch.long)
all_start_positions = torch.tensor([f.start_label_ids for f in dev_features],
                                   dtype=torch.long)
all_end_positions = torch.tensor([f.end_label_ids for f in dev_features],
                    docs.append(doc)

        # need filter the names from doc and do classification again
        return names, docs

    nft = namefilter()
    def _ext(doc):
        names, docs = predict(doc)
        print('original names', names)
        return nft(list(set().union(*names)), ''.join(docs))

    return _ext


if __name__ == "__main__":
    tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)
    device = torch.device(cuda_num if torch.cuda.is_available() else "cpu")

    model = BertForTokenClassification.from_pretrained(
        PRETRAINED_MODEL_NAME, num_labels=len(tag2idx))

    model = model.to(device)

    # additional
    max_grad_norm = 1.0
    FULL_FINETUNING = True
    if FULL_FINETUNING:
        # Fine tune model all layer parameters
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'gamma', 'beta']
        optimizer_grouped_parameters = [
Exemple #18
0
def main():
    args = setup_train_args()
    # 日志同时输出到文件和console
    global logger
    logger = create_logger(args)
    # 当用户使用GPU,并且GPU可用时
    args.cuda = torch.cuda.is_available() and not args.no_cuda
    device = 'cuda' if args.cuda else 'cpu'
    logger.info('using device:{}'.format(device))
    # 为CPU设置种子用于生成随机数,以使得结果是确定的
    # 为当前GPU设置随机种子;如果使用多个GPU,应该使用torch.cuda.manual_seed_all()为所有的GPU设置种子。
    # 当得到比较好的结果时我们通常希望这个结果是可以复现
    if args.seed:
        set_random_seed(args)

    # 设置使用哪些显卡进行训练
    os.environ["CUDA_VISIBLE_DEVICES"] = args.device

    # 初始化tokenizer
    tokenizer = BertTokenizer(vocab_file=args.vocab_path)
    # tokenizer的字典大小
    vocab_size = len(tokenizer)

    global pad_id
    pad_id = tokenizer.convert_tokens_to_ids(PAD)

    # 创建对话模型的输出目录
    if not os.path.exists(args.lyric_model_output_path):
        os.mkdir(args.lyric_model_output_path)
    # 创建MMI模型的输出目录
    if not os.path.exists(args.mmi_model_output_path):
        os.mkdir(args.mmi_model_output_path)
    # 加载GPT2模型
    model, n_ctx = create_model(args, vocab_size)
    model.to(device)
    # 对原始数据进行预处理,将原始语料转换成对应的token_id
    if args.raw and args.train_mmi:  # 如果当前是要训练MMI模型
        preprocess_mmi_raw_data(args, tokenizer, n_ctx)
    elif args.raw and not args.train_mmi:  # 如果当前是要训练对话生成模型
        preprocess_raw_data(args, tokenizer, n_ctx)
    # 是否使用多块GPU进行并行运算
    multi_gpu = False
    if args.cuda and torch.cuda.device_count() > 1:
        logger.info("Let's use GPUs to train")
        model = DataParallel(
            model, device_ids=[int(i) for i in args.device.split(',')])
        multi_gpu = True
    # 记录模型参数数量
    num_parameters = 0
    parameters = model.parameters()
    for parameter in parameters:
        num_parameters += parameter.numel()
    logger.info('number of model parameters: {}'.format(num_parameters))

    # 加载数据
    logger.info("loading traing data")
    if args.train_mmi:  # 如果是训练MMI模型
        with open(args.train_mmi_tokenized_path, "r", encoding="utf8") as f:
            data = f.read()
    else:  # 如果是训练生成模型
        with open(args.train_tokenized_path, "r", encoding="utf8") as f:
            data = f.read()
    data_list = data.split("\n")
    train_list, test_list = train_test_split(data_list,
                                             test_size=0.1,
                                             random_state=1)
    # 开始训练
    train(model, device, train_list, multi_gpu, args)
    # 测试模型
    evaluate(model, device, test_list, multi_gpu, args)
def train_and_test(data_dir, bert_model="bert-base-uncased", task_name=None,
                   output_dir=None, max_seq_length=80, do_train=False, do_eval=False, do_lower_case=False,
                   train_batch_size=24, eval_batch_size=8, learning_rate=2e-5, num_train_epochs=15,
                   warmup_proportion=0.1,no_cuda=False, local_rank=-1, seed=42, gradient_accumulation_steps=1,
                   optimize_on_cpu=False, fp16=False, loss_scale=128, saved_model=""):
    


    # ## Required parameters
    # parser.add_argument("--data_dir",
    #                     default=None,
    #                     type=str,
    #                     required=True,
    #                     help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
    # parser.add_argument("--bert_model", default=None, type=str, required=True,
    #                     help="Bert pre-trained model selected in the list: bert-base-uncased, "
    #                          "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
    # parser.add_argument("--task_name",
    #                     default=None,
    #                     type=str,
    #                     required=True,
    #                     help="The name of the task to train.")
    # parser.add_argument("--output_dir",
    #                     default=None,
    #                     type=str,
    #                     required=True,
    #                     help="The output directory where the model checkpoints will be written.")

    ## Other parameters
    # parser.add_argument("--max_seq_length",
    #                     default=128,
    #                     type=int,
    #                     help="The maximum total input sequence length after WordPiece tokenization. \n"
    #                          "Sequences longer than this will be truncated, and sequences shorter \n"
    #                          "than this will be padded.")
    # parser.add_argument("--do_train",
    #                     default=False,
    #                     action='store_true',
    #                     help="Whether to run training.")
    # parser.add_argument("--do_eval",
    #                     default=False,
    #                     action='store_true',
    #                     help="Whether to run eval on the dev set.")
    # parser.add_argument("--do_lower_case",
    #                     default=False,
    #                     action='store_true',
    #                     help="Set this flag if you are using an uncased model.")
    # parser.add_argument("--train_batch_size",
    #                     default=32,
    #                     type=int,
    #                     help="Total batch size for training.")
    # parser.add_argument("--eval_batch_size",
    #                     default=8,
    #                     type=int,
    #                     help="Total batch size for eval.")
    # parser.add_argument("--learning_rate",
    #                     default=5e-5,
    #                     type=float,
    #                     help="The initial learning rate for Adam.")
    # parser.add_argument("--num_train_epochs",
    #                     default=3.0,
    #                     type=float,
    #                     help="Total number of training epochs to perform.")
    # parser.add_argument("--warmup_proportion",
    #                     default=0.1,
    #                     type=float,
    #                     help="Proportion of training to perform linear learning rate warmup for. "
    #                          "E.g., 0.1 = 10%% of training.")
    # parser.add_argument("--no_cuda",
    #                     default=False,
    #                     action='store_true',
    #                     help="Whether not to use CUDA when available")
    # parser.add_argument("--local_rank",
    #                     type=int,
    #                     default=-1,
    #                     help="local_rank for distributed training on gpus")
    # parser.add_argument('--seed',
    #                     type=int,
    #                     default=42,
    #                     help="random seed for initialization")
    # parser.add_argument('--gradient_accumulation_steps',
    #                     type=int,
    #                     default=1,
    #                     help="Number of updates steps to accumulate before performing a backward/update pass.")
    # parser.add_argument('--optimize_on_cpu',
    #                     default=False,
    #                     action='store_true',
    #                     help="Whether to perform optimization and keep the optimizer averages on CPU")
    # parser.add_argument('--fp16',
    #                     default=False,
    #                     action='store_true',
    #                     help="Whether to use 16-bit float precision instead of 32-bit")
    # parser.add_argument('--loss_scale',
    #                     type=float, default=128,
    #                     help='Loss scaling, positive power of 2 values can improve fp16 convergence.')

    # args = parser.parse_args()

    
    processors = {
#         "cola": ColaProcessor,
#         "mnli": MnliProcessor,
        "mrpc": MrpcProcessor,
        "stance":StanceProcessor,
        "neg":NegProcessor,
        "tri": TriProcessor
    }

    if local_rank == -1 or no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
        if fp16:
            logger.info("16-bits training currently not supported in distributed training")
            fp16 = False # (see https://github.com/pytorch/pytorch/pull/13496)
    logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(local_rank != -1))

    if gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
                            gradient_accumulation_steps))

    train_batch_size = int(train_batch_size / gradient_accumulation_steps)

    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(seed)

    if not do_train and not do_eval:
        raise ValueError("At least one of `do_train` or `do_eval` must be True.")

    if do_train:
#         if os.path.exists(output_dir) and os.listdir(output_dir):
        if os.path.exists(output_dir):
            pass
#             raise ValueError("Output directory ({}) already exists and is not empty.".format(output_dir))
        else:
            os.makedirs(output_dir, exist_ok=True)

    task_name = task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    label_list = processor.get_labels()

#     tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=do_lower_case)
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    

    train_examples = None
    num_train_steps = None
    if do_train:
        
        train_df = processor.get_train_df(data_dir)
        test_df = processor.get_test_df(data_dir)
        dev_df = processor.get_dev_df(data_dir)
        
        new_train_df = generate_opp_pers_dataset_not_elim(train_df)
        
        new_train_df.to_csv(os.path.join(data_dir, "tri_train.tsv"),sep='\t',index=False)
        
        new_test_df = generate_opp_pers_dataset_not_elim(test_df)
        
        new_test_df.to_csv(os.path.join(data_dir, "tri_test.tsv"),sep='\t',index=False)
        
        new_dev_df = generate_opp_pers_dataset_not_elim(dev_df)
        
        new_dev_df.to_csv(os.path.join(data_dir, "tri_dev.tsv"),sep='\t',index=False)
        
        train_examples = processor.get_train_examples(data_dir)
        
        num_train_steps = int(
            len(train_examples) / train_batch_size / gradient_accumulation_steps * num_train_epochs)

    # Prepare model
#     model = BertForSequenceClassification.from_pretrained(bert_model,
#                 cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(local_rank), num_labels = 2)

        model = BertForConsistencyCueClassification.from_pretrained('bert-base-uncased', num_labels=2)
        model.to(device)
        
        if fp16:
            model.half()

        if local_rank != -1:
            model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[local_rank],
                                                              output_device=local_rank)
        elif n_gpu > 1:
            model = torch.nn.DataParallel(model)

        # Prepare optimizer
        if fp16:
            param_optimizer = [(n, param.clone().detach().to('cpu').float().requires_grad_())                                 for n, param in model.named_parameters()]
        elif optimize_on_cpu:
            param_optimizer = [(n, param.clone().detach().to('cpu').requires_grad_())                                 for n, param in model.named_parameters()]
        else:
            param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'gamma', 'beta']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0}
            ]
        t_total = num_train_steps
#     print(t_total)
    if local_rank != -1:
        t_total = t_total // torch.distributed.get_world_size()
    if do_train:
        optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=learning_rate,
                         warmup=warmup_proportion,
                         t_total=t_total)

    global_step = 0
    if do_train:

        claim_features = convert_claims_to_features(train_examples, label_list, max_seq_length, tokenizer)
        logger.info("claims features done")
        train_features = convert_pers_to_features(train_examples, label_list, max_seq_length, tokenizer)
        logger.info("perspective features done")
#         opposite_claim_features = convert_opp_claims_to_features(train_examples, label_list, max_seq_length, tokenizer)
#         logger.info("opposite claim features done")
        opposite_perspective_features = convert_triopp_pers_to_features(train_examples, label_list, max_seq_length, tokenizer)
        logger.info("opp perspective features done")

        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)
        
            
        pers_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
        pers_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
        pers_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
        pers_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)

        claims_input_ids = torch.tensor([f.input_ids for f in claim_features], dtype=torch.long)
        claims_input_mask = torch.tensor([f.input_mask for f in claim_features], dtype=torch.long)
        claims_segment_ids = torch.tensor([f.segment_ids for f in claim_features], dtype=torch.long)
        claims_label_ids = torch.tensor([f.label_id for f in claim_features], dtype=torch.long)
        
        
        opp_pers_input_ids = torch.tensor([f.input_ids for f in opposite_perspective_features], dtype=torch.long)
        opp_pers_input_mask = torch.tensor([f.input_mask for f in opposite_perspective_features], dtype=torch.long)
        opp_pers_segment_ids = torch.tensor([f.segment_ids for f in opposite_perspective_features], dtype=torch.long)
        opp_pers_label_ids = torch.tensor([f.label_id for f in opposite_perspective_features], dtype=torch.long)
        
        
#         opp_pers_input_ids = torch.tensor([f.input_ids for f in opposite_perspective_features if f.input_ids], dtype=torch.long)
#         opp_pers_input_mask = torch.tensor([f.input_mask for f in opposite_perspective_features if f.input_mask], dtype=torch.long)
#         opp_pers_segment_ids = torch.tensor([f.segment_ids for f in opposite_perspective_features if f.segment_ids], dtype=torch.long)
#         opp_pers_label_ids = torch.tensor([f.label_id for f in opposite_perspective_features if f.label_id], dtype=torch.long)

#         opp_claims_input_ids = torch.tensor([f.input_ids for f in opposite_claim_features], dtype=torch.long)
#         opp_claims_input_mask = torch.tensor([f.input_mask for f in opposite_claim_features], dtype=torch.long)
#         opp_claims_segment_ids = torch.tensor([f.segment_ids for f in opposite_claim_features], dtype=torch.long)
#         opp_claims_label_ids = torch.tensor([f.label_id for f in opposite_claim_features], dtype=torch.long)
        
#         logger.info("  opp pers id: %d, opp pers mask: %d, opp pers seg: %d, opp pers label: %d, opp calims label: %d, calims label: %d ", len(opp_pers_input_ids),len(opp_pers_input_mask),len(opp_pers_segment_ids),len(opp_pers_label_ids),len(opp_claims_label_ids),len(claims_label_ids))
        
        train_data = TensorDataset(pers_input_ids, pers_input_mask, pers_segment_ids, pers_label_ids, claims_input_ids, claims_input_mask, claims_segment_ids, claims_label_ids, opp_pers_input_ids, opp_pers_input_mask, opp_pers_segment_ids, opp_pers_label_ids)

        if local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=train_batch_size)

        model.train()

        for _ in trange(int(num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            process_bar = tqdm(train_dataloader)
            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids, claim_input_ids, claim_input_mask, claim_segment_ids, claim_label_ids, opp_input_ids, opp_input_mask, opp_segment_ids, opp_label_ids = batch
                
                out_results = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids, input_ids2=claim_input_ids, token_type_ids2=claim_segment_ids, attention_mask2=claim_input_mask, labels2=claim_label_ids, input_ids3=opp_input_ids, token_type_ids3=opp_segment_ids, attention_mask3=opp_input_mask, labels3=opp_label_ids)
#                 loss = model(input_ids, segment_ids, input_mask, label_ids)
#                 print("out_results:")
#                 print(out_results)
                loss = out_results
            
                if n_gpu > 1:
                    loss = loss.mean() # mean() to average on multi-gpu.
                if fp16 and loss_scale != 1.0:
                    # rescale loss for fp16 training
                    # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html
                    loss = loss * loss_scale
                if gradient_accumulation_steps > 1:
                    loss = loss / gradient_accumulation_steps
                process_bar.set_description("Loss: %0.8f" % (loss.sum().item()))
                loss.backward()
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % gradient_accumulation_steps == 0:
                    if fp16 or optimize_on_cpu:
                        if fp16 and loss_scale != 1.0:
                            # scale down gradients for fp16 training
                            for param in model.parameters():
                                if param.grad is not None:
                                    param.grad.data = param.grad.data / loss_scale
                        is_nan = set_optimizer_params_grad(param_optimizer, model.named_parameters(), test_nan=True)
                        if is_nan:
                            logger.info("FP16 TRAINING: Nan in gradients, reducing loss scaling")
                            loss_scale = loss_scale / 2
                            model.zero_grad()
                            continue
                        optimizer.step()
                        copy_optimizer_params_to_model(model.named_parameters(), param_optimizer)
                    else:
                        optimizer.step()
                    model.zero_grad()
                    global_step += 1
            print("\nLoss: {}\n".format(tr_loss / nb_tr_steps))
        torch.save(model.state_dict(), output_dir +"distance_concat_margin1_costriplet_cos_siamese_bs24_lr2e_5_epoch15.pth")


    if do_eval and (local_rank == -1 or torch.distributed.get_rank() == 0):
        
        train_df = processor.get_train_df(data_dir)
        test_df = processor.get_test_df(data_dir)
        dev_df = processor.get_dev_df(data_dir)
        
        new_train_df = generate_opp_pers_dataset_not_elim(train_df)
        
        new_train_df.to_csv(os.path.join(data_dir, "tri_train.tsv"),sep='\t',index=False)
        
        new_test_df = generate_opp_pers_dataset_not_elim(test_df)
        
        new_test_df.to_csv(os.path.join(data_dir, "tri_test.tsv"),sep='\t',index=False)
        
        new_dev_df = generate_opp_pers_dataset_not_elim(dev_df)
        
        new_dev_df.to_csv(os.path.join(data_dir, "tri_dev.tsv"),sep='\t',index=False)
     
    
#         test_df = processor.get_test_df(data_dir)
        
#         new_test_df = generate_opp_dataset(test_df)
        
#         new_test_df.to_csv(os.path.join(data_dir, "new_test.tsv"),sep='\t',index=False)
        
#         train_df = processor.get_train_df(data_dir)
        
#         new_train_df = generate_opp_dataset(train_df)
        
#         new_train_df.to_csv(os.path.join(data_dir, "new_train.tsv"),sep='\t',index=False)
        
#         dev_df = processor.get_dev_df(data_dir)
        
#         new_dev_df = generate_opp_dataset(dev_df)
        
#         new_dev_df.to_csv(os.path.join(data_dir, "new_dev.tsv"),sep='\t',index=False)

        eval_examples = processor.get_test_examples(data_dir)
#         eval_examples = processor.get_train_examples(data_dir)
#         eval_examples = processor.get_dev_examples(data_dir)
        claim_features = convert_claims_to_features(eval_examples, label_list, max_seq_length, tokenizer)
        eval_features = convert_pers_to_features(eval_examples, label_list, max_seq_length, tokenizer)
        
#         opposite_claim_features = convert_opp_claims_to_features(eval_examples, label_list, max_seq_length, tokenizer)
        opposite_eval_features = convert_triopp_pers_to_features(eval_examples, label_list, max_seq_length, tokenizer)
            
    
        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", eval_batch_size)
        
        pers_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
        pers_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
        pers_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
        pers_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
        
        claims_input_ids = torch.tensor([f.input_ids for f in claim_features], dtype=torch.long)
        claims_input_mask = torch.tensor([f.input_mask for f in claim_features], dtype=torch.long)
        claims_segment_ids = torch.tensor([f.segment_ids for f in claim_features], dtype=torch.long)
        claims_label_ids = torch.tensor([f.label_id for f in claim_features], dtype=torch.long)
        
        opp_pers_input_ids = torch.tensor([f.input_ids for f in opposite_eval_features], dtype=torch.long)
        opp_pers_input_mask = torch.tensor([f.input_mask for f in opposite_eval_features], dtype=torch.long)
        opp_pers_segment_ids = torch.tensor([f.segment_ids for f in opposite_eval_features], dtype=torch.long)
        opp_pers_label_ids = torch.tensor([f.label_id for f in opposite_eval_features], dtype=torch.long)
        
#         opp_claims_input_ids = torch.tensor([f.input_ids for f in opposite_claim_features], dtype=torch.long)
#         opp_claims_input_mask = torch.tensor([f.input_mask for f in opposite_claim_features], dtype=torch.long)
#         opp_claims_segment_ids = torch.tensor([f.segment_ids for f in opposite_claim_features], dtype=torch.long)
#         opp_claims_label_ids = torch.tensor([f.label_id for f in opposite_claim_features], dtype=torch.long)
        
#         logger.info("%d%d%d%d", len(pers_input_ids),len(claims_input_ids),len(opp_pers_input_ids),len(opp_claims_input_ids))
        
        eval_data = TensorDataset(pers_input_ids, pers_input_mask, pers_segment_ids, pers_label_ids, claims_input_ids, claims_input_mask, claims_segment_ids, claims_label_ids, opp_pers_input_ids, opp_pers_input_mask, opp_pers_segment_ids, opp_pers_label_ids)
        
#         logger.info(eval_data)
        # Run prediction for full data
#         eval_sampler = SequentialSampler(eval_data)
        eval_sampler = SequentialSampler(eval_data)
#         logger.info("1")
        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=eval_batch_size)
#         print('all_input_ids:')
#         print(all_input_ids)
#         logger.info("2")
        

#         model.load_state_dict(torch.load(saved_model))
        model_state_dict = torch.load(saved_model)
#         logger.info("3")
        model = BertForConsistencyCueClassification.from_pretrained('bert-base-uncased', num_labels=2, state_dict=model_state_dict)
#         logger.info("4")
        model.to(device)
#         logger.info("5")
        
        model.eval()
#         logger.info("6")
        # eval_loss, eval_accuracy = 0, 0

        eval_tp, eval_pred_c, eval_gold_c = 0, 0, 0
        distance_eval_tp, distance_eval_pred_c, distance_eval_gold_c = 0, 0, 0
        eval_loss, eval_accuracy, eval_macro_p, eval_macro_r = 0, 0, 0, 0
        distance_accuracy, distance_eval_macro_p, distance_eval_macro_r = 0, 0, 0

        raw_score = []
        predicted_labels = []
        distance_labels = []
        predicted_prob = []
        gold_labels = []

        nb_eval_steps, nb_eval_examples = 0, 0
        for input_ids, input_mask, segment_ids, label_ids, claim_input_ids, claim_input_mask, claim_segment_ids, claim_label_ids, opp_input_ids, opp_input_mask, opp_segment_ids, opp_label_ids in eval_dataloader:
            
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            label_ids = label_ids.to(device)
            
            claim_input_ids = claim_input_ids.to(device)
            claim_input_mask = claim_input_mask.to(device)
            claim_segment_ids = claim_segment_ids.to(device)
            claim_label_ids = claim_label_ids.to(device)
            
            opp_input_ids = opp_input_ids.to(device)
            opp_input_mask = opp_input_mask.to(device)
            opp_segment_ids = opp_segment_ids.to(device)
            opp_label_ids = opp_label_ids.to(device)
            
#             opp_claim_input_ids = opp_claim_input_ids.to(device)
#             opp_claim_input_mask = opp_claim_input_mask.to(device)
#             opp_claim_segment_ids = opp_claim_segment_ids.to(device)
#             opp_claim_label_ids = opp_claim_label_ids.to(device)

#             print("start")
#             print(input_ids)
#             print(input_mask)
#             print(segment_ids)
#             print(label_ids)
#             print(claim_input_ids)
#             print(claim_input_mask)
#             print(claim_segment_ids)
#             print(claim_label_ids)
#             print("end")
            with torch.no_grad():
                tmp_eval_loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids, input_ids2=claim_input_ids, token_type_ids2=claim_segment_ids, attention_mask2=claim_input_mask, labels2=claim_label_ids, input_ids3=opp_input_ids, token_type_ids3=opp_segment_ids, attention_mask3=opp_input_mask, labels3=opp_label_ids)
                
                logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, input_ids2=claim_input_ids, token_type_ids2=claim_segment_ids, attention_mask2=claim_input_mask, input_ids3=opp_input_ids, token_type_ids3=opp_segment_ids, attention_mask3=opp_input_mask)[0]
                
                distance_logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, input_ids2=claim_input_ids, token_type_ids2=claim_segment_ids, attention_mask2=claim_input_mask, input_ids3=opp_input_ids, token_type_ids3=opp_segment_ids, attention_mask3=opp_input_mask)[1]
                
#                 predicted_prob.extend(torch.nn.functional.softmax(logits, dim=1))
#                 logits_grid = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, input_ids2=claim_input_ids, token_type_ids2=claim_segment_ids, attention_mask2=claim_input_mask, input_ids3=opp_input_ids, token_type_ids3=opp_segment_ids, attention_mask3=opp_input_mask, input_ids4=opp_claim_input_ids, token_type_ids4=opp_claim_segment_ids, attention_mask4=opp_claim_input_mask)
            
#             print(logits)
#             print(logits[0])
            logits = logits.detach().cpu().numpy()
            distance_logits = distance_logits.detach().cpu().numpy()
#             print(logits)
            label_ids = label_ids.to('cpu').numpy()
#             print(label_ids)

            tmp_eval_accuracy = accuracy(logits, label_ids)
            distance_eval_accuracy = accuracy(distance_logits, label_ids)
            
            tmp_predicted = np.argmax(logits, axis=1)
            distance_predicted = np.argmax(distance_logits, axis=1)
            
            predicted_labels.extend(tmp_predicted.tolist())
            distance_labels.extend(distance_predicted.tolist())
            gold_labels.extend(label_ids.tolist())
            
            # Micro F1 (aggregated tp, fp, fn counts across all examples)
            tmp_tp, tmp_pred_c, tmp_gold_c = tp_pcount_gcount(logits, label_ids)
            eval_tp += tmp_tp
            eval_pred_c += tmp_pred_c
            eval_gold_c += tmp_gold_c
            
            distance_tp, distance_pred_c, distance_gold_c = tp_pcount_gcount(distance_logits, label_ids)
            distance_eval_tp += distance_tp
            distance_eval_pred_c += distance_pred_c
            distance_eval_gold_c += distance_gold_c
            
            pred_label = np.argmax(logits, axis=1)
            distance_label = np.argmax(distance_logits, axis=1)
            
            raw_score += zip(logits, distance_logits, pred_label, distance_label, label_ids)
            
            # Macro F1 (averaged P, R across mini batches)
            tmp_eval_p, tmp_eval_r, tmp_eval_f1 = p_r_f1(logits, label_ids)

            eval_macro_p += tmp_eval_p
            eval_macro_r += tmp_eval_r
            
            distance_eval_p, distance_eval_r, distance_eval_f1 = p_r_f1(distance_logits, label_ids)

            distance_eval_macro_p += distance_eval_p
            distance_eval_macro_r += distance_eval_r

            eval_loss += tmp_eval_loss.mean().item()
            eval_accuracy += tmp_eval_accuracy
            distance_accuracy += distance_eval_accuracy
            
            nb_eval_examples += input_ids.size(0)
            nb_eval_steps += 1


        # Micro F1 (aggregated tp, fp, fn counts across all examples)
        eval_micro_p = eval_tp / eval_pred_c
        eval_micro_r = eval_tp / eval_gold_c
        eval_micro_f1 = 2 * eval_micro_p * eval_micro_r / (eval_micro_p + eval_micro_r)
        
        distance_eval_micro_p = distance_eval_tp / distance_eval_pred_c
        distance_eval_micro_r = distance_eval_tp / distance_eval_gold_c
        distance_eval_micro_f1 = 2 * distance_eval_micro_p * distance_eval_micro_r / (distance_eval_micro_p + distance_eval_micro_r)

        # Macro F1 (averaged P, R across mini batches)
        eval_macro_p = eval_macro_p / nb_eval_steps
        eval_macro_r = eval_macro_r / nb_eval_steps
        eval_macro_f1 = 2 * eval_macro_p * eval_macro_r / (eval_macro_p + eval_macro_r)
        
        distance_eval_macro_p = distance_eval_macro_p / nb_eval_steps
        distance_eval_macro_r = distance_eval_macro_r / nb_eval_steps
        distance_eval_macro_f1 = 2 * distance_eval_macro_p * distance_eval_macro_r / (distance_eval_macro_p + distance_eval_macro_r)

        eval_loss = eval_loss / nb_eval_steps
        eval_accuracy = eval_accuracy / nb_eval_examples
        distance_accuracy = distance_accuracy / nb_eval_examples
#         print("\nLoss: {}\n".format(eval_loss / nb_eval_steps))
        result = {
                  'eval_loss': eval_loss,
                  'eval_accuracy':eval_accuracy,
                  'eval_micro_p': eval_micro_p,
                  'eval_micro_r': eval_micro_r,
                  'eval_micro_f1': eval_micro_f1,
                  'eval_macro_p': eval_macro_p,
                  'eval_macro_r': eval_macro_r,
                  'eval_macro_f1': eval_macro_f1,
            
                  'distance_accuracy':distance_accuracy,
                  'distance_eval_micro_p': distance_eval_micro_p,
                  'distance_eval_micro_r': distance_eval_micro_r,
                  'distance_eval_micro_f1': distance_eval_micro_f1,
                  'distance_eval_macro_p': distance_eval_macro_p,
                  'distance_eval_macro_r': distance_eval_macro_r,
                  'distance_eval_macro_f1': distance_eval_macro_f1
#                   'global_step': global_step,
#                   'loss': tr_loss/nb_tr_steps
                  }

        output_eval_file = os.path.join(output_dir, "elim_opp_distance_concat_margin1_costriplet_cos_siamese_bs24_lr2e_5_epoch15_eval_results.txt")
        output_raw_score = os.path.join(output_dir, "elim_opp_distance_concat_margin1_costriplet_cos_siamese_bs24_lr2e_5_epoch15_raw_score.csv")
        
#         logger.info(classification_report(gold_labels, predicted_labels, target_names=label_list, digits=4))
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
#             writer.write(classification_report(gold_labels, predicted_labels, target_names=label_list, digits=4))


        with open(output_raw_score, 'w') as fout:
            fields = ["undermine_score", "support_score", "cp_distance", "cop_distance", "predict_label", "distance_label", "gold"]
            writer = csv.DictWriter(fout, fieldnames=fields)
            writer.writeheader()
            for score, distance, pred, distance_pred, gold in raw_score:
                writer.writerow({
                    "undermine_score": str(score[0]),
                    "support_score": str(score[1]),
                    "cp_distance": str(distance[0]),
                    "cop_distance": str(distance[1]),
                    "predict_label": str(pred),
                    "distance_label": str(distance_pred),
                    "gold": str(gold)
                })
    values  #makes a split so that the proportion of values in the sample produced will be the same as the proportion of values provided to parameter
)

df['data_type'] = ['not_set'] * df.shape[0]

df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'
df.groupby(['category', 'label', 'data_type']).count()
#This help in segregating the data inside train and val based on labels

from transformers import BertTokenizer
from torch.utils.data import TensorDataset
#Tokenizer takes raw test and stores it as a token
#token = some random number to identified with

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                          do_lower_case=True)

#special tokens represent where the sentence ends
#Attention_mask helps in identifying the empty spaces in the sentence
#since sentences can vary in sizes therefore we pad them them to make the dimensionality constant across all elements
encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type == 'train'].text.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt')

encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type == 'val'].text.values,
    add_special_tokens=True,
def main():

    random.seed(123)
    np.random.seed(123)
    torch.manual_seed(123)

    parser = argparse.ArgumentParser()

    parser.add_argument('--cuda', default=None, type=int, required=True, help='Selected CUDA.')
    parser.add_argument('--batch_size', default=None, type=int, required=True, help='Batch size.')

    args = parser.parse_args()

    models = [
        ('pfx', 1),
        ('pfx', 2),
        ('pfx', 4),
        ('pfx', 8),
        ('pfx', 16),
        ('pfx', 32),
        ('pfx', 64)
    ]

    for m in models:

        print('Mode: {}'.format(m[0]))
        print('Count: {}'.format(m[1]))

        print('Batch size: {}'.format(args.batch_size))

        # Define path to data
        inpath = str(Path('../../data/final').resolve())

        test_path = '{}{}sents_{:02d}_test.txt'.format(inpath, os.sep, m[1])

        # Initialize val loader
        print('Load validation data...')
        try:
            test_data = AffixDataset(test_path, m[0])
        except FileNotFoundError:
            print('Bin not found.')
            continue

        test_loader = DataLoader(test_data, batch_size=args.batch_size, collate_fn=collate_sents)

        tok = BertTokenizer.from_pretrained('bert-base-uncased')

        # Define device
        device = torch.device('cuda:{}'.format(args.cuda) if torch.cuda.is_available() else 'cpu')

        # Initialize model
        affix_predictor = AffixPredictor(m[0], freeze=False)

        # Move model to CUDA
        affix_predictor = affix_predictor.to(device)

        mrr_micro, mrr_macro_dict = test_single(test_loader, affix_predictor, m[0], args.cuda)

        with open('results_final/results_hyp_macro.txt', 'a+') as f:
            f.write('{:.3f} & '.format(np.mean(list(mrr_macro_dict.values()))))
        with open('results_final/results_hyp_micro.txt', 'a+') as f:
            f.write('{:.3f} & '.format(mrr_micro))
 def __init__(self):
     self.tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
     self.model = TFBertModel.from_pretrained('bert-base-multilingual-cased')
     self.build_model()
#LEARNING_RATE_MODEL = 1e-5
#LEARNING_RATE_CLASSIFIER = 1e-3
# MAX_GRAD_NORM = 1.0
EARLY_STOPPING_ROUNDS = 2
NUM_MODELS = 3
MODEL_PATH = "models/bert_{}".format(time.strftime('%Y%m%d%H%M'))
os.mkdir(MODEL_PATH)

train = pd.read_csv('data/train_preprocessed.csv')
test = pd.read_csv('data/test_preprocessed.csv')
train['comment_text'].fillna("", inplace=True)
test['comment_text'].fillna("", inplace=True)
classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
X_train_valid_raw, y_train_valid = train['comment_text'].str.lower(), train[classes].values
X_test_raw = test['comment_text'].str.lower()
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
X_train_valid = np.array(list(map(lambda x: tokenizer.encode(x, max_length=MAX_LEN, pad_to_max_length=True), X_train_valid_raw)))
X_test = np.array(list(map(lambda x: tokenizer.encode(x, max_length=MAX_LEN, pad_to_max_length=True), X_test_raw)))

class BertForSequenceClassification(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)

        self.init_weights()

    def forward(
def main():

    tokenizer = BertTokenizer.from_pretrained('bert-chinese')

    train_sent_pair, train_label = read_data('./data/train/train.csv')
    train_data = {
        'sent': [],
        'sent_mask':[],
        'sent_segment':[]
    }

    for q1, q2 in train_sent_pair:
        sent = '[CLS]' + q1 + '[SEP]' + q2 + '[SEP]'
        token_list = tokenizer.tokenize(sent)
        for i, word in enumerate(token_list):
            if word == '[SEP]':
                q1_len = i + 1
                break
        sent_id = tokenizer.convert_tokens_to_ids(token_list)
        padding_id = [0] * (MAX_Q_LENGTH - len(token_list))
        train_data['sent'].append(sent_id + padding_id)
        train_data['sent_segment'].append([1] * (q1_len) + [0] * (MAX_Q_LENGTH - q1_len))
        train_data['sent_mask'].append([1] * len(token_list) + padding_id)

    t_seqs = torch.tensor(train_data['sent'], dtype=torch.long)
    t_seq_segs = torch.tensor(train_data['sent_segment'], dtype=torch.long)
    t_seq_masks = torch.tensor(train_data['sent_mask'], dtype=torch.long)
    t_labels = torch.tensor(train_label, dtype=torch.long)

    dataset = TensorDataset(t_seqs, t_seq_masks, t_seq_segs, t_labels)
    dataloader = DataLoader(dataset, shuffle=True, batch_size=32)

    device = "cpu"  #    'cuda:0'

    model = BertForSequenceClassification.from_pretrained('bert-chinese', num_labels=2)
    model.to(device)
    model.train()

    param_optimizer = list(model.named_parameters())
    #print(param_optimizer)
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']

    optimizer_grouped_parameters = [
        {
            'params':
                [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
            'weight_decay':
                0.01
        },
        {
            'params':
                [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
                0.0
        }
    ]

    optimizer = AdamW(optimizer_grouped_parameters,
                         lr=2e-05)

    for i in range(10):
        for step, batch_data in enumerate(
                dataloader):
            batch_data = tuple(t.to(device) for t in batch_data)
            batch_seqs, batch_seq_masks, batch_seq_segments, batch_labels = batch_data
            logits = model(
                batch_seqs, batch_seq_masks, batch_seq_segments, labels=None)
            loss_function = CrossEntropyLoss()
            loss = loss_function(logits[0], batch_labels)
            optimizer.zero_grad()
            loss.backward()
            print("epoch{}, step {}, loss = {}".format(i, step, loss.item()))
            optimizer.step()


    dev_sent_pair, dev_label = read_data('./dev.csv')
    dev_data = {
        'sent': [],
        'sent_mask': [],
        'sent_segment': []
    }

    for q1, q2 in dev_sent_pair:
        sent = '[CLS]' + q1 + '[SEP]' + q2 + '[SEP]'
        token_list = tokenizer.tokenize(sent)
        for i, word in enumerate(token_list):
            if word == '[SEP]':
                q1_len = i + 1
                break
        sent_id = tokenizer.convert_tokens_to_ids(token_list)
        # print(len(token_list) == len(sent_id))
        padding_id = [0] * (MAX_Q_LENGTH - len(token_list))
        dev_data['sent'].append(sent_id + padding_id)
        dev_data['sent_segment'].append([1] * (q1_len) + [0] * (MAX_Q_LENGTH - q1_len))
        dev_data['sent_mask'].append([1] * len(token_list) + padding_id)

    t_seqs = torch.tensor(dev_data['sent'], dtype=torch.long)
    t_seq_segs = torch.tensor(dev_data['sent_segment'], dtype=torch.long)
    t_seq_masks = torch.tensor(dev_data['sent_mask'], dtype=torch.long)
    t_labels = torch.tensor(dev_label, dtype=torch.long)

    dataset = TensorDataset(t_seqs, t_seq_masks, t_seq_segs, t_labels)
    dataloader = DataLoader(dataset, shuffle=True, batch_size=32)

    true_labels = []
    pred_labels = []
    model.eval()

    with torch.no_grad():
        for batch_data in dataloader:
            batch_data = tuple(t.to(device) for t in batch_data)
            batch_seqs, batch_seq_masks, batch_seq_segments, batch_labels = batch_data
            logits = model(
                batch_seqs, batch_seq_masks, batch_seq_segments, labels=None)
            logits = logits[0].argmax(dim=1)
            pred_labels += logits.cpu().numpy().tolist()
            true_labels += batch_labels.cpu().numpy().tolist()

    acc_cnt = 0
    for l_pre, l_true in zip(pred_labels, true_labels):
        if l_pre == l_true:
            acc_cnt += 1
    print('valid acc: {}'.format(acc_cnt / len(pred_labels)))
Exemple #25
0
    for i in range(0, len(test_ids_labels)):
        tem_list = test_ids_labels[i].strip().split('\t')
        tem_id = tem_list[0].split('/')[-1].replace('.jpg', '')
        test_ids.append(tem_id)
        test_labels.append(int(tem_list[1]))
        with open(
                '/data/scratch/projects/punim0478/ailis/Fakeddit/fakenews_full/text/'
                + tem_id + '.txt') as f:
            data = f.readline().strip()
        test_text.append(data)

    logger.info('train: ' + str(len(train_ids)) + ' dev: ' +
                str(len(dev_ids)) + ' test: ' + str(len(test_ids)))

    tokenizer = BertTokenizer.from_pretrained(parser.bert_pretrained)

    train_dataset = TextDataset(train_ids, train_text, train_labels, tokenizer)
    dev_dataset = TextDataset(dev_ids, dev_text, dev_labels, tokenizer)
    test_dataset = TextDataset(test_ids, test_text, test_labels, tokenizer)

    dataloaders_train = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=parser.batch_size,
        shuffle=True,
        num_workers=4)
    dataloaders_dev = torch.utils.data.DataLoader(dev_dataset,
                                                  batch_size=parser.batch_size,
                                                  shuffle=False,
                                                  num_workers=4)
    dataloaders_test = torch.utils.data.DataLoader(
def main():
    parser = argparse.ArgumentParser()
    arg = parser.add_argument
    arg('--mode', choices=['train', 'validate', 'predict'], default='train')
    arg('--run_root', default='.')
    arg('--batch-size', type=int, default=16)
    arg('--step', type=int, default=1)
    arg('--workers', type=int, default=0)
    arg('--lr', type=float, default=0.00003)
    arg('--adam_epsilon', type=float, default=1e-8)
    arg('--weight_decay', type=float, default=0.0)
    arg('--fold', type=int, default=0)
    arg('--warmup', type=float, default=0.05)
    arg('--limit', type=int)
    arg('--patience', type=int, default=1)
    arg('--clean', action='store_true')
    arg('--n-epochs', type=int, default=20)
    arg('--vocab-size', type=int, default=13318)
    arg('--multi-gpu', type=int, default=0)
    arg('--print-num', type=int, default=5)
    arg('--temperature', type=float)

    args = parser.parse_args()

    df = pd.read_table('../data/dialog-rewrite/corpus.txt',
                       sep="\t\t",
                       names=['a', 'b', 'current', 'label'],
                       dtype=str)
    df.dropna(how='any', inplace=True)
    train_length = int(len(df) * 0.9)

    train_df = df.iloc[:train_length].iloc[:, :]
    valid_df = df.iloc[train_length:]
    print(valid_df.head())
    if args.mode == 'predict':
        # valid_df['current'] = valid_df['label']
        valid_df = pd.read_table('../data/dialog-rewrite/test.csv',
                                 sep=",",
                                 names=['a', 'b', 'current', 'label'],
                                 dtype=str)
        print(valid_df.tail())
    valid_df['eval_label'] = valid_df['label'].apply(
        lambda x: ' '.join(list(x)))

    if args.limit:
        train_df = train_df.iloc[0:args.limit]
        valid_df = valid_df.iloc[0:args.limit]
    # train_df['len'] = train_df['content'].apply(lambda x: len(x))

    run_root = Path('../experiments/' + args.run_root)
    tokenizer = BertTokenizer.from_pretrained("../rbt3")
    valid_set = TaggerRewriterDataset(valid_df, tokenizer, valid=True)
    valid_index = np.array(valid_set.valid_index)
    # np.save('index.npy', valid_index)
    valid_df = valid_df.reset_index().loc[valid_index, :]
    ner_index = np.array(valid_set.label_type) == 1
    valid_loader = DataLoader(valid_set,
                              batch_size=args.batch_size,
                              shuffle=False,
                              num_workers=args.workers,
                              collate_fn=tagger_collate_fn)

    config = BertConfig.from_json_file('../rbt3/config.json')
    config.num_labels = 5
    # # config.is_decoder = True
    # decoder = BertModel.from_pretrained("../rbt3", config=config)
    # encoder = BertModel.from_pretrained("../rbt3")
    # args.vocab_size = config.vocab_size
    bert_path = '../rbt3'
    model = TaggerRewriteModel(config, bert_path)
    model.cuda()

    if args.mode == 'train':
        if run_root.exists() and args.clean:
            shutil.rmtree(run_root)
        run_root.mkdir(exist_ok=True, parents=True)
        (run_root / 'params.json').write_text(
            json.dumps(vars(args), indent=4, sort_keys=True))

        train_set = TaggerRewriterDataset(train_df, tokenizer)

        # np.save('index.npy', train_set.valid_index)

        train_loader = DataLoader(train_set,
                                  batch_size=args.batch_size,
                                  shuffle=True,
                                  num_workers=args.workers,
                                  collate_fn=tagger_collate_fn)

        # Prepare optimizer and schedule (linear warmup and decay)
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [
                    p for n, p in model.named_parameters()
                    if not any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                args.weight_decay,
            },
            {
                "params": [
                    p for n, p in model.named_parameters()
                    if any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                0.0
            },
        ]

        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=args.lr,
                          eps=args.adam_epsilon)
        t_total = int(len(train_df) * args.n_epochs / args.batch_size)
        warmup_steps = int(t_total * args.warmup)
        # scheduler = get_linear_schedule_with_warmup(
        # optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total
        # )
        scheduler = get_constant_schedule_with_warmup(
            optimizer, num_warmup_steps=warmup_steps)
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level='O2',
                                          verbosity=0)

        train(args,
              model,
              optimizer,
              scheduler,
              tokenizer,
              ner_index,
              train_loader=train_loader,
              valid_df=valid_df,
              valid_loader=valid_loader,
              epoch_length=len(train_df))

    elif args.mode == 'validate':
        model_path = run_root / ('tagger_model-%d.pt' % args.fold)
        load_model(model, model_path)
        valid_metrics = validate(model,
                                 valid_loader,
                                 valid_df,
                                 args,
                                 tokenizer,
                                 ner_index,
                                 decode_mode='beam_search')

    elif args.mode == 'predict':
        model_path = run_root / ('tagger_model-%d.pt' % args.fold)
        load_model(model, model_path)
        valid_metrics = validate(model,
                                 valid_loader,
                                 valid_df,
                                 args,
                                 tokenizer,
                                 decode_mode='beam_search')
Exemple #27
0
 def __init__(self, model_path, use_gpu=True):
     self.model = BertForTokenClassification.from_pretrained(model_path)
     self.tokenizer = BertTokenizer.from_pretrained(model_path)
     self.labels_map = self.model.config.id2label
     self.use_gpu = True
def main():

    #global variables to be used in script
    PRE_TRAINED_MODEL_NAME = 'bert-base-cased'
    class_names = ['negative', 'positive']
    tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

    class Model(nn.Module):
        def __init__(self, *args, **kwargs):
            super(Model, self).__init__()

    #develop a class for the Sentiment Classifier
    class SentimentClassifier(nn.Module):
        def __init__(self, n_classes):
            super(SentimentClassifier, self).__init__()
            self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
            self.drop = nn.Dropout(p=0.3)
            self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

        def forward(self, input_ids, attention_mask):
            _, pooled_output = self.bert(input_ids=input_ids,
                                         attention_mask=attention_mask)
            output = self.drop(pooled_output)
            return self.out(output)

    #Generate a title for our webpage
    st.title('Sentiment analysis and product reviews.')
    #createing a sidebar for our webpage
    st.sidebar.title("Sentiment Analysis Web App")
    #little comment for our sidebar section
    st.sidebar.markdown("😃Is your review positive or negative?😞")

    #Here we will load the data into a cache to prevent repeated work)
    @st.cache
    def load_data():
        #Function to pull in data from our Amazon s3 Bucket
        data = pd.read_csv(
            'https://amazonproductdata.s3-us-west-1.amazonaws.com/train.csv')
        return data

    #let's ingest our raw data here
    df = load_data()

    @st.cache
    def get_model():
        gdown.download(
            "https://drive.google.com/uc?id=1cz41bp4tf37Mky_R31T41qiSN6ucMjGi",
            "./assets/model_state_dict.bin",
            quiet=False)

    get_model()

    #A function for loading models incase we include other models later
    def load_model(filepath):
        model = SentimentClassifier(len(class_names))
        device = torch.device('cpu')
        model.load_state_dict(torch.load(filepath, map_location=device))
        return model

    #loading model into memory - works locally
    #model = load_model('./model/BERT_trained_model')   #This one works locally!
    model = load_model('./assets/model_state_dict.bin')

    #here we have the ability to plot data metrics
    def plot_metrics(metrics_list):
        if "Confusion Matrix" in metrics_list:
            st.subheader("Confusion Matrix")
            plot_confusion_matrix(model,
                                  x_test,
                                  y_test,
                                  display_labels=class_names)

    #function to provide inference from BERT model
    def BERT_inference(review_text):
        #tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
        #Now we must encode the use text
        encoded_review = tokenizer.encode_plus(
            review_text,
            max_length=300,
            add_special_tokens=True,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        input_ids = encoded_review['input_ids'].to(device)
        attention_mask = encoded_review['attention_mask'].to(device)

        output = model(input_ids, attention_mask)
        _, prediction = torch.max(output, dim=1)

        st.write(f'Review text: {review_text}')
        st.write(f'Sentiment  : {class_names[prediction]}')

    #sidebar options to add more rich features to our app
    if st.sidebar.checkbox("Show raw data", False):
        st.subheader(
            "Amazon Review Sentiment Analysis. (Polarity Classification)")
        st.table(df.head(10))
    #Generating a textbox for user input
    if st.sidebar.checkbox("Input text for inference", False):
        st.subheader(
            "Amazon Review Dataset for Sentiment Analysis. (Inference Demonstration.)"
        )
        user_input = st.text_area("Please provide a review here.")
        if user_input:
            #Let's process the users input
            print(user_input)
            BERT_inference(user_input)
Exemple #29
0
print('model_name: ', model_name)

if (('RoBerta' in model_name) or ('roberta' in model_name)):
    from transformers import RobertaTokenizer, RobertaModel
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base',
                                                 do_lower_case=False)
    from multi_label_fns import RoBerta_clf
    model = RoBerta_clf.from_pretrained(model_name,
                                        num_labels=NUM_LABELS,
                                        output_attentions=False,
                                        output_hidden_states=True)
    print('using RoBerta:', model_name)

elif (('Bert' in model_name) or ('bert' in model_name)):
    from transformers import BertTokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-cased',
                                              do_lower_case=False)
    from multi_label_fns import Bert_clf
    model = Bert_clf.from_pretrained(model_name,
                                     num_labels=NUM_LABELS,
                                     output_attentions=False,
                                     output_hidden_states=True)
    print('using Bert:', model_name)

elif (('XLM' in model_name) or ('xlm' in model_name)):
    from transformers import XLMTokenizer
    tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-enfr-1024',
                                             do_lower_case=False)
    from multi_label_fns import XLM_clf
    model = XLM_clf.from_pretrained(model_name,
                                    num_labels=NUM_LABELS,
                                    output_attentions=False,
Exemple #30
0
 def __init__(self):
     super(NLI, self).__init__()
     self.model = BertForSequenceClassification.from_pretrained('./data/model_en/bert_fine_tuning').cuda()
     self.tokenizer = BertTokenizer.from_pretrained('./data/model_en/bert_fine_tuning')