Beispiel #1
0
 def __init__(self, n_classes: int):
     super(SentimentClassifier, self).__init__()
     self.model = BertModel.from_pretrained(config.PRETRAINED_MODEL_NAME)
     self.dropout = nn.Dropout(p=0.3)
     self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
     self.softmax = nn.Softmax(dim=1)
Beispiel #2
0
 def __init__(self, config, num_labels=3):
     super(BertForABSA, self).__init__(config)
     self.num_labels = num_labels
     self.bert = BertModel(config)
     self.hsum = HSUM(4, config, num_labels)
     self.init_weights()
Beispiel #3
0
    ["This is a sample", "This is another longer sample text"],
    pad_to_max_length=
    True  # First sentence will have some PADDED tokens to match second sequence length
)

for i in range(2):
    print("Tokens (int)      : {}".format(tokens['input_ids'][i]))
    print("Tokens (str)      : {}".format(
        [tokenizer.convert_ids_to_tokens(s) for s in tokens['input_ids'][i]]))
    print("Tokens (attn_mask): {}".format(tokens['attention_mask'][i]))
    print()

from transformers import TFBertModel, BertModel

# Let's load a BERT model for TensorFlow and PyTorch
model_tf = TFBertModel.from_pretrained('bert-base-cased')
model_pt = BertModel.from_pretrained('bert-base-cased')

# transformers generates a ready to use dictionary with all the required parameters for the specific framework.
input_tf = tokenizer.encode_plus("This is a sample input", return_tensors="tf")
input_pt = tokenizer.encode_plus("This is a sample input", return_tensors="pt")

# Let's compare the outputs
output_tf, output_pt = model_tf(input_tf), model_pt(**input_pt)

# Models outputs 2 values (The value for each tokens, the pooled representation of the input sentence)
# Here we compare the output differences between PyTorch and TensorFlow.
for name, o_tf, o_pt in zip(["output", "pooled"], output_tf, output_pt):
    print("{} differences: {}".format(name,
                                      (o_tf.numpy() - o_pt.numpy()).sum()))
    help=
    'must have the following columns: seqs, num_seqs, and note_id either as a column or index'
)
parser.add_argument('--model_path', type=str)
parser.add_argument('--output_path', type=str)
parser.add_argument('--emb_method',
                    default='last',
                    const='last',
                    nargs='?',
                    choices=['last', 'sum4', 'cat4'],
                    help='how to extract embeddings from BERT output')
args = parser.parse_args()

tokenizer = BertTokenizer.from_pretrained(args.model_path)
config = BertConfig.from_pretrained(args.model_path, output_hidden_states=True)
model = BertModel.from_pretrained(args.model_path, config=config)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
print(f'Using {device} with {n_gpu} GPUs')

# if n_gpu > 1:
#     model = torch.nn.DataParallel(model)

print('Reading dataframe...')
df = pd.read_pickle(args.df_path)
if 'note_id' in df.columns:
    df = df.set_index('note_id')


def convert_input_example(note_id, text, seqIdx, subj_id, gender):
Beispiel #5
0
# datasets['test'] = TensorDataset(*[t[:100] for t in datasets['test'].tensors])
train_dataloader = DataLoader(datasets['train'], batch_size=1, shuffle=False)
dev_dataloader = DataLoader(datasets['dev'], batch_size=100)
test_dataloader = DataLoader(datasets['test'], batch_size=100)

# Language Model
bert_folder_path = Path(
    f'./experiments/transformers/{bert_model_name}/{bert_model_size_type}/{bert_tokenizer_type}/{bert_version}'
)
if bert_tokenizer_type == 'wordpiece_roots':
    bert_folder_path = Path(
        f'./experiments/transformers/{bert_model_name}/{bert_model_size_type}/wordpiece/{bert_version}'
    )
    logging.info(f'Loading roots tokenizer BERT from: {str(bert_folder_path)}')
    bert_tokenizer = AlefBERTRootTokenizer(str(bert_folder_path / 'vocab.txt'))
    bert = BertModel.from_pretrained(str(bert_folder_path))
elif bert_model_name == 'mBERT':
    logging.info(f'Loading {bert_model_name}')
    bert_tokenizer = BertTokenizerFast.from_pretrained(
        'bert-base-multilingual')
    bert = BertModel.from_pretrained('bert-base-multilingual')
elif bert_model_name == 'heBERT':
    logging.info(f'Loading {bert_model_name}')
    bert_tokenizer = BertTokenizerFast.from_pretrained(
        f'avichr/{bert_model_name}')
    bert = BertModel.from_pretrained(f'avichr/{bert_model_name}')
else:
    logging.info(f'Loading BERT from: {str(bert_folder_path)}')
    bert_tokenizer = BertTokenizerFast.from_pretrained(str(bert_folder_path))
    bert = BertModel.from_pretrained(str(bert_folder_path))
logging.info('BERT model and tokenizer loaded')
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
# Some global variables
train_batch_size = 40
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
learning_rate = 1e-5
num_epoch = 10

# define student and teacher model
# Teacher Model
bert_config = BertConfig(num_hidden_layers=12,
                         hidden_size=60,
                         intermediate_size=60,
                         output_hidden_states=True,
                         output_attentions=True)
teacher_model = BertModel(bert_config)
# Student Model
bert_config = BertConfig(num_hidden_layers=3,
                         hidden_size=60,
                         intermediate_size=60,
                         output_hidden_states=True,
                         output_attentions=True)
student_model = BertModel(bert_config)

### Train data loader
input_ids = torch.LongTensor(np.random.randint(100, 1000, (100000, 50)))
attention_mask = torch.LongTensor(np.ones((100000, 50)))
token_type_ids = torch.LongTensor(np.zeros((100000, 50)))
train_data = TensorDataset(input_ids, attention_mask, token_type_ids)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data,
Beispiel #7
0
 def __init__(self, config):
     super(BertForQuestionAnswering, self).__init__(config)
     self.bert = BertModel(config)
     # TODO check with Google if it's normal there is no dropout on the token classifier of SQuAD in the TF version
     self.apply(self.init_bert_weights)
Beispiel #8
0
 def __init__(self, label_size, bert_model):
     super().__init__()
     self.label_size = label_size
     self.hidden_size = 768 * 2
     self.bert = BertModel.from_pretrained(bert_model)
     self.linear = nn.Linear(self.hidden_size, self.label_size)
Beispiel #9
0
def convert_pytorch_checkpoint_to_tf(model: BertModel, ckpt_dir: str,
                                     model_name: str):
    """
    Args:
        model: BertModel Pytorch model instance to be converted
        ckpt_dir: Tensorflow model directory
        model_name: model name

    Currently supported HF models:

        - Y BertModel
        - N BertForMaskedLM
        - N BertForPreTraining
        - N BertForMultipleChoice
        - N BertForNextSentencePrediction
        - N BertForSequenceClassification
        - N BertForQuestionAnswering
    """

    tensors_to_transpose = ("dense.weight", "attention.self.query",
                            "attention.self.key", "attention.self.value")

    var_map = (
        ("layer.", "layer_"),
        ("word_embeddings.weight", "word_embeddings"),
        ("position_embeddings.weight", "position_embeddings"),
        ("token_type_embeddings.weight", "token_type_embeddings"),
        (".", "/"),
        ("LayerNorm/weight", "LayerNorm/gamma"),
        ("LayerNorm/bias", "LayerNorm/beta"),
        ("weight", "kernel"),
    )

    if not os.path.isdir(ckpt_dir):
        os.makedirs(ckpt_dir)

    state_dict = model.state_dict()

    def to_tf_var_name(name: str):
        for patt, repl in iter(var_map):
            name = name.replace(patt, repl)
        return "bert/{}".format(name)

    def create_tf_var(tensor: np.ndarray, name: str, session: tf.Session):
        tf_dtype = tf.dtypes.as_dtype(tensor.dtype)
        tf_var = tf.get_variable(dtype=tf_dtype,
                                 shape=tensor.shape,
                                 name=name,
                                 initializer=tf.zeros_initializer())
        session.run(tf.variables_initializer([tf_var]))
        session.run(tf_var)
        return tf_var

    tf.reset_default_graph()
    with tf.Session() as session:
        for var_name in state_dict:
            tf_name = to_tf_var_name(var_name)
            torch_tensor = state_dict[var_name].numpy()
            if any([x in var_name for x in tensors_to_transpose]):
                torch_tensor = torch_tensor.T
            tf_var = create_tf_var(tensor=torch_tensor,
                                   name=tf_name,
                                   session=session)
            tf.keras.backend.set_value(tf_var, torch_tensor)
            tf_weight = session.run(tf_var)
            print("Successfully created {}: {}".format(
                tf_name, np.allclose(tf_weight, torch_tensor)))

        saver = tf.train.Saver(tf.trainable_variables())
        saver.save(
            session,
            os.path.join(ckpt_dir,
                         model_name.replace("-", "_") + ".ckpt"))
Beispiel #10
0
    def bert_embedding(self, bert_path, text_data):
        tokenizer = BertTokenizer(vocab_file=bert_path + "vocab.txt")  # 初始化分词器

        # 如果第一段文本有10小段,则记录为[0, 10],该list元素数量比tag多1,tag[i]对应文本separate_points[i]:separates[i+1]
        separated_texts = []
        separated_points = [0]
        max_text_length = 512
        for slices in text_data:
            for text in slices:
                separated_texts.append(text)
            separated_points.append(len(separated_texts))

        max_len = max([len(single) for single in separated_texts])  # 最大的句子长度
        self.logger.info("data_size: %d" % len(text_data))
        self.logger.info("max_seq_len: %d" % max_len)
        self.logger.info(
            "avg_seq_len: %d " % np.mean([len(single) for single in separated_texts])
        )

        bert_model = BertModel.from_pretrained(bert_path).to(self.device)
        bert_model.eval()
        batch_size = config.bert_batch_size
        n_batch = math.ceil(len(separated_texts) / batch_size)
        embeds = []
        for i in range(n_batch):
            if i % 100 == 0:
                self.logger.info("Embedding, %d / %d" % (i, n_batch))
            # if i != 0 and i % max_save_size == 0:
            sta = i * batch_size
            end = (i + 1) * batch_size
            tokens, segments, input_masks = [], [], []
            for text in separated_texts[sta:end]:
                indexed_tokens = tokenizer.encode(text)  # 索引列表
                if len(indexed_tokens) > max_text_length:
                    indexed_tokens = indexed_tokens[:max_text_length]
                tokens.append(indexed_tokens)
                segments.append([0] * len(indexed_tokens))
                input_masks.append([1] * len(indexed_tokens))

            max_len = max([len(single) for single in tokens])  # 最大的句子长度

            for j in range(len(tokens)):
                padding = [0] * (max_len - len(tokens[j]))
                tokens[j] += padding
                segments[j] += padding
                input_masks[j] += padding
            # segments列表全0,因为只有一个句子1,没有句子2
            # input_masks列表1的部分代表句子单词,而后面0的部分代表paddig,只是用于保持输入整齐,没有实际意义。
            # 相当于告诉BertModel不要利用后面0的部分

            # 转换成PyTorch tensors
            tokens_tensor = torch.tensor(tokens).to(self.device)
            segments_tensors = torch.tensor(segments).to(self.device)
            input_masks_tensors = torch.tensor(input_masks).to(self.device)

            output = bert_model(
                tokens_tensor,
                token_type_ids=segments_tensors,
                attention_mask=input_masks_tensors,
            )
            last_encode = output[0]
            output_mask = (
                input_masks_tensors
                .unsqueeze(-1)
                .repeat(1, 1, last_encode.shape[-1])
            )
            masked_output = last_encode * output_mask
            self.logger.debug(masked_output.shape)
            pooled_output = torch.mean(masked_output, dim=1)
            self.logger.debug(pooled_output.shape)
            embed = pooled_output.cpu().detach().tolist()
            self.logger.debug(len(embed))
            embeds.extend(embed)
            torch.cuda.empty_cache()

        # 按照文本分割的embedding
        embeds_per_text = []
        for i in range(len(separated_points) - 1):
            embeds_per_text.append(embeds[separated_points[i]:separated_points[i+1]])
        return embeds_per_text
Beispiel #11
0
def trainBaseline(trainPath,
                  vocabPath,
                  labelPath,
                  epoch=1,
                  useBert=0,
                  load=0,
                  modelPath='',
                  saveName="baseline"):
    device = torch.device(
        'cuda') if torch.cuda.is_available() else torch.device('cpu')
    dl = DataLoader(vocabPath, labelPath)

    if useBert == 0:
        trainArticles = dl.readData_ub(trainPath)
    else:
        trainArticles = dl.readData_bert(trainPath)
    sample = list(
        torch.utils.data.WeightedRandomSampler(dl.indices,
                                               dl.noSample,
                                               replacement=False))
    lenArt = dl.lenArt
    #print('epoch: ', i)

    baseline = Baseline(64, dl.voc.keysize, 32, 2, useBert=useBert)
    #    print('7')
    docLoss = nn.CrossEntropyLoss()
    #    print('5')
    opt = torch.optim.Adam(baseline.parameters(), lr=0.001)

    if load == 1:
        checkpoint = torch.load(modelPath)

        baseline.load_state_dict(checkpoint['baseline_model'])
        #DGAT.load_state_dict(checkpoint['document_model'])
        opt.load_state_dict(checkpoint['optimizer'])

    if useBert == 1:
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        bert = BertModel.from_pretrained('bert-base-uncased').to(device)
    ##Iterate over articles in dataset
    for e in range(0, epoch):
        truelabels = []
        predictions = []

        for n, ind in enumerate(sample):

            a = trainArticles[ind]
            id, article, l = a
            aTemp = torch.tensor([])
            if l > 0:
                l = torch.tensor([1], dtype=torch.long)  #.to('cuda')
                truelabels.append(1)
            else:
                l = torch.tensor([0], dtype=torch.long)  #.to('cuda')
                truelabels.append(0)

            for i, s in enumerate(article):
                if useBert == 1:
                    sTemp = bert(
                        torch.tensor(
                            tokenizer(
                                s[:510] if len(s) > 510 else s)['input_ids'],
                            device=device).unsqueeze(0))[0].detach().squeeze(
                                dim=0).to('cpu')
                else:
                    sTemp = s
                aTemp = torch.cat((aTemp, sTemp.float()), dim=0)

        #if i == 0:
        #h = torch.zeros((1, 32))
            label = baseline(aTemp.unsqueeze(0))

            loss = docLoss(label, l)

            if l.item() == label.argmax().item():
                if l.item() == 0:
                    #           tneg += 1
                    predictions.append(0)
                else:
                    #          tpos += 1
                    predictions.append(1)
            else:
                if l.item() == 0:
                    #          fpos += 1
                    predictions.append(1)
                else:
                    #          fneg += 1
                    predictions.append(0)
            opt.zero_grad()
            loss.backward()
            opt.step()

        print('f1 score: ', f1_score(truelabels, predictions, average=None))

# if debugMode == 0:
    torch.save(
        {
            'baseline_model': baseline.state_dict(),
            'optimizer': opt.state_dict()
        }, 'dataset/{}.tar'.format(saveName))
Beispiel #12
0
assert tokenized_text == [
    '[CLS]', 'who', 'was', 'jim', 'henson', '?', '[SEP]', 'jim', '[MASK]',
    'was', 'a', 'puppet', '##eer', '[SEP]'
]

# Convert token to vocabulary indices
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
# Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]

# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-large-uncased')

# Set the model in evaluation mode to deactivate the DropOut modules
# This is IMPORTANT to have reproducible results during evaluation!
model.eval()

# If you have a GPU, put everything on cuda
tokens_tensor = tokens_tensor.to('cuda')
segments_tensors = segments_tensors.to('cuda')
model.to('cuda')

# Predict hidden states features for each layer
with torch.no_grad():
    # See the models docstrings for the detail of the inputs
    outputs = model(tokens_tensor, token_type_ids=segments_tensors)
    # Transformers models always output tuples.
Beispiel #13
0
    def __init__(self, n_class):
        super(SentimentClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)

        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_class)
Beispiel #14
0
 def __init__(self, config):
     super().__init__()
     self.num_labels = config.num_labels
     self.bert = BertModel(config)
     self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
Beispiel #15
0
 def get_model(self):
     from transformers import BertModel
     _model = BertModel.from_pretrained(self.tmp_model_path)
     self.to_device(_model)
     return _model
Beispiel #16
0
        hidden_states = self.decoder(hidden_states)
        return hidden_states


"""------"""

# class Bert(nn.Module):
#     def __init__(self):
#         super(Bert, self).__init__()
#         self.model = BertModel.from_pretrained('hfl/chinese-bert-wwm', config=config)

#     def forward(self, input_ids=None, attention_mask=None, token_type_ids=None):
#         output = self.model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
#         return output

BERT = BertModel.from_pretrained('hfl/chinese-bert-wwm', config=config)


class dotdict(dict):
    """dot.notation access to dictionary attributes"""
    __getattr__ = dict.get
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__


#%%
class Seq2Seq(nn.Module):
    def __init__(self, mode):
        super(Seq2Seq, self).__init__()
        assert mode in ['G', 'R']
        # 把 bert 拿進來
 def __init__(self):
     super().__init__()
     config = BertConfig.from_pretrained("bert-base-uncased")
     self.model = BertModel(config)
Beispiel #18
0
class SpERT(BertPreTrainedModel):
    """ Span-based model to jointly extract entities and relations """

    VERSION = '1.1'

    def __init__(self, config: BertConfig, cls_token: int, relation_types: int, entity_types: int,
                 size_embedding: int, prop_drop: float, freeze_transformer: bool, max_pairs: int = 100):
        super(SpERT, self).__init__(config)

        # BERT model
        self.bert = BertModel(config)

        # layers
        self.rel_classifier = nn.Linear(config.hidden_size * 3 + size_embedding * 2, relation_types)
        self.entity_classifier = nn.Linear(config.hidden_size * 2 + size_embedding, entity_types)
        self.size_embeddings = nn.Embedding(100, size_embedding)
        self.dropout = nn.Dropout(prop_drop)

        self._cls_token = cls_token
        self._relation_types = relation_types
        self._entity_types = entity_types
        self._max_pairs = max_pairs

        # weight initialization
        self.init_weights()

        if freeze_transformer:
            print("Freeze transformer weights")

            # freeze all transformer weights
            for param in self.bert.parameters():
                param.requires_grad = False

    def _forward_train(self, encodings: torch.tensor, context_masks: torch.tensor, entity_masks: torch.tensor,
                       entity_sizes: torch.tensor, relations: torch.tensor, rel_masks: torch.tensor):
        # get contextualized token embeddings from last transformer layer
        context_masks = context_masks.float()
        h = self.bert(input_ids=encodings, attention_mask=context_masks)['last_hidden_state']

        batch_size = encodings.shape[0]

        # classify entities
        size_embeddings = self.size_embeddings(entity_sizes)  # embed entity candidate sizes
        entity_clf, entity_spans_pool = self._classify_entities(encodings, h, entity_masks, size_embeddings)

        # classify relations
        h_large = h.unsqueeze(1).repeat(1, max(min(relations.shape[1], self._max_pairs), 1), 1, 1)
        rel_clf = torch.zeros([batch_size, relations.shape[1], self._relation_types]).to(
            self.rel_classifier.weight.device)

        # obtain relation logits
        # chunk processing to reduce memory usage
        for i in range(0, relations.shape[1], self._max_pairs):
            # classify relation candidates
            chunk_rel_logits = self._classify_relations(entity_spans_pool, size_embeddings,
                                                        relations, rel_masks, h_large, i)
            rel_clf[:, i:i + self._max_pairs, :] = chunk_rel_logits

        return entity_clf, rel_clf

    def _forward_eval(self, encodings: torch.tensor, context_masks: torch.tensor, entity_masks: torch.tensor,
                      entity_sizes: torch.tensor, entity_spans: torch.tensor, entity_sample_masks: torch.tensor):
        # get contextualized token embeddings from last transformer layer
        context_masks = context_masks.float()
        h = self.bert(input_ids=encodings, attention_mask=context_masks)['last_hidden_state']

        batch_size = encodings.shape[0]
        ctx_size = context_masks.shape[-1]

        # classify entities
        size_embeddings = self.size_embeddings(entity_sizes)  # embed entity candidate sizes
        entity_clf, entity_spans_pool = self._classify_entities(encodings, h, entity_masks, size_embeddings)

        # ignore entity candidates that do not constitute an actual entity for relations (based on classifier)
        relations, rel_masks, rel_sample_masks = self._filter_spans(entity_clf, entity_spans,
                                                                    entity_sample_masks, ctx_size)

        rel_sample_masks = rel_sample_masks.float().unsqueeze(-1)
        h_large = h.unsqueeze(1).repeat(1, max(min(relations.shape[1], self._max_pairs), 1), 1, 1)
        rel_clf = torch.zeros([batch_size, relations.shape[1], self._relation_types]).to(
            self.rel_classifier.weight.device)

        # obtain relation logits
        # chunk processing to reduce memory usage
        for i in range(0, relations.shape[1], self._max_pairs):
            # classify relation candidates
            chunk_rel_logits = self._classify_relations(entity_spans_pool, size_embeddings,
                                                        relations, rel_masks, h_large, i)
            # apply sigmoid
            chunk_rel_clf = torch.sigmoid(chunk_rel_logits)
            rel_clf[:, i:i + self._max_pairs, :] = chunk_rel_clf

        rel_clf = rel_clf * rel_sample_masks  # mask

        # apply softmax
        entity_clf = torch.softmax(entity_clf, dim=2)

        return entity_clf, rel_clf, relations

    def _classify_entities(self, encodings, h, entity_masks, size_embeddings):
        # max pool entity candidate spans
        m = (entity_masks.unsqueeze(-1) == 0).float() * (-1e30)
        entity_spans_pool = m + h.unsqueeze(1).repeat(1, entity_masks.shape[1], 1, 1)
        entity_spans_pool = entity_spans_pool.max(dim=2)[0]

        # get cls token as candidate context representation
        entity_ctx = get_token(h, encodings, self._cls_token)

        # create candidate representations including context, max pooled span and size embedding
        entity_repr = torch.cat([entity_ctx.unsqueeze(1).repeat(1, entity_spans_pool.shape[1], 1),
                                 entity_spans_pool, size_embeddings], dim=2)
        entity_repr = self.dropout(entity_repr)

        # classify entity candidates
        entity_clf = self.entity_classifier(entity_repr)

        return entity_clf, entity_spans_pool

    def _classify_relations(self, entity_spans, size_embeddings, relations, rel_masks, h, chunk_start):
        batch_size = relations.shape[0]

        # create chunks if necessary
        if relations.shape[1] > self._max_pairs:
            relations = relations[:, chunk_start:chunk_start + self._max_pairs]
            rel_masks = rel_masks[:, chunk_start:chunk_start + self._max_pairs]
            h = h[:, :relations.shape[1], :]

        # get pairs of entity candidate representations
        entity_pairs = util.batch_index(entity_spans, relations)
        entity_pairs = entity_pairs.view(batch_size, entity_pairs.shape[1], -1)

        # get corresponding size embeddings
        size_pair_embeddings = util.batch_index(size_embeddings, relations)
        size_pair_embeddings = size_pair_embeddings.view(batch_size, size_pair_embeddings.shape[1], -1)

        # relation context (context between entity candidate pair)
        # mask non entity candidate tokens
        m = ((rel_masks == 0).float() * (-1e30)).unsqueeze(-1)
        rel_ctx = m + h
        # max pooling
        rel_ctx = rel_ctx.max(dim=2)[0]
        # set the context vector of neighboring or adjacent entity candidates to zero
        rel_ctx[rel_masks.to(torch.uint8).any(-1) == 0] = 0

        # create relation candidate representations including context, max pooled entity candidate pairs
        # and corresponding size embeddings
        rel_repr = torch.cat([rel_ctx, entity_pairs, size_pair_embeddings], dim=2)
        rel_repr = self.dropout(rel_repr)

        # classify relation candidates
        chunk_rel_logits = self.rel_classifier(rel_repr)
        return chunk_rel_logits

    def _filter_spans(self, entity_clf, entity_spans, entity_sample_masks, ctx_size):
        batch_size = entity_clf.shape[0]
        entity_logits_max = entity_clf.argmax(dim=-1) * entity_sample_masks.long()  # get entity type (including none)
        batch_relations = []
        batch_rel_masks = []
        batch_rel_sample_masks = []

        for i in range(batch_size):
            rels = []
            rel_masks = []
            sample_masks = []

            # get spans classified as entities
            non_zero_indices = (entity_logits_max[i] != 0).nonzero().view(-1)
            non_zero_spans = entity_spans[i][non_zero_indices].tolist()
            non_zero_indices = non_zero_indices.tolist()

            # create relations and masks
            for i1, s1 in zip(non_zero_indices, non_zero_spans):
                for i2, s2 in zip(non_zero_indices, non_zero_spans):
                    if i1 != i2:
                        rels.append((i1, i2))
                        rel_masks.append(sampling.create_rel_mask(s1, s2, ctx_size))
                        sample_masks.append(1)

            if not rels:
                # case: no more than two spans classified as entities
                batch_relations.append(torch.tensor([[0, 0]], dtype=torch.long))
                batch_rel_masks.append(torch.tensor([[0] * ctx_size], dtype=torch.bool))
                batch_rel_sample_masks.append(torch.tensor([0], dtype=torch.bool))
            else:
                # case: more than two spans classified as entities
                batch_relations.append(torch.tensor(rels, dtype=torch.long))
                batch_rel_masks.append(torch.stack(rel_masks))
                batch_rel_sample_masks.append(torch.tensor(sample_masks, dtype=torch.bool))

        # stack
        device = self.rel_classifier.weight.device
        batch_relations = util.padded_stack(batch_relations).to(device)
        batch_rel_masks = util.padded_stack(batch_rel_masks).to(device)
        batch_rel_sample_masks = util.padded_stack(batch_rel_sample_masks).to(device)

        return batch_relations, batch_rel_masks, batch_rel_sample_masks

    def forward(self, *args, evaluate=False, **kwargs):
        if not evaluate:
            return self._forward_train(*args, **kwargs)
        else:
            return self._forward_eval(*args, **kwargs)
 def get_vision_text_model(self, vision_config, text_config):
     vision_model = CLIPVisionModel(vision_config).eval()
     text_model = BertModel(text_config).eval()
     return vision_model, text_model
Beispiel #20
0
    if args.seed == 0:
        args.seed = random.randint(0, 100)
        set_seed(args)

    helper = DataHelper(gz=True, config=args)
    args.n_type = helper.n_type  # 2

    # Set datasets
    Full_Loader = helper.train_loader
    # Subset_Loader = helper.train_sub_loader
    dev_example_dict = helper.dev_example_dict
    dev_feature_dict = helper.dev_feature_dict
    eval_dataset = helper.dev_loader

    roberta_config = BC.from_pretrained(args.bert_model)
    encoder = BertModel.from_pretrained(args.bert_model)
    args.input_dim = roberta_config.hidden_size
    model = BertSupportNet(config=args, encoder=encoder)
    if args.trained_weight is not None:
        model.load_state_dict(torch.load(args.trained_weight))
    if args.n_gpu > 0 and args.model_gpu != '-1':
        model.to('cuda')

    # Initialize optimizer and criterions
    lr = args.lr
    t_total = len(
        Full_Loader) * args.epochs // args.gradient_accumulation_steps
    warmup_steps = 0.1 * t_total
    optimizer = AdamW(model.parameters(), lr=lr, eps=1e-8)
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=warmup_steps,
Beispiel #21
0
 def __init__(self):
     super(BertEncoder, self).__init__()
     self.model_config = BertConfig.from_pretrained('MTBERT',
                                                    output_hidden_state=True)
     self.bert = BertModel.from_pretrained('MTBERT',
                                           config=self.model_config)
Beispiel #22
0
 def __init__(self, state_path, top_model=CLSTopModel(), use_loc_ids=False):
     super(EndToEnd, self).__init__()
     self.bert = BertModel.from_pretrained(state_path)
     self.top_model = top_model
     self.clip_param_grad = None
     self.use_loc_ids = use_loc_ids
Beispiel #23
0
 def __init__(self, opt):
     super().__init__()
     self.bert = BertModel.from_pretrained(opt.bert_path)
     self.linear = nn.Linear(opt.bert_hid_size, 1)
Beispiel #24
0
def AE(df):

    model_type = 'bert-base-uncased'

    tokenizer = BertTokenizer.from_pretrained(model_type)
    model = BertModel.from_pretrained(model_type, return_dict=True)
    mask_model = BertForMaskedLM.from_pretrained(model_type, return_dict=True)

    sep_token = '[SEP]'
    mask_token = '[MASK]'

    mask_id = tokenizer(mask_token)['input_ids'][1]
    sep_id = tokenizer(sep_token)['input_ids'][1]

    optimizer = AdamW(model.parameters())
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    mask_model.to(device)

    auxiliary_tokens = ['the', 'aspect', 'term', 'is']

    df['mask_tokens'] = 0
    df['auxiliary_tokens'] = 0
    df = df.astype('object')

    for i in range(len(df)):

        #for j in range(len(df['aspect_terms'].iloc[i])):
        auxiliary_sents = []
        for j in range(len(df['aspect_terms'].iloc[i])):
            aspect_terms = df['aspect_terms'].iloc[i][j]
            auxiliary_sent = auxiliary_tokens + [aspect_terms] + [
                sep_token
            ] + df['tokens'].iloc[i]
            auxiliary_sents.append(auxiliary_sent)

        mask_sent = auxiliary_tokens + [mask_token] + [sep_token
                                                       ] + df['tokens'].iloc[i]
        df['mask_tokens'].iloc[i] = mask_sent
        df['auxiliary_tokens'].iloc[i] = auxiliary_sents

    df['distance'] = 0
    df = df.astype('object')

    for i in range(len(df)):

        tokenized = tokenizer.encode(df['mask_tokens'].iloc[i])

        sep_index = tokenized.index(sep_id)
        mask_index = tokenized.index(mask_id)

        tokenized = pd.Series([tokenized])

        padded = pad_sequences(tokenized,
                               maxlen=MAX_LEN,
                               dtype="long",
                               value=0,
                               truncating="post",
                               padding="post")

        attention_mask = np.where(padded != 0, 1, 0)

        input_ids = torch.tensor(padded).to(device)
        attention_mask = torch.tensor(attention_mask).to(device)

        with torch.no_grad():
            last_hidden_states = model(input_ids,
                                       attention_mask=attention_mask)

        original_mask_embedding = last_hidden_states[0][:, mask_index, :].cpu(
        ).numpy()

        distance = []

        for pertubed_index in range(sep_index + 1, MAX_LEN):
            padded = pad_sequences(tokenized,
                                   maxlen=MAX_LEN,
                                   dtype="long",
                                   value=0,
                                   truncating="post",
                                   padding="post")
            if padded[0][pertubed_index] != 0 and padded[0][
                    pertubed_index] != sep_id:
                #print(padded.shape)
                cur_id = padded[0][pertubed_index]
                padded[0][pertubed_index] = mask_id

                cur_embedding = mask_embedding(model, padded, mask_index)
                d = dist(original_mask_embedding, cur_embedding)
                distance.append((cur_id, d))

        df['distance'].iloc[i] = distance

    df['perturbed_mask_index'] = 0
    df = df.astype('object')

    for i in range(len(df)):
        perturbed_mask_index = []
        mask_threshold = calculate_threshold(
            np.array(df['distance'].iloc[i])[:, 1], std_strength)
        for dis_index in range(len(df['distance'].iloc[i])):
            if df['distance'].iloc[i][dis_index][1] < mask_threshold and df[
                    'labels'].iloc[i][dis_index] != 'B' and df['labels'].iloc[
                        i][dis_index] != 'I':
                perturbed_mask_index.append(dis_index)

        df['perturbed_mask_index'].iloc[i] = perturbed_mask_index

    df['augment_token_id'] = 0
    df = df.astype('object')

    for i in range(len(df)):

        augment_tokenizeds = []

        for j in range(len(df['aspect_terms'].iloc[i])):

            tokenized = tokenizer.encode(df['auxiliary_tokens'].iloc[i][j])
            tokenized = torch.Tensor(tokenized).unsqueeze(0).to(
                torch.int64).to(device)
            augment_tokenized = tokenizer.encode(
                df['auxiliary_tokens'].iloc[i][j])

            for k in range(len(df['perturbed_mask_index'].iloc[i])):
                mask_tokenized = tokenizer.encode(
                    df['auxiliary_tokens'].iloc[i][j])
                sep_index = mask_tokenized.index(sep_id)
                perturbed_mask_index = df['perturbed_mask_index'].iloc[i][
                    k] + sep_index + 1
                mask_tokenized[perturbed_mask_index] = mask_id

                mask_tokenized = torch.Tensor(mask_tokenized).unsqueeze(0).to(
                    torch.int64).to(device)

                outputs = mask_model(mask_tokenized, labels=tokenized)
                augment_tokenized[perturbed_mask_index] = int(
                    outputs.logits[:, perturbed_mask_index, :].argmax().cpu(
                    ).numpy())

            augment_tokenizeds.append(augment_tokenized)

        df['augment_token_id'].iloc[i] = augment_tokenizeds

    df['augment_tokens'] = 0
    df = df.astype('object')

    for i in range(len(df)):

        tokens_lists = []

        for j in range(len(df['aspect_terms'].iloc[i])):

            tokens_list = []

            for k in range(1, len(df['augment_token_id'].iloc[i][j]) - 1):

                tokens_list.append(
                    tokenizer.decode([df['augment_token_id'].iloc[i][j][k]]))

            sep_index = tokens_list.index(sep_token)
            tokens_list = tokens_list[sep_index + 1:]
            tokens_lists.append(tokens_list)

        df['augment_tokens'].iloc[i] = tokens_lists

        return df
def main(args, logger):
    # trn_df = pd.read_csv(f'{MNT_DIR}/inputs/origin/train.csv')
    trn_df = pd.read_pickle(f'{MNT_DIR}/inputs/nes_info/trn_df.pkl')
    trn_df['is_original'] = 1

    gkf = GroupKFold(n_splits=5).split(
        X=trn_df.question_body,
        groups=trn_df.question_body_le,
    )

    histories = {
        'trn_loss': {},
        'val_loss': {},
        'val_metric': {},
        'val_metric_raws': {},
    }
    loaded_fold = -1
    loaded_epoch = -1
    if args.checkpoint:
        histories, loaded_fold, loaded_epoch = load_checkpoint(args.checkpoint)

    fold_best_metrics = []
    fold_best_metrics_raws = []
    for fold, (trn_idx, val_idx) in enumerate(gkf):
        if fold < loaded_fold:
            fold_best_metrics.append(np.max(histories["val_metric"][fold]))
            fold_best_metrics_raws.append(
                histories["val_metric_raws"][fold][np.argmax(
                    histories["val_metric"][fold])])
            continue
        sel_log(
            f' --------------------------- start fold {fold} --------------------------- ',
            logger)
        fold_trn_df = trn_df.iloc[trn_idx]  # .query('is_original == 1')
        fold_trn_df = fold_trn_df.drop(['is_original', 'question_body_le'],
                                       axis=1)
        # use only original row
        fold_val_df = trn_df.iloc[val_idx].query('is_original == 1')
        fold_val_df = fold_val_df.drop(['is_original', 'question_body_le'],
                                       axis=1)
        if args.debug:
            fold_trn_df = fold_trn_df.sample(100, random_state=71)
            fold_val_df = fold_val_df.sample(100, random_state=71)
        temp = pd.Series(
            list(
                itertools.chain.from_iterable(
                    fold_trn_df.question_title.apply(lambda x: x.split(' ')) +
                    fold_trn_df.question_body.apply(lambda x: x.split(' ')) +
                    fold_trn_df.answer.apply(lambda x: x.split(' '))))
        ).value_counts()
        tokens = temp[temp >= 10].index.tolist()
        # tokens = []
        tokens = [
            'CAT_TECHNOLOGY'.casefold(),
            'CAT_STACKOVERFLOW'.casefold(),
            'CAT_CULTURE'.casefold(),
            'CAT_SCIENCE'.casefold(),
            'CAT_LIFE_ARTS'.casefold(),
        ]

        trn_dataset = QUESTDataset(
            df=fold_trn_df,
            mode='train',
            tokens=tokens,
            augment=[],
            tokenizer_type=TOKENIZER_TYPE,
            pretrained_model_name_or_path=TOKENIZER_PRETRAIN,
            do_lower_case=True,
            LABEL_COL=LABEL_COL,
            t_max_len=T_MAX_LEN,
            q_max_len=Q_MAX_LEN,
            a_max_len=A_MAX_LEN,
            tqa_mode='tq_a',
            TBSEP='[TBSEP]',
            pos_id_type='all_one',
            MAX_SEQUENCE_LENGTH=MAX_SEQ_LEN,
        )
        # update token
        trn_sampler = RandomSampler(data_source=trn_dataset)
        trn_loader = DataLoader(trn_dataset,
                                batch_size=BATCH_SIZE,
                                sampler=trn_sampler,
                                num_workers=os.cpu_count(),
                                worker_init_fn=lambda x: np.random.seed(),
                                drop_last=True,
                                pin_memory=True)
        val_dataset = QUESTDataset(
            df=fold_val_df,
            mode='valid',
            tokens=tokens,
            augment=[],
            tokenizer_type=TOKENIZER_TYPE,
            pretrained_model_name_or_path=TOKENIZER_PRETRAIN,
            do_lower_case=True,
            LABEL_COL=LABEL_COL,
            t_max_len=T_MAX_LEN,
            q_max_len=Q_MAX_LEN,
            a_max_len=A_MAX_LEN,
            tqa_mode='tq_a',
            TBSEP='[TBSEP]',
            pos_id_type='all_one',
            MAX_SEQUENCE_LENGTH=MAX_SEQ_LEN,
        )
        val_sampler = RandomSampler(data_source=val_dataset)
        val_loader = DataLoader(val_dataset,
                                batch_size=BATCH_SIZE,
                                sampler=val_sampler,
                                num_workers=os.cpu_count(),
                                worker_init_fn=lambda x: np.random.seed(),
                                drop_last=False,
                                pin_memory=True)

        fobj = BCEWithLogitsLoss()
        state_dict = BertModel.from_pretrained(MODEL_PRETRAIN).state_dict()
        model = BertModelForBinaryMultiLabelClassifier(
            num_labels=len(LABEL_COL),
            config_path=MODEL_CONFIG_PATH,
            state_dict=state_dict,
            token_size=len(trn_dataset.tokenizer),
            MAX_SEQUENCE_LENGTH=MAX_SEQ_LEN,
        )
        optimizer = optim.Adam(model.parameters(), lr=3e-5)
        scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                         T_max=MAX_EPOCH,
                                                         eta_min=1e-5)

        # load checkpoint model, optim, scheduler
        if args.checkpoint and fold == loaded_fold:
            load_checkpoint(args.checkpoint, model, optimizer, scheduler)

        for epoch in tqdm(list(range(MAX_EPOCH))):
            if fold <= loaded_fold and epoch <= loaded_epoch:
                continue
            if epoch < 1:
                model.freeze_unfreeze_bert(freeze=True, logger=logger)
            else:
                model.freeze_unfreeze_bert(freeze=False, logger=logger)
            model = DataParallel(model)
            model = model.to(DEVICE)
            trn_loss = train_one_epoch(model, fobj, optimizer, trn_loader,
                                       DEVICE)
            val_loss, val_metric, val_metric_raws, val_y_preds, val_y_trues, val_qa_ids = test(
                model, fobj, val_loader, DEVICE, mode='valid')

            scheduler.step()
            if fold in histories['trn_loss']:
                histories['trn_loss'][fold].append(trn_loss)
            else:
                histories['trn_loss'][fold] = [
                    trn_loss,
                ]
            if fold in histories['val_loss']:
                histories['val_loss'][fold].append(val_loss)
            else:
                histories['val_loss'][fold] = [
                    val_loss,
                ]
            if fold in histories['val_metric']:
                histories['val_metric'][fold].append(val_metric)
            else:
                histories['val_metric'][fold] = [
                    val_metric,
                ]
            if fold in histories['val_metric_raws']:
                histories['val_metric_raws'][fold].append(val_metric_raws)
            else:
                histories['val_metric_raws'][fold] = [
                    val_metric_raws,
                ]

            logging_val_metric_raws = ''
            for val_metric_raw in val_metric_raws:
                logging_val_metric_raws += f'{float(val_metric_raw):.4f}, '

            sel_log(
                f'fold : {fold} -- epoch : {epoch} -- '
                f'trn_loss : {float(trn_loss.detach().to("cpu").numpy()):.4f} -- '
                f'val_loss : {float(val_loss.detach().to("cpu").numpy()):.4f} -- '
                f'val_metric : {float(val_metric):.4f} -- '
                f'val_metric_raws : {logging_val_metric_raws}', logger)
            model = model.to('cpu')
            model = model.module
            save_checkpoint(f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}', model,
                            optimizer, scheduler, histories, val_y_preds,
                            val_y_trues, val_qa_ids, fold, epoch, val_loss,
                            val_metric)
        fold_best_metrics.append(np.max(histories["val_metric"][fold]))
        fold_best_metrics_raws.append(
            histories["val_metric_raws"][fold][np.argmax(
                histories["val_metric"][fold])])
        save_and_clean_for_prediction(f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}',
                                      trn_dataset.tokenizer,
                                      clean=False)
        del model

    # calc training stats
    fold_best_metric_mean = np.mean(fold_best_metrics)
    fold_best_metric_std = np.std(fold_best_metrics)
    fold_stats = f'{EXP_ID} : {fold_best_metric_mean:.4f} +- {fold_best_metric_std:.4f}'
    sel_log(fold_stats, logger)
    send_line_notification(fold_stats)

    fold_best_metrics_raws_mean = np.mean(fold_best_metrics_raws, axis=0)
    fold_raw_stats = ''
    for metric_stats_raw in fold_best_metrics_raws_mean:
        fold_raw_stats += f'{float(metric_stats_raw):.4f},'
    sel_log(fold_raw_stats, logger)
    send_line_notification(fold_raw_stats)

    sel_log('now saving best checkpoints...', logger)
Beispiel #26
0
 def __init__(self, drop_rate, output_size):
     super().__init__()
     self.bert = BertModel.from_pretrained("bert-base-uncased")
     self.drop = torch.nn.Dropout(drop_rate)
     #BERTの出力に合わせて768次元
     self.fc = torch.nn.Linear(768, output_size)
Beispiel #27
0
import os
import sys
import json
from transformers import BertModel, TFBertModel

if __name__ == '__main__':
    path = sys.argv[1]
    print(path)
    conf_path = os.path.join(path, 'config.json')
    c = json.load(open(conf_path))
    c['model_type'] = 'bert'
    json.dump(c, open(conf_path, 'w'))
    model = TFBertModel.from_pretrained(path, from_pt=True)
    model.save_pretrained(path)
    # test loads
    model = BertModel.from_pretrained(path)
    model = TFBertModel.from_pretrained(path)
# coding:utf-8
import sys

sys.path.append('../')
import torch
import numpy as np
import torch.nn as nn
from common.tree import head_to_adj
from common.transformer_encoder import TransformerEncoder
from common.RGAT import RGATEncoder
from transformers import BertModel, BertConfig

bert_config = BertConfig.from_pretrained("bert-base-uncased")
bert_config.output_hidden_states = True
bert_config.num_labels = 3
bert = BertModel.from_pretrained("bert-base-uncased", config=bert_config)


class RGATABSA(nn.Module):
    def __init__(self, args, emb_matrix=None):
        super().__init__()
        in_dim = args.hidden_dim + args.bert_out_dim
        self.args = args
        self.enc = ABSAEncoder(args)
        self.classifier = nn.Linear(in_dim, args.num_class)
        self.dropout = nn.Dropout(0.1)

    def forward(self, inputs):
        outputs = self.enc(inputs)
        outputs = self.dropout(outputs)
        logits = self.classifier(outputs)
        targets=df.score.to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len
    )

    return DataLoader(
        ds,
        batch_size=batch_size,
        num_workers=1
    )


train_data_loader = create_data_loader(
    df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
bert_model = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)


# freeze all the parameters
for param in bert_model.parameters():
    param.requires_grad = False


class SentimentClassifier(nn.Module):

    def __init__(self, n_classes):
        super(SentimentClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
        self.drop = nn.Dropout(p=0.3)
        self.relu = nn.ReLU()
        self.dense1 = nn.Linear(768, 512)  # 768 hidden state of bert
Beispiel #30
0
# @file: test.py
# @time: 2021/05/09
# -*- coding: utf-8 -*-
# @project:wholee_keyword
# @author:caojinlei
# @file: bert_embedding.py
# @time: 2021/05/07
import torch
from transformers import BertTokenizer, BertModel, BertConfig

model_name = 'uncased_L-12_H-768_A-12'
tokenizer = BertTokenizer.from_pretrained(model_name)
model_config = BertConfig.from_pretrained(model_name)
model_config.output_hidden_states = True
model_config.output_attentions = True
bert_model = BertModel.from_pretrained(model_name, config=model_config)

# s = 'i have a pen'
s = 'i have a apple'
sen_code = tokenizer.encode(s)
print(sen_code)
sen_word = tokenizer.convert_ids_to_tokens(sen_code)
tokens_tensor = torch.LongTensor([sen_code])
segments_tensors = torch.zeros(len(sen_code), dtype=int)

# 静态词向量
emb = bert_model.embeddings.word_embeddings.weight.data
sen_emb = []
for i in sen_code:
    sen_emb.append(emb[i])