Esempio n. 1
0
    def __init__(self, embedding_dir, model_name="bert-base-multilingual-cased", layer=-2):
        super(BertEncoder, self).__init__(embedding_dir)

        # Load pre-trained model (weights) and set to evaluation mode (no more training)
        self.model = BertModel.from_pretrained(model_name)
        self.model.eval()

        # Load word piece tokenizer
        self.tokenizer = BertTokenizer.from_pretrained(model_name)

        # Layer from which to get the embeddings
        self.layer = layer
Esempio n. 2
0
    def __init__(self,
                 tag_size,
                 top_rnns=False,
                 device='cpu',
                 finetuning=False):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-chinese')

        self.top_rnns = top_rnns
        if top_rnns:
            self.rnn = nn.LSTM(bidirectional=True,
                               num_layers=2,
                               input_size=768,
                               hidden_size=768 // 2,
                               batch_first=True)
        self.fc = nn.Linear(768, tag_size)

        self.device = device
        self.finetuning = finetuning
Esempio n. 3
0
    def __init__(self, config, static=False):
        super(TextCNN, self).__init__()

        model_bert = bertModel.from_pretrained('bert-base-uncased')
        pre_trained_embed = model_bert.embeddings.word_embeddings.weight

        D = pre_trained_embed.shape[1]
        C = config.hidden_size
        Ci = 1
        Co = config.cnn_kernel_num
        Ks = config.cnn_kernel_sizes

        self.embed = nn.Embedding.from_pretrained(pre_trained_embed)
        self.convs = nn.ModuleList([nn.Conv2d(Ci, Co, (K, D)) for K in Ks])
        self.dropout = nn.Dropout(config.cnn_dropout_prob)
        self.fc1 = nn.Linear(len(Ks) * Co, C)

        if static:
            self.embed.weight.requires_grad = False
Esempio n. 4
0
    def __init__(self,
                 top_rnns=False,
                 vocab_size=None,
                 device='cpu',
                 finetuning=False):
        super().__init__()
        self.bert = BertModel.from_pretrained(config.Config.bert_model)

        self.top_rnns = top_rnns
        if top_rnns:
            self.rnn = nn.LSTM(bidirectional=True,
                               num_layers=2,
                               input_size=768,
                               hidden_size=768 // 2,
                               batch_first=True)  #[128, 74, 768]
        self.fc = nn.Linear(768, vocab_size)

        self.device = device
        self.finetuning = finetuning
def main():
    args = parser.parse_args()
    tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
    model = BertModel.from_pretrained('bert-large-uncased')
    model.eval().cuda(args.gpu)

    datadir = args.data

    pir_dataset = product_image_retrieval(
        datadir,
        args.set,
    )

    loader = torch.utils.data.DataLoader(pir_dataset,
                                         batch_size=args.batch_size,
                                         shuffle=True,
                                         num_workers=args.workers,
                                         pin_memory=True,
                                         drop_last=False)

    length_dir = len(
        pd.read_csv(os.path.join(datadir, 'splitted', args.set) + '.csv'))
    feats = np.zeros((length_dir, 1024))

    text = 'pattern recognition is so hard for me'
    for i, (texts, index) in tqdm(enumerate(loader)):
        # if args.gpu is not None:
        #     text = text.cuda(args.gpu, non_blocking=True)
        tokens_lst = []
        for text in texts:
            tokenized_text = tokenizer.tokenize(text)
            indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
            tokens_lst.append(indexed_tokens)
        tokens_tensor = torch.tensor(tokens_lst).cuda(args.gpu,
                                                      non_blocking=True)

        feats_batch = model(tokens_tensor, output_all_encoded_layers=False)[1]
        feats_batch = feats_batch.detach().cpu().numpy()

        for idx in range(len(index)):
            feats[index[idx], :] = feats_batch[idx, :]

    np.save('expr/features_bert' + args.name + '_' + args.set + '.npy', feats)
Esempio n. 6
0
    def __init__(self,
                 pretrained_model_name_or_path,
                 num_labels,
                 Encoder1,
                 is_lock=False):

        super(ClassifyModel, self).__init__()
        self.bert = BertModel.from_pretrained(pretrained_model_name_or_path)
        self.Encoder = Encoder1
        self.classifier = nn.Linear(768, num_labels)
        self.init_mBloSA()
        self.s2tSA = s2tSA(768)

        if is_lock:
            for name, param in self.bert.named_parameters():
                if name.startswith('pooler'):
                    continue
                else:
                    param.requires_grad_(False)
 def __init__(self, tag_to_ix, hidden_dim=768):
     super(Bert_BiLSTM_CRF, self).__init__()
     self.tag_to_ix = tag_to_ix
     self.tagset_size = len(tag_to_ix)
     # self.hidden = self.init_hidden()
     self.lstm = nn.LSTM(bidirectional=True, num_layers=2, input_size=768, hidden_size=hidden_dim//2, batch_first=True)
     self.transitions = nn.Parameter(torch.randn(
         self.tagset_size, self.tagset_size
     ))
     self.hidden_dim = hidden_dim
     self.start_label_id = self.tag_to_ix['[CLS]']
     self.end_label_id = self.tag_to_ix['[SEP]']
     self.fc = nn.Linear(hidden_dim, self.tagset_size)
     self.bert = BertModel.from_pretrained('/root/workspace/qa_project/chinese_L-12_H-768_A-12')
     # self.bert.eval()  # 知用来取bert embedding
     
     self.transitions.data[self.start_label_id, :] = -10000
     self.transitions.data[:, self.end_label_id] = -10000
     self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    def __init__(self,
                 max_seq_length,
                 max_q_length,
                 max_a_length,
                 embedding_dim=768,
                 prev_history=2,
                 use_gpu=False,
                 bert_model=None):
        super(BaseModel, self).__init__()

        self.bert_model = BertModel.from_pretrained('bert-base-uncased')
        if bert_model != None:
            self.bert_model.load_state_dict(bert_model)

        if True:
            for name, param in self.bert_model.named_parameters():
                if "encoder.layer.11" not in name:
                    param.requires_grad = False

        self.max_seq_length = max_seq_length
        self.max_q_length = max_q_length
        self.max_a_length = max_a_length
        self.embedding_dim = embedding_dim
        self.prev_history = prev_history
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.bi_gru_layer1 = torch.nn.GRU(
            (prev_history * 2 + 1) * embedding_dim,
            embedding_dim,
            batch_first=True,
            bidirectional=True)
        self.linear_start = torch.nn.Linear(
            (prev_history * 2 + 3) * embedding_dim, 1)
        self.softmax = torch.nn.Softmax(dim=1)
        self.bi_gru_layer2 = torch.nn.GRU(2 * embedding_dim,
                                          embedding_dim,
                                          batch_first=True,
                                          bidirectional=True)
        self.linear_end = torch.nn.Linear(
            (prev_history * 2 + 3) * embedding_dim, 1)
        self.answer_type_layer = torch.nn.Linear(
            (prev_history * 2 + 3) * embedding_dim, 3)
        self.CUDA = torch.cuda.is_available() and use_gpu
Esempio n. 9
0
def BERT(textA, textB):
    # Load pre-trained model tokenizer (vocabulary)
    tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')

    # Tokenized input
    tokenized_textA = tokenizer.tokenize(textA)
    tokenized_textB = tokenizer.tokenize(textB)

    # Mask a token that we will try to predict back with `BertForMaskedLM`
    masked_index = len(tokenized_textA) + tokenized_textB.index('$')
    tokenized_text = tokenized_textA + tokenized_textB
    tokenized_text[masked_index] = '[MASK]'
    #print(tokenized_textB)

    # Convert token to vocabulary indices
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    # Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
    segments_ids = [0] * len(tokenized_textA) + [1] * len(tokenized_textB)

    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])
    
    # Load pre-trained model (weights)
    model = BertModel.from_pretrained('bert-large-uncased')
    model.eval()

    # Predict hidden states features for each layer
    encoded_layers, _ = model(tokens_tensor, segments_tensors)
    # We have a hidden states for each of the 12 layers in model bert-base-uncased
    # Load pre-trained model (weights)
    
    model = BertForMaskedLM.from_pretrained('bert-large-uncased')
    model.eval()

    # Predict all tokens
    predictions = model(tokens_tensor, segments_tensors)

    # confirm we were able to predict 
    predicted_index = torch.argmax(predictions[0, masked_index]).item()
    predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
    return predicted_token
def customEmbeddingTest():
    global sortedWordSents

    bertModel = BertModel.from_pretrained('/media/yuan/Samsung_T5/Documents/BERT/bert-base-chinese')
    bertModel.eval()

    # sortedWordSents = sorted(origWordSents, key=lambda e: len(e), reverse=True)
    # print(sortedWordSents)
    # wordsSents = ["[CLS]" + sent + "[SEP]" for sent in origWordSents]
    # print(wordsSents)

    tokenizedSents = [tokenizer.tokenize(sent) for sent in origWordSents]
    origSentsLens = [len(sent) for sent in tokenizedSents]
    tokenizedSents = [["[CLS]"] + sent + ["[SEP]"] for sent in tokenizedSents]
    targetedWordIdxs = [sent.index('还') for sent in tokenizedSents]
    wordIdxs = [tokenizer.convert_tokens_to_ids(sent) for sent in tokenizedSents]

    #maxLen = len(wordIdxs[0])
    maxLen = max([len(sent) for sent in wordIdxs])
    paddedInputIds = [sent + [0] * (maxLen - len(sent)) for sent in wordIdxs]
    paddedInputIds = torch.tensor(paddedInputIds)

    attentionMask = torch.tensor([[float(i > 0) for i in sent] for sent in paddedInputIds])

    allLayerEmbeds, _ = bertModel(paddedInputIds, attention_mask=attentionMask,
                                  output_all_encoded_layers=True)
    with torch.no_grad():
        concatSentEmbeds = []
        layerIdxs = [-1, -2, -3, -4]
        for sentIdx, sent in enumerate(tokenizedSents):
            selectedLayersForSent = []
            for tokenIdx in range(maxLen):
                selectedLayersForToken = []
                if tokenIdx == 0 or tokenIdx == origSentsLens[sentIdx] + 1:
                    continue
                for layerIdx in layerIdxs:
                    layerEmbeds = allLayerEmbeds[layerIdx].detach().cpu()[sentIdx]
                    selectedLayersForToken.append(layerEmbeds[tokenIdx])
                selectedLayersForSent.append(torch.cat(selectedLayersForToken))
            concatSentEmbeds.append(torch.stack(selectedLayersForSent))

    return torch.stack(concatSentEmbeds)
def prepare(params, samples):
    
    if params['cache'] is None: # check whether cache is already provided
        params['cache'] = load_cache(params.model_name, params.current_task, params.cache_path) # try to load cache

        if params['cache'] is None: # if there is no cache saved, then construct encoder model
            print("Constructing Encoder Model")
            params['cache'] = {}

            # ====== Construct Model ====== #
            model = BertModel.from_pretrained(args.model_name)
            model = torch.nn.DataParallel(model)
            tokenizer = BertTokenizer.from_pretrained(args.model_name, do_lower_case=True)

            params['model'] = model
            params['tokenizer'] = tokenizer
            params['flag_save'] = True
             
    # ====== Initializ Counter ====== #
    params['count'] = 0
Esempio n. 12
0
    def __init__(self, opt):
        super(bert_att_mis, self).__init__()

        self.opt = opt
        self.model_name = 'bert_att_mis'
        self.test_scale_p = 0.5

        self.bert_model = BertModel.from_pretrained(opt.bert_path)
        self.bert_model.cuda()

        self.bags_feature = []

        rel_dim = opt.rel_dim

        self.rel_embs = nn.Parameter(torch.randn(self.opt.rel_num, rel_dim))
        self.rel_bias = nn.Parameter(torch.randn(self.opt.rel_num))

        self.dropout = nn.Dropout(self.opt.drop_out)

        self.init_model_weight()
Esempio n. 13
0
    def __init__(self):
        super(Model4, self).__init__()
        self.bert = BertModel.from_pretrained(
            r'D:\bert_weight_Chinese\chinese_L-12_H-768_A-12\bert-base-chinese.tar'
        )
        for param in self.bert.parameters():
            param.requires_grad = False

        self.conv1 = nn.Conv2d(
            1, 100, kernel_size=(1, 16 * 768),
            stride=1)  # params: 输入通道数,输出通道数(filter个数),核视野(H,W),步长
        self.conv2 = nn.Conv2d(1, 100, kernel_size=(2, 16 * 768), stride=1)
        self.conv3 = nn.Conv2d(1, 100, kernel_size=(3, 16 * 768), stride=1)
        self.conv4 = nn.Conv2d(1, 100, kernel_size=(4, 16 * 768), stride=1)
        self.conv5 = nn.Conv2d(1, 100, kernel_size=(5, 16 * 768), stride=1)

        self.dp1 = nn.Dropout(0.1)
        self.dense1 = nn.Linear(500 * 4, 500 * 2)
        self.dense2 = nn.Linear(500 * 2, 200)
        self.dense3 = nn.Linear(200, 2)
    def vectorize(self, sentence):

        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        marked_text = "[CLS] " + sentence + " [SEP]"

        #print(marked_text)
        tokenized_text = tokenizer.tokenize(marked_text)
        #print(tokenized_text)
        segments_ids = [1] * len(tokenized_text)
        #print(segments_ids)
        segments_tensors = torch.tensor([segments_ids])
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        tokens_tensor = torch.tensor([indexed_tokens])
        #print(tokens_tensor)
        model = BertModel.from_pretrained('bert-base-uncased')
        with torch.no_grad():
            encoded_layers, _ = model(tokens_tensor, segments_tensors)
        sentence_embedding = torch.mean(encoded_layers[11], 1)
        #print(sentence_embedding)
        return sentence_embedding
Esempio n. 15
0
def BERT_initializer(line1):

    tokenized_text = tokenizer.tokenize(line1)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    segments_ids = [1] * len(tokenized_text)

    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    model = BertModel.from_pretrained('bert-base-multilingual-cased')
    # Put the model in "evaluation" mode, meaning feed-forward operation.
    model.eval()
    # Predict hidden states features for each layer
    with torch.no_grad():
        encoded_layers, _ = model(tokens_tensor, segments_tensors)

    token_embedding = token_embeddings(encoded_layers)

    return token_embedding, tokenized_text, segments_ids
Esempio n. 16
0
def query_encow(query_id, sentences, N=5000):
    # GPU available?
    CUDA = torch.cuda.is_available()

    # initialize the bert model
    print(f"Initializing BERT model {'with' if CUDA else 'without'} CUDA...", end='')
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')
    if CUDA:
        model = model.to('cuda')
    model.eval()
    print(" OK.")

    aggregated_pairs = []
    for i in range(80):
        vecs, sents = read_pickle(f'encow/encow_sent.txt.{str(i).zfill(3)}')
        aggregated_pairs += query(model, tokenizer, vecs, sents, sentences, CUDA)
        print("Querying " + f'({i + 1}/80)' + ('.' * ((i % 3) + 1)) + '      ', end='\r')

    return query_id, sorted([(sim, sent[6:-6]) for sim, sent in aggregated_pairs], reverse=True, key=lambda x: x[0])[:N]
def main():
    df = pd.read_pickle("../data/contextual_similarity_df")
    print ("Initiating CURTIS!...") 
    pretrained_model = 'bert-base-uncased'
    tokenizer = initialize_tokenizer(pretrained_model)
    model = BertModel.from_pretrained(pretrained_model)
    # starting feed forward network
    model.eval()
    print ("CURTIS: Hey Kaushik, I am happy to have you here again!")
    _ = input("CURTIS: How are you?\nKaushik: ")
    user_question = input("CURTIS: You are having a rough time \nCURTIS: What makes you feel like this?\nKaushik: ")
    user_question_context = input("CURTIS: Please provide more context for your problem\nKaushik: ")
    user_vec = get_contextual_vector(model, tokenizer, user_question, user_question_context)
    for i in range(df.shape[0]):
        try:
            cs_dist = cosine(user_vec, df.loc[i, "contextual_vector"])
        except:
            cs_dist = 1
        df.loc[i, "similarity_score"] = 1 - cs_dist
    print ("CURTIS:", df[df.similarity_score == df.similarity_score.max()].reset_index().reflection[0])
Esempio n. 18
0
 def define_module(self):
     self.encoder = BertModel.from_pretrained('bert-base-uncased')
     for param in self.encoder.parameters():
         param.requires_grad = False
     self.bert_linear = nn.Linear(768, self.ninput)
     self.drop = nn.Dropout(self.drop_prob)
     if self.rnn_type == 'LSTM':
         # dropout: If non-zero, introduces a dropout layer on
         # the outputs of each RNN layer except the last layer
         self.rnn = nn.LSTM(self.ninput, self.nhidden,
                            self.nlayers, batch_first=True,
                            dropout=self.drop_prob,
                            bidirectional=self.bidirectional)
     elif self.rnn_type == 'GRU':
         self.rnn = nn.GRU(self.ninput, self.nhidden,
                           self.nlayers, batch_first=True,
                           dropout=self.drop_prob,
                           bidirectional=self.bidirectional)
     else:
         raise NotImplementedError
Esempio n. 19
0
    def __init__(self):
        with open('data/sent_example.pickle', 'rb') as handle:
            self.sent_example_map = pickle.load(handle)
        self.target_embedding_map = {}
        self.wikilinks_embedding_map = {}
        self.target_output_embedding_map = {}
        self.wikilinks_output_embedding_map = {}
        self.stop_sign = "STOP_SIGN_SIGNAL"
        self.db_loaded = False
        self.load_sqlite_db('data/bert_cache_2.db')
        self.server_mode = False

        # Load pre-trained model (weights)
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

        self.model = BertModel.from_pretrained('bert-base-uncased')
        self.model.eval()

        # If you have a GPU, put everything on cuda
        self.model.to('cuda')
Esempio n. 20
0
    def __init__(self, dropout, output_dim):
        """
        Args:
            embedding_matrix: Pre-trained word embeddings
            embedding_dim: Embedding dimension of the word embeddings
            vocab_size: Size of the vocabulary
            hidden_dim: Size hiddden state
            dropout: Dropout probability
            output_dim: Output classes (Subtask A: 2 = (OFF, NOT))
        """

        super(BertPooling, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-uncased')
        for param in self.bert.parameters():
            param.requires_grad = False

        self.classifier = nn.Linear(768, output_dim)
        self.dropout = nn.Dropout(dropout)
        nn.init.xavier_normal_(self.classifier.weight)
Esempio n. 21
0
    def __init__(self,
                 top_rnns=False,
                 vocab_size=None,
                 device='cpu',
                 finetuning=False):
        super().__init__()
        self.bert = BertModel.from_pretrained(
            '/root/workspace/qa_project/chinese_L-12_H-768_A-12')

        self.top_rnns = top_rnns
        if top_rnns:
            self.rnn = nn.LSTM(bidirectional=True,
                               num_layers=2,
                               input_size=768,
                               hidden_size=768 // 2,
                               batch_first=True)  #[128, 74, 768]
        self.fc = nn.Linear(768, vocab_size)

        self.device = device
        self.finetuning = finetuning
Esempio n. 22
0
    def __init__(self, opt):
        self.opt = opt

        if 'bert' in opt.model_name:
            tokenizer = Tokenizer4Bert(opt.max_seq_len,
                                       opt.pretrained_bert_name)
            bert = BertModel.from_pretrained(opt.pretrained_bert_name)
            self.model = opt.model_class(bert, opt).to(opt.device)
        else:
            tokenizer = build_tokenizer(
                fnames=[opt.dataset_file['train'], opt.dataset_file['test']],
                max_seq_len=opt.max_seq_len,
                dat_fname='{0}_tokenizer.dat'.format(opt.dataset))
            embedding_matrix = build_embedding_matrix(
                word2idx=tokenizer.word2idx,
                embed_dim=opt.embed_dim,
                dat_fname='{0}_{1}_embedding_matrix.dat'.format(
                    str(opt.embed_dim), opt.dataset))
            self.model = opt.model_class(embedding_matrix, opt).to(opt.device)

        self.trainset = ABSADataset(opt.dataset_file['train'], tokenizer)
        self.testset = ABSADataset(opt.dataset_file['test'], tokenizer)
        if self.opt.do_predict is True:
            self.predictset = ABSADataset(opt.dataset_file['predict'],
                                          tokenizer)

        assert 0 <= opt.valset_ratio < 1
        if opt.valset_ratio > 0:
            valset_len = int(len(self.trainset) * opt.valset_ratio)
            self.trainset, self.valset = random_split(
                self.trainset, (len(self.trainset) - valset_len, valset_len))
        else:
            self.valset = self.testset

        # tmp setting
        self.testset = self.valset

        if opt.device.type == 'cuda':
            logger.info('cuda memory allocated: {}'.format(
                torch.cuda.memory_allocated(device=opt.device.index)))
        self._print_args()
Esempio n. 23
0
    def __init__(self,
                 emb_size,
                 hidden_size,
                 vocab_size,
                 nlayers=1,
                 bidirectional=True,
                 rec_unit='LSTM',
                 dropout=0.5):
        """
        Based on https://github.com/komiya-m/MirrorGAN/blob/master/model.py
        :param emb_size: size of word embeddings
        :param hidden_size: size of hidden state of the recurrent unit
        :param vocab_size: size of the vocabulary (output of the network)
        :param rec_unit: type of recurrent unit (default=gru)
        """
        self.dropout = dropout
        self.nlayers = nlayers
        self.bidirectional = bidirectional
        self.num_directions = 2 if self.bidirectional else 1
        __rec_units = {
            'GRU': nn.GRU,
            'LSTM': nn.LSTM,
        }
        assert rec_unit in __rec_units, 'Specified recurrent unit is not available'

        super().__init__(emb_size)

        self.hidden_linear = nn.Linear(emb_size, hidden_size)
        self.encoder = BertModel.from_pretrained('bert-base-uncased')
        for param in self.encoder.parameters():
            param.requires_grad = False

        self.bert_linear = nn.Linear(768, emb_size)
        self.rnn = __rec_units[rec_unit](emb_size,
                                         hidden_size,
                                         num_layers=self.nlayers,
                                         batch_first=True,
                                         dropout=self.dropout,
                                         bidirectional=self.bidirectional)

        self.out = nn.Linear(self.num_directions * hidden_size, vocab_size)
Esempio n. 24
0
 def __init__(self,
              model_path='bert-base-uncased',
              length=None,
              cased=False):
     self.length = length
     self.cased = cased
     if cased == True:
         model_path = 'bert-base-cased'
         self.tokenizer = BertTokenizer.from_pretrained('bert-base-cased',
                                                        do_lower_case=False)
     else:
         self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
     self.model = BertForMaskedLM.from_pretrained(model_path)
     self.base_model = BertModel.from_pretrained(model_path)
     self.model.eval()
     self.base_model.eval()
     self.vocab = dict(self.tokenizer.vocab)
     sw_vocab = stop_words.intersection(set(self.vocab.keys()))
     self.sw_indecies = self.tokenizer.convert_tokens_to_ids(list(sw_vocab))
     puncs = list(string.punctuation)
     self.puncs_indecies = self.tokenizer.convert_tokens_to_ids(puncs)
Esempio n. 25
0
 def __init__(self, model_name, layer, use_cache=False):
     super().__init__()
     if 'bert' in globals():
         self.bert = globals()['bert']
     else:
         self.bert = BertModel.from_pretrained(model_name)
         globals()['bert'] = self.bert
     for p in self.bert.parameters():
         p.requires_grad = False
     self.layer = layer
     if 'large' in model_name:
         n_layer = 24
     else:
         n_layer = 12
     if self.layer == 'weighted_sum':
         self.weights = nn.Parameter(torch.ones(n_layer, dtype=torch.float))
         self.softmax = nn.Softmax(0)
     if use_cache:
         self._cache = {}
     else:
         self._cache = None
Esempio n. 26
0
def load_bert(base_version=True, lower_case=True, device=None):

    if base_version:
        embedding_dim = BERT_BASE_EMBEDDING_DIM
        if lower_case:
            bert_name = 'bert-base-uncased'
        else:
            bert_name = 'bert-base-cased'
    else:
        embedding_dim = BERT_LARGE_EMBEDDING_DIM
        if lower_case:
            bert_name = 'bert-large-uncased'
        else:
            bert_name = 'bert-large-cased'

    # Load pre-trained model tokenizer (vocabulary)
    tokenizer = BertTokenizer.from_pretrained(bert_name)
    # Load pre-trained model (weights)
    model = BertModel.from_pretrained(bert_name)

    return tokenizer, model, device, embedding_dim
Esempio n. 27
0
    def __init__(self, config):
        super().__init__()
        self.batchSize = config['model']['batchSize']
        self.dropout = nn.Dropout(config['model']['dropout'])
        self.device = config['DEVICE']
        #选取的特征数量
        self.featureLen = config['model']['featureLen']
        self.hiddenSize = config['model']['hiddenSize']
        self.embeddingSize = 768

        self.positionEncoding = PositionalEncoding(self.embeddingSize, dropout = 0.1)
        self.bertModel = BertModel.from_pretrained(config['model']['bert_base_chinese'])

        self.layer = nn.TransformerEncoderLayer(d_model = self.embeddingSize, nhead = 4)

        self.encoder = nn.TransformerEncoder(self.layer, num_layers=2)

        self.cnnArr = nn.ModuleList([nn.Conv2d(in_channels=1, out_channels=self.hiddenSize//self.featureLen, kernel_size=(i, self.embeddingSize))
            for i in range(2, 2+ self.featureLen)])

        self.fc = nn.Linear(self.hiddenSize, len(tagDict))
    def __init__(self, config, ft, num_labels, H, device_num, C, c_adj, alpha):
        super(BertGCN_Cluster, self).__init__(config)
        self.device = torch.device('cuda:' + device_num)
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.5)
        self.ft = ft
        self.alpha = alpha
        self.H = get_tensor(H, self.device)  # m * 3072
        self.C = get_tensor(C, self.device)  # m * C
        self.c_adj = gen_adj(get_tensor(c_adj, self.device)).detach()  # C * C
        self.num_labels = num_labels
        self.FCN = nn.Linear(768, num_labels)
        self.FCN_gcn = nn.Linear(768, 768)
        self.FCN_H = nn.Linear(H.shape[1], 768)
        self.actv = nn.LeakyReLU(0.2)
        # self.actv = nn.Tanh()
        self.softmax = nn.Softmax(dim=1)
        self.apply(self.init_bert_weights)

        self.W1 = Parameter(torch.Tensor(H.shape[1], 1536))
        self.W2 = Parameter(torch.Tensor(1536, 768))
Esempio n. 29
0
def main():
    path = os.path.join("data", "LJSpeech-1.1")
    preprocess_ljspeech(path)

    model_bert = BertModel.from_pretrained('bert-base-uncased')
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    text_path = os.path.join(hp.dataset_path, "train.txt")
    texts = process_text(text_path)

    if not os.path.exists(hp.bert_embeddings_path):
        os.mkdir(hp.bert_embeddings_path)

    for ind, text in enumerate(texts):
        character = text[0:len(text)-1]
        bert_embedding = get_embedding(character, model_bert, tokenizer)
        np.save(os.path.join(hp.bert_embeddings_path, str(ind) + ".npy"),
                bert_embedding.numpy(), allow_pickle=False)

        if (ind+1) % 100 == 0:
            print("Done", (ind+1))
Esempio n. 30
0
    def __init__(self, trigger_size=None, entity_size=None, all_postags=None, postag_embedding_dim=50,
                 argument_size=None, entity_embedding_dim=50, device=torch.device("cpu")):
        super().__init__()
        # self.bert = BertModel.from_pretrained('bert-base-cased')
        self.bert = BertModel.from_pretrained(bert_model_path)

        # hidden_size = 768 + entity_embedding_dim + postag_embedding_dim
        # hidden_size = 768
        hidden_size = 768 * 3
        self.fc1 = nn.Sequential(
            # nn.Dropout(0.5),
            nn.Linear(hidden_size, hidden_size, bias=True),
            nn.ReLU(),
        )
        self.fc_trigger = nn.Sequential(
            nn.Linear(hidden_size, trigger_size),
        )
        self.fc_argument = nn.Sequential(
            nn.Linear(hidden_size * 2, argument_size),
        )
        self.device = device
Esempio n. 31
0
 def __init__(self, emb_size, hidden_size, out_rel, n_layers=1, dropout=0.1, emb_drop=0.2,
              gpu=False, pretrained_emb=None, train_emb=True):
     super(EncoderRNN, self).__init__()
     self.gpu = gpu
     # self.input_size = input_size
     self.hidden_size = hidden_size
     self.n_layers = n_layers
     self.dropout = dropout
     self.use_cuda = gpu
     # self.b_size = b_size
     self.emb_size = emb_size
     # self.embedding = nn.Embedding(vocab_size, emb_size, padding_idx=0)
     self.bert = BertModel.from_pretrained('bert-base-uncased')
     self.entity_classifier = nn.Linear(emb_size, out_rel)
     # self.bert.weight.requires_grad = False
     self.embedding_dropout = nn.Dropout(emb_drop)
     self.rnn = nn.LSTM(emb_size, hidden_size, n_layers, dropout=self.dropout, batch_first=True)
     if pretrained_emb is not None:
        self.embedding.weight.data.copy_(pretrained_emb)
     if train_emb == False:
        self.embedding.weight.requires_grad = False
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
print(tokenizer.convert_tokens_to_ids('hello'))  # [1044, 1041, 1048, 1048, 1051]
print(tokenizer.convert_tokens_to_ids(['hello']))  # [7592]
print(tokenizer.convert_tokens_to_ids(['[hello]']))  # KeyError: '[hello]'; can not deal with OOV
print(indexed_tokens)  # [101, 2040, 2001, 3958, 27227, 1029, 102, 3958, 103, 2001, 1037, 13997, 11510, 102]
## Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]  # tokenized_text 分为两句, 前 7 个词一句, 后七个词一句

##################################################################
## BertModel
## Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens]); print(tokens_tensor.shape)  # torch.Size([1, 14])
segments_tensors = torch.tensor([segments_ids])

## Load pre-trained model (weights)
model = BertModel.from_pretrained(home + '/datasets/WordVec/pytorch_pretrained_bert/bert-large-uncased/')
model.eval()

## Predict hidden states features for each layer
print(tokens_tensor.shape)  # torch.Size([1, 14])
with torch.no_grad():
    encoded_layers, _ = model(tokens_tensor, segments_tensors)
## We have a hidden states for each of the 24 layers in model bert-large-uncased
print(len(encoded_layers))  # 24
print(encoded_layers[0].shape)  # torch.Size([1, 14, 1024])
x = torch.LongTensor([[1, 2], [3, 4]]); print(x.shape)  # torch.Size([2, 2])
print(modelfj)

##################################################################
## BertForMaskedLM
model = BertForMaskedLM.from_pretrained('/Users/coder352/datasets/WordVec/pytorch_pretrained_bert/bert-large-uncased/')