Ejemplo n.º 1
0
    def forward(self, x):
        #print('x:', str(x))
        batch_size = len(x)
        character_ids = batch_to_ids(x).to(tdevice)
        embeddings = self.elmo(character_ids)['elmo_representations']
        #print('elmo embeddings:', embeddings[0].size())
        X = embeddings[0].view(batch_size, -1, 1024)  # (N, W, D)

        # Pad to 10 words
        if X.size(1) > self.pad_size:
            X = X[:, 0:self.pad_size, :]
        elif X.size(1) < self.pad_size:
            pad = self.pad_size - X.size(1)
            zero_vec = torch.zeros(X.size(0), pad, X.size(2), device=tdevice)
            X = torch.cat((X, zero_vec), dim=1)

        if self.reduce_size > 0:
            X, hn = self.gru(X, None)

        x = X.unsqueeze(1)  # (N, Ci, W, D)]
        print('x size:', x.size())
        x_list = []
        for conv in self.convs:
            x_list.append(self.conv_and_pool(x, conv))
        x = torch.cat(x_list, 1)
        x = self.dropout(x)  # (N, len(Ks)*Co)

        logit = self.fc1(x)  # (N, C)
        return logit
Ejemplo n.º 2
0
def batch_preprocessing_elmo(batch, args):
    # 2D list containing all tokenized sentences from all the docs in batch  [[sent1], [sent2]]
    all_batch_sentences = []
    # 1D numpy array containing len of each sentences in each docs after padding
    modified_sentence_len = []
    # 1D list containing len of docs (num of sentences in each docs)
    batch_docs_len = [len(batch[i]) for i in range(len(batch))]
    # 2D list containing len of each sentences in each docs [doc->len_sent]
    batch_sentences_len = batch_sentences_length(batch)

    max_doc_len = max(batch_docs_len)
    pad_token = [args.padding_symbol]
    for doc in batch:
        if len(doc) < max_doc_len:
            for _ in range(max_doc_len - len(doc)):
                doc.append(pad_token)
        for sent in doc:
            modified_sentence_len.append(len(sent))
            all_batch_sentences.append(sent)
    # Converts a batch of tokenized sentences to a tensor representing the sentences with encoded characters
    # (len(batch), max sentence length, max word length[char]).
    all_batch_sents_enc_char = batch_to_ids(all_batch_sentences).to(
        args.device)
    # Converting to batch-mode again. 4D tensor  [doc] -> [sent] -> [word] -> [char]
    d0, d1, d2 = all_batch_sents_enc_char.size()
    all_batch_sents_enc_char = all_batch_sents_enc_char.view(
        args.batch_size_train, -1, d1, d2)
    return all_batch_sents_enc_char, batch_docs_len, batch_sentences_len, np.asarray(
        modified_sentence_len).reshape(args.batch_size_train, -1)
Ejemplo n.º 3
0
 def elmo_encode(data, __id2word=id2word):
     data_text = [glove_tokenizer(x, __id2word) for x in data]
     with torch.no_grad():
         character_ids = batch_to_ids(data_text).cuda()
         elmo_emb = elmo(character_ids)['elmo_representations']
         elmo_emb = (elmo_emb[0] + elmo_emb[1]) / 2  # avg of two layers
     return elmo_emb.cuda()
Ejemplo n.º 4
0
def get_scope_elmo(model,
                   ELMO_folder,
                   scope_text,
                   elmo_dim,
                   idx2id_dict,
                   id2idx_dict,
                   device=torch.device('cpu')):
    """ Get scope note ELMo embedding representation
    
    """
    with torch.no_grad():
        elmo_embeddings = [
            model(batch_to_ids(i).to(device)) for i in scope_text
        ]
        elmo_scope_embeddings = [
            i['elmo_representations'][0].view(-1, elmo_dim)
            for i in elmo_embeddings
        ]

    elmo_scope_embeddings = [
        torch.mean(item, dim=0) for item in elmo_scope_embeddings
    ]
    elmo_scope_embeddings = torch.stack(elmo_scope_embeddings)

    return elmo_scope_embeddings, idx2id_dict
Ejemplo n.º 5
0
def load_data_with_diff_vocab(inputFile,
                              src_vocab,
                              tgt_vocab,
                              max_word=200,
                              max_char=50,
                              type='norm'):
    """
    use different to load data, src_vocab, tgt_vocab, src_vocab including the lexnorm2015 and aggressive dataset
    :param inputFile:
    :param src_vocab:
    :param tgt_vocab:
    :param max_word:
    :param max_char:
    :param type:
    :return:
    """
    char_inputs = []
    word_inputs = []
    outputs = []
    max_output = 0
    df = pd.read_csv(inputFile, names=['src', 'tgt'])
    for src, tgt in zip(df['src'], df['tgt']):
        elmo_id = batch_to_ids([tokenize(str(src))])
        elmo_id = elmo_id.view(-1, 50)
        word_input = [
            src_vocab.word_to_id(word) for word in tokenize(str(src))
        ]
        if type == 'norm':
            output = [tgt_vocab.word_to_id(tgt_vocab.SYM_SOS)]
            output.extend([
                tgt_vocab.word_to_id(word) for word in tgt.strip().split(' ')
            ])
            output.append(tgt_vocab.word_to_id(tgt_vocab.SYM_EOS))
        else:
            output = [tgt_vocab.tag_to_id(tag) for tag in tgt.strip().split()]
        char_inputs.append(elmo_id)
        word_inputs.append(word_input)
        # print(str(idx), len(output))
        # idx += 1
        outputs.append(output)
    max_output = max([len(sent) for sent in outputs])
    # print(max_output)
    outputs = list(map(lambda d: d[:max_output], outputs))
    outputs = list(
        map(
            lambda d: d +
            (max_output - len(d)) * [tgt_vocab.word_to_id(tgt_vocab.W_PAD)],
            outputs))
    word_inputs = list(map(lambda d: d[:max_word], word_inputs))
    word_inputs = list(
        map(
            lambda d: d +
            (max_word - len(d)) * [src_vocab.word_to_id(src_vocab.W_PAD)],
            word_inputs))
    char_inputs = pad_sequence(char_inputs, True, max_word, max_char)

    # char_input = [[vocab.char_to_id(char) for char in word] for word in tokenize(str(src))]

    dataset = OurDataset(word_inputs, char_inputs, outputs)
    return dataset
Ejemplo n.º 6
0
    def train(self, training_data):
        '''
        Must be passed the training data - list of questions from the QuizBowlDataset class
        '''
        print("train")

        # We want questions to store each question tokenized by word
        # and answers stored as a list
        questions = []
        for ques in training_data:
            tokens = self.tokenizer(' '.join(ques.sentences))
            tokens_list = [token.text for token in tokens]
            questions.append(tokens_list)
            self.answers.append(ques.page)

        print("chars to ids")
        character_ids = batch_to_ids(questions)
        print("elmo output")
        elmo_output = self.elmo(character_ids)

        # index at zero because we only have a single output representation
        word_embeddings = elmo_output['elmo_representations'][0]

        print("mean")
        # A matrix of size (num_train_questions * embed_length)
        self.question_matrix = word_embeddings.mean(1)

        print("train done")
Ejemplo n.º 7
0
    def pointerEncoder(self, Xin_ELMo, lens):

        self.bn_inputdata = nn.BatchNorm1d(self.word_dim,
                                           affine=False,
                                           track_running_stats=False)

        batch_size = len(Xin_ELMo)

        #to convert input to ELMo embeddings
        character_ids = batch_to_ids(Xin_ELMo)
        if self.use_cuda:
            character_ids = character_ids.cuda()
        embeddings = elmo(character_ids)
        X_ELMo = embeddings['elmo_representations'][
            0]  #two layers output  [batch,length,d_elmo]
        if self.use_cuda:
            X_ELMo = X_ELMo.cuda()

        X = X_ELMo
        if self.isbanor:
            X = X.permute(0, 2, 1)  # N C L
            X = self.bn_inputdata(X)
            X = X.permute(0, 2, 1)  # N L C

        X = self.nnDropout(X)

        encoder_lstm_co_h_o = self.initHidden(self.hidden_dim, batch_size)
        output_encoder, hidden_states_encoder = self._run_rnn_packed(
            self.encoder_rnn, X, lens, encoder_lstm_co_h_o)  # batch_first=True
        output_encoder = output_encoder.contiguous()
        output_encoder = self.nnDropout(output_encoder)

        return output_encoder, hidden_states_encoder
Ejemplo n.º 8
0
    def run_batch_lattice(self, batch, testing=False):
        if testing:
            self.lm.eval()
        else:
            self.lm.train()

        inputs, positions, prevs, rev_prevs, lm_labels, rev_lm_labels, lm_masks, rev_lm_masks = batch
        char_ids = batch_to_ids(inputs).to(self.device)
        lm_labels = torch.from_numpy(lm_labels).to(self.device)
        rev_lm_labels = torch.from_numpy(rev_lm_labels).to(self.device)
        lm_masks = torch.from_numpy(lm_masks).float().to(self.device).view(-1)
        rev_lm_masks = torch.from_numpy(rev_lm_masks).float().to(
            self.device).flip(dims=[1]).view(-1)

        logits_forward, logits_backward, hiddens = self.lm(
            char_ids, prevs, rev_prevs)

        bs, sl, vs = logits_forward.size()
        logits_forward = logits_forward.view(-1, vs).log_softmax(-1)
        logits_backward = logits_backward.view(-1, vs).log_softmax(-1)
        lm_labels = lm_labels.view(-1, vs).float()
        rev_lm_labels = rev_lm_labels.flip(dims=[1]).view(-1, vs).float()
        loss_for = F.kl_div(logits_forward, lm_labels, reduction='none')
        loss_for = (loss_for.sum(-1) * lm_masks).sum() / lm_masks.sum()
        loss_rev = F.kl_div(logits_backward, rev_lm_labels, reduction='none')
        loss_rev = (loss_rev.sum(-1) * rev_lm_masks).sum() / rev_lm_masks.sum()

        loss = (loss_for + loss_rev) / 2
        if not testing:
            self.optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.parameters, 0.25)
            self.optimizer.step()

        return loss_for, loss_rev, torch.tensor(0.0), torch.tensor(0.0)
Ejemplo n.º 9
0
 def convert_tokens_to_cids(self,
                            tokens,
                            pad_sequence=True,
                            min_seq_size=10):
     """
     Args:
       pad_sequence, min_seq_size:
         if pad_sequence is True, pad the sequence up to n_ctx(max_seq_size).
         else do not pad basically. however, since the sequence size should be larger than min_seq_size.
         we pad the sequence additionally.
     """
     from allennlp.modules.elmo import batch_to_ids
     pad_cids = [[self.pad_id] * self.char_n_ctx]
     ids = batch_to_ids([tokens])[0].detach().cpu().numpy().tolist()
     # padding
     if pad_sequence:
         padding_length = self.n_ctx - len(ids)
         if padding_length > 0:
             ids += pad_cids * padding_length
     else:
         padding_length = min_seq_size - len(ids)
         if padding_length > 0:
             ids += pad_cids * padding_length
     ids = ids[:self.n_ctx]
     return ids
Ejemplo n.º 10
0
    def run_batch_lattice(self, batch, testing=False):
        if testing:
            self.slu.eval()
        else:
            self.slu.train()

        inputs, words, positions, prevs, rev_inputs, rev_prevs, labels = batch

        inputs = torch.from_numpy(inputs).to(self.device)
        rev_inputs = torch.from_numpy(rev_inputs).to(self.device)
        labels = torch.from_numpy(labels).to(self.device)

        elmo_emb = None
        if self.use_elmo:
            char_ids = batch_to_ids(words).to(self.device)
            elmo_emb = self.elmo(
                char_ids, prevs=prevs,
                rev_prevs=rev_prevs)['elmo_representations'][0]

        logits = self.slu(inputs, positions, prevs, rev_inputs, rev_prevs,
                          elmo_emb)

        loss = F.cross_entropy(logits, labels)

        if not testing:
            start = time.time()
            self.optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.parameters, 1.0)
            self.optimizer.step()
            # print(f"backward takes {time.time()-start}")

        return loss, logits
Ejemplo n.º 11
0
    def run_batch(self, batch, testing=False):
        if testing:
            self.lm.eval()
        else:
            self.lm.train()

        inputs, outputs, outputs_rev, uids = batch
        char_ids = batch_to_ids(inputs).to(self.device)
        outputs = torch.from_numpy(outputs).to(self.device)
        outputs_rev = torch.from_numpy(outputs_rev).to(self.device)

        logits_forward, logits_backward, hiddens, mask = self.lm(char_ids)

        bs, sl, vs = logits_forward.size()
        logits_forward = logits_forward.view(-1, vs)
        logits_backward = logits_backward.view(-1, vs)
        outputs = outputs.view(-1)
        outputs_rev = outputs_rev.view(-1)
        loss_for = F.cross_entropy(logits_forward, outputs, ignore_index=PAD)
        loss_rev = F.cross_entropy(logits_backward,
                                   outputs_rev,
                                   ignore_index=PAD)

        loss = (loss_for + loss_rev) / 2
        if not testing:
            self.optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.parameters, 0.25)
            self.optimizer.step()

        return loss_for, loss_rev, torch.tensor(0.0), torch.tensor(0.0)
Ejemplo n.º 12
0
    def run_batch(self, batch, testing=False):
        if testing:
            self.slu.eval()
        else:
            self.slu.train()

        if len(batch) == 4:
            inputs, words, positions, labels = batch
        else:
            inputs, words, positions, _, _, _, labels = batch

        inputs = torch.from_numpy(inputs).to(self.device)
        labels = torch.from_numpy(labels).to(self.device)

        elmo_emb = None
        if self.use_elmo:
            char_ids = batch_to_ids(words).to(self.device)
            elmo_emb = self.elmo(char_ids)['elmo_representations'][0]

        logits = self.slu(inputs, positions, elmo_emb)

        loss = F.cross_entropy(logits, labels)

        if not testing:
            self.optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.parameters, 1.0)
            self.optimizer.step()

        return loss, logits
    def elmo_encode(self, data, __id2word):
        """
        get the id2word from vocab, then convert to id
        from allennlp.modules.elmo import Elmo, batch_to_ids
        batch_to_id fills to the max sentence length, which could be less than desired
        So further fill it to get to the max sent length
        """
        data_text = [self.glove_tokenizer(x, __id2word) for x in data]

        with torch.no_grad():
            elmo = Elmo(options_file, weight_file, 2, dropout=0).cuda()
            elmo.eval()
            character_ids = batch_to_ids(data_text).cuda()

            row_num = character_ids.shape[0]
            elmo_dim = self.elmo_dim

            if torch.sum(character_ids) != 0:
                elmo_emb = elmo(character_ids)['elmo_representations']
                elmo_emb = (elmo_emb[0] + elmo_emb[1]) / 2  # avg of two layers
            else:
                elmo_emb = torch.zeros([row_num, self.sent_pad_len, elmo_dim],
                                       dtype=torch.float)

        sent_len = elmo_emb.shape[1]

        if sent_len < self.sent_pad_len:
            fill_sent_len = self.sent_pad_len - sent_len
            # create a bunch of 0's to fill it up
            filler = torch.zeros([row_num, fill_sent_len, elmo_dim],
                                 dtype=torch.float)
            elmo_emb = torch.cat((elmo_emb, filler.cuda()), dim=1)
        return elmo_emb.cuda()
Ejemplo n.º 14
0
    def correct_string(self, line):

        premise = line[0].lower()
        Xtype = torch.FloatTensor
        ytype = torch.LongTensor
        is_cuda = torch.cuda.is_available()

        if is_cuda:
            self.model.cuda()
            Xtype = torch.cuda.FloatTensor
            ytype = torch.cuda.LongTensor
            if self.use_background: self.model_bg.cuda()

        X, _ = get_line_representation(premise)
        tx = Variable(torch.from_numpy(np.array([X]))).type(Xtype)

        if self.use_elmo or self.use_elmo_bg:
            tx_elmo = Variable(batch_to_ids([premise.split()])).type(ytype)

        print(premise)
        SEQ_LEN = len(premise.split())

        if self.use_elmo:
            ty_pred = self.model(tx, tx_elmo, [SEQ_LEN])
        else:
            ty_pred = self.model(tx, [234])

        y_pred = ty_pred.detach().cpu().numpy()
        y_pred = y_pred[0] # ypred now is NUM_CLASSES x SEQ_LEN

        if self.use_background:
            if self.use_elmo_bg:
                ty_pred_bg = self.model_bg(tx, tx_elmo, [SEQ_LEN])
            else:
                ty_pred_bg = self.model_bg(tx, [SEQ_LEN])
            y_pred_bg = ty_pred_bg.detach().cpu().numpy()
            y_pred_bg = y_pred_bg[0]

        output_words = []

        self.total_predictions += SEQ_LEN

        for idx in range(SEQ_LEN):
            pred_idx = np.argmax(y_pred[:, idx])
            if pred_idx == utils.WORD_LIMIT:
                word = premise.split()[idx]
                if self.use_background:
                    pred_idx_bg = np.argmax(y_pred_bg[:, idx])
                    if pred_idx_bg != self.vocab_size_bg:
                        word = utils.i2w_bg[pred_idx_bg]
                if self.unk_output:
                    word = "a" # choose a sentiment neutral word
                output_words.append(word)
                self.predicted_unks += 1.0
                if word in utils.w2i:
                    self.predicted_unks_in_vocab += 1.0
            else:
                output_words.append(utils.i2w[pred_idx])
        line[0] = " ".join(output_words)
        return line
Ejemplo n.º 15
0
def load_data_with_diff_vocab(inputFile, src_vocab, tgt_vocab, max_word=200, max_char=50, data_type ='norm'):
    """
    use different to load data, src_vocab, tgt_vocab, src_vocab including the lexnorm2015 and aggressive dataset
    :param inputFile:
    :param src_vocab:
    :param tgt_vocab:
    :param max_word:
    :param max_char:
    :param data_type:
    :return:
    """
    char_inputs = []
    word_inputs = []
    outputs = []
    src, tgt = pickle.load(open(inputFile, 'rb'))
    for s, t in zip(src, tgt):
        elmo_id = batch_to_ids(s)
        elmo_id = elmo_id.view(-1, 50)
        tokens = ['<sos>'] + s + ['<eos>']
        word_input = [src_vocab.word_to_id(word) for word in tokens]
        if data_type == 'norm':
            output = [tgt_vocab.word_to_id(tgt_vocab.SYM_SOS)]
            output.extend([tgt_vocab.word_to_id(word) for word in t])
            output.append(tgt_vocab.word_to_id(tgt_vocab.SYM_EOS))
        else:
            output = [tgt_vocab.tag_to_id(t)]
        char_inputs.append(elmo_id)
        word_inputs.append(word_input)
        outputs.append(output)
    dataset = OurDataset(word_inputs, char_inputs, outputs, max_word, max_char)
    return dataset
def elmo_large(conll_file):
    dataset, textset = read_conll_corpus(conll_file)

    options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json"
    weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"
    elmo = Elmo(options_file, weight_file, 1)
    sentences = [sent[1:-1] for sent in dataset]

    print(sentences)
    character_ids = batch_to_ids(sentences)
    embeddings = elmo(character_ids)['elmo_representations'][0].detach()
    vectors = []
    print(embeddings)
    for sent_i, sent in enumerate(embeddings):
        key = sentences[sent_i]
        if 'play' in key:
            i = key.index('play')

        elif 'bright' in key:
            i = key.index('bright')
        elif 'light' in key:
            i = key.index('light')
        elif 'smart' in key:
            i = key.index('smart')
        vectors.append(np.array(sent[i]))

    print(vectors)
    return np.stack(vectors,
                    axis=0), ['\t'.join(sentence) for sentence in sentences]
Ejemplo n.º 17
0
    def transform(self, X, y=None):
        """Transform documents to document ids.

        Uses the vocabulary learned by fit.

        Args:
            X: iterable
            an iterable which yields either str, unicode or file objects.
            y: iterable, label string
        Returns:
            features: document if matrix.
            y: label id matrix.
        """
        word_ids = [self._word_vocab.doc2id(doc) for doc in X]
        word_ids = pad_sequences(word_ids, padding='post')

        char_ids = [[self._char_vocab.doc2id(w) for w in doc] for doc in X]
        char_ids = pad_nested_sequences(char_ids)

        character_ids = batch_to_ids(X)
        elmo_embeddings = self._elmo(character_ids)['elmo_representations'][1]
        elmo_embeddings = elmo_embeddings.detach().numpy()

        features = [word_ids, char_ids, elmo_embeddings]

        if y is not None:
            y = [self._label_vocab.doc2id(doc) for doc in y]
            y = pad_sequences(y, padding='post')
            y = to_categorical(y, self.label_size).astype(int)
            # categorical issues
            y = y if len(y.shape) == 3 else np.expand_dims(y, axis=0)
            return features, y
        else:
            return features
Ejemplo n.º 18
0
    def __init__(self, config, path):
        from allennlp.modules.elmo import batch_to_ids
        pad_ids = [config['pad_token_id']] * config['char_n_ctx']
        all_token_ids = []
        all_pos_ids = []
        all_char_ids = []
        all_label_ids = []
        with open(path, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                items = line.split('\t')
                token_ids = [int(d) for d in items[1].split()]
                pos_ids = [int(d) for d in items[2].split()]
                # compute ELMo character ids
                tokens = items[3].split()
                char_ids = batch_to_ids([tokens
                                         ])[0].detach().cpu().numpy().tolist()
                for _ in range(len(token_ids) - len(char_ids)):
                    char_ids.append(pad_ids)
                label_ids = [int(d) for d in items[0].split()]
                all_token_ids.append(token_ids)
                all_pos_ids.append(pos_ids)
                all_char_ids.append(char_ids)
                all_label_ids.append(label_ids)
        all_token_ids = torch.tensor(all_token_ids, dtype=torch.long)
        all_pos_ids = torch.tensor(all_pos_ids, dtype=torch.long)
        all_char_ids = torch.tensor(all_char_ids, dtype=torch.long)
        all_label_ids = torch.tensor(all_label_ids, dtype=torch.long)

        self.x = TensorDataset(all_token_ids, all_pos_ids, all_char_ids)
        self.y = all_label_ids
Ejemplo n.º 19
0
def elmoFromPair(pair, elmo):
    # choose layer 2 representations
    layer = 1
    character_ids = batch_to_ids(pair)
    input_tensor, output_tensor = elmo(
        character_ids)['elmo_representations'][layer].data
    return (input_tensor, output_tensor)
Ejemplo n.º 20
0
    def __init__(self, config, path):
        from allennlp.modules.elmo import batch_to_ids
        pad_ids = [config['pad_token_id']] * config['char_n_ctx']
        all_token_ids = []
        all_pos_ids = []
        all_char_ids = []
        all_label_ids = []
        with open(path, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                items = line.split('\t')
                token_ids = [int(d) for d in items[1].split()]
                pos_ids = [int(d) for d in items[2].split()]
                # using ELMo.batch_to_ids, compute character ids: ex) 'The' [259, 85, 105, 102, 260, 261, 261, ...]
                # (actually byte-based, char_vocab_size == 262, char_padding_idx == 261)
                tokens = items[3].split()
                char_ids = batch_to_ids([tokens
                                         ])[0].detach().cpu().numpy().tolist()
                for _ in range(len(token_ids) - len(char_ids)):
                    char_ids.append(pad_ids)
                label_ids = [int(d) for d in items[0].split()]
                all_token_ids.append(token_ids)
                all_pos_ids.append(pos_ids)
                all_char_ids.append(char_ids)
                all_label_ids.append(label_ids)
        all_token_ids = torch.tensor(all_token_ids, dtype=torch.long)
        all_pos_ids = torch.tensor(all_pos_ids, dtype=torch.long)
        all_char_ids = torch.tensor(all_char_ids, dtype=torch.long)
        all_label_ids = torch.tensor(all_label_ids, dtype=torch.long)

        self.x = TensorDataset(all_token_ids, all_pos_ids, all_char_ids)
        self.y = all_label_ids
Ejemplo n.º 21
0
    def encode_sent_and_span_paral(self, text,  # batch, max_sent, max_word
                                   text_msk,  # batch, max_sent, max_word
                                   span,  # batch, max_sent_num, max_span_num, max_word
                                   sent_idx  # batch size
                                   ):
        this_text = two_dim_index_select(text['tokens'], sent_idx)  # batch, max_word
        from allennlp.modules.elmo import batch_to_ids
        if self.use_elmo:
            this_text_list: List = this_text.tolist()
            text_str_list = []
            for sample in this_text_list:
                s = [self.vocab.get_token_from_index(x) for x in sample]
                text_str_list.append(s)
            character_ids = batch_to_ids(text_str_list).to(self.device)
            this_context = self.elmo(character_ids)
            # print(this_context['elmo_representations'][0].size())
            this_context = this_context['elmo_representations'][0]
        else:
            this_text = {'tokens': this_text}
            this_context = self._text_field_embedder(this_text)

        num_doc, max_word, inp_dim = this_context.size()
        batch_size = sent_idx.size()[0]
        assert batch_size == num_doc

        # text is the original text of the selected sentence.
        # this_context = two_dim_index_select(context, sent_idx)  # batch, max_word, hdim
        this_context_mask = two_dim_index_select(text_msk, sent_idx)  # batch, max_word
        this_span = two_dim_index_select(span, sent_idx)  # batch , nspan, max_word

        concat_rep_of_compression, \
        span_msk, original_sent_rep = self.enc.forward(word_emb=this_context,
                                                       word_emb_msk=this_context_mask,
                                                       span=this_span)
        return concat_rep_of_compression, span_msk, original_sent_rep
Ejemplo n.º 22
0
    def batch_to_embeddings(self, batch: List[List[str]]) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Parameters
        ----------
        batch : ``List[List[str]]``, required
            A list of tokenized sentences.

        Returns
        -------
            A tuple of tensors, the first representing activations (batch_size, 3, num_timesteps, 1024) and
        the second a mask (batch_size, num_timesteps).
        """
        character_ids = batch_to_ids(batch)
        if self.cuda_device >= 0:
            character_ids = character_ids.cuda(device=self.cuda_device)

        bilm_output = self.elmo_bilm(character_ids)
        layer_activations = bilm_output['activations']
        mask_with_bos_eos = bilm_output['mask']

        # without_bos_eos is a 3 element list of (activation, mask) tensor pairs,
        # each with size (batch_size, num_timesteps, dim and (batch_size, num_timesteps)
        # respectively.
        without_bos_eos = [remove_sentence_boundaries(layer, mask_with_bos_eos)
                           for layer in layer_activations]
        # Converts a list of pairs (activation, mask) tensors to a single tensor of activations.
        activations = torch.cat([ele[0].unsqueeze(1) for ele in without_bos_eos], dim=1)
        # The mask is the same for each ELMo vector, so just take the first.
        mask = without_bos_eos[0][1]

        return activations, mask
Ejemplo n.º 23
0
    def elmo_embeddings(self,
                        processed_sent,
                        number_sent,
                        lang='en',
                        args=None):
        print('complicated_layers_extraction')
        '''
            (batch, sequence_length, 50)
            returns dict 
        '''

        first_layer_lst = []
        second_layer_lst = []
        third_layer_lst = []
        for index, sent in enumerate(processed_sent[:number_sent]):
            sentences = sent  # create a list of size one of a list.
            # print(sentences)
            character_ids = batch_to_ids(sentences).to(self.device)
            # embeddings = elmo._elmo_lstm._token_embedder(character_ids)
            bilm_output = self.elmo._elmo_lstm(character_ids)
            temp = bilm_output['activations']
            first_layer = temp[0].cpu()[:, 1:-1]  # shape = (bsz, seqlen, dim)
            second_layer = temp[1].cpu()[:, 1:-1]
            third_layer = temp[2].cpu()[:, 1:-1]

            first_layer_lst.append(first_layer)
            second_layer_lst.append(second_layer)
            third_layer_lst.append(third_layer)

            if index % 1000 == 0:
                sys.stdout.write('-')
                sys.stdout.flush()
        sys.stdout.write('\n')

        return first_layer_lst, second_layer_lst, third_layer_lst
Ejemplo n.º 24
0
 def forward(self, sentences, device='cuda'):
     """
     sentences: list[str], len of list: B
     output sent_embs: Tensor B x OUT
     """
     sentences = [WordEncoder.tokenize(s) for s in sentences]
     # sentences = [['First', 'sentence', '.'], ['Another', '.']]
     # use batch_to_ids to convert sentences to character ids
     character_ids = batch_to_ids(sentences).to(device)
     embeddings = self.elmo(character_ids)
     # embeddings['elmo_representations'] is length two list of tensors.
     # Each element contains one layer of ELMo representations with shape
     # (2, 3, 1024).
     #   2    - the batch size
     #   3    - the sequence length of the batch
     #   1024 - the length of each ELMo vector
     sent_embeds = embeddings['elmo_representations'][1]  # B x max_l x 1024
     sent_emb_list = list()
     for si in range(len(sentences)):
         sent_len = len(sentences[si])
         sent_embed = torch.mean(sent_embeds[si, :sent_len, :],
                                 dim=0)  # 1024
         sent_emb_list.append(sent_embed)
     sent_embs = torch.stack(sent_emb_list, dim=0)  # B x 1024
     return sent_embs
    def batch_to_embeddings(self, batch: List[List[str]]) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Parameters
        ----------
        batch : ``List[List[str]]``, required
            A list of tokenized sentences.
        Returns
        -------
            A tuple of tensors, the first representing activations (batch_size, 3, num_timesteps, 1024) and
        the second a mask (batch_size, num_timesteps).
        """
        character_ids = batch_to_ids(batch)
        if self.cuda_device >= 0:
            character_ids = character_ids.cuda(device=self.cuda_device)

        bilm_output = self.elmo_bilm(character_ids, self.bias, self.num_bias, self.contraction)
        layer_activations = bilm_output['activations']
        mask_with_bos_eos = bilm_output['mask']

        # without_bos_eos is a 3 element list of (activation, mask) tensor pairs,
        # each with size (batch_size, num_timesteps, dim and (batch_size, num_timesteps)
        # respectively.
        without_bos_eos = [remove_sentence_boundaries(layer, mask_with_bos_eos)
                           for layer in layer_activations]
        # Converts a list of pairs (activation, mask) tensors to a single tensor of activations.
        activations = torch.cat([ele[0].unsqueeze(1) for ele in without_bos_eos], dim=1)
        # The mask is the same for each ELMo vector, so just take the first.
        mask = without_bos_eos[0][1]

        return activations, mask
Ejemplo n.º 26
0
def elmo_read_and_generate_vecs(korpus_path: str, size: int, test=False):
    """
    INPUT:
    korpus - nazwa folderu z korpusem (np. "korpusGAZETA")
    size - rozmiar sąsiedztwa budującego pojedynczą frazę (liczba słów w tył i w przód względem nazwiska)
    
    OUTPUT:
    vecs - wektory przetworzone przez elmo - 1 wektor dla frazy zdubowanej napodstawie sąsiedztwa
    corpus_list, person_list, document_dict, person_dict - wynik działania funkcji 'read_corpus' - patrz komentarz do tamtej funkcji
    """

    elmo = Elmo(options_file, weight_file, 1, dropout=0)

    path_raw = korpus_path

    [corpus_list, person_list, document_dict, person_dict,
     profession_dict] = read_corpus(path_raw, size, test)

    vecs = []
    step = 1000

    for i in tqdm(range(0, len(corpus_list), step)):
        character_ids = batch_to_ids(corpus_list[i:i + step])

        embeddings = elmo(character_ids)

        vecs.extend(elmo_emb_2_vec(embeddings))

    return [
        vecs, corpus_list, person_list, document_dict, person_dict,
        profession_dict
    ]
Ejemplo n.º 27
0
 def __call__(self, tokens):
     """Elmo representation is extracted for each candidate in a turn."""
     token_ids = batch_to_ids(tokens)
     if torch.cuda.is_available(): token_ids = token_ids.cuda()
     embeddings = self.encoder(
         token_ids)["elmo_representations"][0].detach().cpu().data
     return embeddings
Ejemplo n.º 28
0
def create_embed_loaders():
    train, train_labs, test, test_labs, _ = get_data()
    ttr, tte = [], []
    for t in train:
        ttr.append(len(t))
    for t in test:
        tte.append(len(t))
    # train = train[:100]
    # test = test[:50]
    lentrain = len(train)
    data = copy(train)
    data.extend(test)
    elmo = get_elmo()
    # elmo = ElmoEmbedder()
    
    # te = process_sentences(elmo, test)
    # print(tr.size())
    # assert False
    da_ids = batch_to_ids(data)
    da = elmo(da_ids)['elmo_representations'][0]
    da = da.to(device)
    print(da.size())
    tr = da[:lentrain, :, :]
    te = da[lentrain:, :, :]
    trdata = ElmoDset(tr, train_labs, ttr)
    tedata = ElmoDset(te, test_labs, tte)
    trload = DataLoader(trdata, shuffle = True, batch_size = 2)
    teload = DataLoader(tedata, shuffle = True, batch_size = 2)
    return trload, teload
Ejemplo n.º 29
0
    def _word_embed_elmo(self):
        print(':: Initializing ELMo')
        elmo = Elmo(ELMO_OPTIONS_FILE, ELMO_WEIGHT_FILE, 1, dropout=0)

        print(':: Encoding tokens')
        char_ids = batch_to_ids(self.sent_tokens)

        print(':: Calculating ELMo embeddings')
        sent_embeds, sent_lengths = [], []
        for i in range(0, len(char_ids), ELMO_BATCH_SIZE):
            print(' > {}/{} [{}%]'.format(i, len(char_ids),
                                          int(i / len(char_ids) * 100)),
                  end='\r')

            elmo_batch = char_ids[i:i + ELMO_BATCH_SIZE]
            sent_batch_embeds = elmo(elmo_batch)
            sent_embeds.append(sent_batch_embeds['elmo_representations'][0])
            sent_lengths.append(sent_batch_embeds['mask'].sum(dim=1))

            if len(sent_embeds) >= 8:
                if self.word_tensor is not None:
                    sent_embeds = [self.word_tensor] + sent_embeds
                    sent_lengths = [self.sent_lengths] + sent_lengths

                self.word_tensor = torch.cat(sent_embeds)
                self.sent_lengths = torch.cat(sent_lengths)
                sent_embeds, sent_lengths = [], []

        self.word_tensor = torch.cat([self.word_tensor] + sent_embeds)
        self.sent_lengths = torch.cat([self.sent_lengths] + sent_lengths)
        print(' > ELMo embeddings are calculated for all {} sentences'.format(
            len(char_ids)))
Ejemplo n.º 30
0
    def forward(self, input, seq_lens, vocab, batchartoovs=None):
        strings = []
        for examp in input:
            copy = examp.clone().cpu().numpy().astype(int)
            converted = data.outputids2words(
                copy, vocab, (batchartoovs if config.pointer_gen else None))
            strings.append(converted)

        strings = batch_to_ids(strings).cuda()
        embedded = self.elmo_layer(strings)['elmo_representations']
        embedded = embedded[0]
        #[batch size, max enc steps, 1024]
        #embedded = self.embedding(input)

        packed = pack_padded_sequence(embedded, seq_lens, batch_first=True)
        output, hidden = self.lstm(packed)

        encoder_outputs, _ = pad_packed_sequence(
            output, batch_first=True)  # h dim = B x t_k x n
        encoder_outputs = encoder_outputs.contiguous()

        encoder_feature = encoder_outputs.view(
            -1, 2 * config.hidden_dim)  # B * t_k x 2*hidden_dim
        encoder_feature = self.W_h(encoder_feature)

        return encoder_outputs, encoder_feature, hidden
Ejemplo n.º 31
0
    def forward(self, span, a_vec, raw_span):
        #span_vec = span
        span_vec = self.word_embeddings(span)
        character_ids = batch_to_ids(raw_span).cuda()

        elmo_embeddings = self.elmo(character_ids)
        elmo_representations = torch.cat(
            [span_vec] + elmo_embeddings['elmo_representations'], dim=2)
        generate_output = self.generative_decoder(elmo_representations, a_vec,
                                                  elmo_embeddings['mask'],
                                                  span)
        batch_size, target_iter = a_vec.shape
        gen_out = torch.zeros(batch_size,
                              target_iter).to(generate_output.device)
        for i in range(batch_size):
            gen_out[i, :] = generate_output[i, :, :].max(1)[1]

        print(generate_output.shape)
        generate_output = generate_output[:, 1:, :].contiguous()
        generate_output = generate_output.view(
            generate_output.shape[0] * generate_output.shape[1],
            generate_output.shape[2])
        print(generate_output.shape)

        generate_output = F.softmax(generate_output, dim=1)
        eps = 1e-8
        generate_output = (1 - eps) * generate_output + eps * torch.min(
            generate_output[generate_output != 0])
        generate_loss = self.gen_loss(torch.log(generate_output),
                                      a_vec[:, 1:].contiguous().view(-1))
        #print (generate_output.max(1)[1])
        #print (a_vec[:,1:].contiguous().view(-1))
        loss = generate_loss
        return loss, gen_out
Ejemplo n.º 32
0
Archivo: elmo.py Proyecto: Pinafore/qb
def batchify(x_data, y_data, batch_size=128, shuffle=False):
    batches = []
    for i in range(0, len(x_data), batch_size):
        start, stop = i, i + batch_size
        x_batch = batch_to_ids(x_data[start:stop])
        lengths = Variable(torch.from_numpy(np.array([max(len(x), 1) for x in x_data[start:stop]])).float()).view(-1, 1)
        if CUDA:
            y_batch = Variable(torch.from_numpy(np.array(y_data[start:stop])).cuda())
        else:
            y_batch = Variable(torch.from_numpy(np.array(y_data[start:stop])))
        batches.append((x_batch, y_batch, lengths))

    if shuffle:
        random.shuffle(batches)

    return batches