예제 #1
0
    def test_average_embedding_works(self):
        tempdir = tempfile.mkdtemp()
        sentences_path = os.path.join(tempdir, "sentences.txt")
        output_path = os.path.join(tempdir, "output.txt")

        sentence = "Michael went to the store to buy some eggs ."
        with open(sentences_path, 'w') as f:
            f.write(sentence)

        sys.argv = ["run.py",  # executable
                    "elmo",  # command
                    sentences_path,
                    output_path,
                    "--average",
                    "--options-file",
                    self.options_file,
                    "--weight-file",
                    self.weight_file]

        main()

        assert os.path.exists(output_path)

        embedder = ElmoEmbedder(options_file=self.options_file, weight_file=self.weight_file)
        expected_embedding = embedder.embed_sentence(sentence.split())
        expected_embedding = (expected_embedding[0] + expected_embedding[1] + expected_embedding[2]) / 3

        with h5py.File(output_path, 'r') as h5py_file:
            assert list(h5py_file.keys()) == [sentence]
            # The vectors in the test configuration are smaller (32 length)
            embedding = h5py_file.get(sentence)
            assert embedding.shape == (len(sentence.split()), 32)
            numpy.testing.assert_allclose(embedding, expected_embedding, rtol=1e-4)
예제 #2
0
    def get_sentence_embeddings(self, sentences, name="test"):
        # Layer 0 are token representations which are not sensitive to context
        # Layer 1 are representations from the first bilstm
        # Layer 2 are the representations from the second bilstm

        # Load any preexisting embeddings for name. Careful, make sure that name is unique!
        embedding_file = self.embedding_dir + name + "sentence_embeddings.pickle"
        sentence_embeddings = self.load_embeddings(embedding_file)

        if not (len(sentence_embeddings) == len(sentences)):
            if self.embedder is None:
                self.embedder = ElmoEmbedder()

            sentence_embeddings = self.embedder.embed_batch(sentences)

            if not len(sentence_embeddings) == len(sentences):
                logging.info("Something went wrong with the embedding. Number of embeddings: " + str(
                    len(sentence_embeddings)) + " Number of sentences: " + str(len(sentences)))

            self.save_embeddings(embedding_file, sentence_embeddings)

        single_layer_embeddings = [embedding[self.layer_id] for embedding in sentence_embeddings[:]]

        if self.only_forward:
            forward_embeddings = []
            for sentence_embedding in single_layer_embeddings:
                forward_embeddings.append([token_embedding[0:512] for token_embedding in sentence_embedding])
            return forward_embeddings
        else:
            return single_layer_embeddings
예제 #3
0
    def test_embeddings_are_as_expected(self):
        loaded_sentences, loaded_embeddings = self._load_sentences_embeddings()

        assert len(loaded_sentences) == len(loaded_embeddings)
        batch_size = len(loaded_sentences)

        # The sentences and embeddings are organized in an idiosyncratic way TensorFlow handles batching.
        # We are going to reorganize them linearly so they can be grouped into batches by AllenNLP.
        sentences = []
        expected_embeddings = []
        for batch_number in range(len(loaded_sentences[0])):
            for index in range(batch_size):
                sentences.append(loaded_sentences[index][batch_number].split())
                expected_embeddings.append(loaded_embeddings[index][batch_number])

        assert len(expected_embeddings) == len(sentences)

        embedder = ElmoEmbedder(options_file=self.options_file, weight_file=self.weight_file)
        embeddings = list(embedder.embed_sentences(sentences, batch_size))

        assert len(embeddings) == len(sentences)

        for tensor, expected in zip(embeddings, expected_embeddings):
            numpy.testing.assert_array_almost_equal(tensor[2], expected)
예제 #4
0
class ElmoEncoder(TextEncoder):

    def __init__(self, embedding_dir):
        super(ElmoEncoder, self).__init__(embedding_dir)
        self.layer_id = 1
        self.only_forward = False
        self.embedder = None

    # Word embeddings are just sentence embeddings with sentences consisting of a single word
    # This is most likely too naive.
    def get_word_embeddings(self, words, name="test"):
        self.get_sentence_embeddings(name, words)

    # Takes a list of sentences and returns a list of embeddings
    def get_sentence_embeddings(self, sentences, name="test"):
        # Layer 0 are token representations which are not sensitive to context
        # Layer 1 are representations from the first bilstm
        # Layer 2 are the representations from the second bilstm

        # Load any preexisting embeddings for name. Careful, make sure that name is unique!
        embedding_file = self.embedding_dir + name + "sentence_embeddings.pickle"
        sentence_embeddings = self.load_embeddings(embedding_file)

        if not (len(sentence_embeddings) == len(sentences)):
            if self.embedder is None:
                self.embedder = ElmoEmbedder()

            sentence_embeddings = self.embedder.embed_batch(sentences)

            if not len(sentence_embeddings) == len(sentences):
                logging.info("Something went wrong with the embedding. Number of embeddings: " + str(
                    len(sentence_embeddings)) + " Number of sentences: " + str(len(sentences)))

            self.save_embeddings(embedding_file, sentence_embeddings)

        single_layer_embeddings = [embedding[self.layer_id] for embedding in sentence_embeddings[:]]

        if self.only_forward:
            forward_embeddings = []
            for sentence_embedding in single_layer_embeddings:
                forward_embeddings.append([token_embedding[0:512] for token_embedding in sentence_embedding])
            return forward_embeddings
        else:
            return single_layer_embeddings
예제 #5
0
    def test_embed_batch_contains_empty_sentence(self):
        embedder = ElmoEmbedder(options_file=self.options_file, weight_file=self.weight_file)
        embeddings = list(embedder.embed_sentences(["This is a test".split(), []]))

        assert len(embeddings) == 2
예제 #6
0
    def test_embed_batch_contains_empty_sentence(self):
        embedder = ElmoEmbedder(options_file=self.options_file, weight_file=self.weight_file)
        embeddings = list(embedder.embed_sentences([u"This is a test".split(), []]))

        assert len(embeddings) == 2
parser = argparse.ArgumentParser(description='sow training')
parser.add_argument('--input_file',
                    default='../sample_test_sow_reap.txt',
                    help='input file')
parser.add_argument('--elmo_data_dir',
                    help='path to elmo weights and options file')
parser.add_argument('--output_folder',
                    default='sow_intermediate',
                    help='input file')
args = parser.parse_args()

options_file = os.path.join(args.elmo_data_dir, 'options.json')
weight_file = os.path.join(args.elmo_data_dir, 'weights.hdf5')

elmo = ElmoEmbedder(options_file, weight_file)
batch_size = 200
input_file = open(args.input_file)
input_file.readline()  ### READ TWO EXTRANEUOUS LINE OF FILE
input_file.readline()
sentences = []
while True:
    line = input_file.readline()
    if line == "":  ### REACHED END OF FILE
        break
    else:
        sentence = get_phrase_list.get_next_sentence(input_file)
        sentences.append(sentence.sent.split(' '))

embeddings = []
for i in range(int(len(sentences) / batch_size) + 1):
예제 #8
0
class ElmoEncoder(object):
    def __init__(self):
        self.elmo = ElmoEmbedder()

    # return: numpy array
    def encode_batch(self, sents):
        vec_seq = self.elmo.embed_sentences(sents)
        vecs = []
        for vec in vec_seq:
            vecs.append(self.collapse_vec(vec))
        # vecs = torch.stack(vecs)
        vecs = np.stack(vecs)
        return vecs

    def collapse_vec(self,
                     vec_seq,
                     time_combine_method="max",
                     layer_combine_method="add"):
        if time_combine_method == "max":
            vec = vec_seq.max(axis=1)
        elif time_combine_method == "mean":
            vec = vec_seq.mean(axis=1)
        elif time_combine_method == "concat":
            vec = np.concatenate(vec_seq, axis=1)
        elif time_combine_method == "last":
            vec = vec_seq[:, -1]
        else:
            raise NotImplementedError

        if layer_combine_method == "add":
            vec = vec.sum(axis=0)
        elif layer_combine_method == "mean":
            vec = vec.mean(axis=0)
        elif layer_combine_method == "concat":
            vec = np.concatenate(vec, axis=0)
        elif layer_combine_method == "last":
            vec = vec[-1]
        else:
            raise NotImplementedError

        return vec

    def encode(self,
               sents,
               time_combine_method="max",
               layer_combine_method="add"):
        """ Load ELMo and encode sents """
        vecs = {}
        for sent in sents:
            vec_seq = self.elmo.embed_sentence(sent)
            if time_combine_method == "max":
                vec = vec_seq.max(axis=1)
            elif time_combine_method == "mean":
                vec = vec_seq.mean(axis=1)
            elif time_combine_method == "concat":
                vec = np.concatenate(vec_seq, axis=1)
            elif time_combine_method == "last":
                vec = vec_seq[:, -1]
            else:
                raise NotImplementedError

            if layer_combine_method == "add":
                vec = vec.sum(axis=0)
            elif layer_combine_method == "mean":
                vec = vec.mean(axis=0)
            elif layer_combine_method == "concat":
                vec = np.concatenate(vec, axis=0)
            elif layer_combine_method == "last":
                vec = vec[-1]
            else:
                raise NotImplementedError
            vecs[' '.join(sent)] = vec
        return vecs
예제 #9
0
"""
requirements
pip install numpy
pip install sklearn
pip install allennlp
pip install allennlp-models
"""

#import scipy
import logging
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from allennlp.commands.elmo import ElmoEmbedder

logging.info("Loading of ELMo...")
elmo = ElmoEmbedder()   # by default
# or you can use another model
#options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json'
#weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5'
#elmo = ElmoEmbedder(options_file=options_file, weight_file=weight_file)
#tokens = ["I", "ate", "an", "apple", "for", "breakfast"]
#vectors = elmo.embed_sentence(tokens)
#scipy.spatial.distance.cosine(vecs1[2][0], vecs2[2][0])


def similarity_matrix(corpus):
    """
    corpus = [["First", "name"], ["Second", "name"], ["Given", "name"], ["Last", "name"]]
    or corpus = [["First name"], ["Second name"], ["Given name"], ["Last name"]]
    """
    layer = 2   # the output layer 
예제 #10
0
class SeqVecEmbedder(EmbedderWithFallback):
    """SeqVec Embedder

    Heinzinger, Michael, et al. "Modeling aspects of the language of life
    through transfer-learning protein sequences." BMC bioinformatics 20.1 (2019): 723.
    https://doi.org/10.1186/s12859-019-3220-8
    """
    name = "seqvec"
    embedding_dimension = 1024
    number_of_layers = 3

    _weights_file: str
    _options_file: str
    _model: ElmoEmbedder
    # The fallback model running on the cpu, which will be initialized if needed
    _model_fallback: Optional[ElmoEmbedder] = None
    _necessary_files = ["weights_file", "options_file"]

    def __init__(self, warmup_rounds: int = 4, **kwargs):
        """
        Initialize Elmo embedder. Can define non-positional arguments for paths of files and other settings.

        :param warmup_rounds: A sample sequence will be embedded this often to
            work around elmo's non-determinism (https://github.com/allenai/allennlp/blob/v0.9.0/tutorials/how_to/elmo.md#notes-on-statefulness-and-non-determinism)
        :param weights_file: path of weights file
        :param options_file: path of options file
        :param model_directory: Alternative of weights_file/options_file
        :param max_amino_acids: max # of amino acids to include in embed_many batches. Default: 15k AA
        """
        super().__init__(**kwargs)

        # Get file locations from kwargs
        if "model_directory" in self._options:
            self._weights_file = str(
                Path(
                    self._options["model_directory"]).joinpath("weights_file"))
            self._options_file = str(
                Path(
                    self._options["model_directory"]).joinpath("options_file"))
        else:
            self._weights_file = self._options["weights_file"]
            self._options_file = self._options["options_file"]

        if self._device.type == "cuda":
            logger.info("CUDA available, using the GPU")
            cuda_device = self._device.index or 0
        else:
            logger.info("CUDA NOT available, using the CPU. This is slow")
            cuda_device = -1

        self._model = ElmoEmbedder(
            weight_file=self._weights_file,
            options_file=self._options_file,
            cuda_device=cuda_device,
        )

        self.warmup_rounds = warmup_rounds
        if self.warmup_rounds > 0:
            logger.info("Running ELMo warmup")
            for _ in range(self.warmup_rounds):
                self.embed(_warmup_seq)

    def embed(self, sequence: str) -> ndarray:
        return self._model.embed_sentence(list(sequence))

    def _get_fallback_model(self) -> ElmoEmbedder:
        if not self._model_fallback:
            logger.warning(
                "Loading model for CPU into RAM. Embedding on the CPU is very slow and you should avoid it."
            )
            self._model_fallback = ElmoEmbedder(
                weight_file=self._weights_file,
                options_file=self._options_file,
                cuda_device=-1,
            )
            if self.warmup_rounds > 0:
                logger.info("Running CPU ELMo warmup")
                for _ in range(self.warmup_rounds):
                    self._model_fallback.embed_sentence(list(_warmup_seq))
        return self._model_fallback

    def _embed_batch_impl(
            self, batch: List[str],
            model: ElmoEmbedder) -> Generator[ndarray, None, None]:
        # elmo expect a `List[str]` as it was meant for tokens/words with more than one character.
        yield from model.embed_batch([list(seq) for seq in batch])

    @staticmethod
    def reduce_per_protein(embedding):
        return embedding.sum(0).mean(0)
예제 #11
0
parser.add_argument('--subsetfile',
                    default='labels/train_split_Depression_AVEC2017.csv',
                    type=str)
parser.add_argument('--transcriptdir', type=str, default='labels_processed')
parser.add_argument('-ip', type=str, default=None)
parser.add_argument('-o',
                    '--output',
                    type=str,
                    default='train_elmo.ark',
                    help='feature output')
parser.add_argument('-w', type=int, default=4, help="Worker count")
parser.add_argument('--filterlen', default=0, type=int)
parser.add_argument('--filterby', type=str, default='Participant')
args = parser.parse_args()

elmo = ElmoEmbedder()

# Extracting features for the Participant IDs
subset_df = pd.read_csv(args.subsetfile)
speakers = subset_df['Participant_ID'].values

with open(args.output, 'wb') as fd:
    for speaker in tqdm(speakers):
        # PRocess transcript first to get start_end
        transcript_file = glob(
            os.path.join(args.transcriptdir, str(speaker)) +
            '*TRANSCRIPT.csv')[0]
        transcript_df = pd.read_csv(transcript_file, sep='\t')
        transcript_df.value = transcript_df.value.str.strip()
        transcript_df.dropna(inplace=True)
        transcript_df = transcript_df[
예제 #12
0
class Model(BaseModel):
    def __init__(self, vocab, config):
        word2id = vocab.word2idx
        super(Model, self).__init__()
        vocab_num = len(word2id)
        self.word2id = word2id
        self.config = config
        self.char_dict = preprocess.get_char_dict('data/char_vocab.english.txt')
        self.genres = {g: i for i, g in enumerate(["bc", "bn", "mz", "nw", "pt", "tc", "wb"])}
        self.device = torch.device("cuda:" + config.cuda)

        self.emb = nn.Embedding(vocab_num, 350)

        emb1 = EmbedLoader().load_with_vocab(config.glove, vocab,normalize=False)
        emb2 = EmbedLoader().load_with_vocab(config.turian,  vocab ,normalize=False)
        pre_emb = np.concatenate((emb1, emb2), axis=1)
        pre_emb /= (np.linalg.norm(pre_emb, axis=1, keepdims=True) + 1e-12)

        if pre_emb is not None:
            self.emb.weight = nn.Parameter(torch.from_numpy(pre_emb).float())
            for param in self.emb.parameters():
                param.requires_grad = False
        self.emb_dropout = nn.Dropout(inplace=True)


        if config.use_elmo:
            self.elmo = ElmoEmbedder(options_file='data/elmo/elmo_2x4096_512_2048cnn_2xhighway_options.json',
                                     weight_file='data/elmo/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5',
                                     cuda_device=int(config.cuda))
            print("elmo load over.")
            self.elmo_args = torch.randn((3), requires_grad=True).to(self.device)

        self.char_emb = nn.Embedding(len(self.char_dict), config.char_emb_size)
        self.conv1 = nn.Conv1d(config.char_emb_size, 50, 3)
        self.conv2 = nn.Conv1d(config.char_emb_size, 50, 4)
        self.conv3 = nn.Conv1d(config.char_emb_size, 50, 5)

        self.feature_emb = nn.Embedding(config.span_width, config.feature_size)
        self.feature_emb_dropout = nn.Dropout(p=0.2, inplace=True)

        self.mention_distance_emb = nn.Embedding(10, config.feature_size)
        self.distance_drop = nn.Dropout(p=0.2, inplace=True)

        self.genre_emb = nn.Embedding(7, config.feature_size)
        self.speaker_emb = nn.Embedding(2, config.feature_size)

        self.bilstm = VarLSTM(input_size=350+150*config.use_CNN+config.use_elmo*1024,hidden_size=200,bidirectional=True,batch_first=True,hidden_dropout=0.2)
        # self.bilstm = nn.LSTM(input_size=500, hidden_size=200, bidirectional=True, batch_first=True)
        self.h0 = nn.init.orthogonal_(torch.empty(2, 1, 200)).to(self.device)
        self.c0 = nn.init.orthogonal_(torch.empty(2, 1, 200)).to(self.device)
        self.bilstm_drop = nn.Dropout(p=0.2, inplace=True)

        self.atten = ffnn(input_size=400, hidden_size=config.atten_hidden_size, output_size=1)
        self.mention_score = ffnn(input_size=1320, hidden_size=config.mention_hidden_size, output_size=1)
        self.sa = ffnn(input_size=3980+40*config.use_metadata, hidden_size=config.sa_hidden_size, output_size=1)
        self.mention_start_np = None
        self.mention_end_np = None

    def _reorder_lstm(self, word_emb, seq_lens):
        sort_ind = sorted(range(len(seq_lens)), key=lambda i: seq_lens[i], reverse=True)
        seq_lens_re = [seq_lens[i] for i in sort_ind]
        emb_seq = self.reorder_sequence(word_emb, sort_ind, batch_first=True)
        packed_seq = nn.utils.rnn.pack_padded_sequence(emb_seq, seq_lens_re, batch_first=True)

        h0 = self.h0.repeat(1, len(seq_lens), 1)
        c0 = self.c0.repeat(1, len(seq_lens), 1)
        packed_out, final_states = self.bilstm(packed_seq, (h0, c0))

        lstm_out, _ = nn.utils.rnn.pad_packed_sequence(packed_out, batch_first=True)
        back_map = {ind: i for i, ind in enumerate(sort_ind)}
        reorder_ind = [back_map[i] for i in range(len(seq_lens_re))]
        lstm_out = self.reorder_sequence(lstm_out, reorder_ind, batch_first=True)
        return lstm_out

    def reorder_sequence(self, sequence_emb, order, batch_first=True):
        """
        sequence_emb: [T, B, D] if not batch_first
        order: list of sequence length
        """
        batch_dim = 0 if batch_first else 1
        assert len(order) == sequence_emb.size()[batch_dim]

        order = torch.LongTensor(order)
        order = order.to(sequence_emb).long()

        sorted_ = sequence_emb.index_select(index=order, dim=batch_dim)

        del order
        return sorted_

    def flat_lstm(self, lstm_out, seq_lens):
        batch = lstm_out.shape[0]
        seq = lstm_out.shape[1]
        dim = lstm_out.shape[2]
        l = [j + i * seq for i, seq_len in enumerate(seq_lens) for j in range(seq_len)]
        flatted = torch.index_select(lstm_out.view(batch * seq, dim), 0, torch.LongTensor(l).to(self.device))
        return flatted

    def potential_mention_index(self, word_index, max_sent_len):
        # get mention index [3,2]:the first sentence is 3 and secend 2
        # [0,0,0,1,1] --> [[0, 0], [0, 1], [1, 1], [1, 2], [2, 2], [3, 3], [3, 4], [4, 4]] (max =2)
        potential_mention = []
        for i in range(len(word_index)):
            for j in range(i, i + max_sent_len):
                if (j < len(word_index) and word_index[i] == word_index[j]):
                    potential_mention.append([i, j])
        return potential_mention

    def get_mention_start_end(self, seq_lens):
        # 序列长度转换成mention
        # [3,2] --> [0,0,0,1,1]
        word_index = [0] * sum(seq_lens)
        sent_index = 0
        index = 0
        for length in seq_lens:
            for l in range(length):
                word_index[index] = sent_index
                index += 1
            sent_index += 1

        # [0,0,0,1,1]-->[[0,0],[0,1],[0,2]....]
        mention_id = self.potential_mention_index(word_index, self.config.span_width)
        mention_start = np.array(mention_id, dtype=int)[:, 0]
        mention_end = np.array(mention_id, dtype=int)[:, 1]
        return mention_start, mention_end

    def get_mention_emb(self, flatten_lstm, mention_start, mention_end):
        mention_start_tensor = torch.from_numpy(mention_start).to(self.device)
        mention_end_tensor = torch.from_numpy(mention_end).to(self.device)
        emb_start = flatten_lstm.index_select(dim=0, index=mention_start_tensor)  # [mention_num,embed]
        emb_end = flatten_lstm.index_select(dim=0, index=mention_end_tensor)  # [mention_num,embed]
        return emb_start, emb_end

    def get_mask(self, mention_start, mention_end):
        # big mask for attention
        mention_num = mention_start.shape[0]
        mask = np.zeros((mention_num, self.config.span_width))  # [mention_num,span_width]
        for i in range(mention_num):
            start = mention_start[i]
            end = mention_end[i]
            # 实际上是宽度
            for j in range(end - start + 1):
                mask[i][j] = 1
        mask = torch.from_numpy(mask)  # [mention_num,max_mention]
        # 0-->-inf  1-->0
        log_mask = torch.log(mask)
        return log_mask

    def get_mention_index(self, mention_start, max_mention):
        # TODO 后面可能要改
        assert len(mention_start.shape) == 1
        mention_start_tensor = torch.from_numpy(mention_start)
        num_mention = mention_start_tensor.shape[0]
        mention_index = mention_start_tensor.expand(max_mention, num_mention).transpose(0,
                                                                                        1)  # [num_mention,max_mention]
        assert mention_index.shape[0] == num_mention
        assert mention_index.shape[1] == max_mention
        range_add = torch.arange(0, max_mention).expand(num_mention, max_mention).long()  # [num_mention,max_mention]
        mention_index = mention_index + range_add
        mention_index = torch.min(mention_index, torch.LongTensor([mention_start[-1]]).expand(num_mention, max_mention))
        return mention_index.to(self.device)

    def sort_mention(self, mention_start, mention_end, candidate_mention_emb, candidate_mention_score, seq_lens):
        # 排序记录,高分段在前面
        mention_score, mention_ids = torch.sort(candidate_mention_score, descending=True)
        preserve_mention_num = int(self.config.mention_ratio * sum(seq_lens))
        mention_ids = mention_ids[0:preserve_mention_num]
        mention_score = mention_score[0:preserve_mention_num]

        mention_start_tensor = torch.from_numpy(mention_start).to(self.device).index_select(dim=0,
                                                                                            index=mention_ids)  # [lamda*word_num]
        mention_end_tensor = torch.from_numpy(mention_end).to(self.device).index_select(dim=0,
                                                                                        index=mention_ids)  # [lamda*word_num]
        mention_emb = candidate_mention_emb.index_select(index=mention_ids, dim=0)  # [lamda*word_num,emb]
        assert mention_score.shape[0] == preserve_mention_num
        assert mention_start_tensor.shape[0] == preserve_mention_num
        assert mention_end_tensor.shape[0] == preserve_mention_num
        assert mention_emb.shape[0] == preserve_mention_num
        # TODO 不交叉没做处理

        # 对start进行再排序,实际位置在前面
        # TODO 这里只考虑了start没有考虑end
        mention_start_tensor, temp_index = torch.sort(mention_start_tensor)
        mention_end_tensor = mention_end_tensor.index_select(dim=0, index=temp_index)
        mention_emb = mention_emb.index_select(dim=0, index=temp_index)
        mention_score = mention_score.index_select(dim=0, index=temp_index)
        return mention_start_tensor, mention_end_tensor, mention_score, mention_emb

    def get_antecedents(self, mention_starts, max_antecedents):
        num_mention = mention_starts.shape[0]
        max_antecedents = min(max_antecedents, num_mention)
        # mention和它是第几个mention之间的对应关系
        antecedents = np.zeros((num_mention, max_antecedents), dtype=int)  # [num_mention,max_an]
        # 记录长度
        antecedents_len = [0] * num_mention
        for i in range(num_mention):
            ante_count = 0
            for j in range(max(0, i - max_antecedents), i):
                antecedents[i, ante_count] = j
                ante_count += 1
            # 补位操作
            for j in range(ante_count, max_antecedents):
                antecedents[i, j] = 0
            antecedents_len[i] = ante_count
        assert antecedents.shape[1] == max_antecedents
        return antecedents, antecedents_len

    def get_antecedents_score(self, span_represent, mention_score, antecedents, antecedents_len, mention_speakers_ids,
                              genre):
        num_mention = mention_score.shape[0]
        max_antecedent = antecedents.shape[1]

        pair_emb = self.get_pair_emb(span_represent, antecedents, mention_speakers_ids, genre)  # [span_num,max_ant,emb]
        antecedent_scores = self.sa(pair_emb)
        mask01 = self.sequence_mask(antecedents_len, max_antecedent)
        maskinf = torch.log(mask01).to(self.device)
        assert maskinf.shape[1] <= max_antecedent
        assert antecedent_scores.shape[0] == num_mention
        antecedent_scores = antecedent_scores + maskinf
        antecedents = torch.from_numpy(antecedents).to(self.device)
        mention_scoreij = mention_score.unsqueeze(1) + torch.gather(
            mention_score.unsqueeze(0).expand(num_mention, num_mention), dim=1, index=antecedents)
        antecedent_scores += mention_scoreij

        antecedent_scores = torch.cat([torch.zeros([mention_score.shape[0], 1]).to(self.device), antecedent_scores],
                                      1)  # [num_mentions, max_ant + 1]
        return antecedent_scores

    ##############################
    def distance_bin(self, mention_distance):
        bins = torch.zeros(mention_distance.size()).byte().to(self.device)
        rg = [[1, 1], [2, 2], [3, 3], [4, 4], [5, 7], [8, 15], [16, 31], [32, 63], [64, 300]]
        for t, k in enumerate(rg):
            i, j = k[0], k[1]
            b = torch.LongTensor([i]).unsqueeze(-1).expand(mention_distance.size()).to(self.device)
            m1 = torch.ge(mention_distance, b)
            e = torch.LongTensor([j]).unsqueeze(-1).expand(mention_distance.size()).to(self.device)
            m2 = torch.le(mention_distance, e)
            bins = bins + (t + 1) * (m1 & m2)
        return bins.long()

    def get_distance_emb(self, antecedents_tensor):
        num_mention = antecedents_tensor.shape[0]
        max_ant = antecedents_tensor.shape[1]

        assert max_ant <= self.config.max_antecedents
        source = torch.arange(0, num_mention).expand(max_ant, num_mention).transpose(0,1).to(self.device)  # [num_mention,max_ant]
        mention_distance = source - antecedents_tensor
        mention_distance_bin = self.distance_bin(mention_distance)
        distance_emb = self.mention_distance_emb(mention_distance_bin)
        distance_emb = self.distance_drop(distance_emb)
        return distance_emb

    def get_pair_emb(self, span_emb, antecedents, mention_speakers_ids, genre):
        emb_dim = span_emb.shape[1]
        num_span = span_emb.shape[0]
        max_ant = antecedents.shape[1]
        assert span_emb.shape[0] == antecedents.shape[0]
        antecedents = torch.from_numpy(antecedents).to(self.device)

        # [num_span,max_ant,emb]
        antecedent_emb = torch.gather(span_emb.unsqueeze(0).expand(num_span, num_span, emb_dim), dim=1,
                                      index=antecedents.unsqueeze(2).expand(num_span, max_ant, emb_dim))
        # [num_span,max_ant,emb]
        target_emb_tiled = span_emb.expand((max_ant, num_span, emb_dim))
        target_emb_tiled = target_emb_tiled.transpose(0, 1)

        similarity_emb = antecedent_emb * target_emb_tiled

        pair_emb_list = [target_emb_tiled, antecedent_emb, similarity_emb]

        # get speakers and genre
        if self.config.use_metadata:
            antecedent_speaker_ids = mention_speakers_ids.unsqueeze(0).expand(num_span, num_span).gather(dim=1,
                                                                                                         index=antecedents)
            same_speaker = torch.eq(mention_speakers_ids.unsqueeze(1).expand(num_span, max_ant),
                                    antecedent_speaker_ids)  # [num_mention,max_ant]
            speaker_embedding = self.speaker_emb(same_speaker.long().to(self.device))  # [mention_num.max_ant,emb]
            genre_embedding = self.genre_emb(
                torch.LongTensor([genre]).expand(num_span, max_ant).to(self.device))  # [mention_num,max_ant,emb]
            pair_emb_list.append(speaker_embedding)
            pair_emb_list.append(genre_embedding)

        # get distance emb
        if self.config.use_distance:
            distance_emb = self.get_distance_emb(antecedents)
            pair_emb_list.append(distance_emb)

        pair_emb = torch.cat(pair_emb_list, 2)
        return pair_emb

    def sequence_mask(self, len_list, max_len):
        x = np.zeros((len(len_list), max_len))
        for i in range(len(len_list)):
            l = len_list[i]
            for j in range(l):
                x[i][j] = 1
        return torch.from_numpy(x).float()

    def logsumexp(self, value, dim=None, keepdim=False):
        """Numerically stable implementation of the operation

        value.exp().sum(dim, keepdim).log()
        """
        # TODO: torch.max(value, dim=None) threw an error at time of writing
        if dim is not None:
            m, _ = torch.max(value, dim=dim, keepdim=True)
            value0 = value - m
            if keepdim is False:
                m = m.squeeze(dim)
            return m + torch.log(torch.sum(torch.exp(value0),
                                           dim=dim, keepdim=keepdim))
        else:
            m = torch.max(value)
            sum_exp = torch.sum(torch.exp(value - m))

            return m + torch.log(sum_exp)

    def softmax_loss(self, antecedent_scores, antecedent_labels):
        antecedent_labels = torch.from_numpy(antecedent_labels * 1).to(self.device)
        gold_scores = antecedent_scores + torch.log(antecedent_labels.float())  # [num_mentions, max_ant + 1]
        marginalized_gold_scores = self.logsumexp(gold_scores, 1)  # [num_mentions]
        log_norm = self.logsumexp(antecedent_scores, 1)  # [num_mentions]
        return torch.sum(log_norm - marginalized_gold_scores)  # [num_mentions]reduce_logsumexp

    def get_predicted_antecedents(self, antecedents, antecedent_scores):
        predicted_antecedents = []
        for i, index in enumerate(np.argmax(antecedent_scores.detach(), axis=1) - 1):
            if index < 0:
                predicted_antecedents.append(-1)
            else:
                predicted_antecedents.append(antecedents[i, index])
        return predicted_antecedents

    def get_predicted_clusters(self, mention_starts, mention_ends, predicted_antecedents):
        mention_to_predicted = {}
        predicted_clusters = []
        for i, predicted_index in enumerate(predicted_antecedents):
            if predicted_index < 0:
                continue
            assert i > predicted_index
            predicted_antecedent = (int(mention_starts[predicted_index]), int(mention_ends[predicted_index]))
            if predicted_antecedent in mention_to_predicted:
                predicted_cluster = mention_to_predicted[predicted_antecedent]
            else:
                predicted_cluster = len(predicted_clusters)
                predicted_clusters.append([predicted_antecedent])
                mention_to_predicted[predicted_antecedent] = predicted_cluster

            mention = (int(mention_starts[i]), int(mention_ends[i]))
            predicted_clusters[predicted_cluster].append(mention)
            mention_to_predicted[mention] = predicted_cluster

        predicted_clusters = [tuple(pc) for pc in predicted_clusters]
        mention_to_predicted = {m: predicted_clusters[i] for m, i in mention_to_predicted.items()}

        return predicted_clusters, mention_to_predicted

    def evaluate_coref(self, mention_starts, mention_ends, predicted_antecedents, gold_clusters, evaluator):
        gold_clusters = [tuple(tuple(m) for m in gc) for gc in gold_clusters]
        mention_to_gold = {}
        for gc in gold_clusters:
            for mention in gc:
                mention_to_gold[mention] = gc
        predicted_clusters, mention_to_predicted = self.get_predicted_clusters(mention_starts, mention_ends,
                                                                               predicted_antecedents)
        evaluator.update(predicted_clusters, gold_clusters, mention_to_predicted, mention_to_gold)
        return predicted_clusters


    def forward(self, sentences, doc_np, speaker_ids_np, genre, char_index, seq_len):
        """
        实际输入都是tensor
        :param sentences: 句子,被fastNLP转化成了numpy,
        :param doc_np: 被fastNLP转化成了Tensor
        :param speaker_ids_np: 被fastNLP转化成了Tensor
        :param genre: 被fastNLP转化成了Tensor
        :param char_index: 被fastNLP转化成了Tensor
        :param seq_len: 被fastNLP转化成了Tensor
        :return:
        """
        # change for fastNLP
        sentences = sentences[0].tolist()
        doc_tensor = doc_np[0]
        speakers_tensor = speaker_ids_np[0]
        genre = genre[0].item()
        char_index = char_index[0]
        seq_len = seq_len[0].cpu().numpy()

        # 类型

        # doc_tensor = torch.from_numpy(doc_np).to(self.device)
        # speakers_tensor = torch.from_numpy(speaker_ids_np).to(self.device)
        mention_emb_list = []

        word_emb = self.emb(doc_tensor)
        word_emb_list = [word_emb]
        if self.config.use_CNN:
            # [batch, length, char_length, char_dim]
            char = self.char_emb(char_index)
            char_size = char.size()
            # first transform to [batch *length, char_length, char_dim]
            # then transpose to [batch * length, char_dim, char_length]
            char = char.view(char_size[0] * char_size[1], char_size[2], char_size[3]).transpose(1, 2)

            # put into cnn [batch*length, char_filters, char_length]
            # then put into maxpooling [batch * length, char_filters]
            char_over_cnn, _ = self.conv1(char).max(dim=2)
            # reshape to [batch, length, char_filters]
            char_over_cnn = torch.tanh(char_over_cnn).view(char_size[0], char_size[1], -1)
            word_emb_list.append(char_over_cnn)

            char_over_cnn, _ = self.conv2(char).max(dim=2)
            char_over_cnn = torch.tanh(char_over_cnn).view(char_size[0], char_size[1], -1)
            word_emb_list.append(char_over_cnn)

            char_over_cnn, _ = self.conv3(char).max(dim=2)
            char_over_cnn = torch.tanh(char_over_cnn).view(char_size[0], char_size[1], -1)
            word_emb_list.append(char_over_cnn)

        # word_emb = torch.cat(word_emb_list, dim=2)

        # use elmo or not
        if self.config.use_elmo:
            # 如果确实被截断了
            if doc_tensor.shape[0] == 50 and len(sentences) > 50:
                sentences = sentences[0:50]
            elmo_embedding, elmo_mask = self.elmo.batch_to_embeddings(sentences)
            elmo_embedding = elmo_embedding.to(
                self.device)  # [sentence_num,max_sent_len,3,1024]--[sentence_num,max_sent,1024]
            elmo_embedding = elmo_embedding[:, 0, :, :] * self.elmo_args[0] + elmo_embedding[:, 1, :, :] * \
                             self.elmo_args[1] + elmo_embedding[:, 2, :, :] * self.elmo_args[2]
            word_emb_list.append(elmo_embedding)
        # print(word_emb_list[0].shape)
        # print(word_emb_list[1].shape)
        # print(word_emb_list[2].shape)
        # print(word_emb_list[3].shape)
        # print(word_emb_list[4].shape)

        word_emb = torch.cat(word_emb_list, dim=2)

        word_emb = self.emb_dropout(word_emb)
        # word_emb_elmo = self.emb_dropout(word_emb_elmo)
        lstm_out = self._reorder_lstm(word_emb, seq_len)
        flatten_lstm = self.flat_lstm(lstm_out, seq_len)  # [word_num,emb]
        flatten_lstm = self.bilstm_drop(flatten_lstm)
        # TODO 没有按照论文写
        flatten_word_emb = self.flat_lstm(word_emb, seq_len)  # [word_num,emb]

        mention_start, mention_end = self.get_mention_start_end(seq_len)  # [mention_num]
        self.mention_start_np = mention_start  # [mention_num] np
        self.mention_end_np = mention_end
        mention_num = mention_start.shape[0]
        emb_start, emb_end = self.get_mention_emb(flatten_lstm, mention_start, mention_end)  # [mention_num,emb]

        # list
        mention_emb_list.append(emb_start)
        mention_emb_list.append(emb_end)

        if self.config.use_width:
            mention_width_index = mention_end - mention_start
            mention_width_tensor = torch.from_numpy(mention_width_index).to(self.device)  # [mention_num]
            mention_width_emb = self.feature_emb(mention_width_tensor)
            mention_width_emb = self.feature_emb_dropout(mention_width_emb)
            mention_emb_list.append(mention_width_emb)

        if self.config.model_heads:
            mention_index = self.get_mention_index(mention_start, self.config.span_width)  # [mention_num,max_mention]
            log_mask_tensor = self.get_mask(mention_start, mention_end).float().to(
                self.device)  # [mention_num,max_mention]
            alpha = self.atten(flatten_lstm).to(self.device)  # [word_num]

            # 得到attention
            mention_head_score = torch.gather(alpha.expand(mention_num, -1), 1,
                                              mention_index).float().to(self.device)  # [mention_num,max_mention]
            mention_attention = F.softmax(mention_head_score + log_mask_tensor, dim=1)  # [mention_num,max_mention]

            # TODO flatte lstm
            word_num = flatten_lstm.shape[0]
            lstm_emb = flatten_lstm.shape[1]
            emb_num = flatten_word_emb.shape[1]

            # [num_mentions, max_mention_width, emb]
            mention_text_emb = torch.gather(
                flatten_word_emb.unsqueeze(1).expand(word_num, self.config.span_width, emb_num),
                0, mention_index.unsqueeze(2).expand(mention_num, self.config.span_width,
                                                     emb_num))
            # [mention_num,emb]
            mention_head_emb = torch.sum(
                mention_attention.unsqueeze(2).expand(mention_num, self.config.span_width, emb_num) * mention_text_emb,
                dim=1)
            mention_emb_list.append(mention_head_emb)

        candidate_mention_emb = torch.cat(mention_emb_list, 1)  # [candidate_mention_num,emb]
        candidate_mention_score = self.mention_score(candidate_mention_emb)  # [candidate_mention_num]

        antecedent_scores, antecedents, mention_start_tensor, mention_end_tensor = (None, None, None, None)
        mention_start_tensor, mention_end_tensor, mention_score, mention_emb = \
            self.sort_mention(mention_start, mention_end, candidate_mention_emb, candidate_mention_score, seq_len)
        mention_speakers_ids = speakers_tensor.index_select(dim=0, index=mention_start_tensor)  # num_mention

        antecedents, antecedents_len = self.get_antecedents(mention_start_tensor, self.config.max_antecedents)
        antecedent_scores = self.get_antecedents_score(mention_emb, mention_score, antecedents, antecedents_len,
                                                       mention_speakers_ids, genre)

        ans = {"candidate_mention_score": candidate_mention_score, "antecedent_scores": antecedent_scores,
               "antecedents": antecedents, "mention_start_tensor": mention_start_tensor,
               "mention_end_tensor": mention_end_tensor}

        return ans

    def predict(self, sentences, doc_np, speaker_ids_np, genre, char_index, seq_len):
        ans = self(sentences,
                   doc_np,
                   speaker_ids_np,
                   genre,
                   char_index,
                   seq_len)

        predicted_antecedents = self.get_predicted_antecedents(ans["antecedents"], ans["antecedent_scores"])
        predicted_clusters, mention_to_predicted = self.get_predicted_clusters(ans["mention_start_tensor"],
                                                                               ans["mention_end_tensor"],
                                                                               predicted_antecedents)

        return {'predicted':predicted_clusters,"mention_to_predicted":mention_to_predicted}
예제 #13
0
    args, extra_args = argparser.parse_known_args()
    config = Configurable(args.config_file, extra_args)
    torch.set_num_threads(args.thread)

    vocab = creatVocab(config.train_file, config.min_occur_count)
    pickle.dump(vocab, open(config.save_vocab_path, 'wb'))

    config.use_cuda = False
    gpu_id = -1
    if gpu and args.gpu >= 0:
        torch.cuda.set_device(args.gpu)
        config.use_cuda = True
        print("GPU ID: ", args.gpu)
        gpu_id = args.gpu

    elmo = ElmoEmbedder(config.elmo_option_file, config.elmo_weight_file,
                        gpu_id)

    elmo_layers = elmo.elmo_bilm.num_layers
    elmo_dims = elmo.elmo_bilm.get_output_dim()

    model = BiLSTMModel(vocab, config, (elmo_layers, elmo_dims))

    if config.use_cuda:
        #torch.backends.cudnn.enabled = True
        model = model.cuda()

    classifier = SentenceClassifier(model, elmo, vocab)

    data = read_corpus(config.train_file)
    dev_data = read_corpus(config.dev_file)
    test_data = read_corpus(config.test_file)
예제 #14
0
class ElmoExtractor(DirectSentenceExtractor):

    _options_file = 'elmo_2x2048_256_2048cnn_1xhighway_options.json'
    _weight_file = 'elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5'
    _method = 'elmo'

    def __init__(self, layer=None):

        super(ElmoExtractor, self).__init__()

        self._options_file = os.path.join(self._embedding_model_path,
                                          self._method, self._options_file)
        self._weight_file = os.path.join(self._embedding_model_path,
                                         self._method, self._weight_file)
        '''first check whether weight/option exist'''

        if not os.path.exists(self._options_file):
            raise ValueError('Elmo model file(s) '  +
                                 ' '\
                                ' are missing. Please download ' + \
                                ' the model files via the following command: ' +\
                                ' python download.py(\'elmo\')' )



        self._elmoObj = ElmoEmbedder(options_file=self._options_file, \
                                 weight_file=self._weight_file)

        if layer == None:
            self._layer = 'default'
        else:
            self._layer = layer

    def getVector(self, stim, cbow=False):

        if not isinstance(stim, list):
            stim = [stim]

        if cbow == True:
            stim_cbows = self.generateCbows(stim)
            stim = stim_cbows

        embeddings = []
        for s in stim:
            tokens = s.split()
            embedding_layers = self._elmoObj.embed_sentence(tokens)
            if self._layer == 'default':
                '''we take the average from
                the three layers of Elmo'''
                embedding_avg = np.average(embedding_layers, axis=0)
                sentence_op = np.average(embedding_avg, axis=0)
            elif self._layer == 'top':
                embedding_top = embedding_layers[-1]
                sentence_op = np.average(embedding_top, axis=0)
            elif self._layer == 'bottom':
                embedding_bottom = embedding_layers[1]
                sentence_op = np.average(embedding_bottom, axis=0)

            embeddings.append(sentence_op)

        num_dims = embeddings[0].shape[0]
        features = ['%s%d' % (self.prefix, i) for i in range(num_dims)]

        return ExtractorResult(embeddings, stim, self, features=features)
예제 #15
0
class DataGenerator():
    def __init__(self, configs):
        self.configs = configs

        self.elmo = ElmoEmbedder(options_file=self.configs['elmo_option_file'],
                                 weight_file=self.configs['elmo_weight_file'],
                                 cuda_device=0)

        self.train_c_r, self.train_label = self.load_train_data()
        print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())),
              ' : Finish Loading Training Data...')

        self.dev_c_r, self.dev_label = self.load_dev_data()
        print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())),
              ' : Finish Loading Dev Data...')

        self.train_data_size = len(self.train_label)
        print('Train set size: ', self.train_data_size)
        self.dev_data_size = len(self.dev_label)
        print('Dev set size: ', self.dev_data_size)

    def train_data_generator(self, batch_num):

        train_size = self.train_data_size
        start = batch_num * self.configs['batch_size'] % train_size
        end = (batch_num * self.configs['batch_size'] +
               self.configs['batch_size']) % train_size

        # shuffle data at the beginning of every epoch
        if batch_num == 0:
            self.train_c_r, self.train_label, _ = self.unison_shuffled_copies(
                self.train_c_r, self.train_label)
            print(
                time.strftime('%Y-%m-%d %H:%M:%S',
                              time.localtime(time.time())),
                ' : Finish Shuffling Data...')

        if start < end:
            batches_label = self.train_label[start:end]
            batches_c_r = self.train_c_r[start:end]
        else:
            batches_label = self.train_label[train_size - self.
                                             configs['batch_size']:train_size]
            batches_c_r = self.train_c_r[train_size -
                                         self.configs['batch_size']:train_size]

        turns, turn_num, turn_len, response, response_len, label = self.batch2placeholder(
            batches_c_r, batches_label)

        return turns, turn_num, turn_len, response, response_len, label

    def dev_data_generator(self, batch_num):
        """
           This function return training/validation/test data for classifier. batch_num*batch_size is start point of the batch.
           :param batch_size: int. the size of each batch
           :return: [[[float32,],],]. [[[wordembedding]element,]batch,]
           """

        dev_size = self.dev_data_size
        start = batch_num * self.configs['batch_size'] % dev_size
        end = (batch_num * self.configs['batch_size'] +
               self.configs['batch_size']) % dev_size
        if start < end:
            batches_label = self.dev_label[start:end]
            batches_c_r = self.dev_c_r[start:end]
        else:
            batches_label = self.dev_label[start:]
            batches_c_r = self.dev_c_r[start:]

        turns, turn_num, turn_len, response, response_len, label = self.batch2placeholder(
            batches_c_r, batches_label)

        return turns, turn_num, turn_len, response, response_len, label

    def batch2placeholder(self, batches_c_r, batches_label):

        tmp = list(zip(*batches_c_r))
        example_id_c_r, turns, turn_num, turn_len, candidate, candidate_len = tmp[
            0], tmp[1], tmp[2], tmp[3], tmp[4], tmp[5]

        tmp = list(zip(*batches_label))
        example_id_y, label = tmp[0], tmp[1]

        assert example_id_c_r == example_id_y

        # shuffle respone order in one example
        candidate, candidate_len, label = self.shuffle_response(
            candidate, candidate_len, label)

        # generate elmo embedding
        turns, turn_len, candidate = self.elmo_emb(turns, turn_len, candidate)

        return turns, turn_num, turn_len, candidate, candidate_len, label

    def unison_shuffled_copies(self, a, b):
        assert len(a) == len(b)
        p = np.random.permutation(len(a))
        return a[p], b[p], p

    def shuffle_response(self, response, response_len, label):
        """
        responses contain ground truth id
        :param response: (batch_size, options_num, max_turn_len)
        :param response_len: (batch_size, options_num)
        :param label: (batch_size)
        :return:
        """
        tmp_response = np.zeros_like(response)
        tmp_response_len = np.zeros_like(response_len)
        tmp_label = np.zeros_like(label)
        for i in range(len(response)):
            tmp_response[i], tmp_response_len[
                i], shuffle_id = self.unison_shuffled_copies(
                    np.array(response[i]), np.array(response_len[i]))
            tmp_label[i] = np.argwhere(shuffle_id == label[i])

        return tmp_response, tmp_response_len, tmp_label

    def get_context_response(self, data):
        """

        :param data:
        :param eos_idx:
        :param max_turn_num:
        :param max_turn_len:
        :return: array of tuple, tuple:(sent_list, example_turn_num, example_turn_len)
        """

        saver = []

        for c in range(data.shape[0]):
            turn_num = data['turn_num'][c]
            turn_len = data['turn_len'][c]
            c_s = data['context'][c]
            if len(c_s) > self.configs['max_turn_num']:
                c_s = c_s[-self.configs['max_turn_num']:]
                turn_num = self.configs['max_turn_num']
                turn_len = turn_len[-self.configs['max_turn_num']:]

            r_s = data['candidate'][c]

            res = np.array([
                data['id'][c], c_s, turn_num, turn_len, r_s,
                data['candidate_len'][c]
            ],
                           dtype=object)
            saver.append(res)

        return np.array(saver)

    def get_label(self, data):

        saver = []
        for e in range(data.shape[0]):
            res = np.array([data['id'][e], 0], dtype=object)
            saver.append(res)

        return np.array(saver)

    def load_train_data(self):

        if os.path.exists(
                self.configs['process_train_data']) and os.path.getsize(
                    self.configs['process_train_data']) > 0:
            with open(self.configs['process_train_data'], 'rb') as f:
                train_c_r, train_label = pickle.load(f)
        else:
            with open(self.configs['train_data'], 'rb') as f:
                train_data = pickle.load(f)

            train_c_r = self.get_context_response(train_data)
            train_label = self.get_label(train_data)

            with open(self.configs['process_train_data'], 'wb') as f:
                pickle.dump((train_c_r, train_label), f)

        return train_c_r, train_label

    def load_dev_data(self):

        if os.path.exists(
                self.configs['process_dev_data']) and os.path.getsize(
                    self.configs['process_dev_data']) > 0:
            with open(self.configs['process_dev_data'], 'rb') as f:
                dev_c_r, dev_label = pickle.load(f)
        else:
            with open(self.configs['dev_data'], 'rb') as f:
                dev_data = pickle.load(f)

            dev_c_r = self.get_context_response(dev_data)
            dev_label = self.get_label(dev_data)

            with open(self.configs['process_dev_data'], 'wb') as f:
                pickle.dump((dev_c_r, dev_label), f)

        return dev_c_r, dev_label

    def elmo_emb(self, turns, turn_len, candidate):

        _turns = []
        _candidate = []
        _turns_len = []
        for idx in range(self.configs['batch_size']):
            turns_emb = self.elmo.embed_batch(turns[idx])
            candidate_emb = self.elmo.embed_batch(candidate[idx])
            pad_len = np.zeros(shape=[self.configs['max_turn_num']])
            pad_len[:len(turn_len[idx])] = turn_len[idx]
            _turns_len.append(pad_len)

            # Padding turns embedding
            turns_emb_pad = []
            for i, emb in enumerate(turns_emb):
                pad_emb = np.zeros(shape=[
                    self.configs['elmo_layer'], self.configs['max_turn_len'],
                    self.configs['emb_size']
                ],
                                   dtype=np.float32)
                pad_emb[:emb.shape[0], :emb.shape[1], :emb.shape[2]] = emb
                turns_emb_pad.append(pad_emb)

            turns_emb_pad = np.array(turns_emb_pad)
            turns_pad = np.zeros(shape=[
                self.configs['max_turn_num'], self.configs['elmo_layer'],
                self.configs['max_turn_len'], self.configs['emb_size']
            ],
                                 dtype=np.float32)
            turns_pad[:turns_emb_pad.shape[0], :, :, :] = turns_emb_pad
            # Padding candidate embedding
            candidate_emb_pad = []
            for emb in candidate_emb:
                pad_emb = np.zeros(shape=[
                    self.configs['elmo_layer'], self.configs['max_turn_len'],
                    self.configs['emb_size']
                ],
                                   dtype=np.float32)
                pad_emb[:emb.shape[0], :emb.shape[1], :emb.shape[2]] = emb
                candidate_emb_pad.append(pad_emb)
            candidate_emb_pad = np.array(candidate_emb_pad)

            _turns.append(turns_pad)
            _candidate.append(candidate_emb_pad)

        _turns = np.array(_turns)
        _candidate = np.array(_candidate)
        _turns_len = np.array(_turns_len)

        return _turns, _turns_len, _candidate
예제 #16
0
def main():
    parser = argparse.ArgumentParser(description='Tuning with bi-directional RNN-CNN-CRF')
    parser.add_argument('--mode', choices=['RNN', 'LSTM', 'GRU'], help='architecture of rnn', required=True)
    parser.add_argument('--num_epochs', type=int, default=100, help='Number of training epochs')
    parser.add_argument('--batch_size', type=int, default=16, help='Number of sentences in each batch')
    parser.add_argument('--hidden_size', type=int, default=128, help='Number of hidden units in RNN')
    parser.add_argument('--tag_space', type=int, default=0, help='Dimension of tag space')
    parser.add_argument('--num_filters', type=int, default=30, help='Number of filters in CNN')
    parser.add_argument('--char_dim', type=int, default=30, help='Dimension of Character embeddings')
    parser.add_argument('--learning_rate', type=float, default=0.015, help='Learning rate')
    parser.add_argument('--alpha', type=float, default=0.1, help='alpha of rmsprop')
    parser.add_argument('--momentum', type=float, default=0, help='momentum')
    parser.add_argument('--lr_decay', type=float, default=0, help='Decay rate of learning rate')
    parser.add_argument('--gamma', type=float, default=0.0, help='weight for regularization')
    parser.add_argument('--dropout', choices=['std', 'variational'], help='type of dropout', required=True)
    parser.add_argument('--p', type=float, default=0.5, help='dropout rate')
    parser.add_argument('--bigram', action='store_true', help='bi-gram parameter for CRF')
    parser.add_argument('--schedule', type=int, help='schedule for learning rate decay')
    parser.add_argument('--unk_replace', type=float, default=0., help='The rate to replace a singleton word with UNK')
    parser.add_argument('--embedding', choices=['word2vec', 'glove', 'senna', 'sskip', 'polyglot', 'elmo'], help='Embedding for words', required=True)
    parser.add_argument('--embedding_dict', help='path for embedding dict')
    parser.add_argument('--elmo_option', help='path for ELMo option file')
    parser.add_argument('--elmo_weight', help='path for ELMo weight file')
    parser.add_argument('--elmo_cuda', help='assign GPU for ELMo embedding task')
    parser.add_argument('--attention', choices=['none', 'mlp', 'fine'], help='attetion mode', required=True)
    parser.add_argument('--data_reduce', help='data size reduce, value is keeping rate', default=1.0)
    parser.add_argument('--train')  # "data/POS-penn/wsj/split1/wsj1.train.original"
    parser.add_argument('--dev')  # "data/POS-penn/wsj/split1/wsj1.dev.original"
    parser.add_argument('--test')  # "data/POS-penn/wsj/split1/wsj1.test.original"

    args = parser.parse_args()

    logger = get_logger("NERCRF")

    mode = args.mode
    train_path = args.train
    dev_path = args.dev
    test_path = args.test
    num_epochs = args.num_epochs
    batch_size = args.batch_size
    hidden_size = args.hidden_size
    num_filters = args.num_filters
    learning_rate = args.learning_rate
    alpha = args.alpha
    momentum = args.momentum
    lr_decay = args.lr_decay
    gamma = args.gamma
    schedule = args.schedule
    p = args.p
    unk_replace = args.unk_replace
    bigram = args.bigram
    embedding = args.embedding
    embedding_path = args.embedding_dict
    elmo_option = args.elmo_option
    elmo_weight = args.elmo_weight
    elmo_cuda = int(args.elmo_cuda)
    attention_mode = args.attention
    data_reduce = float(args.data_reduce)

    embedd_dict, embedd_dim = utils.load_embedding_dict(embedding, embedding_path)

    logger.info("Creating Alphabets")
    word_alphabet, char_alphabet, pos_alphabet, \
        chunk_alphabet, ner_alphabet = bionlp_data.create_alphabets(os.path.join(Path(train_path).parent.abspath(
        ), "alphabets"), train_path, data_paths=[dev_path, test_path], embedd_dict=embedd_dict, max_vocabulary_size=50000)

    logger.info("Word Alphabet Size: %d" % word_alphabet.size())
    logger.info("Character Alphabet Size: %d" % char_alphabet.size())
    logger.info("POS Alphabet Size: %d" % pos_alphabet.size())
    logger.info("Chunk Alphabet Size: %d" % chunk_alphabet.size())
    logger.info("NER Alphabet Size: %d" % ner_alphabet.size())

    if embedding == 'elmo':
        logger.info("Loading ELMo Embedder")
        ee = ElmoEmbedder(options_file=elmo_option, weight_file=elmo_weight, cuda_device=elmo_cuda)
    else:
        ee = None

    logger.info("Reading Data")
    use_gpu = torch.cuda.is_available()

    data_train = bionlp_data.read_data_to_variable(train_path, word_alphabet, char_alphabet, pos_alphabet,
                                                    chunk_alphabet, ner_alphabet, use_gpu=use_gpu, 
                                                    elmo_ee=ee, data_reduce=data_reduce)
    num_data = sum(data_train[1])
    num_labels = ner_alphabet.size()

    data_dev = bionlp_data.read_data_to_variable(dev_path, word_alphabet, char_alphabet, pos_alphabet,
                                                  chunk_alphabet, ner_alphabet, use_gpu=use_gpu, volatile=True,
                                                  elmo_ee=ee)

    data_test = bionlp_data.read_data_to_variable(test_path, word_alphabet, char_alphabet, pos_alphabet,
                                                   chunk_alphabet, ner_alphabet, use_gpu=use_gpu, volatile=True,
                                                   elmo_ee=ee)

    writer = BioNLPWriter(word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet)

    def construct_word_embedding_table():
        scale = np.sqrt(3.0 / embedd_dim)
        table = np.empty([word_alphabet.size(), embedd_dim], dtype=np.float32)
        table[bionlp_data.UNK_ID, :] = np.random.uniform(-scale, scale, [1, embedd_dim]).astype(np.float32)
        oov = 0
        for word, index in word_alphabet.items():
            if not embedd_dict == None and word in embedd_dict:
                embedding = embedd_dict[word]
            elif not embedd_dict == None and word.lower() in embedd_dict:
                embedding = embedd_dict[word.lower()]
            else:
                embedding = np.random.uniform(-scale, scale, [1, embedd_dim]).astype(np.float32)
                oov += 1
            table[index, :] = embedding
        print('oov: %d' % oov)
        return torch.from_numpy(table)

    word_table = construct_word_embedding_table()
    logger.info("constructing network...")

    char_dim = args.char_dim
    window = 3
    num_layers = 1
    tag_space = args.tag_space
    if args.dropout == 'std':
        if attention_mode == 'none':
            network = BiRecurrentConvCRF(embedd_dim, word_alphabet.size(),
                                     char_dim, char_alphabet.size(),
                                     num_filters, window,
                                     mode, hidden_size, num_layers, num_labels,
                                     tag_space=tag_space, embedd_word=word_table, p_in=p, p_rnn=p, bigram=bigram, 
                                     elmo=(embedding == 'elmo'))
        else:
            network = BiRecurrentConvAttentionCRF(embedd_dim, word_alphabet.size(),
                                     char_dim, char_alphabet.size(),
                                     num_filters, window,
                                     mode, hidden_size, num_layers, num_labels,
                                     tag_space=tag_space, embedd_word=word_table, p_in=p, p_rnn=p, bigram=bigram,
                                     elmo=(embedding == 'elmo'), attention_mode=attention_mode)

    else:
        raise NotImplementedError

    if use_gpu:
        network.cuda()

    lr = learning_rate
    # optim = SGD(network.parameters(), lr=lr, momentum=momentum, weight_decay=gamma, nesterov=True)
    optim = RMSprop(network.parameters(), lr=lr, alpha=alpha, momentum=momentum, weight_decay=gamma)
    logger.info("Network: %s, num_layer=%d, hidden=%d, filter=%d, tag_space=%d, crf=%s" % (
        mode, num_layers, hidden_size, num_filters, tag_space, 'bigram' if bigram else 'unigram'))
    logger.info("training: l2: %f, (#training data: %d, batch: %d, dropout: %.2f, unk replace: %.2f)" % (
        gamma, num_data, batch_size, p, unk_replace))

    num_batches = num_data // batch_size + 1
    dev_f1 = 0.0
    dev_acc = 0.0
    dev_precision = 0.0
    dev_recall = 0.0
    test_f1 = 0.0
    test_acc = 0.0
    test_precision = 0.0
    test_recall = 0.0
    best_epoch = 0
    for epoch in range(1, num_epochs + 1):
        print('Epoch %d (%s(%s), learning rate=%.4f, decay rate=%.4f (schedule=%d)): ' % (
            epoch, mode, args.dropout, lr, lr_decay, schedule))
        train_err = 0.
        train_total = 0.

        start_time = time.time()
        num_back = 0
        network.train()
        for batch in range(1, num_batches + 1):
            word, char, _, _, labels, masks, lengths, elmo_embedding = bionlp_data.get_batch_variable(data_train, batch_size,
                                                                                       unk_replace=unk_replace)

            optim.zero_grad()
            loss = network.loss(word, char, labels, mask=masks, elmo_word=elmo_embedding)
            loss.backward()
            clip_grad_norm(network.parameters(), 5.0)
            optim.step()

            num_inst = word.size(0)
            train_err += loss.data[0] * num_inst
            train_total += num_inst

            time_ave = (time.time() - start_time) / batch
            time_left = (num_batches - batch) * time_ave

            # update log
            if batch % 100 == 0:
                sys.stdout.write("\b" * num_back)
                sys.stdout.write(" " * num_back)
                sys.stdout.write("\b" * num_back)
                log_info = 'train: %d/%d loss: %.4f, time left (estimated): %.2fs' % (
                    batch, num_batches, train_err / train_total, time_left)
                sys.stdout.write(log_info)
                sys.stdout.flush()
                num_back = len(log_info)

        sys.stdout.write("\b" * num_back)
        sys.stdout.write(" " * num_back)
        sys.stdout.write("\b" * num_back)
        print('train: %d loss: %.4f, time: %.2fs' % (num_batches, train_err / train_total, time.time() - start_time))

        # evaluate performance on dev data
        network.eval()
        tmp_filename = 'tmp/%s_dev%d' % (str(uid), epoch)
        writer.start(tmp_filename)

        for batch in bionlp_data.iterate_batch_variable(data_dev, batch_size):
            word, char, pos, chunk, labels, masks, lengths, elmo_embedding = batch
            preds, _ = network.decode(word, char, target=labels, mask=masks,
                                         leading_symbolic=bionlp_data.NUM_SYMBOLIC_TAGS, elmo_word=elmo_embedding)
            writer.write(word.data.cpu().numpy(), pos.data.cpu().numpy(), chunk.data.cpu().numpy(),
                         preds.cpu().numpy(), labels.data.cpu().numpy(), lengths.cpu().numpy())
        writer.close()
        acc, precision, recall, f1 = evaluate(tmp_filename)
        print('dev acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%%' % (acc, precision, recall, f1))

        if dev_f1 < f1:
            dev_f1 = f1
            dev_acc = acc
            dev_precision = precision
            dev_recall = recall
            best_epoch = epoch

            # evaluate on test data when better performance detected
            tmp_filename = 'tmp/%s_test%d' % (str(uid), epoch)
            writer.start(tmp_filename)

            for batch in bionlp_data.iterate_batch_variable(data_test, batch_size):
                word, char, pos, chunk, labels, masks, lengths, elmo_embedding = batch
                preds, _ = network.decode(word, char, target=labels, mask=masks,
                                          leading_symbolic=bionlp_data.NUM_SYMBOLIC_TAGS, elmo_word=elmo_embedding)
                writer.write(word.data.cpu().numpy(), pos.data.cpu().numpy(), chunk.data.cpu().numpy(),
                             preds.cpu().numpy(), labels.data.cpu().numpy(), lengths.cpu().numpy())
            writer.close()
            test_acc, test_precision, test_recall, test_f1 = evaluate(tmp_filename)

        print("best dev  acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)" % (
            dev_acc, dev_precision, dev_recall, dev_f1, best_epoch))
        print("best test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)" % (
            test_acc, test_precision, test_recall, test_f1, best_epoch))

        if epoch % schedule == 0:
            # lr = learning_rate / (1.0 + epoch * lr_decay)
            # optim = SGD(network.parameters(), lr=lr, momentum=momentum, weight_decay=gamma, nesterov=True)
            lr = lr * lr_decay
            optim.param_groups[0]['lr'] = lr
예제 #17
0
def main():
    parser = argparse.ArgumentParser(
        description='Tuning with Multitask bi-directional RNN-CNN-CRF')
    parser.add_argument('--config',
                        help='Config file (Python file format)',
                        default="config_multitask.py")
    parser.add_argument('--grid', help='Grid Search Options', default="{}")
    args = parser.parse_args()
    logger = get_logger("Multi-Task")
    use_gpu = torch.cuda.is_available()

    # Config Tensorboard Writer
    log_writer = SummaryWriter()

    # Load from config file
    spec = importlib.util.spec_from_file_location("config", args.config)
    config_module = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(config_module)
    config = config_module.entries

    # Load options from grid search
    options = eval(args.grid)
    for k, v in options.items():
        if isinstance(v, six.string_types):
            cmd = "%s = \"%s\"" % (k, v)
        else:
            cmd = "%s = %s" % (k, v)
            log_writer.add_scalar(k, v, 1)
        exec(cmd)

    # Load embedding dict
    embedding = config.embedding.embedding_type
    embedding_path = config.embedding.embedding_dict
    embedd_dict, embedd_dim = utils.load_embedding_dict(
        embedding, embedding_path)

    # Collect data path
    data_dir = config.data.data_dir
    data_names = config.data.data_names
    train_paths = [
        os.path.join(data_dir, data_name, "train.tsv")
        for data_name in data_names
    ]
    dev_paths = [
        os.path.join(data_dir, data_name, "devel.tsv")
        for data_name in data_names
    ]
    test_paths = [
        os.path.join(data_dir, data_name, "test.tsv")
        for data_name in data_names
    ]

    # Create alphabets
    logger.info("Creating Alphabets")
    if not os.path.exists('tmp'):
        os.mkdir('tmp')
    word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet, ner_alphabet_task, label_reflect  = \
            bionlp_data.create_alphabets(os.path.join(Path(data_dir).abspath(), "alphabets", "_".join(data_names)), train_paths,
                    data_paths=dev_paths + test_paths, use_cache=True,
                    embedd_dict=embedd_dict, max_vocabulary_size=50000)

    logger.info("Word Alphabet Size: %d" % word_alphabet.size())
    logger.info("Character Alphabet Size: %d" % char_alphabet.size())
    logger.info("POS Alphabet Size: %d" % pos_alphabet.size())
    logger.info("Chunk Alphabet Size: %d" % chunk_alphabet.size())
    logger.info("NER Alphabet Size: %d" % ner_alphabet.size())
    logger.info(
        "NER Alphabet Size per Task: %s",
        str([task_alphabet.size() for task_alphabet in ner_alphabet_task]))

    #task_reflects = torch.LongTensor(reverse_reflect(label_reflect, ner_alphabet.size()))
    #if use_gpu:
    #    task_reflects = task_reflects.cuda()

    if embedding == 'elmo':
        logger.info("Loading ELMo Embedder")
        ee = ElmoEmbedder(options_file=config.embedding.elmo_option,
                          weight_file=config.embedding.elmo_weight,
                          cuda_device=config.embedding.elmo_cuda)
    else:
        ee = None

    logger.info("Reading Data")

    # Prepare dataset
    data_trains = [
        bionlp_data.read_data_to_variable(train_path,
                                          word_alphabet,
                                          char_alphabet,
                                          pos_alphabet,
                                          chunk_alphabet,
                                          ner_alphabet_task[task_id],
                                          use_gpu=use_gpu,
                                          elmo_ee=ee)
        for task_id, train_path in enumerate(train_paths)
    ]
    num_data = [sum(data_train[1]) for data_train in data_trains]
    num_labels = ner_alphabet.size()
    num_labels_task = [task_item.size() for task_item in ner_alphabet_task]

    data_devs = [
        bionlp_data.read_data_to_variable(dev_path,
                                          word_alphabet,
                                          char_alphabet,
                                          pos_alphabet,
                                          chunk_alphabet,
                                          ner_alphabet_task[task_id],
                                          use_gpu=use_gpu,
                                          volatile=True,
                                          elmo_ee=ee)
        for task_id, dev_path in enumerate(dev_paths)
    ]

    data_tests = [
        bionlp_data.read_data_to_variable(test_path,
                                          word_alphabet,
                                          char_alphabet,
                                          pos_alphabet,
                                          chunk_alphabet,
                                          ner_alphabet_task[task_id],
                                          use_gpu=use_gpu,
                                          volatile=True,
                                          elmo_ee=ee)
        for task_id, test_path in enumerate(test_paths)
    ]

    writer = BioNLPWriter(word_alphabet, char_alphabet, pos_alphabet,
                          chunk_alphabet, ner_alphabet)

    def construct_word_embedding_table():
        scale = np.sqrt(3.0 / embedd_dim)
        table = np.empty([word_alphabet.size(), embedd_dim], dtype=np.float32)
        table[bionlp_data.UNK_ID, :] = np.random.uniform(
            -scale, scale, [1, embedd_dim]).astype(np.float32)
        oov = 0
        for word, index in word_alphabet.items():
            if not embedd_dict == None and word in embedd_dict:
                embedding = embedd_dict[word]
            elif not embedd_dict == None and word.lower() in embedd_dict:
                embedding = embedd_dict[word.lower()]
            else:
                embedding = np.random.uniform(
                    -scale, scale, [1, embedd_dim]).astype(np.float32)
                oov += 1
            table[index, :] = embedding
        print('oov: %d' % oov)
        return torch.from_numpy(table)

    word_table = construct_word_embedding_table()
    logger.info("constructing network...")

    # Construct network
    window = 3
    num_layers = 1
    mode = config.rnn.mode
    hidden_size = config.rnn.hidden_size
    char_dim = config.rnn.char_dim
    num_filters = config.rnn.num_filters
    tag_space = config.rnn.tag_space
    bigram = config.rnn.bigram
    attention_mode = config.rnn.attention
    if config.rnn.dropout == 'std':
        network = FullySharedBiRecurrentCRF(
            len(data_trains),
            embedd_dim,
            word_alphabet.size(),
            char_dim,
            char_alphabet.size(),
            num_filters,
            window,
            mode,
            hidden_size,
            num_layers,
            num_labels,
            num_labels_task=num_labels_task,
            tag_space=tag_space,
            embedd_word=word_table,
            p_in=config.rnn.p,
            p_rnn=config.rnn.p,
            bigram=bigram,
            elmo=(embedding == 'elmo'),
            attention_mode=attention_mode,
            adv_loss_coef=config.multitask.adv_loss_coef,
            diff_loss_coef=config.multitask.diff_loss_coef,
            char_level_rnn=config.rnn.char_level_rnn)
    else:
        raise NotImplementedError

    if use_gpu:
        network.cuda()

    # Prepare training
    unk_replace = config.embedding.unk_replace
    num_epochs = config.training.num_epochs
    batch_size = config.training.batch_size
    lr = config.training.learning_rate
    momentum = config.training.momentum
    alpha = config.training.alpha
    lr_decay = config.training.lr_decay
    schedule = config.training.schedule
    gamma = config.training.gamma

    # optim = SGD(network.parameters(), lr=lr, momentum=momentum, weight_decay=gamma, nesterov=True)
    optim = RMSprop(network.parameters(),
                    lr=lr,
                    alpha=alpha,
                    momentum=momentum,
                    weight_decay=gamma)
    logger.info(
        "Network: %s, num_layer=%d, hidden=%d, filter=%d, tag_space=%d, crf=%s"
        % (mode, num_layers, hidden_size, num_filters, tag_space,
           'bigram' if bigram else 'unigram'))
    logger.info(
        "training: l2: %f, (#training data: %s, batch: %d, dropout: %.2f, unk replace: %.2f)"
        % (gamma, num_data, batch_size, config.rnn.p, unk_replace))

    num_batches = [x // batch_size + 1 for x in num_data]
    dev_f1 = [0.0 for x in num_data]
    dev_acc = [0.0 for x in num_data]
    dev_precision = [0.0 for x in num_data]
    dev_recall = [0.0 for x in num_data]
    test_f1 = [0.0 for x in num_data]
    test_acc = [0.0 for x in num_data]
    test_precision = [0.0 for x in num_data]
    test_recall = [0.0 for x in num_data]
    best_epoch = [0 for x in num_data]

    # Training procedure
    for epoch in range(1, num_epochs + 1):
        print(
            'Epoch %d (%s(%s), learning rate=%.4f, decay rate=%.4f (schedule=%d)): '
            % (epoch, mode, config.rnn.dropout, lr, lr_decay, schedule))
        train_err = 0.
        train_total = 0.

        # Gradient decent on training data
        start_time = time.time()
        num_back = 0
        network.train()
        batch_count = 0
        for batch in range(1, 2 * num_batches[0] + 1):
            r = random.random()
            task_id = 0 if r <= 0.5 else random.randint(1, len(num_data) - 1)
            batch_count += 1
            word, char, _, _, labels, masks, lengths, elmo_embedding = bionlp_data.get_batch_variable(
                data_trains[task_id], batch_size, unk_replace=unk_replace)

            optim.zero_grad()
            loss, task_loss, adv_loss, diff_loss = network.loss(
                task_id,
                word,
                char,
                labels,
                mask=masks,
                elmo_word=elmo_embedding)
            #log_writer.add_scalars(
            #        'train_loss_task' + str(task_id),
            #        {'all_loss': loss, 'task_loss': task_loss, 'adv_loss': adv_loss, 'diff_loss': diff_loss},
            #        (epoch - 1) * (num_batches[task_id] + 1) + batch
            #)
            #log_writer.add_scalars(
            #        'train_loss_overview',
            #        {'all_loss': loss, 'task_loss': task_loss, 'adv_loss': adv_loss, 'diff_loss': diff_loss},
            #        (epoch - 1) * (sum(num_batches) + 1) + batch_count
            #)
            loss.backward()
            clip_grad_norm(network.parameters(), 5.0)
            optim.step()

            num_inst = word.size(0)
            train_err += loss.data[0] * num_inst
            train_total += num_inst

            time_ave = (time.time() - start_time) / batch
            time_left = (2 * num_batches[0] - batch) * time_ave

            # update log
            if batch % 100 == 0:
                sys.stdout.write("\b" * num_back)
                sys.stdout.write(" " * num_back)
                sys.stdout.write("\b" * num_back)
                log_info = 'train: %d/%d loss: %.4f, time left (estimated): %.2fs' % (
                    batch, 2 * num_batches[0], train_err / train_total,
                    time_left)
                sys.stdout.write(log_info)
                sys.stdout.flush()
                num_back = len(log_info)

        sys.stdout.write("\b" * num_back)
        sys.stdout.write(" " * num_back)
        sys.stdout.write("\b" * num_back)
        print('train: %d loss: %.4f, time: %.2fs' %
              (2 * num_batches[0], train_err / train_total,
               time.time() - start_time))

        # Evaluate performance on dev data
        network.eval()
        for task_id in range(len(num_batches)):
            tmp_filename = 'tmp/%s_dev%d%d' % (str(uid), epoch, task_id)
            writer.start(tmp_filename)

            for batch in bionlp_data.iterate_batch_variable(
                    data_devs[task_id], batch_size):
                word, char, pos, chunk, labels, masks, lengths, elmo_embedding = batch
                preds, _ = network.decode(
                    task_id,
                    word,
                    char,
                    target=labels,
                    mask=masks,
                    leading_symbolic=bionlp_data.NUM_SYMBOLIC_TAGS,
                    elmo_word=elmo_embedding)
                writer.write(word.data.cpu().numpy(),
                             pos.data.cpu().numpy(),
                             chunk.data.cpu().numpy(),
                             preds.cpu().numpy(),
                             labels.data.cpu().numpy(),
                             lengths.cpu().numpy())
            writer.close()
            acc, precision, recall, f1 = evaluate(tmp_filename)
            log_writer.add_scalars(
                'dev_task' + str(task_id), {
                    'accuracy': acc,
                    'precision': precision,
                    'recall': recall,
                    'f1': f1
                }, epoch)
            print(
                'dev acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%%'
                % (acc, precision, recall, f1))

            if dev_f1[task_id] < f1:
                dev_f1[task_id] = f1
                dev_acc[task_id] = acc
                dev_precision[task_id] = precision
                dev_recall[task_id] = recall
                best_epoch[task_id] = epoch

                # Evaluate on test data when better performance detected
                tmp_filename = 'tmp/%s_test%d%d' % (str(uid), epoch, task_id)
                writer.start(tmp_filename)

                for batch in bionlp_data.iterate_batch_variable(
                        data_tests[task_id], batch_size):
                    word, char, pos, chunk, labels, masks, lengths, elmo_embedding = batch
                    preds, _ = network.decode(
                        task_id,
                        word,
                        char,
                        target=labels,
                        mask=masks,
                        leading_symbolic=bionlp_data.NUM_SYMBOLIC_TAGS,
                        elmo_word=elmo_embedding)
                    writer.write(word.data.cpu().numpy(),
                                 pos.data.cpu().numpy(),
                                 chunk.data.cpu().numpy(),
                                 preds.cpu().numpy(),
                                 labels.data.cpu().numpy(),
                                 lengths.cpu().numpy())
                writer.close()
                test_acc[task_id], test_precision[task_id], test_recall[
                    task_id], test_f1[task_id] = evaluate(tmp_filename)
                log_writer.add_scalars(
                    'test_task' + str(task_id), {
                        'accuracy': test_acc[task_id],
                        'precision': test_precision[task_id],
                        'recall': test_recall[task_id],
                        'f1': test_f1[task_id]
                    }, epoch)

            print(
                "================================================================================"
            )
            print("dataset: %s" % data_names[task_id])
            print(
                "best dev  acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)"
                % (dev_acc[task_id], dev_precision[task_id],
                   dev_recall[task_id], dev_f1[task_id], best_epoch[task_id]))
            print(
                "best test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)"
                %
                (test_acc[task_id], test_precision[task_id],
                 test_recall[task_id], test_f1[task_id], best_epoch[task_id]))
            print(
                "================================================================================\n"
            )

            if epoch % schedule == 0:
                # lr = learning_rate / (1.0 + epoch * lr_decay)
                # optim = SGD(network.parameters(), lr=lr, momentum=momentum, weight_decay=gamma, nesterov=True)
                lr = lr * lr_decay
                optim.param_groups[0]['lr'] = lr

    # writer.export_scalars_to_json("./all_scalars.json")
    writer.close()
예제 #18
0
    def __init__(self, args, word_vocab):
        super().__init__(args, word_vocab)

        # import ElmoEmbedder here so that the cuda_visible_divices can work
        from allennlp.commands.elmo import ElmoEmbedder
        self.elmo = ElmoEmbedder(cuda_device=0 if args.gpu is not None else -1)
예제 #19
0
class ELMOCRFSegModel(LSTMCRFSegModel):
    def __init__(self, args, word_vocab):
        super().__init__(args, word_vocab)

        # import ElmoEmbedder here so that the cuda_visible_divices can work
        from allennlp.commands.elmo import ElmoEmbedder
        self.elmo = ElmoEmbedder(cuda_device=0 if args.gpu is not None else -1)

    def _setup_placeholders(self):
        self.placeholders = {
            'input_words': tf.placeholder(tf.int32, shape=[None, None]),
            'input_length': tf.placeholder(tf.int32, shape=[None]),
            'elmo_vectors': tf.placeholder(tf.float32,
                                           shape=[None, 3, None, 1024]),
            'seg_labels': tf.placeholder(tf.float32, shape=[None, None]),
            'dropout_keep_prob': tf.placeholder(tf.float32)
        }

    def _embed(self):
        with tf.device('/cpu:0'):
            word_emb_init = tf.constant_initializer(self.word_vocab.embeddings) if self.word_vocab.embeddings is not None \
                else tf.random_normal_initializer()
            self.word_embeddings = tf.get_variable(
                'word_embeddings',
                shape=(self.word_vocab.size(), self.word_vocab.embed_dim),
                initializer=word_emb_init,
                trainable=False)
            self.embedded_words = tf.nn.embedding_lookup(
                self.word_embeddings, self.placeholders['input_words'])
        self.elmo_weights = tf.nn.softmax(
            tf.get_variable('elmo_weights', [3],
                            dtype=tf.float32,
                            trainable=True))
        self.scale_para = tf.get_variable('scale_para', [1],
                                          dtype=tf.float32,
                                          trainable=True)
        self.elmo_vectors = self.scale_para * (
            self.elmo_weights[0] *
            self.placeholders['elmo_vectors'][:, 0, :, :] +
            self.elmo_weights[1] *
            self.placeholders['elmo_vectors'][:, 1, :, :] +
            self.elmo_weights[2] *
            self.placeholders['elmo_vectors'][:, 2, :, :])
        self.embedded_inputs = tf.concat(
            [self.embedded_words, self.elmo_vectors], -1)
        self.embedded_inputs = tf.nn.dropout(
            self.embedded_inputs, self.placeholders['dropout_keep_prob'])

    def _compute_loss(self):
        self.loss = tf.reduce_mean(-self.log_likelyhood, 0)
        if self.weight_decay > 0:
            with tf.variable_scope('l2_loss'):
                l2_loss = tf.add_n([
                    tf.nn.l2_loss(v) for v in tf.trainable_variables()
                    if 'bias' not in v.name
                ])
            self.loss += self.weight_decay * l2_loss

    def _train_epoch(self, train_batches, print_every_n_batch):
        total_loss, total_batch_num = 0, 0
        for bitx, batch in enumerate(train_batches):
            feed_dict = {
                self.placeholders['input_words']: batch['word_ids'],
                self.placeholders['input_length']: batch['length'],
                self.placeholders['seg_labels']: batch['seg_labels']
            }
            elmo_vectors, mask = self.elmo.batch_to_embeddings(
                [sample['words'] for sample in batch['raw_data']])
            feed_dict[self.placeholders['elmo_vectors']] = np.asarray(
                elmo_vectors.cpu().data)
            feed_dict[self.placeholders[
                'dropout_keep_prob']] = self.dropout_keep_prob

            _, loss, grad_norm = self.sess.run(
                [self.train_op, self.loss, self.grad_norm], feed_dict)

            if bitx != 0 and print_every_n_batch > 0 and bitx % print_every_n_batch == 0:
                self.logger.info('bitx: {}, loss: {}, grad: {}'.format(
                    bitx, loss, grad_norm))
            total_loss += loss
            total_batch_num += 1
        return total_loss / total_batch_num

    def segment(self, batch):
        feed_dict = {
            self.placeholders['input_words']: batch['word_ids'],
            self.placeholders['input_length']: batch['length']
        }
        elmo_vectors, mask = self.elmo.batch_to_embeddings(
            [sample['words'] for sample in batch['raw_data']])
        feed_dict[self.placeholders['elmo_vectors']] = np.asarray(
            elmo_vectors.data.cpu())
        feed_dict[self.placeholders['dropout_keep_prob']] = 1.0

        scores, trans_params = self.sess.run([self.scores, self.trans_params],
                                             feed_dict)

        batch_pred_segs = []
        # log_likes = []
        for sample_idx in range(len(batch['raw_data'])):
            length = batch['length'][sample_idx]
            viterbi_seq, viterbi_score = tc.crf.viterbi_decode(
                scores[sample_idx][:length], trans_params)

            # with tf.Graph().as_default(), tf.Session() as session:
            #     length_tensor = tf.expand_dims(c2t(length), axis=0)
            #     viterbi_seq_tensor = tf.expand_dims(c2t(viterbi_seq), axis=0)
            #     scores_tensor = c2t(scores)
            #     trans_params_tensor = c2t(trans_params)
            #     log_likelihood, tparams = tc.crf.crf_log_likelihood(scores_tensor, viterbi_seq_tensor, length_tensor, trans_params_tensor)
            #     log_like_numpy = session.run(log_likelihood)
            # log_likes.append(log_like_numpy)

            # tf.get_default_graph().finalize()
            pred_segs = []
            for word_idx, label in enumerate(viterbi_seq):
                if label == 1:
                    pred_segs.append(word_idx)
            batch_pred_segs.append(pred_segs)
        return batch_pred_segs  # , log_likes
예제 #20
0
@author: OHyic
"""

from allennlp.commands.elmo import ElmoEmbedder
import numpy as np

#define max token length
max_tokens = 60

#input sentences
sentences = [
    "how are you doing", "what is your name", "can you subscribe to my channel"
]

#create a pretrained elmo model (requires internet connection)
elmo = ElmoEmbedder(cuda_device=0)
embeddings = []

#loop through the input sentences
for index, elmo_embedding in enumerate(elmo.embed_sentences(sentences)):
    print("elmo:", index)
    # Average the 3 layers returned from Elmo
    avg_elmo_embedding = np.average(elmo_embedding, axis=0)
    padding_length = max_tokens - avg_elmo_embedding.shape[0]
    if (padding_length > 0):
        avg_elmo_embedding = np.append(avg_elmo_embedding,
                                       np.zeros((padding_length,
                                                 avg_elmo_embedding.shape[1])),
                                       axis=0)
    else:
        avg_elmo_embedding = avg_elmo_embedding[:max_tokens]
예제 #21
0
import numpy as np
from allennlp.commands.elmo import ElmoEmbedder
import time

### ELMo embedding on training data
weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5"
options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json"

start = time.time()
print("Downloading elmo model...")
elmo = ElmoEmbedder(options_file, weight_file)
print("Downloaded in %fs" % (time.time()-start))

start = time.time()
sentences = ["First sentence .".split(), "Another one".split()]
X = elmo.embed_sentence(sentences[1])
print(X.shape)
print('Type: ', type(elmo))
print("Embedding done in %fs." % (time.time()-start))

예제 #22
0
class EntityEmbedder:
    def __init__(self):
        """
        Initialize the value for constants which are useful to drive the behaviour 
        """
        # list of model names
        self.ELMO_NAME = 'Elmo'

        # list of extraction modes for Elmo
        self.LAYER_2 = 'layer_2'
        self.LAYER_1 = 'layer_1'
        self.LAYER_0 = 'layer_0'
        self.MEAN = 'mean'

        # list of word phrase aggregation names
        self.VECTOR_MEAN = 'vector_mean'

    def initialize_embedder_model(self, model_name, corpus):
        """
        setup the variables which will determine how words will be translated in vectors
        :param 
            model_name: the string which identifies the model used to embed
                values: 
                    ELMO_NAME: the allennlp's ElmoEmbedder is used, this take one sentence at time
                               a sentence is a list of single words (['this', 'is', 'a', 'sentence']) 
            corpus: a corpus in a format in accord with the model specifications:
                ELMO_NAME:
                    a list of lists, each sublist is a sentence in the format ['this', 'is', 'a', 'sentence'] 
        """

        if model_name == self.ELMO_NAME:
            self.model = ElmoEmbedder(cuda_device=0)
            self.model_name = model_name
            self.corpus = corpus

    def setup(self,
              model_name,
              extraction_mode,
              occurrences_of_entities_path,
              aggregation_method,
              corpus,
              verbose=False):
        """
        setup the values to drive the behaviour and setup the resources
        :param
            model_name: the name of the embedding model (ELMO_NAME)
            extraction_mode: the modality to extract vectors for word:
                if model_name == ELMO_NAME then extraction_mode can take these values:
                    [LAYER_0, LAYER_1, LAYER_2]: the vector returned comes from layer 0 / 1 / 2 of ELMO
                    MEAN: the mean of layers 0, 1 and 2 is returned
            occurrences_of_entities_path: the path to the file which contains the occurrences of the entities (the output of CorpusManager.check_composite_words())
            aggregation_method: the method used to aggregate token vectors in word phrases (for 'new york' there will be two vectors, we want only one)
                values:
                    VECTOR_MEAN: the mean of all token vectors is returned
            corpus: a corpus in a format in accord with the model specifications (see inizialize_embedder_model for more specific description)
        :return: a list of indexes which are the row in which word appear
        """
        print('setupping the embedder')
        self.initialize_embedder_model(model_name=model_name, corpus=corpus)
        self.extraction_mode = extraction_mode
        self.OCCURRENCE_OF_ENTITIES_PATH = occurrences_of_entities_path
        self.verbose = verbose
        self.aggregation_method = aggregation_method

    def set_extraction_mode(self, mode):
        """
        setup the name of the extraction mode, which will be used to drive the other functions in the class
        :param 
            mode: the string which identifies the mode used to extract vectors of the sentences 
        """
        self.extraction_mode = mode

    def extract_embedding(self, model_output):
        """
        returns the embeddings starting from the output of the model
        :param 
            model_output: the desired output of self.model 
        """

        if self.extraction_mode == self.LAYER_2 and self.model_name == self.ELMO_NAME:
            return model_output[2]
        if self.extraction_mode == self.LAYER_1 and self.model_name == self.ELMO_NAME:
            return model_output[1]
        if self.extraction_mode == self.LAYER_0 and self.model_name == self.ELMO_NAME:
            return model_output[0]
        if self.extraction_mode == self.MEAN and self.model_name == self.ELMO_NAME:
            return (model_output[0] + model_output[1] + model_output[2]) / 3

    def embed_sentence(self, sentence):
        """
        returns the embedding of the input sentence based on the instantiated model
        :param 
            sentence: if model_name == ELMO_NAME a sentence in this format: ['this', 'is', 'a', 'sentence']
        """
        if self.model_name == self.ELMO_NAME:
            return self.extract_embedding(self.model.embed_sentence(sentence))

    def create_embedding_data_structure(self):
        """
        creates the data structure useful to retrieve embeddings
        needs the output of the function 'check_composite_words' of the CorpusManager Class
        """

        print('creating data structures')
        all_occurrences = load_data_with_pickle(
            self.OCCURRENCE_OF_ENTITIES_PATH)
        all_occurrences = [(k, v) if type(v[0]) == tuple else (k, v[0])
                           for k, v in all_occurrences.items() if len(k) > 2]
        all_occurrences = {x[0]: x[1] for x in all_occurrences}

        sentences_to_embed = [
            v[0] for values in all_occurrences.values() for v in values
        ]

        if self.verbose:
            print('total found entity mentions: {}'.format(
                len(sentences_to_embed)))
            print(
                'fraction of sentences with entity mentions: {:.2f} ({} on {})'
                .format(
                    len(set(sentences_to_embed)) / len(self.corpus),
                    len(set(sentences_to_embed)), len(self.corpus)))
            print('{:.2f} average entity mentions per sentence'.format(
                len(sentences_to_embed) / len(set(sentences_to_embed))))

        embedding_data_structure = {index: [] for index in sentences_to_embed}

        for entity_mention, occurrences in all_occurrences.items():
            for couple in occurrences:
                embedding_data_structure[couple[0]].append(
                    (couple[1], entity_mention))

        embedding_data_structure = {
            k: v
            for k, v in embedding_data_structure.items() if v
        }
        self.ordered_embedding_data_structure = OrderedDict(
            sorted(embedding_data_structure.items()))

    def extract_vectors_of_occurrences_in_corpus(self):
        """
        returns the embedding of all input sentences based on the instantiated model
        :param 
            sentences: if model_name == ELMO_NAME a list of sentence in this format: ['this', 'is', 'a', 'sentence']
        """
        print('generate vectors')
        self.vectors_dict = defaultdict(list)

        for row_index, occurrences in tqdm(
                self.ordered_embedding_data_structure.items()):
            vectors = self.embed_sentence(self.corpus[row_index])
            for occ in occurrences:
                for word_index in occ[0]:
                    if len(occ[1].split(' ')) == 1:
                        self.vectors_dict[occ[1]].append(vectors[word_index])
                    else:
                        vecs = [
                            vectors[w_i] for w_i in range(
                                word_index, word_index +
                                len(occ[1].split(' ')))
                        ]
                        self.vectors_dict[occ[1]].append(
                            self.word_phrase_aggregation_method(vecs=vecs))

    def word_phrase_aggregation_method(self, vecs):
        """
        aggregates a list of vectors in accord to the aggregation method 
        (extracts a single vector for the word phrase 'New York' starting from the vectors of 'New' and 'York')
        :param 
            vecs: the list of vector to be aggregated
        """
        if self.aggregation_method == self.VECTOR_MEAN:
            return np.mean(vecs, axis=0)

    def create_dataset(self, entity_dict, X_PATH, Y_PATH, entities_PATH):
        """
        creates a dataset composed of: a list of vectors (X), a list of labels (Y), the entities names which order corresponds to values in X and Y (entities)
        :param 
            entity_dict: a dict of entities which is in the format: {concept: [list of entities]}, used to set the Y values and the entities values
            X_PATH: the filepath in which save the list of vectors
            Y_PATH: the filepath in which save the list of labels
            entities_PATH: the filepath in which save the list of entities names

        """
        print('creating dataset')

        reverse_dict = defaultdict(list)

        for k, words in entity_dict.items():
            for w in words:
                reverse_dict[w].append(k)
        X = []
        Y = []
        entities = []

        for label, label_vectors in self.vectors_dict.items():
            if label in reverse_dict:
                for v in label_vectors:
                    X.append(v)
                    Y.append(reverse_dict[label][0])
                    entities.append(label)

        save_data_with_pickle(X_PATH, X)
        save_data_with_pickle(Y_PATH, Y)
        save_data_with_pickle(entities_PATH, entities)
예제 #23
0
 def _embed_batch_impl(
         self, batch: List[str],
         model: ElmoEmbedder) -> Generator[ndarray, None, None]:
     # elmo expect a `List[str]` as it was meant for tokens/words with more than one character.
     yield from model.embed_batch([list(seq) for seq in batch])
예제 #24
0
 def __init__(self, options_path: str, weights_path: str, device: int = 0):
     self.model = ElmoEmbedder(options_path,
                               weights_path,
                               cuda_device=device)
예제 #25
0
class BilmElmo(Bilm):

    def __init__(self, cuda_device, weights_path, options_path, vocab_path, batch_size=40,
                 cutoff_elmo_vocab=50000):
        super().__init__()
        logging.info(
            'creating elmo in device %d. weight path %s, vocab_path %s '
            ' batch_size: %d' % (
                cuda_device, weights_path, vocab_path,
                batch_size))
        self.elmo = ElmoEmbedder(cuda_device=cuda_device, weight_file= weights_path, options_file=options_path )

        self.batch_size = batch_size

        logging.info('warming up elmo')
        self._warm_up_elmo()

        logging.info('reading elmo weights')
        with h5py.File(weights_path, 'r', libver='latest', swmr=True) as fin:
            self.elmo_softmax_w = fin['softmax/W'][:cutoff_elmo_vocab, :].transpose()
            # self.elmo_softmax_b=fin['softmax/b'][:cutoff_elmo_vocab]
        self.elmo_word_vocab = []
        self.elmo_word_vocab_lemmatized = []

        # we prevent the prediction of these by removing their weights and their vocabulary altogether
        stop_words = {'<UNK>', '<S>', '</S>', '--', '..', '...', '....'}

        logging.info('reading elmo vocabulary')

        lines_to_remove = set()
        with open(vocab_path, encoding="utf-8") as fin:
            for idx, line in enumerate(fin):
                if idx == cutoff_elmo_vocab:
                    break
                word = line.strip()
                if len(word) == 1 or word in stop_words:
                    lines_to_remove.add(idx)
                self.elmo_word_vocab.append(word)

        with open(vocab_path + '.lemmatized', encoding="utf-8") as fin:
            for idx, line in enumerate(fin):
                if idx == cutoff_elmo_vocab:
                    break
                word = line.strip()
                if len(word) == 1 or word in stop_words:
                    lines_to_remove.add(idx)
                self.elmo_word_vocab_lemmatized.append(word)

        # remove stopwords
        self.elmo_word_vocab = [x for i, x in enumerate(self.elmo_word_vocab) if i not in lines_to_remove]
        self.elmo_word_vocab_lemmatized = [x for i, x in enumerate(self.elmo_word_vocab_lemmatized) if
                                           i not in lines_to_remove]
        self.elmo_softmax_w = np.delete(self.elmo_softmax_w, list(lines_to_remove), 1)
        # self.elmo_softmax_b = np.delete(self.elmo_softmax_b, list(lines_to_remove))
        # logging.info('caching cnn embeddings')
        # self.elmo.elmo_bilm.create_cached_cnn_embeddings(self.elmo_word_vocab)
        # self.elmo.elmo_bilm._has_cached_vocab = True

    @staticmethod
    def create_lemmatized_vocabulary_if_needed(vocab_path):
        """
        this creates a new voabulary file in the same directory as ELMo vocab where words has been lemmatized
        :param vocab_path: path to ELMo vocabulary
        :return:
        """
        if not os.path.isfile(vocab_path + '.lemmatized'):
            # if there is not lemmatized vocabulary create it
            with open(vocab_path, encoding="utf-8") as fin:
                unlem = [x.strip() for x in fin.readlines()]
            logging.info('lemmatizing ELMo vocabulary')
            print('lemmatizing ELMo vocabulary')
            import spacy
            nlp = spacy.load("es", disable=['ner', 'parser']) #RL
            new_vocab = []
            for spacyed in tqdm(
                    nlp.pipe(unlem, batch_size=1000, n_threads=multiprocessing.cpu_count()),
                    total=len(unlem)):
                new_vocab.append(spacyed[0].lemma_ if spacyed[0].lemma_ != '-PRON-' else spacyed[0].lower_)
            with open(vocab_path + '.lemmatized', 'w', encoding="utf-8") as fout:
                for word in new_vocab:
                    fout.write('%s\n' % word)
            logging.info('lemmatization done and cached to file')
            print('lemmatization done and cached to file')

    def _warm_up_elmo(self):
        # running a few sentences in elmo will set it to a better state than initial zeros
        warm_up_sent = "En efecto , rematado ya su juicio , vino a dar en el más " \
                       "extraño pensamiento que jamás dio loco en el mundo ; y fue que " \
                       "le pareció convenible y necesario , así para el aumento de su honra " \
                       "como para el servicio de su república , hacerse caballero andante , e irse " \
                       "por todo el mundo con sus armas y caballo a buscar las " \
                       "aventuras y a ejercitarse en todo aquello que él había leído que " \
                       "los caballeros andantes se ejercitaban , deshaciendo todo género de agravio , y poniéndose " \
                       "en ocasiones y peligros donde , acabándolos , cobrase eterno nombre y fama .".split()
        for _ in range(3):
            _ = list(self.elmo.embed_sentences([warm_up_sent] * self.batch_size, self.batch_size))

    def _get_top_words_dist(self, state, cutoff):
        log_probs = np.matmul(state, self.elmo_softmax_w)# (not) + self.elmo_softmax_b - we prevent unconditionally probable substitutes predictions by ignoring the bias vector
        top_k_log_probs = np.argpartition(-log_probs, cutoff)[: cutoff]
        top_k_log_probs_vals = log_probs[top_k_log_probs]
        e_x = np.exp(top_k_log_probs_vals - np.max(top_k_log_probs_vals))
        probs = e_x / e_x.sum(axis=0)
        return top_k_log_probs, probs

    def _embed_sentences(self, inst_id_to_sentence: Dict[str, Tuple[List[str], int]], disable_symmetric_patterns) -> \
            Tuple[List, List]:
        inst_id_sent_tuples = list(inst_id_to_sentence.items())
        target = inst_id_sent_tuples[0][0].rsplit('.', 1)[0]
        to_embed = []

        if disable_symmetric_patterns:
            # w/o sym. patterns - predict for blanked out word.
            # if the target word is the first or last in sentence get empty prediction by embedding '.'
            for _, (tokens, target_idx) in inst_id_sent_tuples:
                forward = tokens[:target_idx]
                backward = tokens[target_idx + 1:]
                if not forward:
                    forward = ['.']
                if not backward:
                    backward = ['.']
                to_embed.append(forward)
                to_embed.append(backward)
        else:

            # w/ sym. patterns - include target word + "and" afterwards in both directions
            for _, (tokens, target_idx) in inst_id_sent_tuples:
                # forward sentence
                to_embed.append(tokens[:target_idx + 1] + ['y']) #RL

                # backward sentence
                to_embed.append(['y'] + tokens[target_idx:]) #RL

        logging.info('embedding %d sentences for target %s' % (len(to_embed), target))
        embedded = list(self.elmo.embed_sentences(to_embed, self.batch_size))

        return inst_id_sent_tuples, embedded

    def predict_sent_substitute_representatives(self, inst_id_to_sentence: Dict[str, Tuple[List[str], int]],
                                                n_represent: int,
                                                n_samples_side: int, disable_symmetric_patterns: bool,
                                                disable_lemmatiziation: bool, prediction_cutoff: int) \
            -> Dict[str, List[Dict[str, int]]]:
        """
        a representative is a dictionary made out of samples from both sides of the BiLM, predicting substitutes
        for a contextualized token.
        an example might look like:
        {'forward_jump':2,'backward_leap':1, 'backward_climb':1} (n_samples_side=2)
        we return a list of n_representatives of those

        :param inst_id_to_sentence: dictionary instance_id -> (sentence tokens list, target word index in tokens)
        :param n_represent: number of representatives
        :param n_samples_side: number of samples to draw from each side
        :param disable_symmetric_patterns: if true words are predicted from context only
        :param disable_lemmatiziation: if true predictions are not lemmatized
        :param prediction_cutoff: only top prediction_cutoff LM prediction are considered
        :return: map from instance id to list of representatives
        """
        inst_id_sent_tuples, embedded = self._embed_sentences(inst_id_to_sentence, disable_symmetric_patterns)
        lemma = inst_id_sent_tuples[0][0].split('.')[0]

        vocabulary_used = self.elmo_word_vocab if disable_lemmatiziation else self.elmo_word_vocab_lemmatized

        results = {}
        for i in range(len(inst_id_sent_tuples)):
            inst_id, (tokens, target_idx) = inst_id_sent_tuples[i]
            target_word_lower = tokens[target_idx].lower()

            sentence = ' '.join([t if i != target_idx else '***%s***' % t for i, t in enumerate(tokens)])
            logging.info('instance %s sentence: %s' % (inst_id, sentence))

            # these will be multiplied by ELMo's output matrix, [layer-number,token-index, state dims]
            # (first 512 state dims in elmo are the forward LM, 512:1024 are the backward LM)
            forward_out_em = embedded[i * 2][2, -1, :512]
            backward_out_em = embedded[i * 2 + 1][2, 0, 512:]

            forward_idxs, forward_dist = self._get_top_words_dist(forward_out_em, prediction_cutoff)
            backward_idxs, backward_dist = self._get_top_words_dist(backward_out_em, prediction_cutoff)

            forward_samples = []

            # after removing samples equal to disamb. target,
            # we might end up with not enough samples, so repeat until we have enough samples
            while len(forward_samples) < n_represent * n_samples_side:
                new_samples = list(
                    np.random.choice(forward_idxs, n_represent * n_samples_side * 2,
                                     p=forward_dist))
                new_samples = [vocabulary_used[x] for x in new_samples if
                               vocabulary_used[x].lower() != lemma and vocabulary_used[x].lower() != target_word_lower]
                forward_samples += new_samples

            backward_samples = []
            while len(backward_samples) < n_represent * n_samples_side:
                new_samples = list(
                    np.random.choice(backward_idxs, n_represent * n_samples_side * 2,
                                     p=backward_dist))
                new_samples = [vocabulary_used[x] for x in new_samples if
                               vocabulary_used[x].lower() != lemma and vocabulary_used[x].lower() != target_word_lower]
                backward_samples += new_samples
            logging.info('some forward samples: %s' % [x for x in forward_samples[:5]])
            logging.info('some backward samples: %s' % [x for x in backward_samples[:5]])
            representatives = []
            for _ in range(n_represent):
                representative = dict()
                for _ in range(n_samples_side):
                    for sample_src in forward_samples, backward_samples:
                        sample_word = sample_src.pop()
                        representative[sample_word] = representative.get(sample_word, 0) + 1
                representatives.append(representative)
            logging.info('first 3 representatives out of %d:\n%s' % (n_represent, representatives[:3]))
            results[inst_id] = representatives
        return results
예제 #26
0

test_tcr = pd.read_csv('TCR.csv')
test_ac1, test_ac2, test_bc1, test_bc2 = abc('test_tracdr.txt',
                                             'test_trbcdr.txt')
test_ac3 = []
test_bc3 = []

for ind, out in test_tcr.iterrows():
    test_ac3.append(out[1])
    test_bc3.append(out[4])

model_dir = Path('uniref50_v2')
weights = model_dir / 'weights.hdf5'
options = model_dir / 'options.json'
seqvec = ElmoEmbedder(options, weights, cuda_device=-1)


def s2v(seq):
    embed1 = seqvec.embed_sentence(list(seq))
    protein_embd1 = torch.tensor(embed1).sum(dim=0).mean(dim=0)
    return list(protein_embd1.detach().numpy())


def embed(l):
    value = []
    uni = list(set(l))
    for i, seq in enumerate(uni):
        sys.stdout.write('%d\r' % i)
        sys.stdout.flush()
        value.append(s2v(seq))
예제 #27
0
 def __init__(self):
     self.elmo = ElmoEmbedder()
예제 #28
0
def loadModel(args):
    # Load model according to the choice in "embed".
    print("Begin loading model...")
    embed = args.embedding
    start = time.time()
    if embed == "glove":
        from GloVe import loadAndCreateModel
        if args.path:
            path_to_glove = args.path  # "./../../../../Perso/Pretrained-Embedding/GloVe/"
            print('GloVe path: ' + path_to_glove +
                  '.\nWarning: in GloVe case, it must be the FOLDER path.')
        else:
            print('You need to give GloVe FOLDER path')
            sys.exit()
        if args.dimension:
            dim = args.dimension
            if dim not in [50, 100, 200, 300]:
                print(
                    "Available GloVe dimension: 50, 100, 200 or 300. You chose %d !"
                    % (dim))
                sys.exit()
        else:
            dim = 50
        print('Chosen dimension for GloVe: ', dim)
        start = time.time()
        model = loadAndCreateModel(dim, path_to_glove)
        vocab_size = len(model.keys())
        d = len(model['hello'])
    elif embed == "numberBatch":
        from numberbatch import loadAndCreateNumberBatchModel
        start = time.time()
        dim = 300
        model = loadAndCreateNumberBatchModel()
        vocab_size = len(model.keys())
        d = len(model['hello'])
    elif embed == "miniNumberbatch":
        from miniNumberbatch import loadMiniNumberbatch
        if args.path:
            mNb_path = args.path  #"./../17.06/mini.h5"
            print(
                'Conceptnet model path: ' + mNb_path +
                '. Warning: in miniNumberbatch case, it must be the FILE.h5 path.'
            )
        else:
            print('You need to give ConceptNet miniNumberBatch FILE.h5 path')
            sys.exit()
        start = time.time()
        model = loadMiniNumberbatch(mNb_path)
        vocab_size = len(model.keys())
        d = len(model['hello'])
    elif embed == "elmo":
        from allennlp.commands.elmo import ElmoEmbedder

        ### ELMo embedding on training data
        if args.which_elmo:
            which_elmo = args.which_elmo  # "small"
            print("Chosen ELMo option: ", which_elmo)
        else:
            which_elmo = "small"
            print('Default ELMo chosen: small.')
        if which_elmo == "small":
            weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5"
            options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json"
        elif which_elmo == "medium":
            weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5"
            options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_options.json"
        elif which_elmo == "original":
            weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"
            options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json"
        else:
            print('This option is not available...')
            sys.exit()
        start = time.time()
        print("Downloading elmo model...")
        model = ElmoEmbedder(options_file, weight_file)
        dim = model.embed_sentence(['Hello']).shape[2]
        d = dim
        vocab_size = 0
        print("Downloaded in %fs" % (time.time() - start))
    elif embed == 'infersent':
        from models import InferSent
        nltk.download('punkt')
        if args.path:
            inferSent_path = args.path
            print(
                'InferSent model path: ' + inferSent_path +
                '. Warning: in InferSent case, it must be InferSent FOLDER path.'
            )
        else:
            print('You need to give InferSent FOLDER path')
            sys.exit()
        if args.version == 1 or args.version == 2:
            model_version = args.version
        else:
            print(
                'You need to choose InferSent version between 1 (Word2Vec input) or 2 (FastText input).'
            )
            sys.exit()
        if args.embedding_path:
            W2V_PATH = args.embedding_path
            print(
                'InferSent pretrained embedding path: ' + W2V_PATH +
                '. Warning: in this case, it must be "model.txt" or "model.vec" path.'
            )
        else:
            print(
                'You need to give InferSent "model.txt" or "model.vec" path path'
            )
            sys.exit()
        MODEL_PATH = os.path.join(inferSent_path,
                                  "./encoder/infersent%s.pkl" % model_version)
        params_model = {
            'bsize': 64,
            'word_emb_dim': 300,
            'enc_lstm_dim': 2048,
            'pool_type': 'max',
            'dpout_model': 0.0,
            'version': model_version
        }
        model = InferSent(params_model)
        model.load_state_dict(torch.load(MODEL_PATH))

        # If infersent1 -> use GloVe embeddings. If infersent2 -> use InferSent embeddings.
        # W2V_PATH = './../../../../Perso/Pretrained-Embedding/GloVe/glove.840B.300d.txt' if model_version == 1 else '../dataset/fastText/crawl-300d-2M.vec'
        model.set_w2v_path(W2V_PATH)
        # Load embeddings of K most frequent words
        vocab_size = 100000
        model.build_vocab_k_words(K=vocab_size)
        d = model.encode(['hello guys']).shape[1]

    print('Model ' + embed.upper() + ' loaded in %fs.' % (time.time() - start))
    print("Vocabulary size: %d" % vocab_size)
    print("Vector dimension: %d" % d)

    return model, d
예제 #29
0
    def test_embed_batch_is_empty_sentence(self):
        embedder = ElmoEmbedder(options_file=self.options_file, weight_file=self.weight_file)
        embeddings = embedder.embed_sentence([])

        assert embeddings.shape == (3, 0, 1024)
예제 #30
0
import random

# In[2]:

# set determinstic results
'''
SEED = 1234
random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
'''

# In[3]:

from allennlp.commands.elmo import ElmoEmbedder
elmo = ElmoEmbedder()

from graph_lstm import *
from decoder import *
from graph2seq_model import *

# get the decoder vocab
with open('./data/vocab.pkl', 'rb') as f:
    vocab = pickle.load(f)
    print("Size of vocab: {}".format(vocab.idx))

# get the graph lstm synset vocab
with open('./data/synset_vocab.pkl', 'rb') as f:
    synset_vocab = pickle.load(f)
print("Size of synset vocab: {}".format(synset_vocab.idx))
import sys

sys.path.append('src')
import new_data_io, SIF_embedding_lib
import csv
from allennlp.commands.elmo import ElmoEmbedder
from nltk import word_tokenize
import numpy as np
""" This area should be placed in the main call """
elmo = ElmoEmbedder()


class params(object):
    def __init__(self):
        self.LW = 1e-5
        self.LC = 1e-5
        self.eta = 0.05

    def __str__(self):
        t = "LW", self.LW, ", LC", self.LC, ", eta", self.eta
        t = map(str, t)
        return ' '.join(t)


cleanfile = "test.txt"
wordfile = 'glove.6B.100d.txt'  # word vector file, can be downloaded from GloVe website
weightfile = 'enwiki_vocab_min200.txt'

(words, We) = new_data_io.getWordmap(wordfile)
weightpara = 1e-3
word2weight = new_data_io.getWordWeight(
예제 #32
0
class MLPRegression(Module):
    def __init__(self,
                 embed_params,
                 attention_type,
                 all_attributes,
                 output_size,
                 layers,
                 hand_feat_dim,
                 device="cpu",
                 embedding_dim=1024,
                 turn_on_hand_feats=False,
                 turn_on_embeddings=False):
        '''
            Super class for training
        '''
        super(MLPRegression, self).__init__()

        # Set model constants and embeddings
        self.device = device
        self.layers = layers
        self.embedding_dim = embedding_dim
        self.output_size = output_size
        self.attention_type = attention_type
        self.all_attributes = all_attributes
        self.is_hand_feats_on = turn_on_hand_feats
        self.is_embeds_on = turn_on_embeddings

        # Initialise embeddings
        if self.is_embeds_on:
            self._init_embeddings(embed_params)
        else:
            self.reduced_embedding_dim = 0
        if self.is_hand_feats_on:
            self.hand_feat_dim = hand_feat_dim
        else:
            self.hand_feat_dim = 0

        # Initialise regression layers and parameters
        self._init_regression()

        # Initialise attention parameters
        self._init_attention()

    def _init_embeddings(self, embedding_params):
        '''
            Initialise embeddings
        '''
        if type(embedding_params[0]) is str:
            self.vocab = None
            options_file = embedding_params[0]
            weight_file = embedding_params[1]
            self.embeddings = ElmoEmbedder(options_file,
                                           weight_file,
                                           cuda_device=0)
            # self.embeddings = Elmo(options_file, weight_file, 3, dropout=0)
            self.reduced_embedding_dim = 256

            # ELMO tuning parameters
            self.embed_linmap_argpred_lower = Linear(
                self.embedding_dim, self.reduced_embedding_dim)
            self.embed_linmap_argpred_mid = Linear(self.embedding_dim,
                                                   self.reduced_embedding_dim,
                                                   bias=False)
            self.embed_linmap_argpred_top = Linear(self.embedding_dim,
                                                   self.reduced_embedding_dim,
                                                   bias=False)

        else:
            # GloVe embeddings
            glove_embeds = embedding_params[0]
            self.vocab = embedding_params[1]
            self.num_embeddings = len(self.vocab)
            self.embeddings = torch.nn.Embedding(self.num_embeddings,
                                                 self.embedding_dim,
                                                 max_norm=None,
                                                 norm_type=2,
                                                 scale_grad_by_freq=False,
                                                 sparse=False)
            self.reduced_embedding_dim = 300

            self.embeddings.weight.data.copy_(
                torch.from_numpy(glove_embeds.values))
            self.embeddings.weight.requires_grad = False
            self.vocab_hash = {w: i for i, w in enumerate(self.vocab)}
            # self.embed_linmap = Linear(self.embedding_dim, self.reduced_embedding_dim)

    def _init_regression(self):
        '''
            Define the linear maps
        '''

        # Output regression parameters
        self.linmaps = ModuleDict(
            {prot: ModuleList([])
             for prot in self.all_attributes.keys()})

        for prot in self.all_attributes.keys():
            last_size = self.reduced_embedding_dim
            # Handle varying size of dimension depending on representation
            if self.attention_type[prot]['repr'] == "root":
                if self.attention_type[prot]['context'] != "none":
                    last_size *= 2
            else:
                if self.attention_type[prot]['context'] == "none":
                    last_size *= 2
                else:
                    last_size *= 3
            # self.layer_norm[prot] = torch.nn.LayerNorm(last_size)
            last_size += self.hand_feat_dim
            for out_size in self.layers:
                linmap = Linear(last_size, out_size)
                self.linmaps[prot].append(linmap)
                last_size = out_size
            final_linmap = Linear(last_size, self.output_size)
            self.linmaps[prot].append(final_linmap)

        # Dropout layer
        self.dropout = Dropout()

    def _regression_nonlinearity(self, x):
        return F.relu(x)

    def _init_attention(self):
        '''
            Initialises the attention map vector/matrix

            Takes attention_type-Span, Sentence, Span-param, Sentence-param
            as a parameter to decide the size of the attention matrix
        '''

        self.att_map_repr = ModuleDict({})
        self.att_map_W = ModuleDict({})
        self.att_map_V = ModuleDict({})
        self.att_map_context = ModuleDict({})
        for prot in self.attention_type.keys():
            # Token representation
            if self.attention_type[prot]['repr'] == "span":
                repr_dim = 2 * self.reduced_embedding_dim
                self.att_map_repr[prot] = Linear(self.reduced_embedding_dim,
                                                 1,
                                                 bias=False)
                self.att_map_W[prot] = Linear(self.reduced_embedding_dim,
                                              self.reduced_embedding_dim)
                self.att_map_V[prot] = Linear(self.reduced_embedding_dim,
                                              1,
                                              bias=False)
            elif self.attention_type[prot]['repr'] == "param":
                repr_dim = 2 * self.reduced_embedding_dim
                self.att_map_repr[prot] = Linear(self.reduced_embedding_dim,
                                                 self.reduced_embedding_dim,
                                                 bias=False)
                self.att_map_W[prot] = Linear(2 * self.reduced_embedding_dim,
                                              self.reduced_embedding_dim)
                self.att_map_V[prot] = Linear(self.reduced_embedding_dim,
                                              1,
                                              bias=False)
            else:
                repr_dim = self.reduced_embedding_dim

            # Context representation
            # There is no attention for argument davidsonian
            if self.attention_type[prot]['context'] == 'param':
                self.att_map_context[prot] = Linear(repr_dim,
                                                    self.reduced_embedding_dim,
                                                    bias=False)
            elif self.attention_type[prot][
                    'context'] == 'david' and prot == 'arg':
                self.att_map_context[prot] = Linear(repr_dim,
                                                    self.reduced_embedding_dim,
                                                    bias=False)

    def _choose_tokens(self, batch, lengths):
        '''
            Extracts tokens from a batch at specified position(lengths)
            batch - batch_size x max_sent_length x embed_dim
            lengths - batch_size x max_span_length x embed_dim
        '''
        idx = (lengths).unsqueeze(2).expand(-1, -1, batch.shape[2])
        return batch.gather(1, idx).squeeze()

    def _get_inputs(self, words):
        '''
           Return ELMO embeddings as root, span or param span
        '''
        if not self.vocab:
            raw_embeds, masks = self.embeddings.batch_to_embeddings(words)
            # raw_ = self.embeddings(batch_to_ids(words).to(self.device))
            # raw_embeds, masks = torch.cat([x.unsqueeze(1) for x in raw_['elmo_representations']], dim=1), raw_['mask']
            masks = masks.unsqueeze(2).repeat(
                1, 1, self.reduced_embedding_dim).byte()
            embedded_inputs = (self.embed_linmap_argpred_lower(
                raw_embeds[:, 0, :, :].squeeze()) +
                               self.embed_linmap_argpred_mid(
                                   raw_embeds[:, 1, :, :].squeeze()) +
                               self.embed_linmap_argpred_top(
                                   raw_embeds[:, 2, :, :].squeeze()))
            masked_embedded_inputs = embedded_inputs * masks.float()
            return masked_embedded_inputs, masks
        else:
            # Glove embeddings
            indices = [[self.vocab_hash[word] for word in sent]
                       for sent in words]
            indices = torch.tensor(indices,
                                   dtype=torch.long,
                                   device=self.device)
            embeddings = self.embeddings(indices)
            masks = (embeddings != 0)[:, :, :self.reduced_embedding_dim].byte()
            # reduced_embeddings = self.embed_linmap(embeddings) * masks.float()
            return embeddings, masks

    def _get_representation(self,
                            prot,
                            embeddings,
                            roots,
                            spans,
                            context=False):
        '''
            returns the representation required from arguments passed by
            running attention based on arguments passed
        '''

        # Get token(pred/arg) representation
        rep_type = self.attention_type[prot]['repr']

        roots_rep_raw = self._choose_tokens(embeddings, roots)
        if len(roots_rep_raw.shape) == 1:
            roots_rep_raw = roots_rep_raw.unsqueeze(0)

        if rep_type == "root":
            token_rep = roots_rep_raw
        else:
            masks_spans = (spans == -1)
            spans[spans == -1] = 0
            spans_rep_raw = self._choose_tokens(embeddings, spans)

            if len(spans_rep_raw.shape) == 1:
                spans_rep_raw = spans_rep_raw.unsqueeze(0).unsqueeze(1)
            elif len(spans_rep_raw.shape) == 2:
                if spans.shape[0] == 1:
                    spans_rep_raw = spans_rep_raw.unsqueeze(0)
                elif spans.shape[1] == 1:
                    spans_rep_raw = spans_rep_raw.unsqueeze(1)

            if rep_type == "span":
                att_raw = self.att_map_repr[prot](spans_rep_raw).squeeze()
                # additive attention
                # att_raw_w = torch.relu(self.att_map_W[prot](for_att))
                # att_raw = self.att_map_V[prot](att_raw_w).squeeze()
            elif rep_type == "param":
                # att_param = torch.relu(self.att_map_repr[prot](roots_rep_raw)).unsqueeze(2)
                # att_raw = torch.matmul(spans_rep_raw, att_param).squeeze()
                # additive attention
                for_att = torch.cat(
                    (spans_rep_raw, roots_rep_raw.unsqueeze(1).repeat(
                        1, spans_rep_raw.shape[1], 1)),
                    dim=2)
                att_raw_w = torch.relu(self.att_map_W[prot](for_att))
                att_raw = self.att_map_V[prot](att_raw_w).squeeze()

            att_raw = att_raw.masked_fill(masks_spans, -1e9)
            att = F.softmax(att_raw, dim=1)
            att = self.dropout(att)
            pure_token_rep = torch.matmul(
                att.unsqueeze(2).permute(0, 2, 1), spans_rep_raw).squeeze()
            if not context:
                token_rep = torch.cat((roots_rep_raw, pure_token_rep), dim=1)
            else:
                token_rep = pure_token_rep

        return token_rep

    def _run_attention(self, prot, embeddings, roots, spans, context_roots,
                       context_spans, masks):
        '''
            Various attention mechanisms implemented
        '''

        # Get the required representation for pred/arg
        token_rep = self._get_representation(prot=prot,
                                             embeddings=embeddings,
                                             roots=roots,
                                             spans=spans)

        # Get the required representation for context of pred/arg
        context_type = self.attention_type[prot]['context']

        if context_type == "none":
            context_rep = None

        elif context_type == "param":
            # Sentence level attention
            att_param = torch.relu(
                self.att_map_context[prot](token_rep)).unsqueeze(1)
            att_raw = torch.matmul(embeddings, att_param.permute(0, 2, 1))
            att_raw = att_raw.masked_fill(masks[:, :, 0:1] == 0, -1e9)
            att = F.softmax(att_raw, dim=1)
            att = self.dropout(att)
            context_rep = torch.matmul(att.permute(0, 2, 1),
                                       embeddings).squeeze()

        elif context_type == "david":
            if prot == "arg":
                prot_context = 'pred'
                context_roots = torch.tensor(context_roots,
                                             dtype=torch.long,
                                             device=self.device).unsqueeze(1)
                max_span = max([len(a) for a in context_spans])
                context_spans = torch.tensor([
                    a + [-1 for i in range(max_span - len(a))]
                    for a in context_spans
                ],
                                             dtype=torch.long,
                                             device=self.device)
                context_rep = self._get_representation(context=True,
                                                       prot=prot_context,
                                                       embeddings=embeddings,
                                                       roots=context_roots,
                                                       spans=context_spans)
            else:
                prot_context = 'arg'
                context_rep = None
                for i, ctx_root in enumerate(context_roots):
                    ctx_root = torch.tensor(ctx_root,
                                            dtype=torch.long,
                                            device=self.device).unsqueeze(1)
                    max_span = max([len(a) for a in context_spans[i]])
                    ctx_span = torch.tensor([
                        a + [-1 for i in range(max_span - len(a))]
                        for a in context_spans[i]
                    ],
                                            dtype=torch.long,
                                            device=self.device)
                    sentence = embeddings[i, :, :].unsqueeze(0).repeat(
                        len(ctx_span), 1, 1)
                    ctx_reps = self._get_representation(context=True,
                                                        prot=prot_context,
                                                        embeddings=sentence,
                                                        roots=ctx_root,
                                                        spans=ctx_span)

                    if len(ctx_reps.shape) == 1:
                        ctx_reps = ctx_reps.unsqueeze(0)
                    # Attention over arguments
                    att_nd_param = torch.relu(self.att_map_context[prot](
                        token_rep[i, :].unsqueeze(0)))
                    att_raw = torch.matmul(att_nd_param,
                                           ctx_reps.permute(1, 0))
                    att = F.softmax(att_raw, dim=1)
                    ctx_rep_final = torch.matmul(att, ctx_reps)
                    if i:
                        context_rep = torch.cat((context_rep, ctx_rep_final),
                                                dim=0).squeeze()
                    else:
                        context_rep = ctx_rep_final

        if context_rep is not None:
            inputs_for_regression = torch.cat((token_rep, context_rep), dim=1)
        else:
            inputs_for_regression = token_rep

        return inputs_for_regression

    def _run_regression(self, prot, x):
        '''
            Run regression to get 3 attribute vector
        '''
        for i, lin_map in enumerate(self.linmaps[prot]):
            if i:
                x = self._regression_nonlinearity(x)
                x = self.dropout(x)

            x = lin_map(x)

        return torch.sigmoid(x)

    def forward(self, prot, words, roots, spans, context_roots, context_spans,
                hand_feats):
        """
            Forward propagation of activations
        """

        if self.is_embeds_on:
            inputs_for_attention, masks = self._get_inputs(words)
            inputs_for_regression = self._run_attention(
                prot=prot,
                embeddings=inputs_for_attention,
                roots=roots,
                spans=spans,
                context_roots=context_roots,
                context_spans=context_spans,
                masks=masks)
            if self.is_hand_feats_on:
                inputs_for_regression = torch.cat(
                    (inputs_for_regression, hand_feats), dim=1)
        elif self.is_hand_feats_on:
            inputs_for_regression = hand_feats
        else:
            sys.exit('You need some word representation!!')

        outputs = self._run_regression(prot=prot, x=inputs_for_regression)
        return outputs
예제 #33
0
    def test_embed_batch_is_empty_sentence(self):
        embedder = ElmoEmbedder(options_file=self.options_file, weight_file=self.weight_file)
        embeddings = embedder.embed_sentence([])

        assert embeddings.shape == (3, 0, 1024)
예제 #34
0
from allennlp.commands.elmo import ElmoEmbedder
import pickle
from utils import Config, safe_pickle_dump
import gensim
elmo = ElmoEmbedder(
    options_file='https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json',
    weight_file='https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5'
)

db = pickle.load(open(Config.db_path, 'rb'))
summary_tokens = []
for pid, j in db.items():
    # idvv = '%sv%d' % (j['_rawid'], j['_version'])
    summary = j['summary'].replace('\n', ' ')
    summary = gensim.utils.simple_preprocess(summary)
    summary_tokens += summary,
print(len(summary_tokens))
elmo_embed = elmo.embed_batch(summary_tokens)
safe_pickle_dump(elmo_embed, 'elmo_embed.p')
예제 #35
0
 def embedder(self):
     if self._memo_embedder is None:
         from allennlp.commands.elmo import ElmoEmbedder
         self._memo_embedder = ElmoEmbedder(options_file=options_file,
                                            weight_file=weight_file)
     return self._memo_embedder
예제 #36
0
    def __init__(self,
                 cuda_device,
                 weights_path,
                 vocab_path,
                 batch_size=40,
                 cutoff_elmo_vocab=50000):
        super().__init__()
        logging.info(
            'creating elmo in device %d. weight path %s, vocab_path %s '
            ' batch_size: %d' %
            (cuda_device, weights_path, vocab_path, batch_size))
        self.elmo = ElmoEmbedder(cuda_device=cuda_device)

        self.batch_size = batch_size

        logging.info('warming up elmo')
        self._warm_up_elmo()

        logging.info('reading elmo weights')
        with h5py.File(weights_path, 'r', libver='latest', swmr=True) as fin:
            self.elmo_softmax_w = fin[
                'softmax/W'][:cutoff_elmo_vocab, :].transpose()

        self.elmo_word_vocab = []
        self.elmo_word_vocab_lemmatized = []

        # we prevent the prediction of these by removing their weights and their vocabulary altogether
        stop_words = {'<UNK>', '<S>', '</S>', '--', '..', '...', '....'}

        logging.info('reading elmo vocabulary')

        lines_to_remove = set()
        with open(vocab_path, encoding="utf-8") as fin:
            for idx, line in enumerate(fin):
                if idx == cutoff_elmo_vocab:
                    break
                word = line.strip()
                if len(word) == 1 or word in stop_words:
                    lines_to_remove.add(idx)
                self.elmo_word_vocab.append(word)

        with open(vocab_path + '.lemmatized', encoding="utf-8") as fin:
            for idx, line in enumerate(fin):
                if idx == cutoff_elmo_vocab:
                    break
                word = line.strip()
                if len(word) == 1 or word in stop_words:
                    lines_to_remove.add(idx)
                self.elmo_word_vocab_lemmatized.append(word)

        # remove stopwords
        self.elmo_word_vocab = [
            x for i, x in enumerate(self.elmo_word_vocab)
            if i not in lines_to_remove
        ]
        self.elmo_word_vocab_lemmatized = [
            x for i, x in enumerate(self.elmo_word_vocab_lemmatized)
            if i not in lines_to_remove
        ]
        self.elmo_softmax_w = np.delete(self.elmo_softmax_w,
                                        list(lines_to_remove), 1)