Esempio n. 1
0
    def __init__(self, configs):
        BaseModel.__init__(self, configs)

        self.encoder = TransformerEncoder(configs)
        self.pair_scorer = ScoreModule(self.get_pair_embs_size(),
                                       [configs['ffnn_size']] *
                                       configs['ffnn_depth'],
                                       configs['dropout_rate'])

        # GENE embeddings (if use_gene_features enabled)
        if configs['use_gene_features']:
            self.gene_dim = GENE2DIM.get(self.configs['gene_variant'],
                                         GENE_DIM)
            self.event2emb = get_event2geneemb(configs['gene_variant'])
            for e in self.event2emb:
                self.event2emb[e] = self.event2emb[e].to(self.device)
            self.defaultgene = nn.Embedding(1, self.gene_dim)

        # Initialize embeddings
        for name, param in self.named_parameters():
            if (not 'transformer'
                    in name.lower()) and 'embedding' in name.lower():
                print('Re-initialize embedding {}'.format(name))
                param.data.uniform_(-0.1, 0.1)

        # Move model to device
        self.to(self.device)
Esempio n. 2
0
 def __init__(self, name, tokenizer, optimizer):
     BaseModel.__init__(self, name, tokenizer, optimizer)
Esempio n. 3
0
    def __init__(self, name, tokenizer, optimizer):
        BaseModel.__init__(self, name, tokenizer, optimizer)

        # Vectorize the data.
        self.input_texts = []
        self.target_texts = []
        self.input_characters = set()
        self.target_characters = set()

        for ch in self.CHARS_BASIC:
            self.input_characters.add(ch)
            self.target_characters.add(ch)

        lines = data.load_clean_sentences('both')

        for line in lines:
            input_text = line[1]  # Swedish
            target_text = line[0]  # English
            # We use "tab" as the "start sequence" character
            # for the targets, and "\n" as "end sequence" character.
            target_text = self.CH_START + target_text + self.CH_END
            self.input_texts.append(input_text)
            self.target_texts.append(target_text)
            for char in input_text:
                if char not in self.input_characters:
                    self.input_characters.add(char)
            for char in target_text:
                if char not in self.target_characters:
                    self.target_characters.add(char)

        self.input_characters = sorted(list(self.input_characters))
        self.target_characters = sorted(list(self.target_characters))
        self.num_encoder_tokens = len(self.input_characters)
        self.num_decoder_tokens = len(self.target_characters)
        self.max_encoder_seq_length = max(
            [len(txt) for txt in self.input_texts])
        self.max_decoder_seq_length = max(
            [len(txt) for txt in self.target_texts])

        print('Number of samples:', len(self.input_texts))
        print('Number of unique input tokens:', self.num_encoder_tokens)
        print('Number of unique output tokens:', self.num_decoder_tokens)
        print('Max sequence length for inputs:', self.max_encoder_seq_length)
        print('Max sequence length for outputs:', self.max_decoder_seq_length)

        self.input_token_index = dict([
            (char, i) for i, char in enumerate(self.input_characters)
        ])
        self.target_token_index = dict([
            (char, i) for i, char in enumerate(self.target_characters)
        ])

        self.encoder_input_data = np.zeros(
            (len(self.input_texts), self.max_encoder_seq_length,
             self.num_encoder_tokens),
            dtype='float32')
        self.decoder_input_data = np.zeros(
            (len(self.input_texts), self.max_decoder_seq_length,
             self.num_decoder_tokens),
            dtype='float32')
        self.decoder_target_data = np.zeros(
            (len(self.input_texts), self.max_decoder_seq_length,
             self.num_decoder_tokens),
            dtype='float32')

        for i, (input_text, target_text) in enumerate(
                zip(self.input_texts, self.target_texts)):
            for t, char in enumerate(input_text):
                self.encoder_input_data[i, t,
                                        self.input_token_index[char]] = 1.
            for t, char in enumerate(target_text):
                # decoder_target_data is ahead of decoder_input_data by one timestep
                self.decoder_input_data[i, t,
                                        self.target_token_index[char]] = 1.
                if t > 0:
                    # decoder_target_data will be ahead by one timestep
                    # and will not include the start character.
                    self.decoder_target_data[
                        i, t - 1, self.target_token_index[char]] = 1.

        # Reverse-lookup token index to decode sequences back to
        # something readable.
        self.reverse_input_char_index = dict(
            (i, char) for char, i in self.input_token_index.items())
        self.reverse_target_char_index = dict(
            (i, char) for char, i in self.target_token_index.items())
Esempio n. 4
0
    def __init__(self,
                 name,
                 tokenizer,
                 optimizer,
                 include_dropout=False,
                 latent_dim=256,
                 reverse_order=False,
                 bidi=False):
        """
        :param reverse_order: If True, reverse the order of input tokens to ease training
        """
        BaseModel.__init__(self, name, tokenizer, optimizer)

        # Collection all tokens across all input lines
        self.include_dropout = include_dropout
        self.latent_dim = latent_dim
        self.reverse_order = reverse_order
        self.bidi = bidi  # If true, use a Bidirectional wrapper around the encoder LSTM
        self.other_tokens = set()  # input
        self.eng_tokens = {self.CH_START, self.CH_END}  # target

        # Collection all tokens across all input lines
        for idx, line in enumerate(self.eng_texts):
            self.eng_texts[
                idx] = self.CH_START + self.eng_texts[idx] + self.CH_END
            self.eng_tokenized[idx] = [
                self.CH_START
            ] + self.eng_tokenized[idx] + [self.CH_END]
            for token in self.other_tokenized[idx]:
                self.other_tokens.add(token)
            for token in self.eng_tokenized[idx]:
                self.eng_tokens.add(token)

        self.other_tokens = sorted(list(self.other_tokens))
        self.eng_tokens = sorted(list(self.eng_tokens))
        self.num_encoder_tokens = len(self.other_tokens)
        self.num_decoder_tokens = len(self.eng_tokens)
        self.max_encoder_seq_length = max(
            [len(txt) for txt in self.other_tokenized])
        self.max_decoder_seq_length = max(
            [len(txt) for txt in self.eng_tokenized])

        print('Number of samples:', self.num_samples)
        print('Number of unique input tokens:', self.num_encoder_tokens)
        print('Number of unique output tokens:', self.num_decoder_tokens)
        print('Max sequence length for inputs:', self.max_encoder_seq_length)
        print('Max sequence length for outputs:', self.max_decoder_seq_length)

        self.input_token_index = dict([
            (token, i) for i, token in enumerate(self.other_tokens)
        ])
        self.target_token_index = dict([
            (token, i) for i, token in enumerate(self.eng_tokens)
        ])

        self.encoder_input_data = numpy.zeros(
            (self.num_samples, self.max_encoder_seq_length,
             self.num_encoder_tokens),
            dtype='uint8')
        self.decoder_input_data = numpy.zeros(
            (self.num_samples, self.max_decoder_seq_length,
             self.num_decoder_tokens),
            dtype='uint8')
        self.decoder_target_data = numpy.zeros(
            (self.num_samples, self.max_decoder_seq_length,
             self.num_decoder_tokens),
            dtype='uint8')

        # Create one-hot encoded values directly
        for i, (input_text, target_text) in enumerate(
                zip(self.other_tokenized, self.eng_tokenized)):
            for t, token in enumerate(input_text):
                self.encoder_input_data[i, t,
                                        self.input_token_index[token]] = 1.
            if reverse_order:
                self.encoder_input_data = numpy.flip(self.encoder_input_data,
                                                     1)
            for t, token in enumerate(target_text):
                # decoder_target_data is ahead of decoder_input_data by one timestep
                self.decoder_input_data[i, t,
                                        self.target_token_index[token]] = 1.
                if t > 0:
                    # decoder_target_data will be ahead by one timestep
                    # and will not include the start character.
                    self.decoder_target_data[
                        i, t - 1, self.target_token_index[token]] = 1.

        # Reverse-lookup token index to decode sequences back to something readable.
        self.reverse_input_token_index = dict(
            (i, token) for token, i in self.input_token_index.items())
        self.reverse_target_token_index = dict(
            (i, token) for token, i in self.target_token_index.items())
Esempio n. 5
0
 def __init__(self):
     BaseModel.__init__(self)
     self.name = map_collections["scope"]
Esempio n. 6
0
 def __init__(self):
     BaseModel.__init__(self)
     self.name = map_collections["rol_relation"]