def __init__(self, configs): BaseModel.__init__(self, configs) self.encoder = TransformerEncoder(configs) self.pair_scorer = ScoreModule(self.get_pair_embs_size(), [configs['ffnn_size']] * configs['ffnn_depth'], configs['dropout_rate']) # GENE embeddings (if use_gene_features enabled) if configs['use_gene_features']: self.gene_dim = GENE2DIM.get(self.configs['gene_variant'], GENE_DIM) self.event2emb = get_event2geneemb(configs['gene_variant']) for e in self.event2emb: self.event2emb[e] = self.event2emb[e].to(self.device) self.defaultgene = nn.Embedding(1, self.gene_dim) # Initialize embeddings for name, param in self.named_parameters(): if (not 'transformer' in name.lower()) and 'embedding' in name.lower(): print('Re-initialize embedding {}'.format(name)) param.data.uniform_(-0.1, 0.1) # Move model to device self.to(self.device)
def __init__(self, name, tokenizer, optimizer): BaseModel.__init__(self, name, tokenizer, optimizer)
def __init__(self, name, tokenizer, optimizer): BaseModel.__init__(self, name, tokenizer, optimizer) # Vectorize the data. self.input_texts = [] self.target_texts = [] self.input_characters = set() self.target_characters = set() for ch in self.CHARS_BASIC: self.input_characters.add(ch) self.target_characters.add(ch) lines = data.load_clean_sentences('both') for line in lines: input_text = line[1] # Swedish target_text = line[0] # English # We use "tab" as the "start sequence" character # for the targets, and "\n" as "end sequence" character. target_text = self.CH_START + target_text + self.CH_END self.input_texts.append(input_text) self.target_texts.append(target_text) for char in input_text: if char not in self.input_characters: self.input_characters.add(char) for char in target_text: if char not in self.target_characters: self.target_characters.add(char) self.input_characters = sorted(list(self.input_characters)) self.target_characters = sorted(list(self.target_characters)) self.num_encoder_tokens = len(self.input_characters) self.num_decoder_tokens = len(self.target_characters) self.max_encoder_seq_length = max( [len(txt) for txt in self.input_texts]) self.max_decoder_seq_length = max( [len(txt) for txt in self.target_texts]) print('Number of samples:', len(self.input_texts)) print('Number of unique input tokens:', self.num_encoder_tokens) print('Number of unique output tokens:', self.num_decoder_tokens) print('Max sequence length for inputs:', self.max_encoder_seq_length) print('Max sequence length for outputs:', self.max_decoder_seq_length) self.input_token_index = dict([ (char, i) for i, char in enumerate(self.input_characters) ]) self.target_token_index = dict([ (char, i) for i, char in enumerate(self.target_characters) ]) self.encoder_input_data = np.zeros( (len(self.input_texts), self.max_encoder_seq_length, self.num_encoder_tokens), dtype='float32') self.decoder_input_data = np.zeros( (len(self.input_texts), self.max_decoder_seq_length, self.num_decoder_tokens), dtype='float32') self.decoder_target_data = np.zeros( (len(self.input_texts), self.max_decoder_seq_length, self.num_decoder_tokens), dtype='float32') for i, (input_text, target_text) in enumerate( zip(self.input_texts, self.target_texts)): for t, char in enumerate(input_text): self.encoder_input_data[i, t, self.input_token_index[char]] = 1. for t, char in enumerate(target_text): # decoder_target_data is ahead of decoder_input_data by one timestep self.decoder_input_data[i, t, self.target_token_index[char]] = 1. if t > 0: # decoder_target_data will be ahead by one timestep # and will not include the start character. self.decoder_target_data[ i, t - 1, self.target_token_index[char]] = 1. # Reverse-lookup token index to decode sequences back to # something readable. self.reverse_input_char_index = dict( (i, char) for char, i in self.input_token_index.items()) self.reverse_target_char_index = dict( (i, char) for char, i in self.target_token_index.items())
def __init__(self, name, tokenizer, optimizer, include_dropout=False, latent_dim=256, reverse_order=False, bidi=False): """ :param reverse_order: If True, reverse the order of input tokens to ease training """ BaseModel.__init__(self, name, tokenizer, optimizer) # Collection all tokens across all input lines self.include_dropout = include_dropout self.latent_dim = latent_dim self.reverse_order = reverse_order self.bidi = bidi # If true, use a Bidirectional wrapper around the encoder LSTM self.other_tokens = set() # input self.eng_tokens = {self.CH_START, self.CH_END} # target # Collection all tokens across all input lines for idx, line in enumerate(self.eng_texts): self.eng_texts[ idx] = self.CH_START + self.eng_texts[idx] + self.CH_END self.eng_tokenized[idx] = [ self.CH_START ] + self.eng_tokenized[idx] + [self.CH_END] for token in self.other_tokenized[idx]: self.other_tokens.add(token) for token in self.eng_tokenized[idx]: self.eng_tokens.add(token) self.other_tokens = sorted(list(self.other_tokens)) self.eng_tokens = sorted(list(self.eng_tokens)) self.num_encoder_tokens = len(self.other_tokens) self.num_decoder_tokens = len(self.eng_tokens) self.max_encoder_seq_length = max( [len(txt) for txt in self.other_tokenized]) self.max_decoder_seq_length = max( [len(txt) for txt in self.eng_tokenized]) print('Number of samples:', self.num_samples) print('Number of unique input tokens:', self.num_encoder_tokens) print('Number of unique output tokens:', self.num_decoder_tokens) print('Max sequence length for inputs:', self.max_encoder_seq_length) print('Max sequence length for outputs:', self.max_decoder_seq_length) self.input_token_index = dict([ (token, i) for i, token in enumerate(self.other_tokens) ]) self.target_token_index = dict([ (token, i) for i, token in enumerate(self.eng_tokens) ]) self.encoder_input_data = numpy.zeros( (self.num_samples, self.max_encoder_seq_length, self.num_encoder_tokens), dtype='uint8') self.decoder_input_data = numpy.zeros( (self.num_samples, self.max_decoder_seq_length, self.num_decoder_tokens), dtype='uint8') self.decoder_target_data = numpy.zeros( (self.num_samples, self.max_decoder_seq_length, self.num_decoder_tokens), dtype='uint8') # Create one-hot encoded values directly for i, (input_text, target_text) in enumerate( zip(self.other_tokenized, self.eng_tokenized)): for t, token in enumerate(input_text): self.encoder_input_data[i, t, self.input_token_index[token]] = 1. if reverse_order: self.encoder_input_data = numpy.flip(self.encoder_input_data, 1) for t, token in enumerate(target_text): # decoder_target_data is ahead of decoder_input_data by one timestep self.decoder_input_data[i, t, self.target_token_index[token]] = 1. if t > 0: # decoder_target_data will be ahead by one timestep # and will not include the start character. self.decoder_target_data[ i, t - 1, self.target_token_index[token]] = 1. # Reverse-lookup token index to decode sequences back to something readable. self.reverse_input_token_index = dict( (i, token) for token, i in self.input_token_index.items()) self.reverse_target_token_index = dict( (i, token) for token, i in self.target_token_index.items())
def __init__(self): BaseModel.__init__(self) self.name = map_collections["scope"]
def __init__(self): BaseModel.__init__(self) self.name = map_collections["rol_relation"]