def make(self, theano_kwargs=None): """Construct the Fergus-Recurrent model Model: Input at time t: - Soft attention over embedded lexemes of children of node_t - Embedded lexeme of node_t Compute: - Inputs are fed into a recurrent tree s.t. hidden states travel down branches - node_t's supertag embeddings are retrieved - output of recurrent tree at time t is aligned with each supertag vector - a vectorized probability function computes a distribution Output: - Distribution over supertags for node_t """ if self.igor.embedding_type == "convolutional": make_convolutional_embedding(self.igor) elif self.igor.embedding_type == "token": make_token_embedding(self.igor) elif self.igor.embedding_type == "shallowconv": make_shallow_convolutional_embedding(self.igor) elif self.igor.embedding_type == "minimaltoken": make_minimal_token_embedding(self.igor) else: raise Exception("Incorrect embedding type") spine_input_shape = (self.igor.batch_size, self.igor.max_sequence, self.igor.max_num_supertags) node_input_shape = (self.igor.batch_size, self.igor.max_sequence) dctx_input_shape = (self.igor.batch_size, self.igor.max_sequence, self.igor.max_daughter_size) E, V = self.igor.word_embedding_size, self.igor.word_vocab_size # for word embeddings repeat_N = self.igor.max_num_supertags # for lex repeat_D = self.igor.max_daughter_size mlp_size = self.igor.mlp_size ## dropout parameters p_emb = self.igor.p_emb_dropout p_W = self.igor.p_W_dropout p_U = self.igor.p_U_dropout w_decay = self.igor.weight_decay p_mlp = self.igor.p_mlp_dropout #### make layer inputs spineset_in = Input(batch_shape=spine_input_shape, name='parent_spineset_in', dtype='int32') phead_in = Input(batch_shape=node_input_shape, name='parent_head_input', dtype='int32') dctx_in = Input(batch_shape=dctx_input_shape, name='daughter_context_input', dtype='int32') topology_in = Input(batch_shape=node_input_shape, name='node_topology', dtype='int32') ##### params def predict_params(): return { 'output_dim': 1, 'W_regularizer': l2(w_decay), 'activation': 'relu', 'b_regularizer': l2(w_decay) } ### Layer functions ############# Convert the word indices to vectors F_embedword = Embedding(input_dim=V, output_dim=E, mask_zero=True, W_regularizer=l2(w_decay), dropout=p_emb, name='embedword') if self.igor.saved_embeddings is not None: print("Loading saved embeddings....") F_embedword.initial_weights = [self.igor.saved_embeddings] F_probability = ProbabilityTensor( name='predictions', dense_function=Dense(**predict_params())) ### composition functions F_softdaughters = compose( LambdaMask(lambda x, mask: None, name='remove_attention_mask'), Distribute(SoftAttention(name='softdaughter'), name='distribute_softdaughter'), F_embedword) F_align = compose(Distribute(Dropout(p_mlp)), Distribute(Dense(mlp_size, activation='relu')), concat) F_rtn = compose( RepeatVector(repeat_N, axis=2, name='repeattree'), BranchLSTM(self.igor.rtn_size, name='recurrent_tree1', return_sequences=True)) F_predict = compose( Distribute(F_probability, name='distribute_probability'), Distribute( Dropout(p_mlp) ), ### need a separate one because the 'concat' is different for the two situations LastDimDistribute(Dense(mlp_size, activation='relu')), concat) ############################ new ########################### dctx = F_softdaughters(dctx_in) parent = F_embedword(phead_in) #node_context = F_align([parent, dctx]) #import pdb #pdb.set_trace() ### put into tree aligned_node = F_align([parent, dctx]) node_context = F_rtn([aligned_node, topology_in]) parent_spines = self.igor.F_embedspine(spineset_in) ### get probability predictions = F_predict([node_context, parent_spines]) ################## ### make model ################## self.model = Model(input=[dctx_in, phead_in, topology_in, spineset_in], output=predictions, preloaded_data=self.igor.preloaded_data) ################## ### compile model ################## optimizer = Adam(self.igor.LR, clipnorm=self.igor.max_grad_norm, clipvalue=self.igor.grad_clip_threshold) theano_kwargs = theano_kwargs or {} self.model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'], **theano_kwargs) if self.igor.from_checkpoint: self.load_checkpoint_weights() elif not self.igor.in_training: raise Exception("No point in running this without trained weights")
def make(self, theano_kwargs=None): '''Make the model and compile it. Igor's config options control everything. Arg: theano_kwargs as dict for debugging theano or submitting something custom ''' if self.igor.embedding_type == "convolutional": make_convolutional_embedding(self.igor) elif self.igor.embedding_type == "token": make_token_embedding(self.igor) elif self.igor.embedding_type == "shallowconv": make_shallow_convolutional_embedding(self.igor) elif self.igor.embedding_type == "minimaltoken": make_minimal_token_embedding(self.igor) else: raise Exception("Incorrect embedding type") B = self.igor.batch_size spine_input_shape = (B, self.igor.max_num_supertags) child_input_shape = (B, 1) parent_input_shape = (B, 1) E, V = self.igor.word_embedding_size, self.igor.word_vocab_size # for word embeddings repeat_N = self.igor.max_num_supertags # for lex mlp_size = self.igor.mlp_size ## dropout parameters p_emb = self.igor.p_emb_dropout p_W = self.igor.p_W_dropout p_U = self.igor.p_U_dropout w_decay = self.igor.weight_decay p_mlp = self.igor.p_mlp_dropout def predict_params(): return { 'output_dim': 1, 'W_regularizer': l2(w_decay), 'activation': 'relu', 'b_regularizer': l2(w_decay) } dspineset_in = Input(batch_shape=spine_input_shape, name='daughter_spineset_in', dtype='int32') pspineset_in = Input(batch_shape=spine_input_shape, name='parent_spineset_in', dtype='int32') dhead_in = Input(batch_shape=child_input_shape, name='daughter_head_input', dtype='int32') phead_in = Input(batch_shape=parent_input_shape, name='parent_head_input', dtype='int32') dspine_in = Input(batch_shape=child_input_shape, name='daughter_spine_input', dtype='int32') inputs = [dspineset_in, pspineset_in, dhead_in, phead_in, dspine_in] ### Layer functions ############# Convert the word indices to vectors F_embedword = Embedding(input_dim=V, output_dim=E, mask_zero=True, W_regularizer=l2(w_decay), dropout=p_emb) if self.igor.saved_embeddings is not None: self.logger.info("+ Cached embeddings loaded") F_embedword.initial_weights = [self.igor.saved_embeddings] ###### Prediction Functions ## these functions learn a vector which turns a tensor into a matrix of probabilities ### P(Parent supertag | Child, Context) F_parent_predict = ProbabilityTensor( name='parent_predictions', dense_function=Dense(**predict_params())) ### P(Leaf supertag) F_leaf_predict = ProbabilityTensor( name='leaf_predictions', dense_function=Dense(**predict_params())) ###### Network functions. ##### Input word, correct its dimensions (basically squash in a certain way) F_singleword = compose(Fix(), F_embedword) ##### Input spine, correct diemnsions, broadcast across 1st dimension F_singlespine = compose(RepeatVector(repeat_N), Fix(), self.igor.F_embedspine) ##### Concatenate and map to a single space F_alignlex = compose( RepeatVector(repeat_N), Dropout(p_mlp), Dense(mlp_size, activation='relu', name='dense_align_lex'), concat) F_alignall = compose( Distribute(Dropout(p_mlp), name='distribute_align_all_dropout'), Distribute(Dense(mlp_size, activation='relu', name='align_all_dense'), name='distribute_align_all_dense'), concat) F_alignleaf = compose( Distribute( Dropout(p_mlp * 0.66), name='distribute_leaf_dropout' ), ### need a separate oen because the 'concat' is different for the two situations Distribute(Dense(mlp_size, activation='relu', name='leaf_dense'), name='distribute_leaf_dense'), concat) ### embed and form all of the inputs into their components ### note: spines == supertags. early word choice, haven't refactored. leaf_spines = self.igor.F_embedspine(dspineset_in) pspine_context = self.igor.F_embedspine(pspineset_in) dspine_single = F_singlespine(dspine_in) dhead = F_singleword(dhead_in) phead = F_singleword(phead_in) ### combine the lexical material lexical_context = F_alignlex([dhead, phead]) #### P(Parent Supertag | Daughter Supertag, Lexical Context) ### we know the daughter spine, want to know the parent spine ### size is (batch, num_supertags) parent_problem = F_alignall( [lexical_context, dspine_single, pspine_context]) ### we don't have the parent, we just have a leaf leaf_problem = F_alignleaf([lexical_context, leaf_spines]) parent_predictions = F_parent_predict(parent_problem) leaf_predictions = F_leaf_predict(leaf_problem) predictions = [parent_predictions, leaf_predictions] theano_kwargs = theano_kwargs or {} ## make it quick so i can load in the weights. self.model = Model(input=inputs, output=predictions, preloaded_data=self.igor.preloaded_data, **theano_kwargs) #mask_cache = traverse_nodes(parent_prediction) #desired_masks = ['merge_3.in.mask.0'] #self.p_tensor = K.function(inputs+[K.learning_phase()], [parent_predictions, F_parent_predict.inbound_nodes[0].input_masks[0]]) if self.igor.from_checkpoint: self.load_checkpoint_weights() elif not self.igor.in_training: raise Exception("No point in running this without trained weights") if not self.igor.in_training: expanded_children = RepeatVector(repeat_N, axis=2)(leaf_spines) expanded_parent = RepeatVector(repeat_N, axis=1)(pspine_context) expanded_lex = RepeatVector(repeat_N, axis=1)( lexical_context ) # axis here is arbitary; its repeating on 1 and 2, but already repeated once huge_tensor = concat( [expanded_lex, expanded_children, expanded_parent]) densely_aligned = LastDimDistribute( F_alignall.get(1).layer)(huge_tensor) output_predictions = Distribute( F_parent_predict, force_reshape=True)(densely_aligned) primary_inputs = [phead_in, dhead_in, pspineset_in, dspineset_in] leaf_inputs = [phead_in, dhead_in, dspineset_in] self.logger.info("+ Compiling prediction functions") self.inner_func = K.Function(primary_inputs + [K.learning_phase()], output_predictions) self.leaf_func = K.Function(leaf_inputs + [K.learning_phase()], leaf_predictions) try: self.get_ptensor = K.function( primary_inputs + [K.learning_phase()], [ output_predictions, ]) except: import pdb pdb.set_trace() else: optimizer = Adam(self.igor.LR, clipnorm=self.igor.max_grad_norm, clipvalue=self.igor.grad_clip_threshold) theano_kwargs = theano_kwargs or {} self.model.compile(loss="categorical_crossentropy", optimizer=optimizer, metrics=['accuracy'], **theano_kwargs)