def __init__(self, input_dim, hidden_dim, output_dim, trans_num, diffusion_num, duration, bias=True, rnn_type='GRU', model_type='C', trans_activate_type='L'): super(CTGCN, self).__init__() self.input_dim = input_dim self.hidden_dim = hidden_dim self.output_dim = output_dim self.rnn_type = rnn_type self.model_type = model_type self.trans_activate_type = trans_activate_type self.method_name = 'CTGCN' + '-' + model_type assert self.model_type in ['C', 'S'] assert self.trans_activate_type in ['L', 'N'] self.duration = duration self.trans_num = trans_num self.diffusion_num = diffusion_num self.bias = bias self.mlp_list = nn.ModuleList() self.duffision_list = nn.ModuleList() for i in range(self.duration): if self.model_type == 'C': self.mlp_list.append(MLP(input_dim, hidden_dim, hidden_dim, trans_num, bias=bias, activate_type=trans_activate_type)) self.duffision_list.append(CDN(hidden_dim, output_dim, output_dim, diffusion_num, rnn_type=rnn_type)) else: # model_type == 'S' self.mlp_list.append(MLP(input_dim, hidden_dim, output_dim, trans_num, bias=bias, activate_type=trans_activate_type)) self.duffision_list.append(CDN(output_dim, output_dim, output_dim, diffusion_num, rnn_type=rnn_type)) assert self.rnn_type in ['LSTM', 'GRU'] if self.rnn_type == 'LSTM': self.rnn = nn.LSTM(output_dim, output_dim, num_layers=1, bias=bias, batch_first=True) else: self.rnn = nn.GRU(output_dim, output_dim, num_layers=1, bias=bias, batch_first=True) self.norm = nn.LayerNorm(output_dim)
def __init__(self, name='ra', nimg=2048, nnh=512, na=512, nh=512, nw=512, nout=8843, npatch=30, model_file=None): self.name = name if model_file is not None: with h5py.File(model_file, 'r') as f: nimg = f.attrs['nimg'] nnh = f.attrs['nnh'] na = f.attrs['na'] nh = f.attrs['nh'] nw = f.attrs['nw'] nout = f.attrs['nout'] # npatch = f.attrs['npatch'] self.config = {'nimg': nimg, 'nnh': nnh, 'na': na, 'nh': nh, 'nw': nw, 'nout': nout, 'npatch': npatch} # word embedding layer self.embedding = Embedding(n_emb=nout, dim_emb=nw, name=self.name+'@embedding') # initialization mlp layer self.init_mlp = MLP(layer_sizes=[na, 2*nh], output_type='tanh', name=self.name+'@init_mlp') self.proj_mlp = MLP(layer_sizes=[nimg, na], output_type='tanh', name=self.name+'@proj_mlp') # lstm self.lstm = BasicLSTM(dim_x=na+nw, dim_h=nh, name=self.name+'@lstm') # prediction mlp self.pred_mlp = MLP(layer_sizes=[na+nh+nw, nout], output_type='softmax', name=self.name+'@pred_mlp') # attention layer self.attention = Attention(dim_item=na, dim_context=na+nw+nh, hsize=nnh, name=self.name+'@attention') # inputs cap = T.imatrix('cap') img = T.tensor3('img') self.inputs = [cap, img] # go through sequence feat = self.proj_mlp.compute(img) init_e = feat.mean(axis=1) init_state = T.concatenate([init_e, self.init_mlp.compute(init_e)], axis=-1) (state, self.p, loss, self.alpha), _ = theano.scan(fn=self.scan_func, sequences=[cap[0:-1, :], cap[1:, :]], outputs_info=[init_state, None, None, None], non_sequences=[feat]) # loss function loss = T.mean(loss) self.costs = [loss] # layers and parameters self.layers = [self.embedding, self.init_mlp, self.proj_mlp, self.attention, self.lstm, self.pred_mlp] self.params = sum([l.params for l in self.layers], []) # load weights from file, if model_file is not None if model_file is not None: self.load_weights(model_file) # these functions and variables are used in test stage self._init_func = None self._step_func = None self._proj_func = None self._feat_shared = theano.shared(np.zeros((1, npatch, nimg)).astype(theano.config.floatX))
def __init__(self, args): super(BaseRN, self).__init__() self.init_encoders(args) self.g_theta = MLP(args.cv_filter + 2 + args.te_hidden, args.basern_gt_hidden, args.basern_gt_hidden, args.basern_gt_layer) self.f_phi = MLP(args.basern_gt_hidden, args.basern_fp_hidden, args.a_size, args.basern_fp_layer, args.basern_fp_dropout, last=True)
def __init__(self, name='scene_mlp', layer_sizes=(2048, 1024, 1024, 80), model_file=None): self.name = name if model_file is not None: with h5py.File(model_file, 'r') as f: layer_sizes = f.attrs['layer_sizes'] self.config = {'layer_sizes': layer_sizes} # define inputs x = T.matrix('x') y = T.matrix('y') self.inputs = [x, y] # define computation graph self.mlp = MLP(layer_sizes=layer_sizes, name='mlp', output_type='softmax') self.proba = self.mlp.compute(x) self.log_proba = T.log(self.proba) # define costs def kl_divergence(p, q): kl = T.mean(T.sum(p * T.log((p+1e-30)/(q+1e-30)), axis=1)) kl += T.mean(T.sum(q * T.log((q+1e-30)/(p+1e-30)), axis=1)) return kl kl = kl_divergence(self.proba, y) acc = T.mean(T.eq(self.proba.argmax(axis=1), y.argmax(axis=1))) self.costs = [kl, acc] # layers and parameters self.layers = [self.mlp] self.params = sum([l.params for l in self.layers], []) # load weights from file, if model_file is not None if model_file is not None: self.load_weights(model_file)
def fit_mlp(image_size=(28, 28), datasets='../data/mnist.pkl.gz', outpath='../output/mnist_lenet.params', n_hidden=500, learning_rate=0.01, L1_reg=0.00, L2_reg=0.001, n_epochs=1000, batch_size=20, patience=10000, patience_increase=2, improvement_threshold=0.995): index = T.lscalar() x = T.matrix('x') y = T.ivector('y') classifier = MLP(rng=rng.RandomState(SEED), input=x, n_in=reduce(np.multiply, image_size), n_hidden=n_hidden, n_out=10) cost = (classifier.negative_log_likelihood(y) + L1_reg * classifier.L1 + L2_reg * classifier.L2) learner = SupervisedMSGD(index, x, y, batch_size, learning_rate, load_data(datasets), outpath, classifier, cost) best_validation_loss, best_iter, epoch, elapsed_time = learner.fit( n_epochs=n_epochs, patience=patience, patience_increase=patience_increase, improvement_threshold=improvement_threshold) display_results(best_validation_loss, elapsed_time, epoch) return learner
def __init__(self, in_dims, n_enc, enc_strides, encoder_type): """The shared encoder function, mapping input x to hiddens. Args: encoder_type: str, type of encoder, either 'conv' or 'multi' n_enc: list, number of hidden units per layer in the encoder enc_strides: list, stride in each layer (only for 'conv' encoder_type) name: str, module name used for tf scope. """ super(SharedEncoder, self).__init__() self._encoder_type = encoder_type if encoder_type == 'conv': self.encoder = SharedConvModule(in_channels=in_dims, layers_out_channels=n_enc, strides=enc_strides, kernel_size=3, activation=nn.ReLU()) elif encoder_type == 'multi': self.encoder = MLP(input_dim=in_dims, hidden_dims=n_enc, activation=nn.ReLU(), activate_final=True) else: raise ValueError('Unknown encoder_type {}'.format(encoder_type))
def __init__(self, feature_size, field_size, embedding_size, deep_layers_dim, dropout_fm, dropout_deep, act_function, batch_norm, l2): super(DeepFM, self).__init__() self.feature_size = feature_size self.field_size = field_size self.embedding_size = embedding_size self.dropout_fm = dropout_fm self.deep_layers_dim = deep_layers_dim self.dropout_deep = dropout_deep self.act_function = act_function self.batch_norm = batch_norm self.l2 = l2 self.embeddings = nn.Embedding(self.feature_size, self.embedding_size) self.biases = nn.Embedding(self.feature_size, 1) self.dropout_fm_layers = [ nn.Dropout(dropout_fm[0]), nn.Dropout(dropout_fm[1]) ] # deep layers # mlp_module = [] in_dim = self.field_size * self.embedding_size self.deep_layers = MLP(in_dim, self.deep_layers_dim, self.dropout_deep, self.act_function, self.batch_norm) self.predict_layer = nn.Linear(self.deep_layers_dim[-1] + 2, 1, bias=True) self.weight_list = [self.predict_layer.weight ] + self.deep_layers.weight_list self.reset_parameters()
def load_pretrain_weight(self): """ loading weights from pre-trained MLP and GMF model :return: """ config = self.config config['latent_dim'] = config['latent_dim_mlp'] mlp_model = MLP(config) if config['use_cuda']: mlp_model.cuda() self.embedding_user_mlp.weight.data = mlp_model.embedding_user.weight.data self.embedding_item_mlp.weight.data = mlp_model.embedding_item.weight.data for i in range(len(self.fc_layers)): self.fc_layers[i].weight.data = mlp_model.fc_layers[i].weight.data config['latent_dim'] = config['latent_dim_mf'] gmf_model = GMF(config) if config['use_cuda']: gmf_model.cuda() self.embedding_user_mf.weight.data = gmf_model.embedding_user.weight.data self.embedding_item_mf.weight.data = gmf_model.embedding_item.weight.data self.affine_output.weight.data = torch.cat([ config['alpha'] * mlp_model.affine_output.weight.data, (1 - config['alpha']) * gmf_model.affine_output.weight.data ], dim=0) self.affine_output.bias.data = config['alpha'] * mlp_model.affine_output.bias.data + (1 - config['alpha']) \ * gmf_model.affine_out.bias.data
def __init__(self, args): super(Sarn, self).__init__() self.init_encoders(args) self.h_psi = MLP(args.cv_filter + 2 + args.te_hidden, args.sarn_hp_hidden, 1, args.sarn_hp_layer, last=True) self.g_theta = MLP((args.cv_filter + 2) * 2 + args.te_hidden, args.sarn_gt_hidden, args.sarn_gt_hidden, args.sarn_gt_layer) self.f_phi = MLP(args.sarn_gt_hidden, args.sarn_fp_hidden, args.a_size, args.sarn_fp_layer, args.sarn_fp_dropout, last=True)
def main(BERT_MODEL='bert-base-uncased', model_file='./models/bert-base-uncased.bin', data_file='./data/hotpot_dev_distractor_v1.json', max_new_nodes=5, sys2='xattn', attn_layers=1): setting = 'distractor' if data_file.find('distractor') >= 0 else 'fullwiki' with open(data_file, 'r') as fin: dataset = json.load(fin) tokenizer = BertTokenizer.from_pretrained(BERT_MODEL, do_lower_case=True) device = torch.device( 'cpu') if not torch.cuda.is_available() else torch.device('cuda') print('Loading model from {}'.format(model_file)) model_state_dict = torch.load(model_file) model1 = BertForMultiHopQuestionAnswering.from_pretrained( BERT_MODEL, state_dict=model_state_dict['params1']) hidden_size = model1.config.hidden_size model2 = CognitiveGNN(hidden_size, model1.config, sys2) if args.sys2 == "xattn": from model import XAttn model2.gcn = XAttn(hidden_size, model1.config, n_layers=args.xattn_layers) elif args.sys2 == "mlp": from layers import MLP model2.gcn = MLP((hidden_size, hidden_size, 1)) model2.load_state_dict(model_state_dict['params2']) sp, answer, graphs = {}, {}, {} print('Start inference... on {} GPUs'.format(torch.cuda.device_count())) model1 = torch.nn.DataParallel(model1, device_ids=range(torch.cuda.device_count())) model1.to(device).eval() model2.to(device).eval() with torch.no_grad(): for data in tqdm(dataset): gold, ans, graph_ret, ans_nodes = cognitive_graph_propagate( tokenizer, data, model1, model2, device, setting=setting, max_new_nodes=max_new_nodes) sp[data['_id']] = list(gold) answer[data['_id']] = ans graphs[data['_id']] = graph_ret + [ 'answer_nodes: ' + ', '.join(ans_nodes) ] pred_file = data_file.replace('.json', '_pred.json') with open(pred_file, 'w') as fout: json.dump({'answer': answer, 'sp': sp, 'graphs': graphs}, fout)
def __init__(self, input_dim, hidden_dim, output_dim, layer_num, duration, bias=True, activate_type='N'): super(MLPClassifier, self).__init__() self.input_dim = input_dim self.hidden_dim = hidden_dim self.output_dim = output_dim self.layer_num = layer_num self.duration = duration self.bias = bias self.activate_type = activate_type self.mlp_list = nn.ModuleList() for i in range(self.duration): self.mlp_list.append(MLP(input_dim, hidden_dim, output_dim, layer_num, bias=bias, activate_type=activate_type))
def __init__(self, args): super(RelationalNetwork, self).__init__() self.init_encoders(args) self.g_theta = MLP((args.cv_filter + 2) * 2 + args.te_hidden, args.rn_gt_hidden, args.rn_gt_hidden, args.rn_gt_layer) self.f_phi = MLP(args.rn_gt_hidden, args.rn_fp_hidden, args.a_size, args.rn_fp_layer, args.rn_fp_dropout, last=True) if args.cv_pretrained: self.visual_encoder = nn.Sequential( nn.Conv2d(1024, args.cv_filter, 3, 2, padding=1), nn.BatchNorm2d(args.cv_filter), nn.ReLU() # nn.Conv2d(args.cv_filter, args.cv_filter, 3, 2, padding=1), # nn.BatchNorm2d(args.cv_filter), # nn.ReLU() ) self.init()
def __init__(self, input_dim, hidden_dim, output_dim, trans_num, diffusion_num, bias=True, rnn_type='GRU', model_type='C', trans_activate_type='L'): super(CGCN, self).__init__() self.input_dim = input_dim self.hidden_dim = hidden_dim self.output_dim = output_dim self.trans_num = trans_num self.diffusion_num = diffusion_num self.bias = bias self.rnn_type = rnn_type self.model_type = model_type self.trans_activate_type = trans_activate_type self.method_name = 'CGCN' + '-' + model_type assert self.model_type in ['C', 'S'] assert self.trans_activate_type in ['L', 'N'] if self.model_type == 'C': # self.mlp = nn.Linear(input_dim, hidden_dim, bias=bias) self.mlp = MLP(input_dim, hidden_dim, hidden_dim, trans_num, bias=bias, activate_type=trans_activate_type) self.duffision = CDN(hidden_dim, output_dim, output_dim, diffusion_num, rnn_type=rnn_type) else: self.mlp = MLP(input_dim, hidden_dim, output_dim, trans_num, bias=bias, activate_type=trans_activate_type) self.duffision = CDN(output_dim, output_dim, output_dim, diffusion_num, rnn_type=rnn_type)
def __init__(self, sequence_length, n_hidden_rnn, n_in_mlp, n_hidden_mlp, n_out, L1_reg, L2_reg, learning_rate, word_embedding, non_static): """ question-answer rnn model init and definition. :param sequence_length: sequence length :param n_hidden_rnn: rnn hidden units :param n_in_mlp: mlp input size :param n_hidden_mlp: mlp hidden size :param n_out: mlp out size :param L1_reg: mlp L1 loss :param L2_reg: mlp L2 loss :param learning_rate: learning rate for update :param word_embedding: word embedding :param non_static: bool, update embedding or not """ self.lr = learning_rate self.word_embedding = word_embedding # define the placeholder with tf.name_scope('placeholder'): self.q_input = tf.placeholder(tf.int64, shape=[None, sequence_length], name='query_input') self.a_input = tf.placeholder(tf.int64, shape=[None, sequence_length], name='answer_input') self.l_input = tf.placeholder(tf.int64, shape=[None], name='label_input') # one-hot -> [batch_size. n_out] self.keep_prop = tf.placeholder(tf.float32, name='keep_prop') # transfer input to vec with embedding. with tf.name_scope("embedding"): _word_embedding = tf.get_variable(name='word_emb', shape=self.word_embedding.shape, dtype=tf.float32, initializer=tf.constant_initializer(self.word_embedding), trainable=non_static) q_embedding = tf.nn.embedding_lookup(_word_embedding, self.q_input) a_embedding = tf.nn.embedding_lookup(_word_embedding, self.a_input) print "input shape(embedding): ", q_embedding.get_shape() # define rnn model. with tf.variable_scope("RNN"): # rnn layer rnn_layer = RNNModule(n_hidden_rnn, cell="GRU") q_sentence_vec, a_sentence_vec = rnn_layer(q_embedding, a_embedding) # define classifier. with tf.name_scope("MLPDrop"): interact_layer = InteractLayer(n_hidden_rnn, n_hidden_rnn, dim=n_in_mlp) qa_vec = interact_layer(q_sentence_vec, a_sentence_vec) bn_layer = BatchNormLayer(n_in=n_in_mlp, inputs=qa_vec) classifier = MLP(bn_layer.out, n_in_mlp, n_hidden_mlp, n_out) # classifier = MLPDropout(bn_layer.out, n_in_mlp, n_hidden_mlp, n_out, keep_prop=self.keep_prop) # define cost, optimizer and output. self.pred_prob = classifier.pred_prob() self.error = classifier.errors(self.l_input) self.cost = classifier.cross_entropy(self.l_input) + L1_reg * classifier.L1 + L2_reg * classifier.L2_sqr self.optimizer = tf.train.RMSPropOptimizer(self.lr, 0.9).minimize(self.cost)
def __init__(self, num_features, num_factors, act_function, layers, batch_norm, drop_prob, l2, pre_trained_FM=None): super(NFM, self).__init__() """ num_features: number of features, num_factors: number of hidden factors, act_function: activation function for MLP layer, layers: list of dimension of deep layers, batch_norm: bool type, whether to use batch norm or not, drop_prob: list of the dropout rate for FM and MLP, pre_trained_FM: the pre-trained FM weights. """ self.num_features = num_features self.num_factors = num_factors self.act_function = act_function self.layers = layers self.batch_norm = batch_norm self.drop_prob = drop_prob self.l2 = l2 self.pre_trained_FM = pre_trained_FM self.embeddings = nn.Embedding(num_features, num_factors) self.biases = nn.Embedding(num_features, 1) self.global_bias = nn.Parameter(torch.tensor([0.0])) fm_modules = [] if self.batch_norm: fm_modules.append(nn.BatchNorm1d(num_factors)) fm_modules.append(nn.Dropout(drop_prob[0])) self.FM_layers = nn.Sequential(*fm_modules) # deep layers self.deep_layers = MLP(num_factors, self.layers, self.drop_prob, self.act_function, self.batch_norm) predict_size = layers[-1] if layers else num_factors self.prediction = nn.Linear(predict_size, 1, bias=False) self.reset_parameters()
def main(rnn_type = 'simple_rnn'): ''' train a language model on character data ''' assert(rnn_type in ('simple_rnn', 'lstm')) model = Model() lookup = model.add_lookup_parameters((VOCAB_SIZE, INPUT_DIM)) rnn = rnn_types[rnn_type](model, INPUT_DIM, HIDDEN_DIM) mlp = MLP(model, HIDDEN_DIM, HIDDEN_DIM, VOCAB_SIZE, output_nonlinearity='softmax', num_layers=NUMBER_OF_LAYERS) #our single training example sentence = TRAINING_SENTENCE #"a quick brown fox jumped over the lazy dog" train(model, rnn, mlp, lookup, sentence)
def __init__(self, feature_size, field_size, embedding_size, deep_layers_dim, cin_layers_size, cin_split_half, dropout_deep, deep_act, cin_act, batch_norm, l2): super(XDeepFM, self).__init__() self.feature_size = feature_size self.field_size = field_size self.embedding_size = embedding_size self.deep_layers_dim = deep_layers_dim self.cin_layers_size = cin_layers_size self.cin_split_half = cin_split_half self.dropout_deep = dropout_deep self.l2 = l2 self.deep_act = deep_act self.cin_act = cin_act self.batch_norm = batch_norm self.embeddings = nn.Embedding(self.feature_size, self.embedding_size) self.biases = nn.Embedding(self.feature_size, 1) self.global_bias = nn.Parameter(torch.tensor([0.0])) self.weight_list = [] # deep layers in_dim = self.field_size * self.embedding_size self.deep_layers = MLP(in_dim, self.deep_layers_dim, self.dropout_deep, self.deep_act, self.batch_norm) self.deep_linear = nn.Linear(self.deep_layers_dim[-1], 1, bias=False) # CIN if cin_split_half: self.feature_map_num = sum( cin_layers_size[:-1]) // 2 + cin_layers_size[-1] else: self.feature_map_num = sum(cin_layers_size) self.cin = CIN(self.field_size, self.cin_layers_size, self.cin_act, cin_split_half) self.cin_linear = nn.Linear(self.feature_map_num, 1, bias=False) # Construct weight list self.weight_list.append(self.biases.weight) self.weight_list += self.deep_layers.weight_list self.weight_list.append(self.deep_linear.weight) self.weight_list += self.cin.weight_list self.weight_list.append(self.cin_linear.weight) self.reset_parameters()
def __init__(self, hps): super(MixPoetAUS, self).__init__() self.hps = hps self.vocab_size = hps.vocab_size self.n_class1 = hps.n_class1 self.n_class2 = hps.n_class2 self.emb_size = hps.emb_size self.hidden_size = hps.hidden_size self.factor_emb_size = hps.factor_emb_size self.latent_size = hps.latent_size self.context_size = hps.context_size self.poem_len = hps.poem_len self.sens_num = hps.sens_num self.sen_len = hps.sen_len self.pad_idx = hps.pad_idx self.bos_idx = hps.bos_idx self.bos_tensor = torch.tensor(hps.bos_idx, dtype=torch.long, device=device).view(1, 1) self.gumbel_tool = GumbelSampler() # build postional inputs to distinguish lines at different positions # [sens_num, sens_num], each line is a one-hot input self.pos_inps = F.one_hot(torch.arange(0, self.sens_num), self.sens_num) self.pos_inps = self.pos_inps.type(torch.FloatTensor).to(device) # ---------------------------- # build componets self.layers = nn.ModuleDict() self.layers['embed'] = nn.Embedding(self.vocab_size, self.emb_size, padding_idx=self.pad_idx) self.layers['encoder'] = BidirEncoder(self.emb_size, self.hidden_size, drop_ratio=hps.drop_ratio) # p(x|z, w, y) self.layers['decoder'] = Decoder(self.hidden_size, self.hidden_size, drop_ratio=hps.drop_ratio) # RNN to combine characters to form the representation of a word self.layers['word_encoder'] = BidirEncoder(self.emb_size, self.emb_size, cell='Elman', drop_ratio=hps.drop_ratio) # p(y_1|x,w), p(y_2|x,w) self.layers['cl_xw1'] = MLP(self.hidden_size*2+self.emb_size*2, layer_sizes=[self.hidden_size, 128, self.n_class1], activs=['relu', 'relu', None], drop_ratio=hps.drop_ratio) self.layers['cl_xw2'] = MLP(self.hidden_size*2+self.emb_size*2, layer_sizes=[self.hidden_size, 128, self.n_class2], activs=['relu', 'relu', None], drop_ratio=hps.drop_ratio) # p(y_1|w), p(y_2|w) self.layers['cl_w1'] = MLP(self.emb_size*2, layer_sizes=[self.emb_size, 64, self.n_class1], activs=['relu', 'relu', None], drop_ratio=hps.drop_ratio) self.layers['cl_w2'] = MLP(self.emb_size*2, layer_sizes=[self.emb_size, 64, self.n_class2], activs=['relu', 'relu', None], drop_ratio=hps.drop_ratio) # factor embedding self.layers['factor_embed1'] = nn.Embedding(self.n_class1, self.factor_emb_size) self.layers['factor_embed2'] = nn.Embedding(self.n_class2, self.factor_emb_size) # posteriori and prior self.layers['prior'] = PriorGenerator( self.emb_size*2+int(self.latent_size//2), self.latent_size, self.n_class1, self.n_class2, self.factor_emb_size) self.layers['posteriori'] = PosterioriGenerator( self.hidden_size*2+self.emb_size*2, self.latent_size, self.n_class1, self.n_class2, self.factor_emb_size) # for adversarial training self.layers['discriminator'] = Discriminator(self.n_class1, self.n_class2, self.factor_emb_size, self.latent_size, drop_ratio=hps.drop_ratio) #-------------- # project the decoder hidden state to a vocanbulary-size output logit self.layers['out_proj'] = nn.Linear(hps.hidden_size, hps.vocab_size) # MLP for calculate initial decoder state # NOTE: Here we use a two-dimension one-hot vector as the input length embedding o_i, # since there are only two kinds of line length, 5 chars and 7 chars, for Chinese # classical quatrains. self.layers['dec_init'] = MLP(self.latent_size+self.emb_size*2+self.factor_emb_size*2, layer_sizes=[self.hidden_size-6], activs=['tanh'], drop_ratio=hps.drop_ratio) self.layers['map_x'] = MLP(self.context_size+self.emb_size, layer_sizes=[self.hidden_size], activs=['tanh'], drop_ratio=hps.drop_ratio) # update the context vector self.layers['context'] = ContextLayer(self.hidden_size, self.context_size) # two annealing parameters self.__tau = 1.0 self.__teach_ratio = 1.0 # only for pre-training self.layers['dec_init_pre'] = MLP(self.hidden_size*2+self.emb_size*2, layer_sizes=[self.hidden_size-6], activs=['tanh'], drop_ratio=hps.drop_ratio)
adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges( adj) adj = adj_train adj_label = adj_train + sp.eye(adj_train.shape[0]) adj_norm = torch.from_numpy(preprocess_graph(adj)) adj_label = torch.from_numpy(adj_label.todense().astype(np.float32)) feat = torch.from_numpy(feat.todense().astype(np.float32)) ############## init model ############## gcn_vae = GraphAE(features_dim, hidden_dim, out_dim, bias=False, dropout=0.0) optimizer_vae = torch.optim.Adam(gcn_vae.parameters(), lr=1e-2) mlp = MLP(features_dim, hidden_dim, out_dim, dropout=0.0) optimizer_mlp = torch.optim.Adam(mlp.parameters(), lr=1e-2) for batch_idx in range(num_iters): # train GCN optimizer_vae.zero_grad() gcn_vae.train() z = gcn_vae(adj_norm, feat) adj_h = torch.mm(z, z.t()) vae_train_loss = reconstruction_loss(adj_label, adj_h, norm) vae_train_loss.backward() optimizer_vae.step() #train mlp optimizer_mlp.zero_grad() mlp.train()
adj_train = adj_select[:cut_idx, :cut_idx] X_train = X_select[:cut_idx, :] return adj_train, X_train, adj_select, X_select else: return adj_select, X_select g_adj, X, g_adj_all, X_all = read_citation_dat(cite_data, with_test=True) features_dim = X.shape[1] params = {'batch_size': 1, 'shuffle': True, 'num_workers': 2} ############### Init models ############### gcn_vae = GraphVae(features_dim, hidden_dim, out_dim, dropout=dropout) mlp = MLP(features_dim, hidden_dim, out_dim) optimizer_vae = torch.optim.Adam(gcn_vae.parameters(), lr=1e-2) optimizer_mlp = torch.optim.Adam(mlp.parameters(), lr=1e-2) train_loss = 0 cache = None ################ training loop ##################### adj = torch.from_numpy(g_adj) feat = torch.from_numpy(X) for batch_idx in range(num_permutation): if adj.size()[0] <= size_update: print("sample size {} too small, skipped!".format(adj.size()[0])) continue
def __init__(self, concept_num, hidden_dim, embedding_dim, edge_type_num, graph_type, graph=None, graph_model=None, dropout=0.5, bias=True): super(GKT, self).__init__() self.concept_num = concept_num self.hidden_dim = hidden_dim self.embedding_dim = embedding_dim self.edge_type_num = edge_type_num assert graph_type in [ 'Dense', 'Transition', 'DKT', 'PAM', 'MHA', 'VAE' ] self.graph_type = graph_type if graph_type in ['Dense', 'Transition', 'DKT']: assert edge_type_num == 2 assert graph is not None and graph_model is None self.graph = nn.Parameter(graph) # [concept_num, concept_num] self.graph.requires_grad = False # fix parameter self.graph_model = graph_model else: # ['PAM', 'MHA', 'VAE'] assert graph is None self.graph = graph # None if graph_type == 'PAM': assert graph_model is None self.graph = nn.Parameter(torch.rand(concept_num, concept_num)) else: assert graph_model is not None self.graph_model = graph_model # one-hot feature and question self.one_hot_feat = torch.eye(2 * self.concept_num) self.one_hot_q = torch.eye(self.concept_num) self.one_hot_q = torch.cat( (self.one_hot_q, torch.zeros(1, self.concept_num)), dim=0) # concept and concept & response embeddings self.emb_x = nn.Embedding(2 * concept_num, embedding_dim) # last embedding is used for padding, so dim + 1 self.emb_c = nn.Embedding(concept_num + 1, embedding_dim, padding_idx=-1) # f_self function and f_neighbor functions mlp_input_dim = hidden_dim + embedding_dim self.f_self = MLP(mlp_input_dim, hidden_dim, hidden_dim, dropout=dropout, bias=bias) self.f_neighbor_list = nn.ModuleList() if graph_type in ['Dense', 'Transition', 'DKT', 'PAM']: # f_in and f_out functions self.f_neighbor_list.append( MLP(2 * mlp_input_dim, hidden_dim, hidden_dim, dropout=dropout, bias=bias)) self.f_neighbor_list.append( MLP(2 * mlp_input_dim, hidden_dim, hidden_dim, dropout=dropout, bias=bias)) else: # ['MHA', 'VAE'] for i in range(edge_type_num): self.f_neighbor_list.append( MLP(2 * mlp_input_dim, hidden_dim, hidden_dim, dropout=dropout, bias=bias)) # Erase & Add Gate self.erase_add_gate = EraseAddGate(hidden_dim, concept_num) # Gate Recurrent Unit self.gru = nn.GRUCell(hidden_dim, hidden_dim, bias=bias) # prediction layer self.predict = nn.Linear(hidden_dim, 1, bias=bias)
def __init__(self, input_size, config, n_layers=1): super(XAttn, self).__init__() layer = MPLayer(input_size, config) self.layer = nn.ModuleList( [copy.deepcopy(layer) for _ in range(n_layers)]) self.predict = MLP(input_sizes=(input_size, input_size, 1))
def __init__(self, filter, last_filter, hidden, last, layer): super(FilmClassifier, self).__init__() self.conv = nn.Conv2d(filter + 2, last_filter, 1, 1, 0) # self.pool = nn.MaxPool2d((input_h, input_w)) self.mlp = MLP(last_filter, hidden, last, layer, last=True) self.init()
def __init__(self, z_shape, output_shape, decoder_type, n_dec, dec_up_strides, n_x, n_y, shared_encoder_conv_shapes=None): """Module initialization Args: output_shape: list, shape of output (not including batch dimension). decoder_type: str, 'single', 'multi', or 'deconv'. n_dec: list, number of hidden units per layer in the decoder dec_up_strides: list, stride in each layer (only for 'deconv' decoder_type). n_x: int, number of dims of x. n_y: int, number of dims of y. shared_encoder_conv_shapes: the shapes of the activations of the intermediate layers of the encoder. Returns: Instance of the LatentDecoder """ super(LatentDecoder, self).__init__() self.decoder_type = decoder_type self.n_y = n_y n_out_factor = 1 self.out_shape = list(output_shape) # Upsample layer (deconvolutional, bilinear, ..). if decoder_type == 'deconv': # First, check that the encoder is convolutional too (needed for batchnorm) if shared_encoder_conv_shapes is None: raise ValueError( 'Shared encoder does not contain conv_shapes.') num_output_channels = output_shape[-1] self.decoder = ConvDecoder( output_dims=n_dec, kernel_size=3, activation=nn.ReLU(), dec_up_strides=dec_up_strides, enc_conv_shapes=shared_encoder_conv_shapes, n_c=num_output_channels * n_out_factor, method=decoder_type) # Multiple MLP decoders, one for each component. # NOTE the 'multi' option is not in working condition and probably never will elif decoder_type == 'multi': self.decoder = [] for k in range(n_y): mlp_decoding = MLP(input_dim=z_shape, hidden_dims=n_dec + [n_x * n_out_factor], activation=nn.ReLU(), activate_final=False) self.decoder.append(mlp_decoding) # Single (shared among components) MLP decoder. elif decoder_type == 'single': self.decoder = MLP( input_dim=z_shape, hidden_dims=n_dec + [n_x * n_out_factor], activation=nn.ReLU(), activate_final=False, ) else: raise ValueError(f'Unknown decoder_type {decoder_type}')
def __init__(self, hps, device): super(WorkingMemoryModel, self).__init__() self.hps = hps self.device = device self.global_trace_size = hps.global_trace_size self.topic_trace_size = hps.topic_trace_size self.topic_slots = hps.topic_slots self.his_mem_slots = hps.his_mem_slots self.vocab_size = hps.vocab_size self.mem_size = hps.mem_size self.sens_num = hps.sens_num self.pad_idx = hps.pad_idx self.bos_tensor = torch.tensor(hps.bos_idx, dtype=torch.long, device=device) # ---------------------------- # build componets self.layers = nn.ModuleDict() self.layers['word_embed'] = nn.Embedding(hps.vocab_size, hps.word_emb_size, padding_idx=hps.pad_idx) # NOTE: We set fixed 33 phonology categories: 0~32 # please refer to preprocess.py for more details self.layers['ph_embed'] = nn.Embedding(33, hps.ph_emb_size) self.layers['len_embed'] = nn.Embedding(hps.sen_len, hps.len_emb_size) self.layers['encoder'] = BidirEncoder(hps.word_emb_size, hps.hidden_size, drop_ratio=hps.drop_ratio) self.layers['decoder'] = Decoder(hps.hidden_size, hps.hidden_size, drop_ratio=hps.drop_ratio) # project the decoder hidden state to a vocanbulary-size output logit self.layers['out_proj'] = nn.Linear(hps.hidden_size, hps.vocab_size) # update the context vector self.layers['global_trace_updater'] = ContextLayer(hps.hidden_size, hps.global_trace_size) self.layers['topic_trace_updater'] = MLP(self.mem_size+self.topic_trace_size, layer_sizes=[self.topic_trace_size], activs=['tanh'], drop_ratio=hps.drop_ratio) # MLP for calculate initial decoder state self.layers['dec_init'] = MLP(hps.hidden_size*2, layer_sizes=[hps.hidden_size], activs=['tanh'], drop_ratio=hps.drop_ratio) self.layers['key_init'] = MLP(hps.hidden_size*2, layer_sizes=[hps.hidden_size], activs=['tanh'], drop_ratio=hps.drop_ratio) # history memory reading and writing layers # query: concatenation of hidden state, global_trace and topic_trace self.layers['memory_read'] = AttentionReader( d_q=hps.hidden_size+self.global_trace_size+self.topic_trace_size+self.topic_slots, d_v=hps.mem_size, drop_ratio=hps.attn_drop_ratio) self.layers['memory_write'] = AttentionWriter(hps.mem_size+self.global_trace_size, hps.mem_size) # NOTE: a layer to compress the encoder hidden states to a smaller size for larger number of slots self.layers['mem_compress'] = MLP(hps.hidden_size*2, layer_sizes=[hps.mem_size], activs=['tanh'], drop_ratio=hps.drop_ratio) # [inp, attns, ph_inp, len_inp, global_trace] self.layers['merge_x'] = MLP( hps.word_emb_size+hps.ph_emb_size+hps.len_emb_size+hps.global_trace_size+hps.mem_size, layer_sizes=[hps.hidden_size], activs=['tanh'], drop_ratio=hps.drop_ratio) # two annealing parameters self._tau = 1.0 self._teach_ratio = 0.8 # --------------------------------------------------------- # only used for for pre-training self.layers['dec_init_pre'] = MLP(hps.hidden_size*2, layer_sizes=[hps.hidden_size], activs=['tanh'], drop_ratio=hps.drop_ratio) self.layers['merge_x_pre'] = MLP( hps.word_emb_size+hps.ph_emb_size+hps.len_emb_size, layer_sizes=[hps.hidden_size], activs=['tanh'], drop_ratio=hps.drop_ratio)
def __init__(self, name='gnic', nimg=2048, nh=512, nw=512, nout=8843, model_file=None): self.name = name if model_file is not None: with h5py.File(model_file, 'r') as f: nimg = f.attrs['nimg'] nh = f.attrs['nh'] nw = f.attrs['nw'] nout = f.attrs['nout'] self.config = {'nimg': nimg, 'nh': nh, 'nw': nw, 'nout': nout} # word embedding layer self.embedding = Embedding(n_emb=nout, dim_emb=nw, name=self.name + '@embedding') # initialization mlp layer self.proj_mlp = MLP(layer_sizes=[nimg, 2 * nh], output_type='tanh', name=self.name + '@proj_mlp') # lstm self.lstm = BasicLSTM(dim_x=nw, dim_h=nh, name=self.name + '@lstm') # prediction mlp self.pred_mlp = MLP(layer_sizes=[nh + nw, nout], output_type='softmax', name=self.name + '@pred_mlp') # inputs cap = T.imatrix('cap') img = T.matrix('img') self.inputs = [cap, img] # go through sequence init_state = self.proj_mlp.compute(img) (state, self.p, loss), _ = theano.scan(fn=self.scan_func, sequences=[cap[0:-1, :], cap[1:, :]], outputs_info=[init_state, None, None]) # loss function loss = T.mean(loss) self.costs = [loss] # layers and parameters self.layers = [self.embedding, self.proj_mlp, self.lstm, self.pred_mlp] self.params = sum([l.params for l in self.layers], []) # load weights from file, if model_file is not None if model_file is not None: self.load_weights(model_file) # these functions are used in test stage self._init_func = None self._step_func = None
def __init__(self, word_embedding, img_h, img_w, filter_windows, feature_maps, n_in, n_hidden, n_out, L1_reg, L2_reg, learning_rate, non_static=False): """ question-answer cnn model init and definition. :param word_embedding: word embedding :param img_h: max sentence length. :param img_w: embedding dim. :param filter_windows: filter height, e.g [1,2,3] :param feature_maps: filter_num. :param n_in: mlp input size. :param n_hidden: mlp hidden size. :param n_out: mlp out size. :param L1_reg: mlp L1 loss. :param L2_reg: mlp L2 loss. :param learning_rate: learning rate for update. :param non_static: bool, update embedding or not. """ self.lr = learning_rate self.word_embedding = word_embedding self.num_feature_maps = feature_maps * len(filter_windows) # define the placeholder with tf.name_scope('placeholder'): self.q_input = tf.placeholder(tf.int64, shape=[None, img_h], name='query_input') self.a_input = tf.placeholder(tf.int64, shape=[None, img_h], name='answer_input') self.l_input = tf.placeholder( tf.int64, shape=[None], name='label_input') # one-hot -> [batch_size, n_out] self.keep_prop = tf.placeholder(tf.float32, name="keep_prop") # drop # transfer input to vec with embedding. with tf.name_scope("embedding"): _word_embedding = tf.get_variable( name='word_emb', shape=self.word_embedding.shape, dtype=tf.float32, initializer=tf.constant_initializer(self.word_embedding), trainable=non_static) q_embedding = tf.nn.embedding_lookup(_word_embedding, self.q_input) a_embedding = tf.nn.embedding_lookup(_word_embedding, self.a_input) q_embedding_expanded = tf.expand_dims(q_embedding, -1) a_embedding_expanded = tf.expand_dims(a_embedding, -1) print "input shape(embedding expanded): ", q_embedding_expanded.get_shape( ) # define cnn model for qa. with tf.variable_scope("model_layers"): inception_module = InceptionModule(img_h, img_w, filter_windows, feature_maps) q_sentence_vec, a_sentence_vec = inception_module( q_embedding_expanded, a_embedding_expanded) interact_layer = InteractLayer(self.num_feature_maps, self.num_feature_maps, dim=n_in) qa_vec = interact_layer(q_sentence_vec, a_sentence_vec) bn_layer = BatchNormLayer(n_in=n_in, inputs=qa_vec) # define the classifier. with tf.name_scope("mlp"): classifier = MLP(bn_layer.out, n_in, n_hidden, n_out) # classifier = MLPDropout(bn_layer.out, n_in, n_hidden, n_out, keep_prop=self.keep_prop) # define cost, optimizer and output. self.pred_prob = classifier.pred_prob() self.error = classifier.errors(self.l_input) self.cost = classifier.cross_entropy( self.l_input ) + L1_reg * classifier.L1 + L2_reg * classifier.L2_sqr self.optimizer = tf.train.RMSPropOptimizer(self.lr, 0.9).minimize(self.cost)
# Task #2 print("Loading model from {}".format(args.load_path)) model_state_dict = torch.load(args.load_path) model1 = BertForMultiHopQuestionAnswering.from_pretrained( args.bert_model, state_dict=model_state_dict["params1"]) hidden_size = model1.config.hidden_size model2 = CognitiveGNN(hidden_size, model1.config, args.sys2) model2.load_state_dict(model_state_dict["params2"]) if args.sys2 == "xattn": from model import XAttn model2.gcn = XAttn(model1.config.hidden_size, model1.config, n_layers=args.xattn_layers) elif args.sys2 == "mlp": from layers import MLP model2.gcn = MLP((hidden_size, hidden_size, 1)) model1 = torch.nn.DataParallel(model1, device_ids=range(torch.cuda.device_count())) print(model1, model2) model1, model2 = train( train_bundles, valid_bundles, model1=model1, mode=args.mode, model2=model2, batch_size=args.batch_size, num_epochs=args.num_epochs, gradient_accumulation_steps=args.gradient_accumulation_steps, lr1=args.lr1, lr2=args.lr2,
def __init__(self, word_V, dep_V, word_d=100, pos_d=25, mlp_d=100, mlp_label_d=100, num_lstm_layers=2, lstm_d=125, embeddings_init=None, pos_V=None, seed=0, verbose=False): ''' word_V - size of word vocab dep_V - size of relation label vocab word_d - dimension of word embeddings pos_d - dimension of POS embeddings mlp_d - dimension of hidden layer for arc prediction MLP mlp_label_d - dimension of hidden layer for label prediction MLP num_lstm_layers - number of bi-directional LSTM layers to stack lstm_d - dimension of hidden state in the LSTM embeddings_init - use pre-trained embeddings pos_V - size of POS vocab seed - random seed for initialization verbose - whether to print information about these parameters ''' if verbose: print('Word vocabulary size: {}'.format(word_V)) print('Dependency relation vocabulary size: {}'.format(dep_V)) print('POS vocabulary size: {}'.format(pos_V)) self.word_V = word_V self.dep_V = dep_V self.pos_V = pos_V self.word_d = word_d self.pos_d = pos_d self.mlp_d = mlp_d self.mlp_label_d = mlp_label_d self.lstm_layers = num_lstm_layers self.lstm_d = lstm_d np.random.seed(seed) self.model = dynet.Model() #embedding layers for words and POS self.embeddings = self.model.add_lookup_parameters( (self.word_V, self.word_d)) if pos_V is not None: self.pos_embeddings = self.model.add_lookup_parameters( (self.pos_V, self.pos_d)) #bi-directional LSTM layers #embeddings -> layer1 -> layer2 lstm_layers = [] for i in range(num_lstm_layers): input_d = word_d if i: input_d = 2 * lstm_d elif pos_V is not None: input_d += pos_d fwd_lstm_layer = LSTM(self.model, input_d, lstm_d) rev_lstm_layer = LSTM(self.model, input_d, lstm_d, reverse=True) lstm_layers.append((fwd_lstm_layer, rev_lstm_layer)) #arc prediction MLP #layer2(i), layer2(j) -> concatenate -> score mlp_layer = MLP(self.model, lstm_d * 4, mlp_d, 1) #label prediction MLP if mlp_label_d: mlp_label_layer = MLP(self.model, lstm_d * 4, mlp_label_d, dep_V) else: mlp_label_layer = None #train the model using Adam optimizer self.trainer = dynet.AdamTrainer(self.model) #take in word and pos_indices, return the output of the 2nd layer def get_lstm_output(indices, pos_indices=None): embeddings_out = [self.embeddings[w] for w in indices] x = embeddings_out if pos_V is not None and pos_indices is not None: x = [] for i, input in enumerate(embeddings_out): x.append( dynet.concatenate( [input, self.pos_embeddings[pos_indices[i]]])) for i in range(num_lstm_layers): x_1 = lstm_layers[i][0].get_output(x)[0] x_2 = lstm_layers[i][1].get_output(x)[0] x = [ dynet.concatenate([x_1[i], x_2[i]]) for i in range(len(indices)) ] return x self.states = get_lstm_output #score all arcs from i to j using the arc prediction MLP def score_arcs(states, value=True): length = len(states) scores = [[None for i in range(length)] for j in range(length)] for i in range(length): for j in range(length): score = mlp_layer.get_output( dynet.concatenate([states[i], states[j]])) if value: scores[i][j] = score.scalar_value() else: scores[i][j] = score return scores self.score_arcs = score_arcs #score all labels at i using the label prediction MLP def score_labels(states, arcs, value=True): scores = [] for i in range(len(states)): score = mlp_label_layer.get_output( dynet.concatenate([states[i], states[arcs[i]]])) if value: scores.append(score.value()) else: scores.append(score) return scores self.score_labels = score_labels
def __init__(self, hidden_size, config, module_type): super(CognitiveGNN, self).__init__() self.gcn = GCN(hidden_size) self.both_net = MLP((hidden_size, hidden_size, 1)) self.select_net = MLP((hidden_size, hidden_size, 1)) self.module_type = module_type