Beispiel #1
0
    def load_pretrain_weights(self):
        """Loading weights from trained MLP model & GMF model"""
        config = self.config
        config['latent_dim'] = config['latent_dim_mlp']
        mlp_model = MLP(config)
        if config['use_cuda'] is True:
            mlp_model.cuda()
        resume_checkpoint(mlp_model,
                          model_dir=config['pretrain_mlp'],
                          device_id=config['device_id'])

        self.embedding_user_mlp.weight.data = mlp_model.embedding_user.weight.data
        self.embedding_item_mlp.weight.data = mlp_model.embedding_item.weight.data
        for idx in range(len(self.fc_layers)):
            self.fc_layers[idx].weight.data = mlp_model.fc_layers[
                idx].weight.data

        config['latent_dim'] = config['latent_dim_mf']
        gmf_model = GMF(config)
        if config['use_cuda'] is True:
            gmf_model.cuda()
        resume_checkpoint(gmf_model,
                          model_dir=config['pretrain_mf'],
                          device_id=config['device_id'])
        self.embedding_user_mf.weight.data = gmf_model.embedding_user.weight.data
        self.embedding_item_mf.weight.data = gmf_model.embedding_item.weight.data

        self.affine_output.weight.data = 0.5 * torch.cat([
            mlp_model.affine_output.weight.data,
            gmf_model.affine_output.weight.data
        ],
                                                         dim=-1)
        self.affine_output.bias.data = 0.5 * (
            mlp_model.affine_output.bias.data +
            gmf_model.affine_output.bias.data)
    def create_model(self):

        print('-' * 15, "Creating model", '-' * 15)

        latent_dim = 10
        config = {
            'num_virus': self.n_v,
            'num_human': self.n_h,
            'latent_dim': latent_dim,
            'sparse':
            False  # set false for now because some optimizers dont work with sparse
        }

        self.model = GMF(config)
        self.model.to(self.device)

        print(self.model)
        # print("params-------")
        # print(list(self.model.parameters()))
        # print("end-------")
        # print('grad: ', list(self.model.parameters())[0].grad)
        # print('grad: ', list(self.model.parameters())[3].grad)
        # print('grad: ', list(self.model.parameters())[4].grad)
        print('-' * 15, "Done with model", '-' * 15)
        print()
Beispiel #3
0
 def __init__(self, args, num_users, num_items):
     BaseModel.__init__(self, args, num_users, num_items)
     self.layers = eval(args.layers)
     self.lambda_layers = eval(args.reg_layers)
     self.num_factors = args.num_factors
     self.model_GMF = GMF(args, num_users, num_items)
     self.model_MLP = MLP(args, num_users, num_items)
    def load_pretrain_weights(self):
        """Loading weights from trained MLP model & GMF model"""
        config = self.config
        mlp_model = MLP(config)
        device_id = -1
        if config['use_cuda'] is True:
            mlp_model.cuda()
            device_id = config['device_id']
        resume_checkpoint(mlp_model,
                          model_dir=config['pretrain_mlp'],
                          device_id=device_id)

        self.embedding_account_mlp.weight.data = mlp_model.embedding_account.weight.data
        self.embedding_location_mlp.weight.data = mlp_model.embedding_location.weight.data

        for idx in range(len(self.fc_layers)):
            self.fc_layers[idx].weight.data = mlp_model.fc_layers[
                idx].weight.data

        config['latent_dim'] = config['latent_dim_mf']
        gmf_model = GMF(config)
        if config['use_cuda'] is True:
            gmf_model.cuda()
        resume_checkpoint(gmf_model,
                          model_dir=config['pretrain_mf'],
                          device_id=device_id)
        self.embedding_account_mf.weight.data = gmf_model.embedding_account.weight.data
        self.embedding_location_mf.weight.data = gmf_model.embedding_location.weight.data

        self.embedding_account_mlp.require = False
        self.embedding_location_mlp.require = False
        self.embedding_account_mf.require = False
        self.embedding_location_mf.require = False
 def load_pretrain_weights(self):
     """Loading weights from trained GMF model"""
     config = self.config
     gmf_model = GMF(config)
     if config['use_cuda'] is True:
         gmf_model.cuda()
     resume_checkpoint(gmf_model, model_dir=config['pretrain_mf'], device_id=config['device_id'])
     self.embedding_user.weight.data = gmf_model.embedding_user.weight.data
     self.embedding_item.weight.data = gmf_model.embedding_item.weight.data
Beispiel #6
0
def train_gmf():
    model = GMF(gmf_config).to(device)
    opt = optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.999),weight_decay=1e-4)
    criterion = torch.nn.MSELoss()
    for epoch in range(gmf_config['num_epoch']):
        print("running epoch ", epoch)
        train_mse = epoch_run(model, train_generator, opt, criterion, "train")
        val_mse = epoch_run(model, val_generator, opt, criterion,"val")
        print("train mse loss => ", train_mse, "val mse loss => ", val_mse)
    return model
    def create_model(self):

        print('-' * 15, "Creating model", '-' * 15)

        latent_dim = 2799
        config = {
            'num_virus': self.n_v,
            'num_human': self.n_h,
            'latent_dim': latent_dim,
            'sparse': False # set false for now because some optimizers dont work with sparse
        }

        self.model = GMF(config)
        self.model.to(self.device)

        print(self.model)
        print('-' * 15, "Done with model", '-' * 15)
        print()
Beispiel #8
0
    def load_pretrain_weights(self):
        """Loading weights from trained GMF model"""
        config = self.config
        gmf_model = GMF(config)
        if config['use_cuda'] is True:
            gmf_model.cuda()
        resume_checkpoint(gmf_model, model_dir=config['pretrain_mf'], device_id=config['device_id'])
        self.embedding_user.weight.data = gmf_model.embedding_user.weight.data
        self.embedding_item.weight.data = gmf_model.embedding_item.weight.data


# class MLPEngine(Engine):
#     """Engine for training & evaluating GMF model"""
#     def __init__(self, config):
#         self.model = MLP(config)
#         if config['use_cuda'] is True:
#             use_cuda(True, config['device_id'])
#             self.model.cuda()
#         super(MLPEngine, self).__init__(config)
#         print(self.model)

#         if config['pretrain']:
#             self.model.load_pretrain_weights()
Beispiel #9
0
class NeuMF(BaseModel):
    def __init__(self, args, num_users, num_items):
        BaseModel.__init__(self, args, num_users, num_items)
        self.layers = eval(args.layers)
        self.lambda_layers = eval(args.reg_layers)
        self.num_factors = args.num_factors
        self.model_GMF = GMF(args, num_users, num_items)
        self.model_MLP = MLP(args, num_users, num_items)

    def build_core_model(self, user_indices, item_indices):
        vector_GMF, len_GMF, params_GMF = self.model_GMF.build_core_model(
            user_indices, item_indices)

        vector_MLP, len_MLP, params_MLP = self.model_MLP.build_core_model(
            user_indices, item_indices)

        model_vector = tf.concat([vector_GMF, vector_MLP], 1)
        model_len = len_GMF + len_MLP

        model_params = []
        model_params.extend(params_GMF)
        model_params.extend(params_MLP)

        return model_vector, model_len, model_params

    def build_model(self, user_indices=None, item_indices=None):
        if not user_indices:
            user_indices = tf.placeholder(tf.int32, [None],
                                          name="user_indices")
        self.user_indices = user_indices

        if not item_indices:
            item_indices = tf.placeholder(tf.int32, [None],
                                          name="item_indices")
        self.item_indices = item_indices

        self.ratings = tf.placeholder(tf.float32, [None])

        model_vector, model_len, model_params = self.build_core_model(
            user_indices, item_indices)

        self.output, self.loss, self.error, self.raw_error, self.train_step = \
            self.build_train_model(model_vector, model_len,
                                   self.ratings, model_params)
Beispiel #10
0
def train(args, train_data_path):
    print("use_gpu:{}, NeuMF:{}, epochs:{}, batch_size:{}, num_factors:{}, num_neg:{}, lr:{}, model_dir:{}, layers:{}".format(
        args.use_gpu, args.NeuMF, args.epochs, args.batch_size, args.num_factors, args.num_neg, args.lr, args.model_dir, args.layers))
    dataset = Dataset(args.path + args.dataset)
    testRatings, testNegatives = dataset.testRatings, dataset.testNegatives

    train_data_generator = utils.Dataset()
    train_reader = fluid.io.batch(train_data_generator.train(train_data_path, True), batch_size=args.batch_size)
    
    inputs = utils.input_data(True)
    if args.GMF:
        model = GMF()
        loss, pred = model.net(inputs, args.num_users, args.num_items, args.num_factors)
    elif args.MLP:
        model = MLP()
        loss, pred = model.net(inputs, args.num_users, args.num_items, args.layers)
    elif args.NeuMF:
        model = NeuMF()
        loss, pred = model.net(inputs, args.num_users, args.num_items, args.num_factors, args.layers)

    optimizer = fluid.optimizer.AdamOptimizer(args.lr)
    optimizer.minimize(loss)
    
    place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())

    loader = fluid.io.DataLoader.from_generator(
        feed_list=inputs, capacity=args.batch_size, iterable=True)
    loader.set_sample_list_generator(train_reader, places=place)
    
    for epoch in range(args.epochs):

        for batch_id, data in enumerate(loader()):
            begin = time.time()
            loss_val = exe.run(program=fluid.default_main_program(),
                    feed=data,
                    fetch_list=[loss.name],
                    return_numpy=True)
            end = time.time()
            logger.info("epoch: {}, batch_id: {}, batch_time: {:.5f}s, loss: {:.5f}".format(epoch, batch_id, end - begin, np.array(loss_val)[0][0]))

        save_dir = "%s/epoch_%d" % (args.model_dir, epoch)
        feed_var_names = ["user_input", "item_input"]
        fetch_vars = [pred]
        fluid.io.save_inference_model(save_dir, feed_var_names, fetch_vars, exe)
Beispiel #11
0
    def load_pretrain_weights(self):
        """Loading weights from trained MLP model & GMF model"""

        config = self.config
        config['latent_dim'] = config['latent_dim_mlp']
        mlp_model = MLP(config)

        # if config['use_cuda'] is True:
        #     mlp_model.cuda()

        # resume_checkpoint(mlp_model, model_dir = config['pretrain_mlp'], device_id = config['device_id'])
        resume_checkpoint(mlp_model, model_dir=config['pretrain_mlp'])

        # Get the user and item weights from the trained MLP model
        self.embedding_user_mlp.weight.data = mlp_model.embedding_user.weight.data
        self.embedding_item_mlp.weight.data = mlp_model.embedding_item.weight.data

        for idx in range(len(self.fc_layers)):
            self.fc_layers[idx].weight.data = mlp_model.fc_layers[
                idx].weight.data

        config['latent_dim'] = config['latent_dim_mf']
        gmf_model = GMF(config)

        # if config['use_cuda'] is True:
        #     gmf_model.cuda()

        # resume_checkpoint(gmf_model, model_dir = config['pretrain_mf'], device_id = config['device_id'])
        resume_checkpoint(gmf_model, model_dir=config['pretrain_mf'])

        # Get the user and item weights from the trained GMF model
        self.embedding_user_mf.weight.data = gmf_model.embedding_user.weight.data
        self.embedding_item_mf.weight.data = gmf_model.embedding_item.weight.data

        # Perform linear transformation to get the final weight and bias values from both MLP and GMF weights
        self.affine_output.weight.data = 0.5 * torch.cat([
            mlp_model.affine_output.weight.data,
            gmf_model.affine_output.weight.data
        ],
                                                         dim=-1)
        self.affine_output.bias.data = 0.5 * (
            mlp_model.affine_output.bias.data +
            gmf_model.affine_output.bias.data)
Beispiel #12
0
    modelpath = os.path.join(modeldir, modelfname)
    resultsdfpath = os.path.join(modeldir, 'results_df.p')

    dataset = np.load(os.path.join(datadir, dataname))
    train_ratings = load_npz(os.path.join(datadir, train_matrix)).todok()
    test_ratings, negatives = dataset['test_negative'], dataset['negatives']
    n_users, n_items = dataset['n_users'].item(), dataset['n_items'].item()

    test_loader = DataLoader(dataset=test_ratings,
        batch_size=1000,
        shuffle=False
        )

    model = NeuMF(n_users, n_items, n_emb, layers, dropouts)
    if os.path.isfile(mf_pretrain) and os.path.isfile(mlp_pretrain):
        gmf_model = GMF(n_users, n_items, n_emb)
        gmf_model.load_state_dict(torch.load(mf_pretrain))
        mlp_model = MLP(n_users, n_items, layers, dropouts)
        mlp_model.load_state_dict(torch.load(mlp_pretrain))
        model = load_pretrain_model(model, gmf_model, mlp_model)
        print("Load pretrained GMF {} and MLP {} models done. ".format(mf_pretrain, mlp_pretrain))

    use_cuda = torch.cuda.is_available()
    if use_cuda:
        model = model.cuda()

    if freeze:
        for name, layer in model.named_parameters():
            if not ("out" in name):
                layer.requires_grad = False
Beispiel #13
0
def parse(path):
	g = gzip.open(path, 'rb')
	for l in g:
    	yield eval(l)

def getDF(path):
	i = 0
	df = {}
	for d in parse(path):
		df[i] = d
	    i += 1
	return pd.DataFrame.from_dict(df, orient='index')

DATA_PATH = Path(".")
MODEL_DIR = "models"

asin2id_map = pickle.load(open(DATA_PATH/'item_mappings.p', 'rb'))
id2asin_map = {k:v for v,k in asin2id_map.items()}

df_movies_meta_data = getDF(DATA_PATH/'meta_Movies_and_TV.json.gz')
keep_cols = ['asin', 'title']
df_movies_meta_data = df_movies_meta_data[keep_cols]
df_movies_meta_data = df_movies_meta_data[~df_movies_meta_data.title.isna()]
asin2title_map = dict(df_movies_meta_data.values)

print("number of items with missing title in the core dataset: {}".format(
	np.setdiff1d(list(id2asin_map.values()), list(asin2title_map.keys())).shape[0]))
print("number of items with non missing titles in the core dataset: {}".format(
	len(id2asin_map) \
	- np.setdiff1d(list(id2asin_map.values()), list(asin2title_map.keys())).shape[0]))

id2title_map = {}
for k,v in id2asin_map.items():
	try:
		id2title_map[k] = asin2title_map[v]
	except:
		continue

df_results = pd.read_pickle(DATA_PATH/MODEL_DIR/'results_df.p')
best_gmf = (df_results[df_results.modelname.str.contains('GMF')]
	.sort_values('best_hr', ascending=False)
	.reset_index(drop=True)
	).modelname[0]
n_emb_i = int(np.where([s == 'emb' for s in best_gmf.split("_")])[0])+1
n_emb = int(best_gmf.split("_")[n_emb_i])

dataset = np.load(DATA_PATH/'neuralcf_split.npz')
n_users, n_items = dataset['n_users'].item(), dataset['n_items'].item()

gmf_model = GMF(n_users, n_items, n_emb)
gmf_model.load_state_dict(torch.load(DATA_PATH/MODEL_DIR/best_gmf))
item_embeddings = gmf_model.embeddings_item.weight.data.numpy()

knn_model = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
knn_model.fit(item_embeddings)

def get_movie_titles(input_id, n=20):
	"""first movie will be the "query" movie and the remaining n-1 the similar
	movies. Similar defined under the functioning of the algorithm, i.e.
	leading to the same prediction"""
	dist, nnidx = knn_model.kneighbors(
		item_embeddings[input_id].reshape(1, -1),
		n_neighbors = n)
	titles = []
	for idx in nnidx[0]:
		try:
			titles.append(id2title_map[idx])
		except:
			continue
	return titles

similar_movies = get_movie_titles(1234)
class GMFConfig_dbg:
    def __init__(self, device, n=150, m=150, prob=.5):
        self.device = device
        self.create_generator(n, m, prob)
        self.create_model()

    def create_model(self):

        print('-' * 15, "Creating model", '-' * 15)

        latent_dim = 10
        config = {
            'num_virus': self.n_v,
            'num_human': self.n_h,
            'latent_dim': latent_dim,
            'sparse':
            False  # set false for now because some optimizers dont work with sparse
        }

        self.model = GMF(config)
        self.model.to(self.device)

        print(self.model)
        # print("params-------")
        # print(list(self.model.parameters()))
        # print("end-------")
        # print('grad: ', list(self.model.parameters())[0].grad)
        # print('grad: ', list(self.model.parameters())[3].grad)
        # print('grad: ', list(self.model.parameters())[4].grad)
        print('-' * 15, "Done with model", '-' * 15)
        print()

    def create_generator(self, m, n, prob):
        ############################
        ##  generate bipartite
        ###########################
        print('-' * 15, "Generating graph", '-' * 15)
        G = nx.bipartite.random_graph(n, m, prob)
        observed = list(G.edges())
        nodes = list(G.nodes())
        virusUprot = []
        humanUprot = []
        edges = []

        for i in tqdm(range(n)):
            for j in tqdm(range(n, m + n)):
                virusUprot.append(i)
                humanUprot.append(j)
                if (i, j) in observed:
                    edges.append(1.0)
                else:
                    edges.append(0.0)

        M = pd.DataFrame({
            'virusUprot': virusUprot,
            'humanUprot': humanUprot,
            'edge': edges
        })

        htoi = {v: k for k, v in enumerate(M['humanUprot'].unique())}
        vtoi = {v: k for k, v in enumerate(M['virusUprot'].unique())}
        print('-' * 15, "Dataframe created", '-' * 15)
        print()

        ############################
        ##   Prepare data (dataloader)
        ############################
        print('-' * 15, "Creating data loaders", '-' * 15)

        data_config = {
            'interactions': M,
            'htoi': htoi,
            'vtoi': vtoi,
            'pct_test': .10,
            'device': self.device
        }

        self.n_v = len(vtoi)
        self.n_h = len(htoi)
        self.gen = ProteinInteractionGenerator(data_config)

        print('-' * 15, "Generator done", '-' * 15)
        print()

    def get_generator(self):
        return self.gen

    def get_model(self):
        return self.model
class GMFConfig:
    def __init__(self, path, debug, device):
        self.device = device
        self.create_generator(path, debug)
        self.create_model()

    def create_model(self):

        print('-' * 15, "Creating model", '-' * 15)

        latent_dim = 2799
        config = {
            'num_virus': self.n_v,
            'num_human': self.n_h,
            'latent_dim': latent_dim,
            'sparse': False # set false for now because some optimizers dont work with sparse
        }

        self.model = GMF(config)
        self.model.to(self.device)

        print(self.model)
        print('-' * 15, "Done with model", '-' * 15)
        print()

    def create_generator(self, path, debug):
        ############################
        ##   paths 
        ########################### 
        train_csv =f'{path}full_train.csv'

        ############################
        ##   Load data
        ############################
        print('-' * 15, "Loading data", '-' * 15)

        print("loading traning matrix at: ", train_csv)
        M = pd.read_csv(train_csv)

        if debug:
            print("Making debug dataset.....")
            pos = M.loc[M['edge'] > 0].sample(frac=1)
            negs = M.loc[M['edge'] == 0].sample(frac=1)
            M = pd.concat([pos, negs[:len(pos)]], ignore_index=True).sample(frac=1)

        htoi = {v:k for k,v in enumerate(M['humanUprot'].unique())}
        vtoi = {v:k for k,v in enumerate(M['virusUprot'].unique())}

        ############################
        ##   Prepare data (dataloader)
        ############################
        print('-' * 15, "Creating data loaders", '-' * 15)

        data_config = {
            'interactions':M,
            'htoi':htoi,
            'vtoi':vtoi,
            'pct_test':.10,
            'device': self.device
        }

        self.n_v = len(vtoi)
        self.n_h = len(htoi)
        self.gen = ProteinInteractionGenerator(data_config)

        print('-' * 15, "Generator done", '-' * 15)
        print()

    def get_generator(self):
        return self.gen

    def get_model(self):
        return self.model