def init_data(self): if self.load: self.we = tools.load_params(self.we_file_pkl) vocab2id = tools.load_params(self.vocab2id_file_pkl) else: self.we, vocab2id = load_embed_from_text(self.word_embed_file, self.word_dim) tools.save_params(self.we, self.we_file_pkl) tools.save_params(vocab2id, self.vocab2id_file_pkl) print("vocab size: %d" % len(vocab2id), "we shape: ", self.we.shape) self.train_x, self.train_y, self.train_seq_len = sentence2id_and_pad( self.train_file, vocab2id, self.max_seq_len) print("train_x: %d " % len(self.train_x), "train_y: %d" % len(self.train_y)) if self.dev_file is not None: self.dev_x, self.dev_y, self.dev_seq_len = sentence2id_and_pad( self.dev_file, vocab2id, self.max_seq_len) print("dev_x: %d " % len(self.dev_x), "dev_y: %d" % len(self.dev_y)) if self.test_file is not None: self.test_x, self.test_y, self.test_seq_len = sentence2id_and_pad( self.test_file, vocab2id, self.max_seq_len) print("test_x: %d " % len(self.test_x), "test_y: %d" % len(self.test_y))
def to2d_pca(fname="samples_vector"): x, y = tools.load_params("../log/{}.pkl".format(fname)) print(x.shape) pca = PCA(n_components=2) x_2d = pca.fit_transform(x) print(x_2d.shape) tools.save_params([x_2d, y], "../log/pca_{}_2d.pkl".format(fname)) return [x_2d, y]
def to2d(fname="samples_vector"): x, y = tools.load_params("../log/{}.pkl".format(fname)) print(x.shape) tsne = TSNE() x_2d = tsne.fit_transform(x) print(x_2d.shape) tools.save_params([x_2d, y], "../log/{}_2d.pkl".format(fname)) return [x_2d, y]
@author: dby_freedom """ import os import pickle import numpy as np import tensorflow as tf from tools import load_config, load_params from model_trainer import train_fn ProcessedDataDir = './processed_data' try: load_dir = load_params() except FileNotFoundError: train_fn() load_dir = load_params() title_count, title_set, genres2int, features, targets_values, ratings, users, movies, data, movies_orig, users_orig = pickle.load( open(ProcessedDataDir + os.sep + 'preprocess.p', mode='rb')) embed_dim, uid_max, gender_max, age_max, job_max, movie_id_max, movie_categories_max, \ movie_title_max, combiner, sentences_size, window_sizes, filter_num = load_config() # 电影ID转下标的字典,数据集中电影ID跟下标不一致,比如第5行的数据电影ID不一定是5 movieid2idx = {val[0]: i for i, val in enumerate(movies.values)} def get_tensors(loaded_graph): uid = loaded_graph.get_tensor_by_name("uid:0")