def load_sparse_trainingData_memory(train_file, col_num): data_directory = "../../paper/data/dianping/corpus/" vector_directory = "../../paper/data/dianping/tfidf/vector" user_vector = os.path.join(vector_directory, "comment.keyword.train.user.vector.1000") shop_vector = os.path.join(vector_directory, "comment.keyword.train.shop.vector.1000") data_path = os.path.join(data_directory, "comment.keyword.train.residual") user_vec = load_vec(user_vector) shop_vec = load_vec(shop_vector) logging("jointing vector") index = 0 train_y = [] rows = [] cols = [] data = [] row_num = 0 with open(data_path) as f: for line in f: index += 1 if index % 200 == 0: logging("%d cases, data size:%d" % (int(index), len(data))) arr = line.strip().split("\t") if len(arr) != 3: continue if not user_vec.has_key(arr[0]) or not shop_vec.has_key(arr[1]): continue u_vec = user_vec[arr[0]] for each in u_vec: rows.append(row_num) cols.append(each[0]) data.append(each[1]) s_vec = shop_vec[arr[1]] for each in s_vec: rows.append(row_num) cols.append(each[0]) data.append(each[1]) row_num += 1 train_y.append(float(arr[2])) return sp.coo_matrix((np.array(data), (np.array(rows), np.array(cols))), shape=(row_num, col_num)), np.array(train_y)
def load_sparse_trainingData_memory(train_file, col_num): data_directory = "../../paper/data/dianping/corpus/" vector_directory = "../../paper/data/dianping/tfidf/vector" user_vector = os.path.join(vector_directory, "comment.keyword.train.user.vector.1000") shop_vector = os.path.join(vector_directory, "comment.keyword.train.shop.vector.1000") data_path = os.path.join(data_directory, "comment.keyword.train.residual") user_vec = load_vec(user_vector) shop_vec = load_vec(shop_vector) logging("jointing vector") index = 0 train_y = [] rows = [] cols = [] data = [] row_num = 0 with open(data_path) as f: for line in f: index += 1 if index % 200 == 0: logging("%d cases, data size:%d" % (int(index), len(data))) arr = line.strip().split("\t") if len(arr) != 3: continue if not user_vec.has_key(arr[0]) or not shop_vec.has_key(arr[1]):continue u_vec = user_vec[arr[0]] for each in u_vec: rows.append(row_num) cols.append(each[0]) data.append(each[1]) s_vec = shop_vec[arr[1]] for each in s_vec: rows.append(row_num) cols.append(each[0]) data.append(each[1]) row_num += 1 train_y.append(float(arr[2])) return sp.coo_matrix((np.array(data), (np.array(rows), np.array(cols))), shape=(row_num, col_num)), np.array(train_y)
starttime = datetime.now() with open(filename) as f: for line in f: arr = line.strip().split("\t") vec[arr[0]] = arr[1:] logging("loading %s vector, eplased time:%s" % (filename, str(datetime.now() - starttime))) return vec vector_file_directory = "../../paper/data/dianping/w2v/vector/" train_user = os.path.join(vector_file_directory, "comment.keyword.train.user.vector") train_shop = os.path.join(vector_file_directory, "comment.keyword.train.shop.vector") user_vec = load_vec(train_user) shop_vec = load_vec(train_shop) def load_trainingData(train_file): starttime = datetime.now() train_x = [] train_y = [] index = 0 logging("loading training data") with open(train_file) as f: for line in f: index += 1 if index % 2000 == 0: logging("%d cases" % index)
def load_vec(filename): vec = {} logging("loading %s vector" % filename) starttime = datetime.now() with open(filename) as f: for line in f: arr = line.strip().split("\t") vec[arr[0]] = arr[1:] logging("loading %s vector, eplased time:%s" % (filename, str(datetime.now() - starttime))) return vec vector_file_directory = "../../paper/data/dianping/w2v/vector/" train_user = os.path.join(vector_file_directory, "comment.keyword.train.user.vector") train_shop = os.path.join(vector_file_directory, "comment.keyword.train.shop.vector") user_vec = load_vec(train_user) shop_vec = load_vec(train_shop) def load_trainingData(train_file): starttime = datetime.now() train_x = [] train_y = [] index = 0 logging("loading training data") with open(train_file) as f: for line in f: index += 1 if index % 2000 == 0: logging("%d cases" % index) arr = line.strip().split("\t")