Beispiel #1
0
def load_sparse_trainingData_memory(train_file, col_num):
    data_directory = "../../paper/data/dianping/corpus/"
    vector_directory = "../../paper/data/dianping/tfidf/vector"
    user_vector = os.path.join(vector_directory,
                               "comment.keyword.train.user.vector.1000")
    shop_vector = os.path.join(vector_directory,
                               "comment.keyword.train.shop.vector.1000")
    data_path = os.path.join(data_directory, "comment.keyword.train.residual")
    user_vec = load_vec(user_vector)
    shop_vec = load_vec(shop_vector)
    logging("jointing vector")
    index = 0

    train_y = []
    rows = []
    cols = []
    data = []
    row_num = 0

    with open(data_path) as f:
        for line in f:
            index += 1
            if index % 200 == 0:
                logging("%d cases, data size:%d" % (int(index), len(data)))
            arr = line.strip().split("\t")
            if len(arr) != 3:
                continue
            if not user_vec.has_key(arr[0]) or not shop_vec.has_key(arr[1]):
                continue
            u_vec = user_vec[arr[0]]
            for each in u_vec:
                rows.append(row_num)
                cols.append(each[0])
                data.append(each[1])
            s_vec = shop_vec[arr[1]]
            for each in s_vec:
                rows.append(row_num)
                cols.append(each[0])
                data.append(each[1])
            row_num += 1
            train_y.append(float(arr[2]))
    return sp.coo_matrix((np.array(data), (np.array(rows), np.array(cols))),
                         shape=(row_num, col_num)), np.array(train_y)
def load_sparse_trainingData_memory(train_file, col_num):
    data_directory = "../../paper/data/dianping/corpus/"
    vector_directory = "../../paper/data/dianping/tfidf/vector"
    user_vector = os.path.join(vector_directory, "comment.keyword.train.user.vector.1000")
    shop_vector = os.path.join(vector_directory, "comment.keyword.train.shop.vector.1000")
    data_path = os.path.join(data_directory, "comment.keyword.train.residual")
    user_vec = load_vec(user_vector)
    shop_vec = load_vec(shop_vector)
    logging("jointing vector")
    index = 0

    train_y = []
    rows = []
    cols = []
    data = []
    row_num = 0

    with open(data_path) as f:
        for line in f:
            index += 1
            if index % 200 == 0:
                logging("%d cases, data size:%d" % (int(index), len(data)))
            arr = line.strip().split("\t")
            if len(arr) != 3:
                continue
            if not user_vec.has_key(arr[0]) or not shop_vec.has_key(arr[1]):continue
            u_vec = user_vec[arr[0]]
            for each in u_vec:
                rows.append(row_num)
                cols.append(each[0])
                data.append(each[1])
            s_vec = shop_vec[arr[1]]
            for each in s_vec:
                rows.append(row_num)
                cols.append(each[0])
                data.append(each[1])
            row_num += 1
            train_y.append(float(arr[2]))
    return sp.coo_matrix((np.array(data), (np.array(rows), np.array(cols))), shape=(row_num, col_num)), np.array(train_y)
Beispiel #3
0
    starttime = datetime.now()
    with open(filename) as f:
        for line in f:
            arr = line.strip().split("\t")
            vec[arr[0]] = arr[1:]
    logging("loading %s vector, eplased time:%s" %
            (filename, str(datetime.now() - starttime)))
    return vec


vector_file_directory = "../../paper/data/dianping/w2v/vector/"
train_user = os.path.join(vector_file_directory,
                          "comment.keyword.train.user.vector")
train_shop = os.path.join(vector_file_directory,
                          "comment.keyword.train.shop.vector")
user_vec = load_vec(train_user)
shop_vec = load_vec(train_shop)


def load_trainingData(train_file):
    starttime = datetime.now()
    train_x = []
    train_y = []

    index = 0
    logging("loading training data")
    with open(train_file) as f:
        for line in f:
            index += 1
            if index % 2000 == 0:
                logging("%d cases" % index)
def load_vec(filename):
    vec = {}
    logging("loading %s vector" % filename)
    starttime = datetime.now()
    with open(filename) as f:
        for line in f:
            arr = line.strip().split("\t")
            vec[arr[0]] = arr[1:]
    logging("loading %s vector, eplased time:%s" % (filename, str(datetime.now() - starttime)))
    return vec

vector_file_directory = "../../paper/data/dianping/w2v/vector/"
train_user = os.path.join(vector_file_directory, "comment.keyword.train.user.vector")
train_shop = os.path.join(vector_file_directory, "comment.keyword.train.shop.vector")
user_vec = load_vec(train_user)
shop_vec = load_vec(train_shop)

def load_trainingData(train_file):
    starttime = datetime.now()
    train_x = []
    train_y = []

    index = 0
    logging("loading training data")
    with open(train_file) as f:
        for line in f:
            index += 1
            if index  % 2000 == 0:
                logging("%d cases" % index)
            arr = line.strip().split("\t")