コード例 #1
0
def get_data():
    df = readers.read_file('./data/work_experiences.dat', sep="::")
    rows = len(df)
    df = df.iloc[np.random.permutation(rows)].reset_index(drop=True)
    split_index = int(rows * 0.9)
    df_train = df[0:split_index]
    df_test = df[split_index:].reset_index(drop=True)
    return df_train, df_test
コード例 #2
0
ファイル: news_server.py プロジェクト: zyq11223/CapNews
def get_data():
    # Prepare training and testing data
    df = readers.read_file('../trainer/ml-1m/ratings.dat')
    rows = len(df)
    df = df.iloc[np.random.permutation(rows)].reset_index(drop=True)
    split_index = int(rows * 0.9)
    train_df = df[0:split_index]
    test_df = df[split_index:].reset_index(drop=True)
    return train_df, test_df
コード例 #3
0
def create_df(ratings_df=readers.read_file(FLAGS.data_file, sep="::")):
    """
    Use to create a trained DataFrame,all missing values in user-item table
    is filled here using SVD trained model
    INPUTS :
        ratings_df : rating dataframe, store all users rating for respective movies

    OUTPUT:
        Filled rating dataframe where user is row and item is col
    """
    if os.path.isfile("./user_item_table.pkl"):
        df=pd.read_pickle("user_item_table.pkl")
    else:
        df = ratings_df.pivot(index = 'item', columns ='user', values = 'rate').fillna(0)
        df.to_pickle("user_item_table.pkl")
    users=[]
    items=[]
    start = time.time()
    print("Start creating user-item dense table")
    total_movies=list(ratings_df.item.unique())
    for index in df.columns.tolist():
        #rated_movies=ratings_df[ratings_df['user']==index].drop(['st', 'user'], axis=1)
        rated_movie=[]
        rated_movie=list(ratings_df[ratings_df['user']==index].drop(['st', 'user'], axis=1)['item'].values)
        unseen_movies=[]
        unseen_movies=list(set(total_movies) - set(rated_movie))
        for movie in unseen_movies:
            users.append(index)
            items.append(movie)
    end = time.time()
    print (("Found in %.2f seconds" % (end-start)))
    del df
    rated_list = []
    init_op = tf.global_variables_initializer()
    #checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
    session_conf = tf.ConfigProto(
      allow_soft_placement=FLAGS.allow_soft_placement,
      log_device_placement=FLAGS.log_device_placement)
    with tf.Session(config = session_conf) as sess:
        #sess.run(init_op)
        print("prediction started ...")
        new_saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_prefix))
        new_saver.restore(sess, tf.train.latest_checkpoint(FLAGS.checkpoint_dir))
        test_err2 = np.array([])
        rated_list = sess.run(infer, feed_dict={user_batch: users,
                                                    item_batch: items})
        rated_list = clip(rated_list)
        print("Done !!!")

    sess.close()
    df_dict={'user':users,'item':items,'rate':rated_list}
    df = ratings_df.drop(['st'],axis=1).append(pd.DataFrame(df_dict)).pivot(index = 'user', columns ='item', values = 'rate').fillna(0)
    df.to_pickle("user_item_table_train.pkl")
    return df
コード例 #4
0
def get_data():
    # Reads file using the demiliter :: form the ratings file
    # Columns are user ID, item ID, rating, and timestamp
    # Sample data - 3::1196::4::978297539
    df = readers.read_file("./ml-1m/ratings.dat", sep="::")
    rows = len(df)
    # Purely integer-location based indexing for selection by position
    df = df.iloc[np.random.permutation(rows)].reset_index(drop=True)
    # Separate data into train and test, 90% for train and 10% for test
    split_index = int(rows * 0.9)
    # Use indices to separate the data
    df_train = df[0:split_index]
    df_test = df[split_index:].reset_index(drop=True)

    return df_train, df_test
コード例 #5
0
def get_data():
    # Reads file using the demiliter :: form the ratings file
    # Download movie lens data from: http://files.grouplens.org/datasets/movielens/ml-1m.zip
    # Columns are user ID, item ID, rating, and timestamp
    # Sample data - 3::1196::4::978297539
    print("Inside get data ...")
    df = readers.read_file(FLAGS.data_file, sep="::")
    rows = len(df)
    # Purely integer-location based indexing for selection by position
    df = df.iloc[np.random.permutation(rows)].reset_index(drop=True)
    # Separate data into train and test, 90% for train and 10% for test
    split_index = int(rows * 0.9)
    # Use indices to separate the data
    df_train = df[0:split_index]
    df_test = df[split_index:].reset_index(drop=True)
    df = df.pivot(index = 'item', columns ='user', values = 'rate').fillna(0)
    df.to_pickle("user_item_table.pkl")
    print("Done !!!")
    return df_train, df_test,df.shape[0],df.shape[1]
コード例 #6
0
def get_data():
    # Reads file using the demiliter :: form the ratings file
    # Download movie lens data from: http://files.grouplens.org/datasets/movielens/ml-1m.zip
    # Columns are user ID, item ID, rating, and timestamp
    # Sample data - 3::1196::4::978297539
    print("Inside get data ...")
    df = readers.read_file(FLAGS.data_file, sep="::")
    rows = len(df)

    # Purely integer-location based indexing for selection by position
    df = df.iloc[np.random.permutation(rows)].reset_index(drop=True)

    # Split data into train and test, 75% for train and 25% for test
    split_index = int(rows * 0.75)

    # Use indices to separate the data
    df_train = df[0:split_index]
    df_test = df[split_index:].reset_index(drop=True)

    print("Done !!!")
    print(df.shape)
    return df_train, df_test, df['user'].max(), df['item'].max()
コード例 #7
0
    #checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)

    session_conf = tf.ConfigProto(
        allow_soft_placement=FLAGS.allow_soft_placement,
        log_device_placement=FLAGS.log_device_placement)
    with tf.Session(config=session_conf) as sess:
        #sess.run(init_op)
        print("prediction started ...")
        new_saver = tf.train.import_meta_graph(
            "{}.meta".format(checkpoint_prefix))
        new_saver.restore(sess,
                          tf.train.latest_checkpoint(FLAGS.checkpoint_dir))
        test_err2 = np.array([])
        rated_list = sess.run(infer,
                              feed_dict={
                                  user_batch: users,
                                  item_batch: items
                              })
        rated_list = clip(rated_list)
        print("Done !!!")

    sess.close()
    df_dict = {'user': users, 'item': items, 'rate': rated_list}
    df = ratings_df.drop(['st'], axis=1).append(pd.DataFrame(df_dict)).pivot(
        index='user', columns='item', values='rate').fillna(0)
    df.to_pickle("user_item_table_train.pkl")
    return df


create_df(ratings_df=readers.read_file(FLAGS.data_file, sep="::"))
コード例 #8
0
import tensorflow as tf
import pandas as pd
import readers
import main
import kmean as km

df = pd.read_pickle("user_item_table_train.pkl")

ratings_df = readers.read_file("Input/ratings.dat", sep="::")

clusters, movies = km.k_mean_clustering(ratings_df=ratings_df, TRAINED=False)
cluster_df = pd.DataFrame({'movies': movies, 'clusters': clusters})
cluster_df.head(10)

main.top_k_similar_items(9, ratings_df=ratings_df, k=10, TRAINED=False)
cluster_df[cluster_df['movies'] == 1721]
cluster_df[cluster_df['movies'] == 1369]
cluster_df[cluster_df['movies'] == 164]
cluster_df[cluster_df['movies'] == 3081]
cluster_df[cluster_df['movies'] == 732]
cluster_df[cluster_df['movies'] == 348]
cluster_df[cluster_df['movies'] == 647]

# Pearson Correlation between User-User. When you run this User Similarity function, on first run it will take time to give output but after that it's response is in real-time.
main.user_similarity(1, 345, ratings_df)

# Similarity between two users
#Rating of User - Aspected rating for a user
ratings_df.head()

main.user_rating(0, 1192)
コード例 #9
0
    changed = True
    iters = 0

    while changed and iters < MAX_ITERS:
        iters += 1
        [changed, _] = sess.run([did_assignments_change, do_updates])

    [centers, assignments] = sess.run([centroids, cluster_assignments])
    end = time.time()
    print (("Found in %.2f seconds" % (end-start)), iters, "iterations")
    cluster_df=pd.DataFrame({'movies':df.index.values,'clusters':assignments})
    cluster_df.to_csv("clusters.csv",index=True)
    return assignments,df.index.values

# Read the main file i.e. ratings.dat
ratings_df = readers.read_file(data_file, sep="::")
clusters,movies = k_mean_clustering(ratings_df,K=K,MAX_ITERS = MAX_ITERS,TRAINED=TRAINED)

user_item=pd.read_pickle("user_item_table.pkl")
cluster=pd.read_csv("clusters.csv", index_col=False)

user_item=user_item.T

pcs = PCA(n_components=2, svd_solver='full')
cluster['x']=pcs.fit_transform(user_item)[:,0]
cluster['y']=pcs.fit_transform(user_item)[:,1]

fig = plt.figure()
ax = plt.subplot(111)