Esempio n. 1
0
def main(argv):
    target_cities = argv[1:len(argv)]
    dirname = "yelp_dataset_challenge_academic_dataset/"
    users_filepath = dirname + "yelp_academic_dataset_user.json"
    businesses_filepath = dirname + "yelp_academic_dataset_business.json"
    reviews_filepath = dirname + "yelp_academic_dataset_review.json"

    print "loading " + users_filepath
    business = loadData(businesses_filepath)
    print "loading " + businesses_filepath
    user = loadData(users_filepath)
    print "loading " + reviews_filepath
    review = loadData(reviews_filepath)
    restaurant_category = set(['Food', 'Restaurants', 'Pizza', 'Coffee & Tea',
        'Sandwiches', 'Breakfast & Brunch', 'Fast Food', 'Bakeries'])
    for target_city in target_cities:
        filter_and_dump(business, user, review, target_city, restaurant_category, dirname)
Esempio n. 2
0
def main(argv):
    users_filepath = "../data/yelp_academic_dataset_user.json"
    businesses_filepath = "../data/yelp_academic_dataset_business.json"
    reviews_filepath = "../data/yelp_academic_dataset_review.json"
    # users = loadData(users_filepath)
    # businesses = loadData(businesses_filepath)
    reviews = loadData(reviews_filepath)

    business_id = "vcNAWiLM4dR7D2nwwJ7nCA"
    target_reviews = []
    for review in reviews:
        if review["business_id"] == business_id:
            target_reviews.append(review)

    sorted(target_reviews, key=lambda x: x["data"])
    print target_reviews
def predict_stars_rw_ubcw(input_file_path, input_file_path1, output_file_path, output_file_path1, sign, threshold=None):
    ##########################################
    ### Load data (business, user, word) ###
    ##########################################

    '''
        total number of users: 366715
        total number of businesses: 61184
        total number of reviews: 1569264
    '''

    print "load yelp dataset..."
    dirname = "yelp_dataset_challenge_academic_dataset/"
    business_filename = dirname + "yelp_academic_dataset_business_Phoenix.json"
    user_filename = dirname + "yelp_academic_dataset_user_Phoenix.json"
    review_filename = dirname + "yelp_academic_dataset_review_Phoenix.json"
    business = loadData(business_filename)
    user = loadData(user_filename)
    review = loadData(review_filename)

    print "load business features..."
    word = {}
    with open("Phoenix.csv", "rb") as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            business_id = row["business_id"]
            row.pop("business_id")
            for k, v in row.iteritems():
                row[k] = int(v)
            word[business_id] = row

    word_topk = {}
    with open(input_file_path1, "rb") as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            business_id = row["business_id"]
            row.pop("business_id")
            for k, v in row.iteritems():
                v = v.strip().split(".")[0]
                row[k] = int(v) if v else 0
            word_topk[business_id] = row

    ##########################################
    ###           Build graph              ###
    ##########################################

    print "build graph..."
    G = nx.DiGraph()

    print "    add nodes..."
    '''
        Node types:
            user ("type": "user")
            business ("type": "business", "stars": 3.5)
            word ("type": "category")
    '''

    # user {"type": "user"}
    G.add_nodes_from( [ [ "USER#" + u["user_id"], {"type": "user"} ] for u in user ] )

    # business {"type": "business", "stars": 3.5}
    G.add_nodes_from( [ [ "BUSINESS#" + b["business_id"], {"type": "business", "stars": b["stars"]} ] for b in business ] )

    # category {"type": "category"}
    category_set = set()
    for b in business:
        for c in b["categories"]:
            category_set.add(c)
    category_list = list(category_set)
    G.add_nodes_from( [ [ "CATEGORY#" + c, {"type": "category"} ] for c in category_list ] )

    # word {"type": "word"}
    word_list = list(word.itervalues().next().keys())
    G.add_nodes_from([ [ "WORD#" + w, {"type": "word"} ] for w in word_list ])


    print "    add edges..."
    '''
        Edge types:
            user->business (no weight), business->user (no weight)
            user1->user2 (no weight)
            business->category, category->business (no weight)
    '''

    # Add positive user->business and business->user review edges (ratings greater than user average)
    # A USER CAN WRITE MORE THAN ONE REVIEW ON A BUSINESS!!!
    user_average_stars = dict()
    for u in user:
        user_id = "USER#" + u["user_id"]
        user_average_stars[user_id] = u["average_stars"]

    user_business_edges = {}
    business_user_edges = {}
    for r in review:
        business_id = "BUSINESS#" + r["business_id"]
        user_id = "USER#" + r["user_id"]
        if ( sign == "pos" and r["stars"] > user_average_stars[user_id] or
             sign == "neg" and r["stars"] < user_average_stars[user_id] or
             sign == "all" ):
            user_business_edges[user_id] = user_business_edges.get(user_id, [])
            business_user_edges[business_id] = business_user_edges.get(business_id, [])
            user_business_edges[user_id].append( [ business_id, r['date'] ] )
            business_user_edges[business_id].append( [ user_id, r['date'] ] )


    for user_id in user_business_edges:
        for business_id, date in user_business_edges[user_id]:
            G.add_edge(user_id, business_id, { "weight": 1 / len(user_business_edges[user_id]), "date": date })

    for business_id in business_user_edges:
        for user_id, date in business_user_edges[business_id]:
            G.add_edge(business_id, user_id, { "weight": 1 / len(business_user_edges[business_id]), "date": date })

    # business->word, word->business
    for b in business:
        business_id = "BUSINESS#" + b["business_id"]
        total = sum(word[b["business_id"]].values())
        for k, v in word[b["business_id"]].iteritems():
            word_id = "WORD#" + k
            w = v / total
            if w > 0:
                G.add_edge(business_id, word_id, {"weight": w})
                G.add_edge(word_id, business_id,{"weight": 1})

    # Add busines->category, category->business
    category_business_edges = {}
    for b in business:
        business_id = "BUSINESS#" + b["business_id"]
        for c in b["categories"]:
            category_id = "CATEGORY#" + c
            G.add_edge(business_id, category_id, { "weight": 1.0 / len(b["categories"]) })
            if category_id not in category_business_edges:
                category_business_edges[category_id] = list()
            category_business_edges[category_id].append(business_id)

    for category_id in category_business_edges:
        for business_id in category_business_edges[category_id]:
            G.add_edge(category_id, business_id, { "weight": 1.0 / len(category_business_edges[category_id]) })

    # Add user1->user2 friendship edges with no weight specified
    for u in user:
        user1_id = "USER#" + u["user_id"]
        number_of_friends = len(u["friends"])
        for f in u["friends"]:
            user2_id = "USER#" + f
            G.add_edge(user1_id, user2_id, {"weight": 1 / number_of_friends})

    #################################################
    ###   Test (Set a business to "cold start")   ###
    #################################################

    '''
        To test a business, remove all the reviews on the test_business_id
            G.node[test_business_id] =
                {'stars': 3.0, 'type': 'business'}
            review_count = 122
    '''

    # Read business stars
    print "read business stars..."
    business_stars = dict()
    for b in business:
        business_id = "BUSINESS#" + b["business_id"]
        business_stars[business_id] = b["stars"]

    # Read test businesses from input file
    print "read test business list from " + input_file_path + "..."
    test_business_list = []
    with open(input_file_path) as f_r:
        line = f_r.readline()
        while line:
            test_business_list.append(line[:-1])
            line = f_r.readline()

    # Run PageRank for test businesses
    f_pr = open(output_file_path1,'w')
    csv_writer = csv.writer(f_pr)
    f_w = open(output_file_path, "w")
    for test_business_original_id in test_business_list:
        test_business_id = "BUSINESS#" + test_business_original_id
        print "\ntest business: " + test_business_original_id

        # Create a new graph from G, remove review edges on test business
        print "    create graph and remove review edges on test business..."
        G_new = nx.DiGraph(G)
        remove_edge_list = []

        if threshold is not None:
            # keep the first k reviews
            print "        keep only business-user edges for the first k reviews..."
            remove_edge_list = []
            test_business_edges = G_new.edge[test_business_id]
            review_user_list = []
            for node_id in test_business_edges:
                if node_id.startswith("USER#"):
                    review_user_list.append([node_id, test_business_edges[node_id]['date']])
            review_user_list = sorted(review_user_list, cmp=lambda x,y: compare_date(x[1],y[1]))
            while len(review_user_list) > threshold:
                user_id = review_user_list[-1][0]
                remove_edge_list.append([user_id, test_business_id])
                remove_edge_list.append([test_business_id, user_id])
                review_user_list.pop()
            G_new.remove_edges_from(remove_edge_list)

            # remove all business-word edges on the test business
            print "        remove all business-word edges on test business..."
            remove_edge_list = []
            for r in review:
                business_id = "BUSINESS#" + r["business_id"]
                if business_id == test_business_id:
                    for k in word[b["business_id"]].keys():
                        word_id = "WORD#" + k
                        remove_edge_list.append([business_id, word_id])
                        remove_edge_list.append([word_id, business_id])
            G_new.remove_edges_from(remove_edge_list)

            # add the top-k business-word edges on the test business
            print "        add business-word edges for the first k reviews..."
            total = sum(word_topk[test_business_original_id].values())
            for k, v in word_topk[test_business_original_id].iteritems():
                word_id = "WORD#" + k
                w = v / total if total > 0 else 0
                if w > 0:
                    G_new.add_edge(test_business_id, word_id, {"weight": w})
                    G_new.add_edge(word_id, test_business_id,{"weight": 1})

        else:
            # remove all business-user edges on test business
            print "        remove all business-user edges on test business..."
            remove_edge_list = []
            for r in review:
                business_id = "BUSINESS#" + r["business_id"]
                if business_id == test_business_id:
                    user_id = "USER#" + r["user_id"]
                    remove_edge_list.append([user_id, business_id])
                    remove_edge_list.append([business_id, user_id])
            G_new.remove_edges_from(remove_edge_list)
            # remove all business-word edges on test business
            print "        remove all business-word edges on test business..."
            remove_edge_list = []
            for r in review:
                business_id = "BUSINESS#" + r["business_id"]
                if business_id == test_business_id:
                    for k in word[b["business_id"]].keys():
                        word_id = "WORD#" + k
                        remove_edge_list.append([business_id, word_id])
                        remove_edge_list.append([word_id, business_id])
            G_new.remove_edges_from(remove_edge_list)




        # Construct personalization vector for PageRank
        personalization_dict = dict()
        for n in G_new.nodes():
            personalization_dict[n] = 0
        personalization_dict[test_business_id] = 1

        # Run PageRank
        print "    run PageRank..."
        pr = nx.pagerank(G_new, alpha=0.85, personalization=personalization_dict, \
                max_iter=300, tol=1e-06, nstart=None, weight='weight', dangling=None)

        # Output results
        print "    calculate and output results to " + output_file_path + "..."
        weighted_stars_sum = 0
        weight_sum = 0
        pr_list = sorted([(k, pr[k]) for k in pr],key=lambda x: -x[1])

        for node_id in pr:
            if node_id.startswith('BUSINESS#') and (node_id != test_business_id) :
                weighted_stars_sum += pr[node_id] * business_stars[node_id]
                weight_sum += pr[node_id]
        predicted_stars = weighted_stars_sum / weight_sum

        print test_business_original_id + "\t" + \
                str(business_stars[test_business_id]) + "\t" + \
                str(predicted_stars)

        f_w.write( test_business_original_id + "\t" + \
                str(business_stars[test_business_id]) + "\t" + \
                str(predicted_stars) + "\n" )

        pr_list = sorted([(k, pr[k]) for k in pr],key=lambda x: -x[1])
        for pair in pr_list:
            if pair[0].startswith('BUSINESS#') and (pair[0] != test_business_id):
                csv_writer.writerow([test_business_id, pair[0], pair[1]])

    f_w.close()
    f_pr.close()
    return
Esempio n. 4
0
k = 10000
w = {
    1.0: 1 / 11,
    1.5: 1 / 66,
    2.0: 1 / 146,
    2.5: 1 / 332,
    3.0: 1 / 564,
    3.5: 1 / 868,
    4.0: 1 / 880,
    4.5: 1 / 425,
    5.0: 1 / 97,
}

dirname = "yelp_dataset_challenge_academic_dataset/"
business_filename = dirname + "yelp_academic_dataset_business_Phoenix.json"
business = loadData(business_filename)

business_stars = {}
for b in business:
    business_id = "BUSINESS#" + b["business_id"]
    business_stars[business_id] = b["stars"]

test_business = {}
with open(input_file_path, 'r') as csv_file:
    csv_reader = csv.reader(csv_file)
    for row in csv_reader:
        b1 = row[0]
        b2 = row[1]
        pr = row[2]
        test_business[b1] = test_business.get(b1, [])
        test_business[b1].append((b2, pr))
Esempio n. 5
0
def predict_stars_rw_ubc(input_file_path, output_file_path, output_file_path1, sign, threshold):

    ##########################################
    ### Load data (business, user, review) ###
    ##########################################

    '''
        total number of users: 366715
        total number of businesses: 61184
        total number of reviews: 1569264
    '''

    print "load yelp dataset..."
    dirname = "yelp_dataset_challenge_academic_dataset/"

    business_filename = dirname + "yelp_academic_dataset_business_Phoenix.json" 
    user_filename = dirname + "yelp_academic_dataset_user_Phoenix.json" 
    review_filename = dirname + "yelp_academic_dataset_review_Phoenix.json" 

    business = loadData(business_filename)
    user = loadData(user_filename)
    review = loadData(review_filename)


    ##########################################
    ###           Build graph              ###
    ##########################################

    print "build graph..."
    G = nx.DiGraph()

    print "    add nodes..."
    '''
        Node types:
            user ("type": "user")
            business ("type": "business", "stars": 3.5)
            category ("type": "category")
    '''

    # user {"type": "user"}
    G.add_nodes_from( [ [ "USER#" + u["user_id"], {"type": "user"} ] for u in user ] )

    # business {"type": "business", "stars": 3.5}
    G.add_nodes_from( [ [ "BUSINESS#" + b["business_id"], {"type": "business", "stars": b["stars"]} ] \
                         for b in business ] )

    # category {"type": "category"}
    category_set = set()
    for b in business:
        for c in b["categories"]:
            category_set.add(c)
    category_list = list(category_set)
    G.add_nodes_from( [ [ "CATEGORY#" + c, {"type": "category"} ] for c in category_list ] )



    print "    add edges..."
    '''
        business:
            business->user
            business->category
        user:
            user->business
            user->user
        category:
            category->business
    '''
    
    normalize_edge = True
    if normalize_edge:
        print "    (normalize edges: YES)"

        #       (0.5)      (0.5)
        #        \          /
        #    User - Business - Category 
        #        /          \ 
        #
        #            (1)
        #             \
        #    Business - Category
        #             /
        #
        #        (0.5)     (0.5)
        #            \      /
        #       User - User - Business 
        #            /      \ 


        # business->category, category->business
        category_business_edges = dict()
        for b in business:
            business_id = "BUSINESS#" + b["business_id"]
            for c in b["categories"]:
                category_id = "CATEGORY#" + c
                G.add_edge(business_id, category_id, { "weight": 0.5 / len(b["categories"]) })
                if category_id not in category_business_edges:
                    category_business_edges[category_id] = list()
                category_business_edges[category_id].append(business_id)

        for category_id in category_business_edges:
            for business_id in category_business_edges[category_id]:
                G.add_edge(category_id, business_id, { "weight": 1.0 / len(category_business_edges[category_id]) })


        # Add user->business and business->user review edges (if rating greater than user's average)
        # A USER CAN WRITE MORE THAN ONE REVIEW ON A BUSINESS!!!
        user_average_stars = dict()
        for u in user:
            user_id = "USER#" + u["user_id"]
            user_average_stars[user_id] = u["average_stars"]
        
        user_business_edges = dict()
        business_user_edges = dict()
        for r in review:
            user_id = "USER#" + r["user_id"]
            business_id = "BUSINESS#" + r["business_id"]
            if (sign == 'pos' and r["stars"] > user_average_stars[user_id]) \
                or (sign == 'neg' and r["stars"] < user_average_stars[user_id]) \
                or (sign == 'all'):
                if user_id not in user_business_edges:
                    user_business_edges[user_id] = list()
                if business_id not in business_user_edges:
                    business_user_edges[business_id] = list()
                user_business_edges[user_id].append( [ business_id, r['date'] ] )
                business_user_edges[business_id].append( [ user_id, r['date'] ] )

        for user_id in user_business_edges:
            for business_id, date in user_business_edges[user_id]:
                G.add_edge(user_id, business_id, { "weight": 0.5 / len(user_business_edges[user_id]), "date": date })

        for business_id in business_user_edges:
            for user_id, date in business_user_edges[business_id]:
                G.add_edge(business_id, user_id, { "weight": 0.5 / len(business_user_edges[business_id]), "date": date })


        # Add user->user1 friendship edges with no weight specified
        for u in user:
            user1_id = "USER#" + u["user_id"]
            for f in u["friends"]:
                user2_id = "USER#" + f
                G.add_edge(user1_id, user2_id, { "weight": 0.5 / len(u["friends"]) })

    else:
        print "    (normalize edges: NO)"

        # business->category, category->business
        for b in business:
            business_id = "BUSINESS#" + b["business_id"]
            for c in b["categories"]:
                category_id = "CATEGORY#" + c
                G.add_edge(business_id, category_id, {"weight": 1})
                G.add_edge(category_id, business_id, {"weight": 1})

        # Add user->business and business->user review edges (if rating greater than user's average)
        # A USER CAN WRITE MORE THAN ONE REVIEW ON A BUSINESS!!!
        user_average_stars = dict()
        for u in user:
            user_id = "USER#" + u["user_id"]
            user_average_stars[user_id] = u["average_stars"]
        for r in review:
            user_id = "USER#" + r["user_id"]
            business_id = "BUSINESS#" + r["business_id"]
            if r["stars"] > user_average_stars[user_id]:
                G.add_edge(user_id, business_id, {"weight": 1})
                G.add_edge(business_id, user_id, {"weight": 1})

        # Add user->user1 friendship edges with no weight specified
        for u in user:
            user1_id = "USER#" + u["user_id"]
            for f in u["friends"]:
                user2_id = "USER#" + f
                G.add_edge(user1_id, user2_id, {"weight": 1})


    #################################################
    ###   Test (Set a business to "cold start")   ###
    #################################################

    '''
        To test a business, remove all the reviews on the test_business_id
            G.node[test_business_id] = 
                {'stars': 3.0, 'type': 'business'}
            review_count = 122
    '''

    # Read business stars
    print "read business stars..."
    business_stars = dict()
    for b in business:
        business_id = "BUSINESS#" + b["business_id"]
        business_stars[business_id] = b["stars"]

    # Read test businesses from input file
    print "read test business list from " + input_file_path + "..."
    test_business_list = []
    with open(input_file_path) as f_r:
        line = f_r.readline()
        while line:
            test_business_list.append(line[:-1])
            line = f_r.readline()


    # Run PageRank for test businesses
    f_pr = open(output_file_path1,'w')
    csv_writer = csv.writer(f_pr)
    f_w = open(output_file_path, "w")
    for test_business_original_id in test_business_list:
        
        test_business_id = "BUSINESS#" + test_business_original_id
        print "\ntest business: " + test_business_original_id
        
        # Create a new graph from G
        print "    create new graph"
        G_new = nx.DiGraph(G)

        if threshold is not None:
            # keep the first k reviews
            print "        keep the first k reviews on test business..."
            remove_edge_list = []
            test_business_edges = G_new.edge[test_business_id]
            review_user_list = []
            for node_id in test_business_edges:
                if node_id.startswith("USER#"):
                    review_user_list.append([node_id, test_business_edges[node_id]['date']])
            review_user_list = sorted(review_user_list, cmp=lambda x,y: compare_date(x[1],y[1]))
            while len(review_user_list) > threshold:
                user_id = review_user_list[-1][0]
                remove_edge_list.append([user_id, test_business_id])
                remove_edge_list.append([test_business_id, user_id])
                review_user_list.pop()
            G_new.remove_edges_from(remove_edge_list)
        else:
            # remove all review edges on test business
            print "        remove all review edges on test business..."
            remove_edge_list = []
            for r in review:
                business_id = "BUSINESS#" + r["business_id"]
                if business_id == test_business_id:
                    user_id = "USER#" + r["user_id"]
                    remove_edge_list.append([user_id, business_id])
                    remove_edge_list.append([business_id, user_id])
            G_new.remove_edges_from(remove_edge_list)



        # Construct personalization vector for PageRank
        personalization_dict = dict()
        for n in G_new.nodes():
            personalization_dict[n] = 0
        personalization_dict[test_business_id] = 1

        # Run PageRank
        print "    run PageRank..."
        pr = nx.pagerank(G_new, alpha=0.85, personalization=personalization_dict, \
                         max_iter=300, tol=1e-06, nstart=None, weight='weight', dangling=None)


        # Output results
        print "    calculate and output results to " + output_file_path + "..."
        weighted_stars_sum = 0
        weight_sum = 0
        for node_id in pr:
            if node_id.startswith('BUSINESS#') and (node_id != test_business_id):
                weighted_stars_sum += pr[node_id] * business_stars[node_id]
                weight_sum += pr[node_id]        
        predicted_stars = weighted_stars_sum / weight_sum

        print test_business_original_id + "\t" + \
              str(business_stars[test_business_id]) + "\t" + \
              str(predicted_stars)

        f_w.write( test_business_original_id + "\t" + \
                   str(business_stars[test_business_id]) + "\t" + \
                   str(predicted_stars) + "\n" )

        pr_list = sorted([(k, pr[k]) for k in pr],key=lambda x: -x[1])
        for pair in pr_list:
            if pair[0].startswith('BUSINESS#') and (pair[0] != test_business_id):
                csv_writer.writerow([test_business_id, pair[0], pair[1]])
    f_w.close()
    f_pr.close()
Esempio n. 6
0
def predict_stars_rw_ubw(input_file_path, output_file_path):
	##########################################
	### Load data (business, user, word) ###
	##########################################

	'''
		total number of users: 366715
		total number of businesses: 61184
		total number of reviews: 1569264
	'''

	print "load yelp dataset..."
	dirname = "yelp_dataset_challenge_academic_dataset/"
	business_filename = dirname + "yelp_academic_dataset_business_Phoenix.json"
	user_filename = dirname + "yelp_academic_dataset_user_Phoenix.json"
	review_filename = dirname + "yelp_academic_dataset_review_Phoenix.json"
	business = loadData(business_filename)
	user = loadData(user_filename)
	review = loadData(review_filename)

	word = {}
	with open("Phoenix.csv", "rb") as csvfile:
		reader = csv.DictReader(csvfile)
		for row in reader:
			business_id = row["business_id"]
			row.pop("business_id")
			for k, v in row.iteritems():
				row[k] = int(v)
			word[business_id] = row


	##########################################
	###           Build graph              ###
	##########################################

	print "build graph..."
	G = nx.DiGraph()

	print "    add nodes..."
	'''
		Node types:
			user ("type": "user")
			business ("type": "business", "stars": 3.5)
			word ("type": "category")
	'''

	# user {"type": "user"}
	G.add_nodes_from( [ [ "USER#" + u["user_id"], {"type": "user"} ] for u in user ] )

	# business {"type": "business", "stars": 3.5}
	G.add_nodes_from( [ [ "BUSINESS#" + b["business_id"], {"type": "business", "stars": b["stars"]} ] \
		                 for b in business ] )

	# word {"type": "word"}
	word_list = list(word.itervalues().next().keys())
	G.add_nodes_from([ [ "WORD#" + w, {"type": "word"} ] for w in word_list ])


	print "    add edges..."
	'''
		Edge types:
			user->business (no weight), business->user (no weight)
			user1->user2 (no weight)
			business->category, category->business (no weight)
	'''

	# Add positive user->business and business->user review edges (ratings greater than user average)
	# A USER CAN WRITE MORE THAN ONE REVIEW ON A BUSINESS!!!
	business_review_number = {}
	user_review_number = {}
	for r in review:
		business_id = "BUSINESS#" + r["business_id"]
		user_id = "USER#" + r["user_id"]
		business_review_number[business_id] = business_review_number.get(business_id, 0) + 1
		user_review_number[user_id] = user_review_number.get(user_id, 0) + 1

	user_average_stars = dict()
	for u in user:
		user_id = "USER#" + u["user_id"]
		user_average_stars[user_id] = u["average_stars"]
	for r in review:
		user_id = "USER#" + r["user_id"]
		business_id = "BUSINESS#" + r["business_id"]
		if r["stars"] > user_average_stars[user_id]:
			G.add_edge(user_id, business_id, {"weight": 1 / user_review_number[user_id]})
			G.add_edge(business_id, user_id, {"weight": 1 / business_review_number[business_id]})

	# business->word, word->business
	for b in business:
	    business_id = "BUSINESS#" + b["business_id"]
	    total = sum(word[b["business_id"]].values())
	    for k, v in word[b["business_id"]].iteritems():
	        word_id = "WORD#" + k
	        w = v / total
	        if w > 0:
	            G.add_edge(business_id, word_id, {"weight": w})
	            G.add_edge(word_id, business_id,{"weight": 1})

	# Add user1->user2 friendship edges with no weight specified
	for u in user:
		user1_id = "USER#" + u["user_id"]
		number_of_friends = len(u["friends"])
		for f in u["friends"]:
			user2_id = "USER#" + f
			G.add_edge(user1_id, user2_id, {"weight": 1 / number_of_friends})

	#################################################
	###   Test (Set a business to "cold start")   ###
	#################################################

	'''
		To test a business, remove all the reviews on the test_business_id
			G.node[test_business_id] =
				{'stars': 3.0, 'type': 'business'}
			review_count = 122
	'''

	# Read business stars
	print "read business stars..."
	business_stars = dict()
	for b in business:
	    business_id = "BUSINESS#" + b["business_id"]
	    business_stars[business_id] = b["stars"]

	# Read test businesses from input file
	print "read test business list from " + input_file_path + "..."
	test_business_list = []
	with open(input_file_path) as f_r:
		line = f_r.readline()
		while line:
			test_business_list.append(line[:-1])
			line = f_r.readline()


	# Run PageRank for test businesses
	f_pr = open('output.phoenix.dev.ubw.pr.txt','w')
	csv_writer = csv.writer(f_pr)
	f_w = open(output_file_path, "w")
	for test_business_original_id in test_business_list:

		test_business_id = "BUSINESS#" + test_business_original_id
		print "\ntest business: " + test_business_original_id

		# Create a new graph from G, remove review edges on test business
		print "    create graph and remove review edges on test business..."
		G_new = nx.DiGraph(G)
		remove_edge_list = []
		for r in review:
			business_id = "BUSINESS#" + r["business_id"]
			if business_id == test_business_id:
				user_id = "USER#" + r["user_id"]
				remove_edge_list.append([user_id, business_id])
				remove_edge_list.append([business_id, user_id])
		G_new.remove_edges_from(remove_edge_list)

		# Construct personalization vector for PageRank
		personalization_dict = dict()
		for n in G_new.nodes():
			personalization_dict[n] = 0
		personalization_dict[test_business_id] = 1

		# Run PageRank
		print "    run PageRank..."
		pr = nx.pagerank(G_new, alpha=0.85, personalization=personalization_dict, \
			             max_iter=300, tol=1e-06, nstart=None, weight='weight', dangling=None)

		# Output results
		print "    calculate and output results to " + output_file_path + "..."
		weighted_stars_sum = 0
		weight_sum = 0
	        pr_list = sorted([(k, pr[k]) for k in pr],key=lambda x: -x[1])
	        with open("output_word/output." + test_business_original_id + ".txt","w") as outfile:
	            for pair in pr_list:
	                if pair[0].startswith('BUSINESS#') and (pair[0] != test_business_id):
	                    outfile.write(pair[0].replace('BUSINESS#','') + ", " + str(pair[1]) + ", " + str(business_stars[pair[0]]) + '\n')

	        for node_id in pr:
			if node_id.startswith('BUSINESS#') and (node_id != test_business_id) :
				weighted_stars_sum += pr[node_id] * business_stars[node_id]
				weight_sum += pr[node_id]
		predicted_stars = weighted_stars_sum / weight_sum

		print test_business_original_id + "\t" + \
			  str(business_stars[test_business_id]) + "\t" + \
			  str(predicted_stars)

		f_w.write( test_business_original_id + "\t" + \
				   str(business_stars[test_business_id]) + "\t" + \
				   str(predicted_stars) + "\n" )


		pr_list = sorted([(k, pr[k]) for k in pr],key=lambda x: -x[1])
                for pair in pr_list:
                    if pair[0].startswith('BUSINESS#') and (pair[0] != test_business_id):
                        csv_writer.writerow([test_business_id, pair[0], pair[1]])


	f_w.close()
	f_pr.close()
	return