Esempio n. 1
0
def get_bipartite_graph(NUM_REVIEWS = 3000):
    businesses = load_data.load_objects("business")
    print "businesses loaded: " + str(time.clock() - start)
    users = load_data.load_objects("user")
    print "users loaded: " + str(time.clock() - start)
    reviews = load_data.load_objects("review", NUM_REVIEWS)
    print "reviews loaded: " + str(time.clock() - start)

    business_dict = {}
    for b in businesses:
        business_dict[b.business_id] = b
    user_dict = {}
    for u in users:
        user_dict[u.user_id] = u

    print "dicts loaded: " + str(time.clock() - start)

    G = nx.DiGraph()
    #G = nx.Graph()

    for index, r in enumerate(reviews):
        if (index % 100) == 0:
            print index
        if r.user_id in user_dict.keys() and b.business_id in business_dict.keys():
            user = user_dict[r.user_id]
            business = business_dict[r.business_id]
            G.add_edge(business, user)
            #G.edge[business][user]['weight'] = r.stars # r.stars or r.votes[funny, useful, cool]

    print "graph fully loaded: " + str(time.clock() - start)
    return G
Esempio n. 2
0
def get_temporal_business_graph():

    businesses = load_data.load_objects("business")
    users = load_data.load_objects("user")
    reviews = load_data.load_objects("review", 10000)

    business_dict = {}
    for b in businesses:
        business_dict[b.business_id] = b
    user_dict = {}
    for u in users:
        user_dict[u.user_id] = u
    # create dictionary of user_id -> [review, ...]
    review_dict = {}
    for review in reviews:
        if "Restaurants" not in business_dict[review.business_id].categories:
            continue
        if review.user_id in review_dict.keys():
            review_dict[review.user_id] += [review]
        else:
            review_dict[review.user_id] = [review]

    G = nx.DiGraph()

    for user_id, review_list in review_dict.items():
        review_list = sorted(review_list, key=lambda review: review.date)
        for i in range(len(review_list) - 1):

            d1 = parser.parse(review_list[i].date)
            d2 = parser.parse(review_list[i + 1].date)
            b1 = business_dict[review_list[i].business_id]
            b2 = business_dict[review_list[i + 1].business_id]
            if G.has_edge(b1, b2):
                G.edge[b1][b2]["weight"] += 1
            else:
                G.add_edge(b1, b2)
                G.edge[b1][b2]["weight"] = 1
            """
            b1 = business_dict[review_list[i].business_id]
            b2 = business_dict[review_list[i+1].business_id]
            for c1 in b1.categories:
                for c2 in b2.categories:
                    if G.has_edge(c1, c2):
                        G.edge[c1][c2]['weight'] += 1
                    else:
                        G.add_edge(c1, c2)
                        G.edge[c1][c2]['weight'] = 1
            """

    print "graph created"
    return G
Esempio n. 3
0
def get_temporal_business_graph():

    businesses = load_data.load_objects("business")
    users = load_data.load_objects("user")
    reviews = load_data.load_objects("review", 10000)

    business_dict = {}
    for b in businesses:
        business_dict[b.business_id] = b
    user_dict = {}
    for u in users:
        user_dict[u.user_id] = u
    # create dictionary of user_id -> [review, ...]
    review_dict = {}
    for review in reviews:
        if 'Restaurants' not in business_dict[review.business_id].categories:
            continue
        if review.user_id in review_dict.keys():
            review_dict[review.user_id] += [review]
        else:
            review_dict[review.user_id] = [review]

    G = nx.DiGraph()

    for user_id, review_list in review_dict.items():
        review_list = sorted(review_list, key=lambda review: review.date)
        for i in range(len(review_list) - 1):
            
            d1 = parser.parse(review_list[i].date)
            d2 = parser.parse(review_list[i+1].date)
            b1 = business_dict[review_list[i].business_id]
            b2 = business_dict[review_list[i+1].business_id]
            if G.has_edge(b1, b2):
                G.edge[b1][b2]['weight'] += 1
            else:
                G.add_edge(b1, b2)
                G.edge[b1][b2]['weight'] = 1
            '''
            b1 = business_dict[review_list[i].business_id]
            b2 = business_dict[review_list[i+1].business_id]
            for c1 in b1.categories:
                for c2 in b2.categories:
                    if G.has_edge(c1, c2):
                        G.edge[c1][c2]['weight'] += 1
                    else:
                        G.add_edge(c1, c2)
                        G.edge[c1][c2]['weight'] = 1
            '''

    print "graph created"
    return G
def run():

    businesses = load_data.load_objects("business")
    business_dict = {}
    for b in businesses:
        business_dict[b.business_id] = b
    reviews = load_data.load_objects("review", 25000)

    G = nx.DiGraph()

    for review in reviews:
        b = business_dict[review.business_id]
        if 'Restaurants' not in b.categories:
            continue
        text = re.sub('[^a-zA-Z0-9\n]', ' ', review.text)
        words_used = []
        for word in re.split(" ", text):
            word = word.lower()
            if word in words_used:  # only count one word occurence per review
                continue
            words_used += [word]
            if (review.stars not in G):
                G.add_node(review.stars)
            if (word not in G):
                G.add_node(word)
            if not G.has_edge(word, review.stars):
                G.add_edge(word, review.stars)
                G.edge[word][review.stars]['weight'] = 1
            else:
                G.edge[word][review.stars]['weight'] += 1

    indicators = []
    for word in G:
        if G.out_degree(word) > 0:  # is a word
            max_stars, max_weight = 0, 0
            # find which rating this word is most indicative of
            for stars in G.neighbors(word):
                if G.edge[word][stars]['weight'] > max_weight:
                    max_stars = stars
                    max_weight = G.edge[word][stars]['weight']
            # find how indicative this word is of that rating (and ratings nearby)
            indicator = max_weight
            for stars in G.neighbors(word):
                indicator -= abs(stars -
                                 max_stars) * G.edge[word][stars]['weight']
            indicators += [(word, indicator, max_stars)]

    # sort by best indicators
    indicators = sorted(indicators, key=lambda tuple: tuple[1], reverse=True)

    return indicators
def run():

    businesses = load_data.load_objects("business")
    business_dict = {}
    for b in businesses:
        business_dict[b.business_id] = b
    reviews = load_data.load_objects("review", 25000)

    G = nx.DiGraph()

    for review in reviews:
        b = business_dict[review.business_id]
        if 'Restaurants' not in b.categories:
            continue
        text = re.sub('[^a-zA-Z0-9\n]', ' ', review.text)
        words_used = []
        for word in re.split(" ", text):
            word = word.lower()
            if word in words_used: # only count one word occurence per review
                continue
            words_used += [word]
            if (review.stars not in G):
                G.add_node(review.stars)
            if (word not in G):
                G.add_node(word)
            if not G.has_edge(word, review.stars):
                G.add_edge(word, review.stars)
                G.edge[word][review.stars]['weight'] = 1
            else:
                G.edge[word][review.stars]['weight'] += 1

    indicators = []
    for word in G:
        if G.out_degree(word) > 0: # is a word
            max_stars, max_weight = 0, 0
            # find which rating this word is most indicative of
            for stars in G.neighbors(word):
                if G.edge[word][stars]['weight'] > max_weight:
                    max_stars = stars
                    max_weight = G.edge[word][stars]['weight']
            # find how indicative this word is of that rating (and ratings nearby)
            indicator = max_weight
            for stars in G.neighbors(word):
                indicator -= abs(stars - max_stars) * G.edge[word][stars]['weight']
            indicators += [(word, indicator, max_stars)]

    # sort by best indicators
    indicators = sorted(indicators, key = lambda tuple: tuple[1], reverse=True)

    return indicators
Esempio n. 6
0
def get_graph(NUM_REVIEWS=30000):

    businesses = load_data.load_objects("business")
    business_dict = {}
    for b in businesses:
        business_dict[b.business_id] = b
    reviews = load_data.load_objects("review", NUM_REVIEWS)

    # first, create a mapping of user -> [businesses rated]
    # second, create a mapping of (b1, b2) -> number of users rating both
    first = {}
    for index, review in enumerate(reviews):
        if index % 1000 == 0:
            print index
        if 'Restaurants' not in business_dict[review.business_id].categories:
            continue
        if review.user_id in first.keys():
            first[review.user_id] += [review.business_id]
        else:
            first[review.user_id] = [review.business_id]

    print "number of users: " + str(len(first))

    second = {}
    for user_id in first.keys():
        for b1 in first[user_id]:
            for b2 in first[user_id]:
                if b1 != b2:
                    # for b1=123, b2=524, we use key = 123_AND_524
                    key = [b1, b2]
                    sorted(key)
                    key = '_AND_'.join(key)
                    if set(key) in second.keys():
                        second[key] += 1
                    else:
                        second[key] = 1

    print "number of business pairs: " + str(len(second))

    G = nx.Graph()

    for key in second.keys():
        if second[key] > 1:
            print key
        b1_id, b2_id = key.split("_AND_")
        G.add_edge(business_dict[b1_id], business_dict[b2_id])

    return G
Esempio n. 7
0
def get_graph(NUM_REVIEWS=30000):

    businesses = load_data.load_objects("business")
    business_dict = {}
    for b in businesses:
        business_dict[b.business_id] = b
    reviews = load_data.load_objects("review", NUM_REVIEWS)

    # first, create a mapping of user -> [businesses rated]
    # second, create a mapping of (b1, b2) -> number of users rating both 
    first = {}
    for index, review in enumerate(reviews):
        if index%1000 == 0:
            print index
        if 'Restaurants' not in business_dict[review.business_id].categories:
            continue
        if review.user_id in first.keys():
            first[review.user_id] += [review.business_id]
        else:
            first[review.user_id] = [review.business_id]

    print "number of users: " + str(len(first))

    second = {}
    for user_id in first.keys():
        for b1 in first[user_id]:
            for b2 in first[user_id]:
                if b1 != b2:
                    # for b1=123, b2=524, we use key = 123_AND_524
                    key = [b1, b2]
                    sorted(key)
                    key = '_AND_'.join(key)
                    if set(key) in second.keys():
                        second[key] += 1
                    else:
                        second[key] = 1

    print "number of business pairs: " +  str(len(second))

    G = nx.Graph()

    for key in second.keys():
        if second[key] > 1:
            print key
        b1_id, b2_id = key.split("_AND_")
        G.add_edge(business_dict[b1_id], business_dict[b2_id])

    return G
Esempio n. 8
0
def run():

    businesses = load_data.load_objects("business")
    users = load_data.load_objects("user")
    reviews = load_data.load_objects("review", 100000)

    business_dict = {}
    for b in businesses:
        business_dict[b.business_id] = b
    user_dict = {}
    for u in users:
        user_dict[u.user_id] = u

    # user gave 4.5, usually gives 3
    b_dict = {}  # business -> (total_reviewer_plus_or_minus, num_reviews_seen)
    for review in reviews:
        if review.business_id not in business_dict or review.user_id not in user_dict:
            continue
        b = business_dict[review.business_id]
        u = user_dict[review.user_id]
        diff = review.stars - u.average_stars
        if b in b_dict:
            b_dict[b] = (b_dict[b][0] + diff, b_dict[b][1] + 1)
        else:
            b_dict[b] = (diff, 1)

    normalized_businesses = []
    for b in b_dict:
        diff, count = b_dict[b]
        if count > 1:
            new_rating = b.stars + diff
            normalized_businesses += [(b, new_rating, count)]

    normalized_businesses = sorted(normalized_businesses,
                                   key=lambda t: t[1],
                                   reverse=True)

    print "\n\nunderrated businesses"
    for b, rating, count in normalized_businesses[:20]:
        print '{:<80}'.format(
            str(b)
        ), "\t\t\t", rating, "\t\t\t", count, "/", b.review_count, "\t\t\t", b.stars

    print "\n\noverrated businesses"
    for b, rating, count in normalized_businesses[-20:]:
        print '{:<80}'.format(
            str(b)
        ), "\t\t\t", rating, "\t\t\t", count, "/", b.review_count, "\t\t\t", b.stars
Esempio n. 9
0
def run():

    businesses = load_data.load_objects("business")
    users = load_data.load_objects("user")
    reviews = load_data.load_objects("review", 100000)

    business_dict = {}
    for b in businesses:
        business_dict[b.business_id] = b
    user_dict = {}
    for u in users:
        user_dict[u.user_id] = u

    # user gave 4.5, usually gives 3
    b_dict = {} # business -> (total_reviewer_plus_or_minus, num_reviews_seen)
    for review in reviews:
        if review.business_id not in business_dict or review.user_id not in user_dict:
            continue
        b = business_dict[review.business_id]
        u = user_dict[review.user_id]
        diff = review.stars - u.average_stars
        if b in b_dict:
            b_dict[b] = (b_dict[b][0] + diff, b_dict[b][1] + 1)
        else:
            b_dict[b] = (diff, 1)

    normalized_businesses = []
    for b in b_dict:
        diff, count = b_dict[b]
        if count > 1:
            new_rating = b.stars + diff
            normalized_businesses += [(b, new_rating, count)]

    normalized_businesses = sorted(normalized_businesses, key=lambda t: t[1], reverse=True)


    print "\n\nunderrated businesses"
    for b, rating, count in normalized_businesses[:20]:
        print '{:<80}'.format(str(b)), "\t\t\t", rating, "\t\t\t", count, "/", b.review_count, "\t\t\t", b.stars

    print "\n\noverrated businesses"
    for b, rating, count in normalized_businesses[-20:]:
        print '{:<80}'.format(str(b)), "\t\t\t", rating, "\t\t\t", count, "/", b.review_count, "\t\t\t", b.stars
def aggregate_objects(data_flags, skip_missing=False):
    """
	Aggregate CS objects from separate .pklz files to a single .pklz file.
	
	Args:
		data_flags: Identifiers for saving and loading.
	"""

    if skip_missing == True:
        print("Skipping missing files...will populate with `None`")

    if isinstance(data_flags, str):
        data_flags = [data_flags]

    for data_flag in data_flags:
        list_dict = read_specs_file(data_flag)
        iter_vars = list_dict['iter_vars']
        iter_vars_dims = []
        for iter_var in iter_vars:
            iter_vars_dims.append(len(iter_vars[iter_var]))
        it = sp.nditer(sp.zeros(iter_vars_dims), flags=['multi_index'])

        obj_list = []
        while not it.finished:
            sys.stdout.flush()
            print(it.multi_index)
            if skip_missing == False:
                CS_obj = load_objects(list(it.multi_index), data_flag)
            else:
                try:
                    CS_obj = load_objects(list(it.multi_index), data_flag)
                except (IOError, OSError):
                    print('Skipping item %s...' % list(it.multi_index))
                    CS_obj = None

            obj_list.append(CS_obj)
            it.iternext()

        save_aggregated_object_list(obj_list, data_flag)
'''
generate reviews for a given 
'''

from objects import *
from networkx import *
from helpers import *
import load_data
import re, random, unicodedata

NUM_STATES = 2 # number of previous words to use as state
NUM_REVIEWS = 100000
PUNCTUATION = ['.', '!', '?']

businesses = load_data.load_objects("business")
users = load_data.load_objects("user")
reviews = load_data.load_objects("review", NUM_REVIEWS)
print NUM_REVIEWS,"reviews loaded"

business_dict = {}
for b in businesses:
    business_dict[b.business_id] = b
user_dict = {}
for u in users:
    user_dict[u.user_id] = u

# creates a text file with the appropriate reviews:
def make_reviews_file(stars=None, min_stars=None, max_stars=None, category=None):
    F_NAME = 'reviews_file'
    f = open(F_NAME, 'w')
    num_reviews = 0
Esempio n. 12
0
'''
A matrix of full connectivity of businesses where
the edge weights are the distance between two businesses
'''

import load_data
import math, numpy as np
from helpers import *

#1 degree of latitude is approx 69 miles
n = 500
pre_businesses = load_data.load_objects("business", n)

businesses=[]
for b in pre_businesses:
  if 'Restaurants' in b.categories:
    businesses += [b]

n = len(businesses)
for b in businesses:
  print "rating for ",str(b), ":", b.stars, ", # reviews: ", b.review_count

#users = load_data.load_objects("user", 5000)
#reviews = load_data.load_objects("review", 50000)
A = []
print 'starting script'
for i in range(n):
  bus = businesses[i]
  dists = []
  for j in range(n):
    if (i==j):
def aggregate_temporal_entropy_objects(data_flags):
    """
	Aggregate CS objects from separate .pklz files of temporal runs to a single
	.pklz object.
	
	Args:
		data_flags: Identifiers for saving and loading.
	"""

    temporal_structs_to_save = ['entropy']

    if isinstance(data_flags, str):
        data_flags = [data_flags]

    for data_flag in data_flags:
        list_dict = read_specs_file(data_flag)
        iter_vars = list_dict['iter_vars']
        iter_vars_dims = []
        for iter_var in iter_vars:
            iter_vars_dims.append(len(iter_vars[iter_var]))
        it = sp.nditer(sp.zeros(iter_vars_dims), flags=['multi_index'])

        CS_init_array = load_objects(list(it.multi_index), data_flag)

        # Dictionary to save all object at time 0; this will contain all
        # non-temporal info for each iterated variable.
        data = dict()
        data['init_objs'] = []
        nT = len(CS_init_array[0].signal_trace_Tt)

        # Assign data structures of appropriate shape for the temporal variable
        structs = dict()
        for struct_name in temporal_structs_to_save:
            try:
                tmp_str = 'structs[struct_name] = CS_init_array[0].%s' \
                   % struct_name
                exec(tmp_str)
            except:
                print('%s not an attribute of the CS object' % struct_name)
                continue

            # shape is (num timesteps, iterated var ranges, variable shape);
            # if a float or integer, shape is just time and iter vars.
            struct_shape = (nT, ) + tuple(iter_vars_dims)
            if hasattr(structs[struct_name], 'shape'):
                struct_shape += (structs[struct_name].shape)
            data['%s' % struct_name] = sp.zeros(struct_shape)

        # Iterate over all objects to be aggregated
        structs = dict()
        while not it.finished:

            print('Loading index:', it.multi_index)
            temporal_CS_array = load_objects(list(it.multi_index), data_flag)

            # Save full object at time 0, contains non-temporal data.
            data['init_objs'].append(temporal_CS_array[0])

            # Grab all the temporal structures, timepoint-by-timepoint
            for iT in range(nT):

                full_idx = (iT, ) + it.multi_index

                for struct_name in temporal_structs_to_save:
                    tmp_str = 'structs[struct_name] = temporal_CS_array[iT].%s' \
                       % struct_name
                    exec(tmp_str)
                    data[struct_name][full_idx] = structs[struct_name]

            it.iternext()

        save_aggregated_temporal_objects(data, data_flag)
Esempio n. 14
0
'''
A matrix of full connectivity of businesses where
the edge weights are the distance between two businesses
'''

import load_data
import math, numpy as np
from helpers import *

#1 degree of latitude is approx 69 miles
n = 500
pre_businesses = load_data.load_objects("business", n)

businesses = []
for b in pre_businesses:
    if 'Restaurants' in b.categories:
        businesses += [b]

n = len(businesses)
for b in businesses:
    print "rating for ", str(b), ":", b.stars, ", # reviews: ", b.review_count

#users = load_data.load_objects("user", 5000)
#reviews = load_data.load_objects("review", 50000)
A = []
print 'starting script'
for i in range(n):
    bus = businesses[i]
    dists = []
    for j in range(n):
        if (i == j):
Esempio n. 15
0
'''
Create a bipartite graphs of words in review text to reviews that contain them.
'''

from objects import *
from networkx import *
from helpers import *
import load_data
import re

reviews = load_data.load_objects("review", 1000)

G = nx.Graph()

init_weights = {}
for review in reviews:
    for word in re.split(" |\. |\! ", review.text):
        word = word.lower()
        G.add_edge(word, review)
        G.edge[word][review]['weight'] = review.stars
        #init_weights[review] = review.stars
        #init_weights[word] = 1

# pick a centrality
#centrality = nx.degree_centrality(G)
#centrality = nx.betweenness_centrality(G)
#centrality = nx.closeness_centrality(G, distance=True)
#centrality = nx.eigenvector_centrality(G, tol=.01)
#centrality = nx.pagerank(G, personalization=init_weights)

just_words = {}
Esempio n. 16
0
'''
Create a bipartite graphs of words in review text to reviews that contain them.
'''

from objects import *
from networkx import *
from helpers import *
import load_data
import re

reviews = load_data.load_objects("review", 1000)

G = nx.Graph()

init_weights = {}
for review in reviews:
    for word in re.split(" |\. |\! ", review.text):
        word = word.lower()
        G.add_edge(word, review)
        G.edge[word][review]['weight'] = review.stars
        #init_weights[review] = review.stars
        #init_weights[word] = 1

# pick a centrality
#centrality = nx.degree_centrality(G)     
#centrality = nx.betweenness_centrality(G)
#centrality = nx.closeness_centrality(G, distance=True)    
#centrality = nx.eigenvector_centrality(G, tol=.01)
#centrality = nx.pagerank(G, personalization=init_weights) 

just_words = {}