def insert_reviews(self, fpath=REVIEW_DATA_PATH): EXCLUDE = ['type', 'votes'] for i,row in enumerate(yelprows(fpath)): keys = [ k for k in row.keys() if k not in EXCLUDE ] values = [] try: for k in keys: if k == 'user_id': values.append(self.get_id('users', k, row[k])) elif k == 'business_id': values.append(self.get_id('businesses', k ,row[k])) else: values.append(row[k]) votes = row['votes'] vkeys = votes.keys() vvalues = [votes[k] for k in vkeys] keys.extend(vkeys) values.extend(vvalues) keys = [ 'review' if k=='text' else k for k in keys ] _id = self._insert("reviews", keys, values) except Exception, e: pass if i%1000 == 0: self.conn.commit() print "%d finished"%i
def insert_users(self, fpath=USER_DATA_PATH): INCLUDE = ['name', 'yelping_since', 'review_count', 'user_id', 'average_stars', 'fans'] for i,row in enumerate(yelprows(fpath)): try: keys = [k for k in row.keys() if k in INCLUDE ] values = [ transform(row[k]) for k in keys ] _id = self._insert("users", keys, values) except Exception,e: print e if i%1000==0: print "%d finished"%i self.conn.commit()
def insert_businesses(self, fpath=BUSINESS_DATA_PATH): IGNORE = ['id','neighborhoods', 'type'] table = "businesses" try: for i, row in enumerate(yelprows(fpath)): keys = [k for k in SCHEMA[table] if k not in IGNORE ] values = [ transform(row[k]) for k in keys ] #stmt = "insert into businesses (" + ','.join(keys) + ") values (" + ','.join(['?']*len(keys)) + ")" try: _id = self._insert("businesses", keys, values) categories = self.get_category_ids(row['categories']) for category_id in categories: self.cursor.execute("insert into categories_businesses values(?,?)", [category_id, _id]) self.conn.commit() except Exception, e: print "Error, inserting same key mutiple times? -- %s"%str(e) if i%100 == 0: print "%d finished"%i except Exception,e: raise print e
#!/usr/bin/env python import sqlite3 import simplejson import operator from yelputils import yelprows fpath = "../../yelpdata/yelp_academic_dataset_business.json" k = set([]) # print all the columns for business data # Can we create a properly normalized database from this data? for row in yelprows(fpath): k = k.union(row.keys()) print "Number of unique columns: %d" % len(k) print repr(k) print # print unique categories cats = {} for row in yelprows(fpath): for item in row["categories"]: cats[item] = cats.setdefault(item, 0) + 1 cats = sorted(cats.iteritems(), key=operator.itemgetter(1)) cats.reverse() print "Num of unique categories: %d" % len(cats) print cats[:10] print