Esempio n. 1
0
    def insert_reviews(self, fpath=REVIEW_DATA_PATH):
        EXCLUDE = ['type', 'votes']
        for i,row in enumerate(yelprows(fpath)):
            keys = [ k for k in row.keys() if k not in EXCLUDE ]
            values = []
           
            try:
                for k in keys:
                    if k == 'user_id':
                        values.append(self.get_id('users', k, row[k]))
                    elif k == 'business_id':
                        values.append(self.get_id('businesses', k ,row[k]))
                    else:
                        values.append(row[k])

                votes = row['votes']
                vkeys = votes.keys()
                vvalues = [votes[k] for k in vkeys]

                keys.extend(vkeys)
                values.extend(vvalues)
                
                keys = [ 'review' if k=='text' else k for k in keys ]
                
                _id = self._insert("reviews", keys, values)
            except Exception, e:
                pass

            if i%1000 == 0:
                self.conn.commit()
                print "%d finished"%i
Esempio n. 2
0
 def insert_users(self, fpath=USER_DATA_PATH):
     INCLUDE = ['name', 'yelping_since', 'review_count', 'user_id', 'average_stars', 'fans']
     for i,row in enumerate(yelprows(fpath)):
         try:
             keys = [k for k in row.keys() if k in INCLUDE ]
             values = [ transform(row[k]) for k in keys ]
             _id = self._insert("users", keys, values)
         except Exception,e:
             print e
         if i%1000==0:
             print "%d finished"%i
             self.conn.commit()
Esempio n. 3
0
 def insert_businesses(self, fpath=BUSINESS_DATA_PATH):
     IGNORE = ['id','neighborhoods', 'type']
     table = "businesses"
     try:
         for i, row in enumerate(yelprows(fpath)):
             keys = [k for k in SCHEMA[table] if k not in IGNORE ]
             values = [ transform(row[k]) for k in keys ]
             #stmt = "insert into businesses (" + ','.join(keys) + ") values (" +  ','.join(['?']*len(keys)) + ")"
             try:
                 _id = self._insert("businesses", keys, values)
                 categories = self.get_category_ids(row['categories'])
                 for category_id in categories:
                     self.cursor.execute("insert into categories_businesses values(?,?)", [category_id, _id])
                 self.conn.commit()
             except Exception, e:
                 print "Error, inserting same key mutiple times? -- %s"%str(e)
             if i%100 == 0:
                 print "%d finished"%i
     except Exception,e:
         raise
         print e
Esempio n. 4
0
#!/usr/bin/env python

import sqlite3
import simplejson
import operator
from yelputils import yelprows

fpath = "../../yelpdata/yelp_academic_dataset_business.json"

k = set([])

# print all the columns for business data
# Can we create a properly normalized database from this data?
for row in yelprows(fpath):
    k = k.union(row.keys())
print "Number of unique columns: %d" % len(k)
print repr(k)
print

# print unique categories
cats = {}
for row in yelprows(fpath):
    for item in row["categories"]:
        cats[item] = cats.setdefault(item, 0) + 1

cats = sorted(cats.iteritems(), key=operator.itemgetter(1))
cats.reverse()

print "Num of unique categories: %d" % len(cats)
print cats[:10]
print