x = section_content.findAll('li') # print [(y.text, y.a['href']) for y in x] catY = [catX.text for catX in siteMap.findAll('h2')] # subCat = [ss.findAll('li') for ss in siteMap.findAll('ul')] subCat = [ss.findAll('li') for ss in siteMap.findAll('ul')] ssubCat = [[(y.text, y.a['href']) for y in q] for q in subCat] haha = zip(catY, ssubCat) # print "\n".join(["Category: %s, %d" % (q[0], len(q[1])) for q in haha]) db = mydb.MyDb() sql = """CREATE TABLE tree( id INTEGER PRIMARY KEY AUTOINCREMENT, pid INTEGER, name VARCHAR(50), value VARCHAR(50) )""" db.execSQL(sql) # print json.dumps(haha) # db.insertData('tree', ['pid', 'name', 'value'], testdata) [db.parentNKids('tree', ['pid', 'name', 'value'], [('', q[0], '')], q[1]) for q in haha] # print db.getData("tree")
class Crunch: ops = 'companies, people, products, financial-organizations, service-providers' op = 'dryrun' def __init__(self, ini_file="crunch.ini", db_file="test.db", crunch_out_dir='/tmp'): """ Things to check: - do we have a readable ini file? - do we have an existing DB? - does it have queue management tables / correct schema version? """ oplist = [i.strip() for i in self.ops.split(',')] try: options = gnu_getopt(sys.argv[1:], 'l:w:', oplist) print options except GetoptError, e: print str(e) exit(1) self.worker = '%d/%d' % (os.getuid(), os.getpid()) config = ConfigParser.ConfigParser() config.read(ini_file) try: crunch_out_dir = config.get("Common", "CrunchOutDir") db_file = os.path.join(crunch_out_dir, config.get("Config", "DBFile")) self.raw_data_dir = os.path.join( crunch_out_dir, config.get("Config", "RawDataDir")) rate_limit = int(config.get("Config", "RateLimit")) except ConfigParser.NoSectionError: # config file or section does not exist pass # using default values if options[0][0][1] in oplist: opflag, self.op = options[0][0] try: db_file = os.path.join(crunch_out_dir, config.get(self.op, "DBFile")) self.raw_data_dir = os.path.join( crunch_out_dir, config.get(self.op, "RawDataDir")) self.list_file = config.get(self.op, "ListFile") self.singular = config.get(self.op, "Singular") except ConfigParser.NoSectionError: # config file or section does not exist pass else: print "must specify a valid action:\n%s" % self.ops exit(1) print "Pricessing: %s" % self.op print "Using DB: %s" % db_file print "Rate Limit: %d" % rate_limit if not os.path.isfile(db_file): print "%s not found" % db_file import mydb self.db = mydb.MyDb(db_file) self.db.debug_flag = False self.create_api_tables() self.admin_tasks() iters = 0 if opflag == '-w' and self.op != 'dryrun': load_completed = True print "activating worker mode" else: load_completed = False while load_completed: iters = iters + 1 self.register_worker() id, url, keyref = self.assign_job()[0] if url: load_completed = self.execute_job(url, keyref) if load_completed: s = 1 print "sleeping for %d seconds" % s time.sleep(s) else: exit(1) else: exit(1) if opflag == '-l' and self.op not in ['dryrun', 'worker']: print 'click' url_mask = "http://api.crunchbase.com/v/1/%s" % self.singular + "/%s.js" self.load_tc_list(self.list_file, url_mask) else: print 'no data load requested' # os.getpid(), os.getuid(), os.uname() # select * from mgmt_api where sess_lastupdate < datetime('now', '-10 seconds'); # sql = "SELECT tbl_name FROM sqlite_master WHERE tbl_name = 'mgmt_api'" # print self.db.runQuery(sql) pass