Exemple #1
0
x = section_content.findAll('li')

# print [(y.text, y.a['href']) for y in x]

catY = [catX.text for catX in siteMap.findAll('h2')]
# subCat = [ss.findAll('li') for ss in siteMap.findAll('ul')]

subCat = [ss.findAll('li') for ss in siteMap.findAll('ul')]

ssubCat = [[(y.text, y.a['href']) for y in q] for q in subCat]

haha = zip(catY, ssubCat)

# print "\n".join(["Category: %s, %d" % (q[0], len(q[1])) for q in haha])

db = mydb.MyDb()
sql = """CREATE TABLE tree(
id INTEGER PRIMARY KEY AUTOINCREMENT, 
pid INTEGER, 
name VARCHAR(50), 
value VARCHAR(50)
)"""
db.execSQL(sql)

# print json.dumps(haha)
# db.insertData('tree', ['pid', 'name', 'value'], testdata)

[db.parentNKids('tree', ['pid', 'name', 'value'], [('', q[0], '')], q[1]) for q in haha]

# print db.getData("tree")
Exemple #2
0
class Crunch:

    ops = 'companies, people, products, financial-organizations, service-providers'
    op = 'dryrun'

    def __init__(self,
                 ini_file="crunch.ini",
                 db_file="test.db",
                 crunch_out_dir='/tmp'):
        """
		Things to check:
		- do we have a readable ini file?
		- do we have an existing DB?
		- does it have queue management tables / correct schema version?
		"""

        oplist = [i.strip() for i in self.ops.split(',')]
        try:
            options = gnu_getopt(sys.argv[1:], 'l:w:', oplist)
            print options
        except GetoptError, e:
            print str(e)
            exit(1)

        self.worker = '%d/%d' % (os.getuid(), os.getpid())
        config = ConfigParser.ConfigParser()
        config.read(ini_file)
        try:
            crunch_out_dir = config.get("Common", "CrunchOutDir")
            db_file = os.path.join(crunch_out_dir,
                                   config.get("Config", "DBFile"))
            self.raw_data_dir = os.path.join(
                crunch_out_dir, config.get("Config", "RawDataDir"))
            rate_limit = int(config.get("Config", "RateLimit"))
        except ConfigParser.NoSectionError:  # config file or section does not exist
            pass  # using default values

        if options[0][0][1] in oplist:
            opflag, self.op = options[0][0]
            try:
                db_file = os.path.join(crunch_out_dir,
                                       config.get(self.op, "DBFile"))
                self.raw_data_dir = os.path.join(
                    crunch_out_dir, config.get(self.op, "RawDataDir"))
                self.list_file = config.get(self.op, "ListFile")
                self.singular = config.get(self.op, "Singular")
            except ConfigParser.NoSectionError:  # config file or section does not exist
                pass
        else:
            print "must specify a valid action:\n%s" % self.ops
            exit(1)

        print "Pricessing: %s" % self.op
        print "Using DB: %s" % db_file
        print "Rate Limit: %d" % rate_limit

        if not os.path.isfile(db_file):
            print "%s not found" % db_file

        import mydb
        self.db = mydb.MyDb(db_file)
        self.db.debug_flag = False
        self.create_api_tables()
        self.admin_tasks()
        iters = 0
        if opflag == '-w' and self.op != 'dryrun':
            load_completed = True
            print "activating worker mode"
        else:
            load_completed = False

        while load_completed:
            iters = iters + 1
            self.register_worker()
            id, url, keyref = self.assign_job()[0]
            if url:
                load_completed = self.execute_job(url, keyref)
                if load_completed:
                    s = 1
                    print "sleeping for %d seconds" % s
                    time.sleep(s)
                else:
                    exit(1)
            else:
                exit(1)

        if opflag == '-l' and self.op not in ['dryrun', 'worker']:
            print 'click'
            url_mask = "http://api.crunchbase.com/v/1/%s" % self.singular + "/%s.js"
            self.load_tc_list(self.list_file, url_mask)
        else:
            print 'no data load requested'

        # os.getpid(), os.getuid(), os.uname()
        # select * from mgmt_api where sess_lastupdate < datetime('now', '-10 seconds');

        # sql = "SELECT tbl_name FROM sqlite_master WHERE tbl_name = 'mgmt_api'"
        # print self.db.runQuery(sql)

        pass