def cli(args, dbm): sn = getCurrentSnapshot() dbConf = readDBConfFromFile(args.config) db = DBClient(dbm) rdf_dir = args.dir if not os.path.exists(rdf_dir): os.mkdir(rdf_dir) sn_dir = os.path.join(rdf_dir, str(sn)) if not os.path.exists(sn_dir): os.mkdir(sn_dir) tasks = [] if args.portalid: P = db.Session.query(Portal).filter(Portal.id == args.portalid).one() if P is None: log.warn("PORTAL NOT IN DB", portalid=args.portalid) return else: tasks.append((P, dbConf, sn, sn_dir)) else: for P in db.Session.query(Portal): tasks.append((P, dbConf, sn, sn_dir)) log.info("START FETCH", processors=args.processors, dbConf=dbConf, portals=len(tasks)) pool = Pool(args.processors) for x in pool.imap(streamCSVs, tasks): pid, sn = x[0].id, x[1] log.info("RECEIVED RESULT", portalid=pid, snapshot=sn)
def cli(args,dbm): if args.snapshot: sn = args.snapshot else: sn = getCurrentSnapshot() dbConf= readDBConfFromFile(args.config) db= DBClient(dbm) tasks=[] if args.portalid: P = db.Session.query(Portal).filter(Portal.id==args.portalid).one() if P is None: log.warn("PORTAL NOT IN DB", portalid=args.portalid) return else: tasks.append((P, dbConf,sn)) else: for P in db.Session.query(Portal): tasks.append((P, dbConf,sn)) log.info("START FRESHNESS", processors=args.processors, dbConf=dbConf, portals=len(tasks)) pool = Pool(args.processors) for x in pool.imap(change_history,tasks): pid,sn =x[0].id, x[1] log.info("RECEIVED RESULT", portalid=pid, snapshot=sn)
def cli(args, dbm): dbConf = readDBConfFromFile(args.config) db = DBClient(dbm) if not args.sn: sn = getCurrentSnapshot() else: sn = args.sn directory = args.directory tasks = [] if args.portalid: P = db.Session.query(Portal).filter(Portal.id == args.portalid).one() if P is None: log.warn("PORTAL NOT IN DB", portalid=args.portalid) return else: tasks.append((P, dbConf, sn, directory)) else: for P in db.Session.query(Portal): tasks.append((P, dbConf, sn, directory)) log.info("START FETCH", processors=args.processors, dbConf=dbConf, portals=len(tasks)) portals = [] pool = Pool(args.processors) for x in pool.imap(generate_schemadotorg_files, tasks): pid, lastmod, sn = x[0].id, x[1], x[2] portals.append((pid, lastmod)) log.info("RECEIVED RESULT", portalid=pid) create_portal_sitemapindex(portals, directory)
def cli(args, dbm): sn = getCurrentSnapshot() dbConf = readDBConfFromFile(args.config) db = DBClient(dbm) aggregateFormatDist(db, sn)
def cli(args, dbm): sn = getCurrentSnapshot() dbConf = readDBConfFromFile(args.config) db = DBClient(dbm) store_local = None if args.config: with open(args.config) as f: config = yaml.load(f) if 'git' in config and 'datadir' in config['git']: store_local = config['git']['datadir'] tasks = [] if args.portalid: P = db.Session.query(Portal).filter(Portal.id == args.portalid).one() if P is None: log.warn("PORTAL NOT IN DB", portalid=args.portalid) return else: tasks.append((P, dbConf, sn, store_local)) else: if args.repair: valid = db.Session.query(PortalSnapshot.portalid).filter( PortalSnapshot.snapshot == sn).filter( PortalSnapshot.status == 200).subquery() for P in db.Session.query(Portal).filter(Portal.id.notin_(valid)): PS = db.Session.query(PortalSnapshot).filter( PortalSnapshot.snapshot == sn).filter( PortalSnapshot.portalid == P.id) PS.delete(synchronize_session=False) PSQ = db.Session.query(PortalSnapshotQuality).filter( PortalSnapshotQuality.snapshot == sn).filter( PortalSnapshotQuality.portalid == P.id) PSQ.delete(synchronize_session=False) tasks.append((P, dbConf, sn, store_local)) else: for P in db.Session.query(Portal): tasks.append((P, dbConf, sn, store_local)) log.info("START FETCH", processors=args.processors, dbConf=dbConf, portals=len(tasks)) pool = Pool(args.processors) for x in pool.imap(fetchHttp, tasks): pid, sn = x[0].id, x[1] log.info("RECEIVED RESULT", portalid=pid, snapshot=sn)
def start (argv): print argv start= time.time() pa = argparse.ArgumentParser(description='Open Portal Watch toolset.', prog='odpw') logg=pa.add_argument_group("Logging") logg.add_argument( '-d', '--debug', help="Print lots of debugging statements", action="store_const", dest="loglevel", const=logging.DEBUG, default=logging.WARNING ) logg.add_argument( '-v', '--verbose', help="Be verbose", action="store_const", dest="loglevel", const=logging.INFO, default=logging.WARNING ) config=pa.add_argument_group("Config") config.add_argument('-c','--config', help="config file", dest='config') sp = pa.add_subparsers(title='Modules', description="Available sub modules") for sm in submodules: smpa = sp.add_parser(sm.name(), help=sm.help()) sm.setupCLI(smpa) smpa.set_defaults(func=sm.cli) m=set([]) for k,v in sys.modules.items(): if v is not None: if '.' in k: m.add(k.split('.')[0]) else: m.add(k) #for i in m: # print i args = pa.parse_args(args=argv) db=readDBConfFromFile(args.config) if args.config: try: with open(args.config) as f_conf: config = yaml.load(f_conf) if 'logging' in config: print "setup logging" logging.config.dictConfig(config['logging']) else: ##load basic logging logconf = os.path.join(odpw.__path__[0], 'resources/logging', 'logging.yaml') with open(logconf) as f: logging.config.dictConfig(yaml.load(f)) except Exception as e: print "Exception during config initialisation",e return else: ##load basic logging logconf = os.path.join(odpw.__path__[0], 'resources/logging', 'logging.yaml') with open(logconf) as f: logging.config.dictConfig(yaml.load(f)) logging.basicConfig(level=args.loglevel) #config the structlog config_logging() log = structlog.get_logger() try: log.info("CMD ARGS", args=str(args)) dbm = DBManager(**db) args.func(args , dbm) except OperationalError as e: log.fatal("DB Connection Exception: ", msg=e.message) except Exception as e: log.fatal("Uncaught exception", exc_info=True) end = time.time() secs = end - start msecs = secs * 1000 log.info("END MAIN", time_elapsed=msecs) Timer.printStats() ErrorHandler.printStats()