def __init__(self): ObjBase.__init__(self) # {name : workerObj} self.workers = dict() self.logger = get_logger('WorkerService_%s' % getnodename())
def create_multi(options): logger = get_logger('MULTIHANDLER') m = pycurl.CurlMulti() m.handles = [] # default number of connections is 5 # some options are hardcoded, nonprovided options # might be problematic when settings.py is used connection_count = int(options.get('maxconnections', [5])[0]) logger.debug('maxconnections %d' % connection_count) for i in range(connection_count): c = pycurl.Curl() c.name = 'curl%s' % i # TODO: bu kısma, gerekli olan optionlar için uyarı koymak lazım if not hasattr(settings, 'CRAWL_OPTIONS'): logger.error('No options for crawler found') return None for opt, val in settings.CRAWL_OPTIONS.items(): # use user provided options or defaults optval = options.get(opt.lower()) optval = optval and (t.match(optval[0]) and int(optval[0]) or optval[0]) or val # optval is an external list c.setopt(getattr(pycurl, opt), val) m.handles.append(c) return m, m.handles[:] # multi, freelist
def create_multi(options): logger = get_logger("MULTIHANDLER") m = pycurl.CurlMulti() m.handles = [] # default number of connections is 5 # some options are hardcoded, nonprovided options # might be problematic when settings.py is used connection_count = int(options.get("maxconnections", [5])[0]) logger.debug("maxconnections %d" % connection_count) for i in range(connection_count): c = pycurl.Curl() c.name = "curl%s" % i # TODO: bu kısma, gerekli olan optionlar için uyarı koymak lazım if not hasattr(settings, "CRAWL_OPTIONS"): logger.error("No options for crawler found") return None for opt, val in settings.CRAWL_OPTIONS.items(): # use user provided options or defaults optval = options.get(opt.lower()) optval = optval and (t.match(optval[0]) and int(optval[0]) or optval[0]) or val # optval is an external list c.setopt(getattr(pycurl, opt), val) m.handles.append(c) return m, m.handles[:] # multi, freelist
def __init__(self, filename): self.logger = get_logger('DArcReader') try: self.file = gzip.open(filename) except IOError: self.logger.error('File not found: %s' % filename) self.file = None
def __init__(self, filename): self.filename = filename.split('.')[0] # trim extension self.file = gzip.GzipFile('%s_temp' % self.filename, 'wb') # if redirected, new url should be rewritten according to regex rules self.regx = re.compile(r'Location: (.*)\r\n') self.logger = get_logger('DArcWriter')
def __init__(self, config, report=None, status=None): # TODO: format configuration items, everything is in list form for now self.name = config.get('options').get('name')[0] self.logger = get_logger(self.name) # shared objects self.report = report self.status = status self.config = config
def listprofiles(*args): logger = get_logger('ListProfiles') session = Session() profiles = session.query(Profile).all() if not profiles: logger.error('No profiles found') else: logger.info('Profiles found: %s' % ', '.join([profile.name for profile in profiles])) sys.exit(0)
def __init__(self, config): if not hasattr(self, 'logger'): self.logger = get_logger('UrlsContainer') self.options = config.get('options') self.plugins = config.get('plugins') self.done = Set([]) # TODO: format configuration objects, everything is in list form self.queue = Set([ item.strip().encode('utf-8') for item in self.options.get('seeds')[0].split(',') ] or []) self.temp = Set([]) self.failed = Set([]) self.retries = dict() self.extractors = [] self.filters = [] self.transformers = [] # TODO: rewrite # plugins = {plug: [params], plug2: [params2]} # plugins with no params where args_required will be discarded for name, parms in self.plugins.items(): try: cls = registry.get(name) if cls.opts.get('args_required') and not parms[0]: continue if name.endswith("Extractor"): self.extractors.append(cls(parms)) elif name.endswith("Filter"): self.filters.append(cls(parms)) elif name.endswith("Transform"): self.transformers.append(cls(parms)) except: self.logger.error('Loading of plugin %s failed\n\n %s' % (name, traceback.format_exc())) # also add enabled but non visible plugins to scene for name, cls in registry.items(): try: if cls.enabled and not cls.visible: if name.endswith("Extractor"): self.extractors.append(cls()) elif name.endswith("Filter"): self.filters.append(cls()) elif name.endswith("Transform"): self.transformers.append(cls()) except: self.logger.error('Loading of plugin %s failed\n\n %s' % (name, traceback.format_exc()))
def listnodes(*args): logger = get_logger('Listnodes') ns = Pyro.naming.NameServerLocator().getNS() nlist = ns.list(':Default.%s' % domain) if not nlist: logger.error('No nodes found') else: logger.info('Nodes found: %s' % ", ".join([node[0] for node in nlist])) sys.exit(0)
def __init__(self, config): if not hasattr(self, 'logger'): self.logger = get_logger('UrlsContainer') self.options = config.get('options') self.plugins = config.get('plugins') self.done = Set([]) # TODO: format configuration objects, everything is in list form self.queue = Set([item.strip().encode('utf-8') for item in self.options.get('seeds')[0].split(',')] or []) self.temp = Set([]) self.failed = Set([]) self.retries = dict() self.extractors = [] self.filters = [] self.transformers = [] # TODO: rewrite # plugins = {plug: [params], plug2: [params2]} # plugins with no params where args_required will be discarded for name, parms in self.plugins.items(): try: cls = registry.get(name) if cls.opts.get('args_required') and not parms[0]: continue if name.endswith("Extractor"): self.extractors.append(cls(parms)) elif name.endswith("Filter"): self.filters.append(cls(parms)) elif name.endswith("Transform"): self.transformers.append(cls(parms)) except: self.logger.error('Loading of plugin %s failed\n\n %s' % (name, traceback.format_exc())) # also add enabled but non visible plugins to scene for name, cls in registry.items(): try: if cls.enabled and not cls.visible: if name.endswith("Extractor"): self.extractors.append(cls()) elif name.endswith("Filter"): self.filters.append(cls()) elif name.endswith("Transform"): self.transformers.append(cls()) except: self.logger.error('Loading of plugin %s failed\n\n %s' % (name, traceback.format_exc()))
def __init__(self, *args, **kwargs): self.logger = get_logger(self.__class__.__name__) if self.args_required: if not args: self.logger.info('%s requires arguments, but none given. therefore this module is disabled' % self.__class__.__name__) self.enabled = False else: self.args = copy(*args) # always in ['blah blah'] form else: self.args = []
def __init__(self, *args, **kwargs): self.logger = get_logger(self.__class__.__name__) if self.args_required: if not args: self.logger.info( "%s requires arguments, but none given. therefore this module is disabled" % self.__class__.__name__ ) self.enabled = False else: self.args = copy(*args) # always in ['blah blah'] form else: self.args = []
def runcommand(options, parser): logger = get_logger('RunCommand') if not options.command or not options.node: return parser.print_help() #ns = Pyro.naming.NameServerLocator().getNS() #uri = ns.resolve('%s.%s.jobservice' % (domain, options.node)) #js = Pyro.core.getProxyForURI(uri) js = Pyro.core.getProxyForURI("PYROLOC://localhost:7766/jobservice") if options.command == 'create': if not options.profile: return logger.error( 'crate should be called with profile name: -n nodename -c create -p profile' ) sess = Session() try: pf = sess.query(Profile).filter( profile.c.name == options.profile).first() status, workername = js.create(pf.configuration) except Exception, e: logger.error('No profile found with name: %s' % options.profile) logger.error("".join(getPyroTraceback(e))) return -1 if status: j = Job(workername, 'paused') pf.jobs.append(j) #sess.save(j) sess.add(j) sess.commit() sess.close() logger.info("worker with name %s created" % workername) return 0 logger.error('no worker created: %s' % workername) return -1
def runcommand(options, parser): logger = get_logger('RunCommand') if not options.command or not options.node: return parser.print_help() #ns = Pyro.naming.NameServerLocator().getNS() #uri = ns.resolve('%s.%s.jobservice' % (domain, options.node)) #js = Pyro.core.getProxyForURI(uri) js = Pyro.core.getProxyForURI("PYROLOC://localhost:7766/jobservice") if options.command == 'create': if not options.profile: return logger.error('crate should be called with profile name: -n nodename -c create -p profile') sess = Session() try: pf = sess.query(Profile).filter(profile.c.name==options.profile).first() status, workername = js.create(pf.configuration) except Exception, e: logger.error( 'No profile found with name: %s' % options.profile) logger.error("".join(getPyroTraceback(e))) return -1 if status: j = Job(workername, 'paused') pf.jobs.append(j) #sess.save(j) sess.add(j) sess.commit() sess.close() logger.info ("worker with name %s created" % workername) return 0 logger.error('no worker created: %s' % workername) return -1
def indexer(filename, max_count=100): logger = get_logger('Indexer') count = offset = 0 sess = Session() d = DArcReader('%s' % filename) logger.debug('Archive file %s opened with pointer %s' % (filename, d)) timestamp = filename.split('_')[1] # always profilename_timestamp_node.arc for chunk in d.chunks(): # url : offset count += 1 index = Index(filename, timestamp, chunk[0], offset) #sess.save(index) sess.add(index) if count % max_count == 0: sess.commit() sess = Session() offset = d.tell() sess.commit() d.close()
from Pyro.util import getPyroTraceback from datetime import datetime from domo import settings from domo.interfaces.db import Site, Job, makelogsession from optparse import OptionParser from sqlalchemy.exceptions import InvalidRequestError from domo.interfaces.logger import get_logger from datetime import datetime, timedelta import Pyro.core import Pyro.naming import sys import traceback domain = settings.DOMAIN logger = get_logger('Cron service') def listnodes(ns): nlist = ns.list(':Default.%s' % domain) if not nlist: logger.error('No nodes found, exiting') return None return [node[0] for node in nlist] def listsites(): session = makelogsession()() sites = session.query(Site).all() if not sites: logger.error('No sites found, exiting')
from Pyro.util import getPyroTraceback from datetime import datetime from domo import settings from domo.interfaces.db import Site, Job, makelogsession from optparse import OptionParser from sqlalchemy.exceptions import InvalidRequestError from domo.interfaces.logger import get_logger from datetime import datetime, timedelta import Pyro.core import Pyro.naming import sys import traceback domain = settings.DOMAIN logger = get_logger('Cron service') def listnodes(ns): nlist = ns.list(':Default.%s' % domain) if not nlist: logger.error('No nodes found, exiting') return None return [node[0] for node in nlist] def listsites(): session = makelogsession()() sites = session.query(Site).all()