Exemple #1
0
    def __init__(self):

        ObjBase.__init__(self)
        
        # {name : workerObj}
        self.workers = dict()
        self.logger = get_logger('WorkerService_%s' % getnodename())
Exemple #2
0
def create_multi(options):
    logger = get_logger('MULTIHANDLER')
    m = pycurl.CurlMulti()
    m.handles = []

    # default number of connections is 5
    # some options are hardcoded, nonprovided options
    # might be problematic when settings.py is used
    connection_count = int(options.get('maxconnections', [5])[0])
    logger.debug('maxconnections %d' % connection_count)

    for i in range(connection_count):
        c = pycurl.Curl()
        c.name = 'curl%s' % i

        # TODO: bu kısma, gerekli olan optionlar için uyarı koymak lazım

        if not hasattr(settings, 'CRAWL_OPTIONS'):
            logger.error('No options for crawler found')
            return None

        for opt, val in settings.CRAWL_OPTIONS.items():

            # use user provided options or defaults
            optval = options.get(opt.lower())
            optval = optval and (t.match(optval[0]) and int(optval[0])
                                 or optval[0]) or val

            # optval is an external list
            c.setopt(getattr(pycurl, opt), val)
        m.handles.append(c)

    return m, m.handles[:]  # multi, freelist
Exemple #3
0
def create_multi(options):
    logger = get_logger("MULTIHANDLER")
    m = pycurl.CurlMulti()
    m.handles = []

    # default number of connections is 5
    # some options are hardcoded, nonprovided options
    # might be problematic when settings.py is used
    connection_count = int(options.get("maxconnections", [5])[0])
    logger.debug("maxconnections %d" % connection_count)

    for i in range(connection_count):
        c = pycurl.Curl()
        c.name = "curl%s" % i

        # TODO: bu kısma, gerekli olan optionlar için uyarı koymak lazım

        if not hasattr(settings, "CRAWL_OPTIONS"):
            logger.error("No options for crawler found")
            return None

        for opt, val in settings.CRAWL_OPTIONS.items():

            # use user provided options or defaults
            optval = options.get(opt.lower())
            optval = optval and (t.match(optval[0]) and int(optval[0]) or optval[0]) or val

            # optval is an external list
            c.setopt(getattr(pycurl, opt), val)
        m.handles.append(c)

    return m, m.handles[:]  # multi, freelist
Exemple #4
0
    def __init__(self, filename):

        self.logger = get_logger('DArcReader')
        try:
            self.file = gzip.open(filename)
        except IOError:
            self.logger.error('File not found: %s' % filename)
            self.file = None
Exemple #5
0
    def __init__(self, filename):

        self.filename = filename.split('.')[0]              # trim extension
        self.file = gzip.GzipFile('%s_temp' % self.filename, 'wb')
        
        # if redirected, new url should be rewritten according to regex rules
        self.regx = re.compile(r'Location: (.*)\r\n')
        self.logger = get_logger('DArcWriter')
Exemple #6
0
    def __init__(self, filename):

        self.logger = get_logger('DArcReader')
        try:
            self.file = gzip.open(filename)
        except IOError:
            self.logger.error('File not found: %s' % filename)
            self.file = None
Exemple #7
0
    def __init__(self, filename):

        self.filename = filename.split('.')[0]  # trim extension
        self.file = gzip.GzipFile('%s_temp' % self.filename, 'wb')

        # if redirected, new url should be rewritten according to regex rules
        self.regx = re.compile(r'Location: (.*)\r\n')
        self.logger = get_logger('DArcWriter')
Exemple #8
0
    def __init__(self, config, report=None, status=None):
        # TODO: format configuration items, everything is in list form for now
        self.name = config.get('options').get('name')[0]
        self.logger = get_logger(self.name)

        # shared objects 
        self.report = report
        self.status = status
        self.config = config
Exemple #9
0
    def __init__(self, config, report=None, status=None):
        # TODO: format configuration items, everything is in list form for now
        self.name = config.get('options').get('name')[0]
        self.logger = get_logger(self.name)

        # shared objects
        self.report = report
        self.status = status
        self.config = config
Exemple #10
0
def listprofiles(*args):
    logger = get_logger('ListProfiles')
    session = Session()
    profiles = session.query(Profile).all()

    if not profiles:
        logger.error('No profiles found')
    else:
        logger.info('Profiles found: %s' % ', '.join([profile.name for profile in profiles]))
    sys.exit(0)
Exemple #11
0
    def __init__(self, config):
        if not hasattr(self, 'logger'):
            self.logger = get_logger('UrlsContainer')

        self.options = config.get('options')
        self.plugins = config.get('plugins')
        self.done = Set([])
        # TODO: format configuration objects, everything is in list form
        self.queue = Set([
            item.strip().encode('utf-8')
            for item in self.options.get('seeds')[0].split(',')
        ] or [])
        self.temp = Set([])
        self.failed = Set([])
        self.retries = dict()
        self.extractors = []
        self.filters = []
        self.transformers = []

        # TODO: rewrite
        # plugins = {plug: [params], plug2: [params2]}
        # plugins with no params where args_required will be discarded
        for name, parms in self.plugins.items():
            try:
                cls = registry.get(name)
                if cls.opts.get('args_required') and not parms[0]:
                    continue

                if name.endswith("Extractor"):
                    self.extractors.append(cls(parms))

                elif name.endswith("Filter"):
                    self.filters.append(cls(parms))

                elif name.endswith("Transform"):
                    self.transformers.append(cls(parms))
            except:
                self.logger.error('Loading of plugin %s failed\n\n %s' %
                                  (name, traceback.format_exc()))

        # also add enabled but non visible plugins to scene
        for name, cls in registry.items():
            try:
                if cls.enabled and not cls.visible:
                    if name.endswith("Extractor"):
                        self.extractors.append(cls())

                    elif name.endswith("Filter"):
                        self.filters.append(cls())

                    elif name.endswith("Transform"):
                        self.transformers.append(cls())
            except:
                self.logger.error('Loading of plugin %s failed\n\n %s' %
                                  (name, traceback.format_exc()))
Exemple #12
0
def listnodes(*args):
    logger = get_logger('Listnodes')

    ns = Pyro.naming.NameServerLocator().getNS()
    nlist = ns.list(':Default.%s' % domain)
    if not nlist:
        logger.error('No nodes found')
    else:
        logger.info('Nodes found: %s' % ", ".join([node[0] for node in nlist]))

    sys.exit(0)
Exemple #13
0
def listprofiles(*args):
    logger = get_logger('ListProfiles')
    session = Session()
    profiles = session.query(Profile).all()

    if not profiles:
        logger.error('No profiles found')
    else:
        logger.info('Profiles found: %s' %
                    ', '.join([profile.name for profile in profiles]))
    sys.exit(0)
Exemple #14
0
def listnodes(*args):
    logger = get_logger('Listnodes')

    ns = Pyro.naming.NameServerLocator().getNS()
    nlist = ns.list(':Default.%s' % domain)
    if not nlist:
        logger.error('No nodes found')
    else:
        logger.info('Nodes found: %s' % ", ".join([node[0] for node in nlist]))

    sys.exit(0)
Exemple #15
0
    def __init__(self, config):
        if not hasattr(self, 'logger'):
            self.logger = get_logger('UrlsContainer')
            
        self.options = config.get('options')
        self.plugins = config.get('plugins')
        self.done = Set([])
        # TODO: format configuration objects, everything is in list form
        self.queue = Set([item.strip().encode('utf-8') for item in self.options.get('seeds')[0].split(',')] or [])
        self.temp = Set([])
        self.failed = Set([])
        self.retries = dict()
        self.extractors = []
        self.filters = []
        self.transformers = []


        # TODO: rewrite
        # plugins = {plug: [params], plug2: [params2]}
        # plugins with no params where args_required will be discarded
        for name, parms in self.plugins.items():
            try:
                cls = registry.get(name)
                if cls.opts.get('args_required') and not parms[0]:
                    continue

                if name.endswith("Extractor"):
                    self.extractors.append(cls(parms))
           
                elif name.endswith("Filter"):
                    self.filters.append(cls(parms))
          
                elif name.endswith("Transform"):
                    self.transformers.append(cls(parms))
            except:
                self.logger.error('Loading of plugin %s failed\n\n %s' % (name,
                                                                         traceback.format_exc()))

        # also add enabled but non visible plugins to scene
        for name, cls in registry.items():
            try:
                if cls.enabled and not cls.visible:
                    if name.endswith("Extractor"):
                        self.extractors.append(cls())
        
                    elif name.endswith("Filter"):
                        self.filters.append(cls())
       
                    elif name.endswith("Transform"):
                        self.transformers.append(cls())
            except:
                self.logger.error('Loading of plugin %s failed\n\n %s' % (name,
                                                                         traceback.format_exc()))
Exemple #16
0
    def __init__(self, *args, **kwargs):
        self.logger = get_logger(self.__class__.__name__)
        

        if self.args_required:
            if not args:
                self.logger.info('%s requires arguments, but none given. therefore this module is disabled' % 
                        self.__class__.__name__)
                self.enabled = False
            else:
                self.args = copy(*args)                 # always in ['blah blah'] form
        else:
            self.args = []
Exemple #17
0
    def __init__(self, *args, **kwargs):
        self.logger = get_logger(self.__class__.__name__)

        if self.args_required:
            if not args:
                self.logger.info(
                    "%s requires arguments, but none given. therefore this module is disabled" % self.__class__.__name__
                )
                self.enabled = False
            else:
                self.args = copy(*args)  # always in ['blah blah'] form
        else:
            self.args = []
Exemple #18
0
def runcommand(options, parser):
    logger = get_logger('RunCommand')

    if not options.command or not options.node:
        return parser.print_help()

    #ns = Pyro.naming.NameServerLocator().getNS()
    #uri = ns.resolve('%s.%s.jobservice' % (domain, options.node))
    #js = Pyro.core.getProxyForURI(uri)
    js = Pyro.core.getProxyForURI("PYROLOC://localhost:7766/jobservice")

    if options.command == 'create':
        if not options.profile:
            return logger.error(
                'crate should be called with profile name: -n nodename -c create -p profile'
            )

        sess = Session()
        try:

            pf = sess.query(Profile).filter(
                profile.c.name == options.profile).first()
            status, workername = js.create(pf.configuration)

        except Exception, e:
            logger.error('No profile found with name: %s' % options.profile)
            logger.error("".join(getPyroTraceback(e)))
            return -1

        if status:
            j = Job(workername, 'paused')
            pf.jobs.append(j)
            #sess.save(j)
            sess.add(j)
            sess.commit()
            sess.close()
            logger.info("worker with name %s created" % workername)
            return 0

        logger.error('no worker created: %s' % workername)
        return -1
Exemple #19
0
def runcommand(options, parser):
    logger = get_logger('RunCommand')

    if not options.command or not options.node:
        return parser.print_help()

    #ns = Pyro.naming.NameServerLocator().getNS()
    #uri = ns.resolve('%s.%s.jobservice' % (domain, options.node))
    #js = Pyro.core.getProxyForURI(uri)
    js = Pyro.core.getProxyForURI("PYROLOC://localhost:7766/jobservice")

    if options.command == 'create':
        if not options.profile:
            return logger.error('crate should be called with profile name: -n nodename -c create -p profile')
            
        sess = Session()
        try:
            
            pf = sess.query(Profile).filter(profile.c.name==options.profile).first()
            status, workername = js.create(pf.configuration)

        except Exception, e:    
            logger.error( 'No profile found with name: %s' % options.profile)
            logger.error("".join(getPyroTraceback(e)))
            return -1

        if status: 
            j = Job(workername, 'paused')
            pf.jobs.append(j)
            #sess.save(j)
            sess.add(j)
            sess.commit()
            sess.close()
            logger.info ("worker with name %s created" % workername)
            return 0

        logger.error('no worker created: %s' % workername)
        return -1
Exemple #20
0
def indexer(filename, max_count=100):
    logger = get_logger('Indexer')

    count = offset = 0
    sess = Session()
    d = DArcReader('%s' % filename)
    logger.debug('Archive file %s opened with pointer %s' % (filename, d))
    timestamp = filename.split('_')[1]  # always profilename_timestamp_node.arc

    for chunk in d.chunks():
        # url : offset
        count += 1

        index = Index(filename, timestamp, chunk[0], offset)
        #sess.save(index)
        sess.add(index)
        if count % max_count == 0:
            sess.commit()
            sess = Session()

        offset = d.tell()
    sess.commit()
    d.close()
Exemple #21
0
def indexer(filename, max_count=100):
    logger = get_logger('Indexer')

    count = offset = 0
    sess = Session()
    d = DArcReader('%s' % filename)
    logger.debug('Archive file %s opened with pointer %s' % (filename, d))
    timestamp = filename.split('_')[1] # always profilename_timestamp_node.arc

    for chunk in d.chunks():
        # url : offset
        count += 1

        index = Index(filename, timestamp, chunk[0], offset)
        #sess.save(index)
        sess.add(index)
        if count % max_count == 0:
            sess.commit()
            sess = Session()

        offset = d.tell()
    sess.commit()
    d.close()
Exemple #22
0
from Pyro.util import getPyroTraceback
from datetime import datetime
from domo import settings
from domo.interfaces.db import Site, Job, makelogsession
from optparse import OptionParser
from sqlalchemy.exceptions import InvalidRequestError
from domo.interfaces.logger import get_logger
from datetime import datetime, timedelta
import Pyro.core
import Pyro.naming
import sys
import traceback

domain = settings.DOMAIN
logger = get_logger('Cron service')

def listnodes(ns):
    nlist = ns.list(':Default.%s' % domain)

    if not nlist:
        logger.error('No nodes found, exiting')
        return None
    return [node[0] for node in nlist]

def listsites():
    session = makelogsession()()
    sites = session.query(Site).all()
    
    if not sites:
        logger.error('No sites found, exiting')
Exemple #23
0
from Pyro.util import getPyroTraceback
from datetime import datetime
from domo import settings
from domo.interfaces.db import Site, Job, makelogsession
from optparse import OptionParser
from sqlalchemy.exceptions import InvalidRequestError
from domo.interfaces.logger import get_logger
from datetime import datetime, timedelta
import Pyro.core
import Pyro.naming
import sys
import traceback

domain = settings.DOMAIN
logger = get_logger('Cron service')


def listnodes(ns):
    nlist = ns.list(':Default.%s' % domain)

    if not nlist:
        logger.error('No nodes found, exiting')
        return None
    return [node[0] for node in nlist]


def listsites():
    session = makelogsession()()
    sites = session.query(Site).all()