Beispiel #1
0
    def __init__(self, base_url, conf_urls={}, verbosity=1, output_dir=None, ascend=True, **kwargs):
        self.base_url = base_url
        self.conf_urls = conf_urls
        self.verbosity = verbosity
        self.ascend = ascend

        auth = kwargs.get('auth')

        if output_dir:
            assert os.path.isdir(output_dir)
            self.output_dir = os.path.realpath(output_dir)
            LOG.info("Output will be saved to %s" % self.output_dir)
        else:
            self.output_dir = None

        #These two are what keep track of what to crawl and what has been.
        self.not_crawled = [(0, 'START',self.base_url)]
        self.crawled = {}

        self.c = Client(REMOTE_ADDR='127.0.0.1')

        if auth:
            printable_auth = ', '.join(
                '%s: %s' % (key, cleanse_setting(key.upper(), value))
                for key, value in auth.items())
            LOG.info('Log in with %s' % printable_auth)
            self.c.login(**auth)

        self.plugins = []
        for plug in Plugin.__subclasses__():
            active = getattr(plug, 'active', True)
            if active:
                #TODO: Check if plugin supports writing CSV (or to a file in general?)
                self.plugins.append(plug())
Beispiel #2
0
    def __init__(self, base_url, conf_urls={}, verbosity=1, output_dir=None, ascend=True, **kwargs):
        self.base_url = base_url
        self.conf_urls = conf_urls
        self.verbosity = verbosity
        self.ascend = ascend

        if output_dir:
            assert os.path.isdir(output_dir)
            self.output_dir = os.path.realpath(output_dir)
            LOG.info("Output will be saved to %s" % self.output_dir)
        else:
            self.output_dir = None

        # These two are what keep track of what to crawl and what has been.
        self.not_crawled = [(0, "START", self.base_url)]
        self.crawled = {}

        self.c = Client(REMOTE_ADDR="127.0.0.1")

        self.plugins = []
        for plug in Plugin.__subclasses__():
            active = getattr(plug, "active", True)
            if active:
                # TODO: Check if plugin supports writing CSV (or to a file in general?)
                self.plugins.append(plug())
Beispiel #3
0
    def __init__(self, base_url, conf_urls={}, verbosity=1, **kwargs):
        self.base_url = base_url
        self.conf_urls = conf_urls
        self.verbosity = verbosity

        #These two are what keep track of what to crawl and what has been.
        self.not_crawled = [('START',self.base_url)]
        self.crawled = {}

        self.c = Client(REMOTE_ADDR='127.0.0.1')

        self.plugins = []
        for plug in Plugin.__subclasses__():
            active = getattr(plug, 'active', True)
            if active:
                self.plugins.append(plug())
Beispiel #4
0
    def __init__(self, base_url, conf_urls={}, verbosity=1, **kwargs):
        self.base_url = base_url
        self.conf_urls = conf_urls
        self.verbosity = verbosity

        #These two are what keep track of what to crawl and what has been.
        self.not_crawled = [('START',self.base_url)]
        self.crawled = {}

        self.c = Client(REMOTE_ADDR='127.0.0.1')

        self.plugins = []
        for plug in Plugin.__subclasses__():
            active = getattr(plug, 'active', True)
            if active:
                self.plugins.append(plug())
    def __init__(self,
                 base_url,
                 conf_urls={},
                 verbosity=1,
                 output_dir=None,
                 ascend=True,
                 **kwargs):
        self.base_url = base_url
        self.conf_urls = conf_urls
        self.verbosity = verbosity
        self.ascend = ascend

        auth = kwargs.get('auth')

        if output_dir:
            assert os.path.isdir(output_dir)
            self.output_dir = os.path.realpath(output_dir)
            LOG.info("Output will be saved to %s" % self.output_dir)
        else:
            self.output_dir = None

        #These two are what keep track of what to crawl and what has been.
        self.not_crawled = [(0, 'START', self.base_url)]
        self.crawled = {}

        self.c = Client(REMOTE_ADDR='127.0.0.1')

        if auth:
            printable_auth = ', '.join(
                '%s: %s' % (key, cleanse_setting(key.upper(), value))
                for key, value in auth.items())
            LOG.info('Log in with %s' % printable_auth)
            self.c.login(**auth)

        self.plugins = []
        for plug in Plugin.__subclasses__():
            active = getattr(plug, 'active', True)
            if active:
                #TODO: Check if plugin supports writing CSV (or to a file in general?)
                self.plugins.append(plug())