Python Crawler Examples

Programming Language: Python

Namespace/Package Name: Crawler.crawler

Class/Type: Crawler

Examples at hotexamples.com: 7

Python Crawler - 7 examples found. These are the top rated real world Python examples of Crawler.crawler.Crawler extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Crawler(6)

create_workers(3)

run_workers(3)

crawl(1)

idle(1)

query_return(1)

Example #1

Show file

def map_file_apply(file_list, apply_func, apply_func_args):
    """ Main controller for the crawl job, applies the provided func and args to each file

    :param file_list: list of files that will be operated on by @apply_func
    :param apply_func: function to be applied to each file
    :param apply_func_args: dict of keyword args needed for @apply_func
    :return: DataFrame aggregated in crawler.query_return
    """

    crawler = Crawler()
    crawler.query_return = []

    vapply_func = np.vectorize(apply_func)
    vapply_func(file_list, crawler, **apply_func_args)

    if not crawler.query_return:
        return
    else:
        return pd.concat(crawler.query_return)

Example #2

Show file

File: test_sftp_crawler.py Project: Ajwerth/sftp-crawler

    def test_connect(self, mock_sftp, mock_transport):


        mock_creds = {
            'host': 'host',
            'user': '******',
            'password': '******',
            'port': '22'
        }

        crawl = Crawler(mock_creds)
        crawl.transport.connect.assert_called_with(username='******', password='******')

Example #3

Show file

File: main.py Project: hvuhsg/Crawler

def main():
    base_url = "https://t.me/BonOgood"
    depth = 2
    userbot = Client("userbot")
    messages_filter = InputMessagesFilterUrl()

    worker_arguments = {"userbot": userbot, "messages_filter": messages_filter}

    userbot.start()

    mongo_storage = Storage(
        base_url=base_url,
        db_name="crawlerDB",
        username="******",
        password="******",
        max_depth=depth,
    )

    crawler = Crawler(base_url=base_url,
                      depth=depth,
                      storage=mongo_storage,
                      worker_class=Worker,
                      workers_number=1,
                      **worker_arguments)
    crawler.create_workers()
    crawler.run_workers()

Example #4

Show file

def main():
    base_url = "en.wikipedia.org/wiki/Main_Page"
    depth = 2

    sqlite_storage = Storage(db_name="storage.db", base_url=base_url, max_depth=depth)
    crawler = Crawler(
        base_url=base_url,
        depth=depth,
        storage=sqlite_storage,
        worker_class=Worker,
        workers_number=2,
    )
    crawler.create_workers()
    crawler.run_workers()
    crawler.idle()

Example #5

Show file

File: main.py Project: yehoyudua/Crawler

def main():
    crawler = Crawler(base_url='kiryat4.org.il',
                      db_name='crawlerDB',
                      depth=5,
                      storage_class=Storage,
                      worker_class=Worker,
                      workers_number=5,
                      username='******',
                      password="******")
    crawler.create_workers()
    crawler.run_workers()

Example #6

Show file

File: api.py Project: thecoldstone/Instagram-api

    def __init__(self):

        self.crawler = Crawler()

Example #7

Show file

File: api.py Project: thecoldstone/Instagram-api

class API:
    def __init__(self):

        self.crawler = Crawler()

    def parse(self, args):
        '''
        :param args: query string filled with parameters
        :return: initialized crawler
        '''

        if 'username' in args:

            self.crawler.username = request.args.get('username')

            if self.crawler.username is None:
                return {'username': '******'}

        else:

            return {'username': '******'}

        # TODO
        if 'pwd' in args:
            self.crawler.password = request.args.get('pwd')

        if 'method' in args:

            self.crawler.crawler_method = request.args.get('method')

            if self.crawler.crawler_method is None:
                return {
                    'method':
                    '{0} {1} {2}'.format('Method', request.args.get('method'),
                                         'does not exist.')
                }

        else:

            return {'method': 'Not defined'}

        if 'limit' in args:

            self.crawler.limit_mode = request.args.get('limit')

            if self.crawler.limit_mode is None:
                return {'limit': 'Limit is not integer type.'}

        # Not all browsers are still supported
        if 'browser' in args:

            self.crawler.browser = request.args.get('browser')

            if self.crawler.browser is None:
                return {'browser': 'Browser has not been defined.'}

        if 'headless' in args:

            self.crawler.headless_mode = request.args.get('headless')

            if self.crawler.headless_mode is None:
                return {
                    'headless':
                    'Headless mode is supposed to be an integer value between (0 or 1) or (y or n).'
                }

    def fetch(self):
        '''
        :return: crawled Instagram account
        '''
        result, respond = self.crawler.crawl()

        return result, respond