Exemple #1
0
def map_file_apply(file_list, apply_func, apply_func_args):
    """ Main controller for the crawl job, applies the provided func and args to each file

    :param file_list: list of files that will be operated on by @apply_func
    :param apply_func: function to be applied to each file
    :param apply_func_args: dict of keyword args needed for @apply_func
    :return: DataFrame aggregated in crawler.query_return
    """

    crawler = Crawler()
    crawler.query_return = []

    vapply_func = np.vectorize(apply_func)
    vapply_func(file_list, crawler, **apply_func_args)

    if not crawler.query_return:
        return
    else:
        return pd.concat(crawler.query_return)
    def test_connect(self, mock_sftp, mock_transport):


        mock_creds = {
            'host': 'host',
            'user': '******',
            'password': '******',
            'port': '22'
        }

        crawl = Crawler(mock_creds)
        crawl.transport.connect.assert_called_with(username='******', password='******')
Exemple #3
0
def main():
    base_url = "https://t.me/BonOgood"
    depth = 2
    userbot = Client("userbot")
    messages_filter = InputMessagesFilterUrl()

    worker_arguments = {"userbot": userbot, "messages_filter": messages_filter}

    userbot.start()

    mongo_storage = Storage(
        base_url=base_url,
        db_name="crawlerDB",
        username="******",
        password="******",
        max_depth=depth,
    )

    crawler = Crawler(base_url=base_url,
                      depth=depth,
                      storage=mongo_storage,
                      worker_class=Worker,
                      workers_number=1,
                      **worker_arguments)
    crawler.create_workers()
    crawler.run_workers()
Exemple #4
0
def main():
    base_url = "en.wikipedia.org/wiki/Main_Page"
    depth = 2

    sqlite_storage = Storage(db_name="storage.db", base_url=base_url, max_depth=depth)
    crawler = Crawler(
        base_url=base_url,
        depth=depth,
        storage=sqlite_storage,
        worker_class=Worker,
        workers_number=2,
    )
    crawler.create_workers()
    crawler.run_workers()
    crawler.idle()
Exemple #5
0
def main():
    crawler = Crawler(base_url='kiryat4.org.il',
                      db_name='crawlerDB',
                      depth=5,
                      storage_class=Storage,
                      worker_class=Worker,
                      workers_number=5,
                      username='******',
                      password="******")
    crawler.create_workers()
    crawler.run_workers()
Exemple #6
0
    def __init__(self):

        self.crawler = Crawler()
Exemple #7
0
class API:
    def __init__(self):

        self.crawler = Crawler()

    def parse(self, args):
        '''
        :param args: query string filled with parameters
        :return: initialized crawler
        '''

        if 'username' in args:

            self.crawler.username = request.args.get('username')

            if self.crawler.username is None:
                return {'username': '******'}

        else:

            return {'username': '******'}

        # TODO
        if 'pwd' in args:
            self.crawler.password = request.args.get('pwd')

        if 'method' in args:

            self.crawler.crawler_method = request.args.get('method')

            if self.crawler.crawler_method is None:
                return {
                    'method':
                    '{0} {1} {2}'.format('Method', request.args.get('method'),
                                         'does not exist.')
                }

        else:

            return {'method': 'Not defined'}

        if 'limit' in args:

            self.crawler.limit_mode = request.args.get('limit')

            if self.crawler.limit_mode is None:
                return {'limit': 'Limit is not integer type.'}

        # Not all browsers are still supported
        if 'browser' in args:

            self.crawler.browser = request.args.get('browser')

            if self.crawler.browser is None:
                return {'browser': 'Browser has not been defined.'}

        if 'headless' in args:

            self.crawler.headless_mode = request.args.get('headless')

            if self.crawler.headless_mode is None:
                return {
                    'headless':
                    'Headless mode is supposed to be an integer value between (0 or 1) or (y or n).'
                }

    def fetch(self):
        '''
        :return: crawled Instagram account
        '''
        result, respond = self.crawler.crawl()

        return result, respond