def map_file_apply(file_list, apply_func, apply_func_args): """ Main controller for the crawl job, applies the provided func and args to each file :param file_list: list of files that will be operated on by @apply_func :param apply_func: function to be applied to each file :param apply_func_args: dict of keyword args needed for @apply_func :return: DataFrame aggregated in crawler.query_return """ crawler = Crawler() crawler.query_return = [] vapply_func = np.vectorize(apply_func) vapply_func(file_list, crawler, **apply_func_args) if not crawler.query_return: return else: return pd.concat(crawler.query_return)
def test_connect(self, mock_sftp, mock_transport): mock_creds = { 'host': 'host', 'user': '******', 'password': '******', 'port': '22' } crawl = Crawler(mock_creds) crawl.transport.connect.assert_called_with(username='******', password='******')
def main(): base_url = "https://t.me/BonOgood" depth = 2 userbot = Client("userbot") messages_filter = InputMessagesFilterUrl() worker_arguments = {"userbot": userbot, "messages_filter": messages_filter} userbot.start() mongo_storage = Storage( base_url=base_url, db_name="crawlerDB", username="******", password="******", max_depth=depth, ) crawler = Crawler(base_url=base_url, depth=depth, storage=mongo_storage, worker_class=Worker, workers_number=1, **worker_arguments) crawler.create_workers() crawler.run_workers()
def main(): base_url = "en.wikipedia.org/wiki/Main_Page" depth = 2 sqlite_storage = Storage(db_name="storage.db", base_url=base_url, max_depth=depth) crawler = Crawler( base_url=base_url, depth=depth, storage=sqlite_storage, worker_class=Worker, workers_number=2, ) crawler.create_workers() crawler.run_workers() crawler.idle()
def main(): crawler = Crawler(base_url='kiryat4.org.il', db_name='crawlerDB', depth=5, storage_class=Storage, worker_class=Worker, workers_number=5, username='******', password="******") crawler.create_workers() crawler.run_workers()
def __init__(self): self.crawler = Crawler()
class API: def __init__(self): self.crawler = Crawler() def parse(self, args): ''' :param args: query string filled with parameters :return: initialized crawler ''' if 'username' in args: self.crawler.username = request.args.get('username') if self.crawler.username is None: return {'username': '******'} else: return {'username': '******'} # TODO if 'pwd' in args: self.crawler.password = request.args.get('pwd') if 'method' in args: self.crawler.crawler_method = request.args.get('method') if self.crawler.crawler_method is None: return { 'method': '{0} {1} {2}'.format('Method', request.args.get('method'), 'does not exist.') } else: return {'method': 'Not defined'} if 'limit' in args: self.crawler.limit_mode = request.args.get('limit') if self.crawler.limit_mode is None: return {'limit': 'Limit is not integer type.'} # Not all browsers are still supported if 'browser' in args: self.crawler.browser = request.args.get('browser') if self.crawler.browser is None: return {'browser': 'Browser has not been defined.'} if 'headless' in args: self.crawler.headless_mode = request.args.get('headless') if self.crawler.headless_mode is None: return { 'headless': 'Headless mode is supposed to be an integer value between (0 or 1) or (y or n).' } def fetch(self): ''' :return: crawled Instagram account ''' result, respond = self.crawler.crawl() return result, respond