def phantomjs(ctx, phantomjs_path, port): """ Run phantomjs fetcher if phantomjs is installed. """ import subprocess g = ctx.obj phantomjs_fetcher = os.path.join(os.path.dirname(pyspider.__file__), 'fetcher/phantomjs_fetcher.js') try: _phantomjs = subprocess.Popen( [phantomjs_path, phantomjs_fetcher, str(port)]) except OSError: return None def quit(*args, **kwargs): _phantomjs.kill() _phantomjs.wait() logging.info('phantomjs existed.') phantomjs = utils.ObjectDict(port=port, quit=quit) g.instances.append(phantomjs) if g.get('testing_mode'): return phantomjs _phantomjs.wait()
def setUpClass(self): import easywebdav shutil.rmtree('./data/tests', ignore_errors=True) os.makedirs('./data/tests') ctx = run.cli.make_context('test', [ '--taskdb', 'sqlite+taskdb:///data/tests/task.db', '--projectdb', 'sqlite+projectdb:///data/tests/projectdb.db', '--resultdb', 'sqlite+resultdb:///data/tests/resultdb.db', ], None, obj=utils.ObjectDict(testing_mode=True)) self.ctx = run.cli.invoke(ctx) ctx = run.webui.make_context('webui', [ '--username', 'binux', '--password', '4321', ], self.ctx) self.app = run.webui.invoke(ctx) self.app_thread = utils.run_in_thread(self.app.run) time.sleep(5) self.webdav = easywebdav.connect('localhost', port=5000, path='dav') self.webdav_up = easywebdav.connect('localhost', port=5000, path='dav', username='******', password='******')
def puppeteer(ctx, port, auto_restart, args): import subprocess g = ctx.obj _quit = [] puppeteer_fetcher = os.path.join(os.path.dirname(pyspider.__file__), 'fetcher/puppeteer_fetcher.js') cmd = ['node', puppeteer_fetcher, str(port)] try: _puppeteer = subprocess.Popen(cmd) except OSError: logging.warning('puppeteer not found, continue running without it.') return None def quit(*args, **kwargs): _quit.append(1) _puppeteer.kill() _puppeteer.wait() logging.info('puppeteer exited.') if not g.get('puppeteer_proxy'): g['puppeteer_proxy'] = '127.0.0.1:%s' % port puppeteer = utils.ObjectDict(port=port, quit=quit) g.instances.append(puppeteer) if g.get('testing_mode'): return puppeteer while True: _puppeteer.wait() if _quit or not auto_restart: break _puppeteer = subprocess.Popen(cmd)
def phantomjs(ctx, phantomjs_path, port, auto_restart, args): """ Run phantomjs fetcher if phantomjs is installed. """ args = args or ctx.default_map and ctx.default_map.get('args', []) import subprocess g = ctx.obj _quit = [] phantomjs_fetcher = os.path.join(os.path.dirname(pyspider.__file__), 'fetcher/phantomjs_fetcher.js') # Command-line Options: http://phantomjs.org/api/command-line.html cmd = [ phantomjs_path, # this may cause memory leak: https://github.com/ariya/phantomjs/issues/12903 #'--load-images=false', '--ssl-protocol=any', '--disk-cache=true' ] + list(args or []) + [phantomjs_fetcher, str(port)] try: _phantomjs = subprocess.Popen(cmd) except OSError: logging.warning('phantomjs not found, continue running without it.') return None def quit(*args, **kwargs): _quit.append(1) _phantomjs.kill() _phantomjs.wait() logging.info('phantomjs exited.') if not g.get('phantomjs_proxy'): g['phantomjs_proxy'] = '127.0.0.1:%s' % port phantomjs = utils.ObjectDict(port=port, quit=quit) g.instances.append(phantomjs) if g.get('testing_mode'): return phantomjs while True: _phantomjs.wait() if _quit or not auto_restart: break _phantomjs = subprocess.Popen(cmd)
def phantomjs(ctx, phantomjs_path, port, auto_restart): """ Run phantomjs fetcher if phantomjs is installed. """ import subprocess g = ctx.obj _quit = [] phantomjs_fetcher = os.path.join(os.path.dirname(pyspider.__file__), 'fetcher/phantomjs_fetcher.js') cmd = [ phantomjs_path, '--ssl-protocol=any', '--disk-cache=true', # this may cause memory leak: https://github.com/ariya/phantomjs/issues/12903 #'--load-images=false', phantomjs_fetcher, str(port) ] try: _phantomjs = subprocess.Popen(cmd) except OSError: return None def quit(*args, **kwargs): _quit.append(1) _phantomjs.kill() _phantomjs.wait() logging.info('phantomjs existed.') if not g.get('phantomjs_proxy'): g['phantomjs_proxy'] = 'localhost:%s' % port phantomjs = utils.ObjectDict(port=port, quit=quit) g.instances.append(phantomjs) if g.get('testing_mode'): return phantomjs while True: _phantomjs.wait() if _quit or not auto_restart: break _phantomjs = subprocess.Popen(cmd)
def chromium(ctx, nodejs_path, port, auto_restart, args): """ Run chromium fetcher if nodejs and puppeteer is installed. """ args = args or ctx.default_map and ctx.default_map.get('args', []) import subprocess g = ctx.obj _quit = [] chromium_fetcher = os.path.join(os.path.dirname(pyspider.__file__), 'fetcher/chromium_fetcher.js') cmd = [ nodejs_path, ] + list(args or []) + [chromium_fetcher, str(port)] try: _chromium = subprocess.Popen(cmd) except OSError: logging.warning('nodejs not found, continue running without it.') return None def quit(*args, **kwargs): _quit.append(1) _chromium.kill() _chromium.wait() logging.info('chromium exited.') if not g.get('chromium_proxy'): g['chromium_proxy'] = '127.0.0.1:%s' % port chromium = utils.ObjectDict(port=port, quit=quit) g.instances.append(chromium) if g.get('testing_mode'): return chromium while True: _chromium.wait() if _quit or not auto_restart: break _chromium = subprocess.Popen(cmd)
def phantomjs(ctx, phantomjs_path, port, auto_restart, args): args = args or ctx.default_map and ctx.default_map.get('args', []) import subprocess g = ctx.obj _quit = [] phantomjs_fetcher = os.path.join(os.path.dirname(pyspider.__file__), 'fetcher/phantomjs_fetcher.js') cmd = [phantomjs_path, '--ssl-protocol=any', '--disk-cache=true'] + list( args or []) + [phantomjs_fetcher, str(port)] try: _phantomjs = subprocess.Popen(cmd) except OSError: logging.warning('phantomjs not found, continue running without it.') return None def quit(*args, **kwargs): _quit.append(1) _phantomjs.kill() _phantomjs.wait() logging.info('phantomjs exited.') if not g.get('phantomjs_proxy'): g['phantomjs_proxy'] = '127.0.0.1:%s' % port phantomjs = utils.ObjectDict(port=port, quit=quit) g.instances.append(phantomjs) if g.get('testing_mode'): return phantomjs while True: _phantomjs.wait() if _quit or not auto_restart: break _phantomjs = subprocess.Popen(cmd)
def cli(ctx, **kwargs): """ A powerful spider system in python. """ if kwargs['add_sys_path']: sys.path.append(os.getcwd()) logging.config.fileConfig(kwargs['logging_config']) # get db from env for db in ('taskdb', 'projectdb', 'resultdb'): if kwargs[db] is not None: continue if os.environ.get('MYSQL_NAME'): kwargs[db] = utils.Get(lambda db=db: connect_database( 'sqlalchemy+mysql+%s://%s:%s/%s' % ( db, os.environ['MYSQL_PORT_3306_TCP_ADDR'], os.environ['MYSQL_PORT_3306_TCP_PORT'], db))) elif os.environ.get('MONGODB_NAME'): kwargs[db] = utils.Get(lambda db=db: connect_database( 'mongodb+%s://%s:%s/%s' % ( db, os.environ['MONGODB_PORT_27017_TCP_ADDR'], os.environ['MONGODB_PORT_27017_TCP_PORT'], db))) elif ctx.invoked_subcommand == 'bench': if kwargs['data_path'] == './data': kwargs['data_path'] += '/bench' shutil.rmtree(kwargs['data_path'], ignore_errors=True) os.mkdir(kwargs['data_path']) if db in ('taskdb', 'resultdb'): kwargs[db] = utils.Get(lambda db=db: connect_database('sqlite+%s://' % (db))) elif db in ('projectdb', ): kwargs[db] = utils.Get(lambda db=db: connect_database('local+%s://%s' % ( db, os.path.join(os.path.dirname(__file__), 'libs/bench.py')))) else: if not os.path.exists(kwargs['data_path']): os.mkdir(kwargs['data_path']) kwargs[db] = utils.Get(lambda db=db: connect_database('sqlite+%s:///%s/%s.db' % ( db, kwargs['data_path'], db[:-2]))) kwargs['is_%s_default' % db] = True # create folder for counter.dump if not os.path.exists(kwargs['data_path']): os.mkdir(kwargs['data_path']) # message queue, compatible with old version if kwargs.get('message_queue'): pass elif kwargs.get('amqp_url'): kwargs['message_queue'] = kwargs['amqp_url'] elif os.environ.get('RABBITMQ_NAME'): kwargs['message_queue'] = ("amqp://*****:*****@%(RABBITMQ_PORT_5672_TCP_ADDR)s" ":%(RABBITMQ_PORT_5672_TCP_PORT)s/%%2F" % os.environ) elif kwargs.get('beanstalk'): kwargs['message_queue'] = "beanstalk://%s/" % kwargs['beanstalk'] for name in ('newtask_queue', 'status_queue', 'scheduler2fetcher', 'fetcher2processor', 'processor2result'): if kwargs.get('message_queue'): kwargs[name] = utils.Get(lambda name=name: connect_message_queue( name, kwargs.get('message_queue'), kwargs['queue_maxsize'])) else: kwargs[name] = connect_message_queue(name, kwargs.get('message_queue'), kwargs['queue_maxsize']) # phantomjs-proxy if kwargs.get('phantomjs_proxy'): pass elif os.environ.get('PHANTOMJS_NAME'): kwargs['phantomjs_proxy'] = os.environ['PHANTOMJS_PORT_25555_TCP'][len('tcp://'):] # puppeteer-proxy if kwargs.get('puppeteer_proxy'): pass elif os.environ.get('PUPPETEER_NAME'): kwargs['puppeteer_proxy'] = os.environ['PUPPETEER_PORT_22222_TCP'][len('tcp://'):] ctx.obj = utils.ObjectDict(ctx.obj or {}) ctx.obj['instances'] = [] ctx.obj.update(kwargs) if ctx.invoked_subcommand is None and not ctx.obj.get('testing_mode'): ctx.invoke(all) return ctx
def cli(ctx, **kwargs): """ A powerful spider system in python. """ logging.config.fileConfig(kwargs['logging_config']) # get db from env for db in ('taskdb', 'projectdb', 'resultdb'): if kwargs[db] is not None: continue if os.environ.get('MYSQL_NAME'): kwargs[db] = utils.Get( lambda db=db: connect_database('mysql+%s://%s:%s/%s' % ( db, os.environ['MYSQL_PORT_3306_TCP_ADDR'], os.environ[ 'MYSQL_PORT_3306_TCP_PORT'], db))) elif os.environ.get('MONGODB_NAME'): kwargs[db] = utils.Get( lambda db=db: connect_database('mongodb+%s://%s:%s/%s' % ( db, os.environ['MONGODB_PORT_27017_TCP_ADDR'], os.environ[ 'MONGODB_PORT_27017_TCP_PORT'], db))) elif ctx.invoked_subcommand == 'bench': if kwargs['data_path'] == './data': kwargs['data_path'] += '/bench' shutil.rmtree(kwargs['data_path'], ignore_errors=True) os.mkdir(kwargs['data_path']) if db in ('taskdb', 'resultdb'): kwargs[db] = utils.Get( lambda db=db: connect_database('sqlite+%s://' % (db))) else: kwargs[db] = utils.Get( lambda db=db: connect_database('sqlite+%s:///%s/%s.db' % ( db, kwargs['data_path'], db[:-2]))) else: if not os.path.exists(kwargs['data_path']): os.mkdir(kwargs['data_path']) kwargs[db] = utils.Get(lambda db=db: connect_database( 'sqlite+%s:///%s/%s.db' % (db, kwargs['data_path'], db[:-2]))) kwargs['is_%s_default' % db] = True # create folder for counter.dump if not os.path.exists(kwargs['data_path']): os.mkdir(kwargs['data_path']) # queue if kwargs.get('amqp_url'): from pyspider.libs.rabbitmq import Queue for name in ('newtask_queue', 'status_queue', 'scheduler2fetcher', 'fetcher2processor', 'processor2result'): kwargs[name] = utils.Get( lambda name=name: Queue(name, amqp_url=kwargs['amqp_url'], maxsize=kwargs['queue_maxsize'])) elif os.environ.get('RABBITMQ_NAME'): from pyspider.libs.rabbitmq import Queue amqp_url = ("amqp://*****:*****@%(RABBITMQ_PORT_5672_TCP_ADDR)s" ":%(RABBITMQ_PORT_5672_TCP_PORT)s/%%2F" % os.environ) for name in ('newtask_queue', 'status_queue', 'scheduler2fetcher', 'fetcher2processor', 'processor2result'): kwargs[name] = utils.Get(lambda name=name: Queue( name, amqp_url=amqp_url, maxsize=kwargs['queue_maxsize'])) elif kwargs.get('beanstalk'): from pyspider.libs.beanstalk import Queue for name in ('newtask_queue', 'status_queue', 'scheduler2fetcher', 'fetcher2processor', 'processor2result'): kwargs[name] = utils.Get( lambda name=name: Queue(name, host=kwargs.get('beanstalk'), maxsize=kwargs['queue_maxsize'])) else: from multiprocessing import Queue for name in ('newtask_queue', 'status_queue', 'scheduler2fetcher', 'fetcher2processor', 'processor2result'): kwargs[name] = Queue(kwargs['queue_maxsize']) # phantomjs-proxy if kwargs.get('phantomjs_proxy'): pass elif os.environ.get('PHANTOMJS_NAME'): kwargs['phantomjs_proxy'] = os.environ['PHANTOMJS_PORT_25555_TCP'][ len('tcp://'):] ctx.obj = utils.ObjectDict(ctx.obj or {}) ctx.obj['instances'] = [] ctx.obj.update(kwargs) if ctx.invoked_subcommand is None and not ctx.obj.get('testing_mode'): ctx.invoke(all) return ctx
def phantomjs(ctx, phantomjs_path, ports, auto_restart, args): """ Run phantomjs fetcher if phantomjs is installed. """ args = args or ctx.default_map and ctx.default_map.get('args', []) portsarea = ports.split(',') ports = [] for i in range(int(portsarea[0]), int(portsarea[1]) + 1): ports.append(i) import subprocess g = ctx.obj _quit = [] phantomjs_fetcher = os.path.join(os.path.dirname(pyspider.__file__), 'fetcher/phantomjs_fetcher.js') list_arges = [] if len(args) != 0: list_arges.append(args.encode('utf-8')) # cmd = [phantomjs_path, # # this may cause memory leak: https://github.com/ariya/phantomjs/issues/12903 # #'--load-images=false', # '--ssl-protocol=any', # '--disk-cache=true'] + list(args or []) + [phantomjs_fetcher, str(port)] _phantomjs = {} for port in ports: cmd = [ phantomjs_path, # this may cause memory leak: https://github.com/ariya/phantomjs/issues/12903 '--load-images=false', '--ssl-protocol=any', '--disk-cache=true' ] + list_arges + [phantomjs_fetcher, str(port)] try: _phantomjs[str(port)] = subprocess.Popen(cmd) except OSError: logging.warning( 'phantomjs not found, continue running without it. and cmd is ' + str(cmd)) return None def quit(*args, **kwargs): _quit.append(1) for key in _phantomjs.keys(): phantomjs[key].kill() phantomjs[key].wait() logging.info('phantomjs exited.') phantom_proxy = '' if not g.get('phantomjs_proxy'): for i in range(len(ports)): if i == 0: phantom_proxy = '' + '127.0.0.1:%s' % ports[i] phantom_proxy = phantom_proxy + ',127.0.0.1:%s' % ports[i] g['phantomjs_proxy'] = phantom_proxy phantomjs = utils.ObjectDict(port=ports, quit=quit) g.instances.append(phantomjs) if g.get('testing_mode'): return phantomjs while True: for key in _phantomjs.keys(): _phantomjs[key].wait() if _quit or not auto_restart: break #phantomjs = subprocess.Popen(cmd) _phantomjs[key] = subprocess.Popen([ phantomjs_path, # this may cause memory leak: https://github.com/ariya/phantomjs/issues/12903 #'--load-images=false', '--ssl-protocol=any', '--disk-cache=true' ] + list_arges + [phantomjs_fetcher, key])