Ejemplo n.º 1
0
def phantomjs(ctx, phantomjs_path, port):
    """
    Run phantomjs fetcher if phantomjs is installed.
    """
    import subprocess
    g = ctx.obj

    phantomjs_fetcher = os.path.join(os.path.dirname(pyspider.__file__),
                                     'fetcher/phantomjs_fetcher.js')
    try:
        _phantomjs = subprocess.Popen(
            [phantomjs_path, phantomjs_fetcher,
             str(port)])
    except OSError:
        return None

    def quit(*args, **kwargs):
        _phantomjs.kill()
        _phantomjs.wait()
        logging.info('phantomjs existed.')

    phantomjs = utils.ObjectDict(port=port, quit=quit)
    g.instances.append(phantomjs)
    if g.get('testing_mode'):
        return phantomjs

    _phantomjs.wait()
Ejemplo n.º 2
0
    def setUpClass(self):
        import easywebdav

        shutil.rmtree('./data/tests', ignore_errors=True)
        os.makedirs('./data/tests')

        ctx = run.cli.make_context('test', [
            '--taskdb',
            'sqlite+taskdb:///data/tests/task.db',
            '--projectdb',
            'sqlite+projectdb:///data/tests/projectdb.db',
            '--resultdb',
            'sqlite+resultdb:///data/tests/resultdb.db',
        ],
                                   None,
                                   obj=utils.ObjectDict(testing_mode=True))
        self.ctx = run.cli.invoke(ctx)

        ctx = run.webui.make_context('webui', [
            '--username',
            'binux',
            '--password',
            '4321',
        ], self.ctx)
        self.app = run.webui.invoke(ctx)
        self.app_thread = utils.run_in_thread(self.app.run)
        time.sleep(5)

        self.webdav = easywebdav.connect('localhost', port=5000, path='dav')
        self.webdav_up = easywebdav.connect('localhost',
                                            port=5000,
                                            path='dav',
                                            username='******',
                                            password='******')
Ejemplo n.º 3
0
def puppeteer(ctx, port, auto_restart, args):

    import subprocess
    g = ctx.obj
    _quit = []
    puppeteer_fetcher = os.path.join(os.path.dirname(pyspider.__file__),
                                     'fetcher/puppeteer_fetcher.js')

    cmd = ['node', puppeteer_fetcher, str(port)]
    try:
        _puppeteer = subprocess.Popen(cmd)
    except OSError:
        logging.warning('puppeteer not found, continue running without it.')
        return None

    def quit(*args, **kwargs):
        _quit.append(1)
        _puppeteer.kill()
        _puppeteer.wait()
        logging.info('puppeteer exited.')

    if not g.get('puppeteer_proxy'):
        g['puppeteer_proxy'] = '127.0.0.1:%s' % port

    puppeteer = utils.ObjectDict(port=port, quit=quit)
    g.instances.append(puppeteer)
    if g.get('testing_mode'):
        return puppeteer

    while True:
        _puppeteer.wait()
        if _quit or not auto_restart:
            break
        _puppeteer = subprocess.Popen(cmd)
Ejemplo n.º 4
0
def phantomjs(ctx, phantomjs_path, port, auto_restart, args):
    """
    Run phantomjs fetcher if phantomjs is installed.
    """
    args = args or ctx.default_map and ctx.default_map.get('args', [])

    import subprocess
    g = ctx.obj
    _quit = []
    phantomjs_fetcher = os.path.join(os.path.dirname(pyspider.__file__),
                                     'fetcher/phantomjs_fetcher.js')
    # Command-line Options: http://phantomjs.org/api/command-line.html
    cmd = [
        phantomjs_path,
        # this may cause memory leak: https://github.com/ariya/phantomjs/issues/12903
        #'--load-images=false',
        '--ssl-protocol=any',
        '--disk-cache=true'
    ] + list(args or []) + [phantomjs_fetcher, str(port)]

    try:
        _phantomjs = subprocess.Popen(cmd)
    except OSError:
        logging.warning('phantomjs not found, continue running without it.')
        return None

    def quit(*args, **kwargs):
        _quit.append(1)
        _phantomjs.kill()
        _phantomjs.wait()
        logging.info('phantomjs exited.')

    if not g.get('phantomjs_proxy'):
        g['phantomjs_proxy'] = '127.0.0.1:%s' % port

    phantomjs = utils.ObjectDict(port=port, quit=quit)
    g.instances.append(phantomjs)
    if g.get('testing_mode'):
        return phantomjs

    while True:
        _phantomjs.wait()
        if _quit or not auto_restart:
            break
        _phantomjs = subprocess.Popen(cmd)
Ejemplo n.º 5
0
def phantomjs(ctx, phantomjs_path, port, auto_restart):
    """
    Run phantomjs fetcher if phantomjs is installed.
    """
    import subprocess
    g = ctx.obj
    _quit = []
    phantomjs_fetcher = os.path.join(os.path.dirname(pyspider.__file__),
                                     'fetcher/phantomjs_fetcher.js')
    cmd = [
        phantomjs_path,
        '--ssl-protocol=any',
        '--disk-cache=true',
        # this may cause memory leak: https://github.com/ariya/phantomjs/issues/12903
        #'--load-images=false',
        phantomjs_fetcher,
        str(port)
    ]

    try:
        _phantomjs = subprocess.Popen(cmd)
    except OSError:
        return None

    def quit(*args, **kwargs):
        _quit.append(1)
        _phantomjs.kill()
        _phantomjs.wait()
        logging.info('phantomjs existed.')

    if not g.get('phantomjs_proxy'):
        g['phantomjs_proxy'] = 'localhost:%s' % port

    phantomjs = utils.ObjectDict(port=port, quit=quit)
    g.instances.append(phantomjs)
    if g.get('testing_mode'):
        return phantomjs

    while True:
        _phantomjs.wait()
        if _quit or not auto_restart:
            break
        _phantomjs = subprocess.Popen(cmd)
Ejemplo n.º 6
0
def chromium(ctx, nodejs_path, port, auto_restart, args):
    """
    Run chromium fetcher if nodejs and puppeteer is installed.
    """
    args = args or ctx.default_map and ctx.default_map.get('args', [])

    import subprocess
    g = ctx.obj
    _quit = []
    chromium_fetcher = os.path.join(os.path.dirname(pyspider.__file__),
                                    'fetcher/chromium_fetcher.js')
    cmd = [
        nodejs_path,
    ] + list(args or []) + [chromium_fetcher, str(port)]

    try:
        _chromium = subprocess.Popen(cmd)
    except OSError:
        logging.warning('nodejs not found, continue running without it.')
        return None

    def quit(*args, **kwargs):
        _quit.append(1)
        _chromium.kill()
        _chromium.wait()
        logging.info('chromium exited.')

    if not g.get('chromium_proxy'):
        g['chromium_proxy'] = '127.0.0.1:%s' % port

    chromium = utils.ObjectDict(port=port, quit=quit)
    g.instances.append(chromium)
    if g.get('testing_mode'):
        return chromium

    while True:
        _chromium.wait()
        if _quit or not auto_restart:
            break
        _chromium = subprocess.Popen(cmd)
Ejemplo n.º 7
0
def phantomjs(ctx, phantomjs_path, port, auto_restart, args):
    args = args or ctx.default_map and ctx.default_map.get('args', [])

    import subprocess
    g = ctx.obj
    _quit = []
    phantomjs_fetcher = os.path.join(os.path.dirname(pyspider.__file__),
                                     'fetcher/phantomjs_fetcher.js')
    cmd = [phantomjs_path, '--ssl-protocol=any', '--disk-cache=true'] + list(
        args or []) + [phantomjs_fetcher, str(port)]

    try:
        _phantomjs = subprocess.Popen(cmd)
    except OSError:
        logging.warning('phantomjs not found, continue running without it.')
        return None

    def quit(*args, **kwargs):
        _quit.append(1)
        _phantomjs.kill()
        _phantomjs.wait()
        logging.info('phantomjs exited.')

    if not g.get('phantomjs_proxy'):
        g['phantomjs_proxy'] = '127.0.0.1:%s' % port

    phantomjs = utils.ObjectDict(port=port, quit=quit)
    g.instances.append(phantomjs)
    if g.get('testing_mode'):
        return phantomjs

    while True:
        _phantomjs.wait()
        if _quit or not auto_restart:
            break
        _phantomjs = subprocess.Popen(cmd)
Ejemplo n.º 8
0
def cli(ctx, **kwargs):
    """
    A powerful spider system in python.
    """
    if kwargs['add_sys_path']:
        sys.path.append(os.getcwd())

    logging.config.fileConfig(kwargs['logging_config'])

    # get db from env
    for db in ('taskdb', 'projectdb', 'resultdb'):
        if kwargs[db] is not None:
            continue
        if os.environ.get('MYSQL_NAME'):
            kwargs[db] = utils.Get(lambda db=db: connect_database(
                'sqlalchemy+mysql+%s://%s:%s/%s' % (
                    db, os.environ['MYSQL_PORT_3306_TCP_ADDR'],
                    os.environ['MYSQL_PORT_3306_TCP_PORT'], db)))
        elif os.environ.get('MONGODB_NAME'):
            kwargs[db] = utils.Get(lambda db=db: connect_database(
                'mongodb+%s://%s:%s/%s' % (
                    db, os.environ['MONGODB_PORT_27017_TCP_ADDR'],
                    os.environ['MONGODB_PORT_27017_TCP_PORT'], db)))
        elif ctx.invoked_subcommand == 'bench':
            if kwargs['data_path'] == './data':
                kwargs['data_path'] += '/bench'
                shutil.rmtree(kwargs['data_path'], ignore_errors=True)
                os.mkdir(kwargs['data_path'])
            if db in ('taskdb', 'resultdb'):
                kwargs[db] = utils.Get(lambda db=db: connect_database('sqlite+%s://' % (db)))
            elif db in ('projectdb', ):
                kwargs[db] = utils.Get(lambda db=db: connect_database('local+%s://%s' % (
                    db, os.path.join(os.path.dirname(__file__), 'libs/bench.py'))))
        else:
            if not os.path.exists(kwargs['data_path']):
                os.mkdir(kwargs['data_path'])
            kwargs[db] = utils.Get(lambda db=db: connect_database('sqlite+%s:///%s/%s.db' % (
                db, kwargs['data_path'], db[:-2])))
            kwargs['is_%s_default' % db] = True

    # create folder for counter.dump
    if not os.path.exists(kwargs['data_path']):
        os.mkdir(kwargs['data_path'])

    # message queue, compatible with old version
    if kwargs.get('message_queue'):
        pass
    elif kwargs.get('amqp_url'):
        kwargs['message_queue'] = kwargs['amqp_url']
    elif os.environ.get('RABBITMQ_NAME'):
        kwargs['message_queue'] = ("amqp://*****:*****@%(RABBITMQ_PORT_5672_TCP_ADDR)s"
                                   ":%(RABBITMQ_PORT_5672_TCP_PORT)s/%%2F" % os.environ)
    elif kwargs.get('beanstalk'):
        kwargs['message_queue'] = "beanstalk://%s/" % kwargs['beanstalk']

    for name in ('newtask_queue', 'status_queue', 'scheduler2fetcher',
                 'fetcher2processor', 'processor2result'):
        if kwargs.get('message_queue'):
            kwargs[name] = utils.Get(lambda name=name: connect_message_queue(
                name, kwargs.get('message_queue'), kwargs['queue_maxsize']))
        else:
            kwargs[name] = connect_message_queue(name, kwargs.get('message_queue'),
                                                 kwargs['queue_maxsize'])

    # phantomjs-proxy
    if kwargs.get('phantomjs_proxy'):
        pass
    elif os.environ.get('PHANTOMJS_NAME'):
        kwargs['phantomjs_proxy'] = os.environ['PHANTOMJS_PORT_25555_TCP'][len('tcp://'):]

    # puppeteer-proxy
    if kwargs.get('puppeteer_proxy'):
        pass
    elif os.environ.get('PUPPETEER_NAME'):
        kwargs['puppeteer_proxy'] = os.environ['PUPPETEER_PORT_22222_TCP'][len('tcp://'):]

    ctx.obj = utils.ObjectDict(ctx.obj or {})
    ctx.obj['instances'] = []
    ctx.obj.update(kwargs)

    if ctx.invoked_subcommand is None and not ctx.obj.get('testing_mode'):
        ctx.invoke(all)
    return ctx
Ejemplo n.º 9
0
def cli(ctx, **kwargs):
    """
    A powerful spider system in python.
    """
    logging.config.fileConfig(kwargs['logging_config'])

    # get db from env
    for db in ('taskdb', 'projectdb', 'resultdb'):
        if kwargs[db] is not None:
            continue
        if os.environ.get('MYSQL_NAME'):
            kwargs[db] = utils.Get(
                lambda db=db: connect_database('mysql+%s://%s:%s/%s' % (
                    db, os.environ['MYSQL_PORT_3306_TCP_ADDR'], os.environ[
                        'MYSQL_PORT_3306_TCP_PORT'], db)))
        elif os.environ.get('MONGODB_NAME'):
            kwargs[db] = utils.Get(
                lambda db=db: connect_database('mongodb+%s://%s:%s/%s' % (
                    db, os.environ['MONGODB_PORT_27017_TCP_ADDR'], os.environ[
                        'MONGODB_PORT_27017_TCP_PORT'], db)))
        elif ctx.invoked_subcommand == 'bench':
            if kwargs['data_path'] == './data':
                kwargs['data_path'] += '/bench'
                shutil.rmtree(kwargs['data_path'], ignore_errors=True)
                os.mkdir(kwargs['data_path'])
            if db in ('taskdb', 'resultdb'):
                kwargs[db] = utils.Get(
                    lambda db=db: connect_database('sqlite+%s://' % (db)))
            else:
                kwargs[db] = utils.Get(
                    lambda db=db: connect_database('sqlite+%s:///%s/%s.db' % (
                        db, kwargs['data_path'], db[:-2])))
        else:
            if not os.path.exists(kwargs['data_path']):
                os.mkdir(kwargs['data_path'])
            kwargs[db] = utils.Get(lambda db=db: connect_database(
                'sqlite+%s:///%s/%s.db' % (db, kwargs['data_path'], db[:-2])))
            kwargs['is_%s_default' % db] = True

    # create folder for counter.dump
    if not os.path.exists(kwargs['data_path']):
        os.mkdir(kwargs['data_path'])

    # queue
    if kwargs.get('amqp_url'):
        from pyspider.libs.rabbitmq import Queue
        for name in ('newtask_queue', 'status_queue', 'scheduler2fetcher',
                     'fetcher2processor', 'processor2result'):
            kwargs[name] = utils.Get(
                lambda name=name: Queue(name,
                                        amqp_url=kwargs['amqp_url'],
                                        maxsize=kwargs['queue_maxsize']))
    elif os.environ.get('RABBITMQ_NAME'):
        from pyspider.libs.rabbitmq import Queue
        amqp_url = ("amqp://*****:*****@%(RABBITMQ_PORT_5672_TCP_ADDR)s"
                    ":%(RABBITMQ_PORT_5672_TCP_PORT)s/%%2F" % os.environ)
        for name in ('newtask_queue', 'status_queue', 'scheduler2fetcher',
                     'fetcher2processor', 'processor2result'):
            kwargs[name] = utils.Get(lambda name=name: Queue(
                name, amqp_url=amqp_url, maxsize=kwargs['queue_maxsize']))
    elif kwargs.get('beanstalk'):
        from pyspider.libs.beanstalk import Queue
        for name in ('newtask_queue', 'status_queue', 'scheduler2fetcher',
                     'fetcher2processor', 'processor2result'):
            kwargs[name] = utils.Get(
                lambda name=name: Queue(name,
                                        host=kwargs.get('beanstalk'),
                                        maxsize=kwargs['queue_maxsize']))
    else:
        from multiprocessing import Queue
        for name in ('newtask_queue', 'status_queue', 'scheduler2fetcher',
                     'fetcher2processor', 'processor2result'):
            kwargs[name] = Queue(kwargs['queue_maxsize'])

    # phantomjs-proxy
    if kwargs.get('phantomjs_proxy'):
        pass
    elif os.environ.get('PHANTOMJS_NAME'):
        kwargs['phantomjs_proxy'] = os.environ['PHANTOMJS_PORT_25555_TCP'][
            len('tcp://'):]

    ctx.obj = utils.ObjectDict(ctx.obj or {})
    ctx.obj['instances'] = []
    ctx.obj.update(kwargs)

    if ctx.invoked_subcommand is None and not ctx.obj.get('testing_mode'):
        ctx.invoke(all)
    return ctx
Ejemplo n.º 10
0
def phantomjs(ctx, phantomjs_path, ports, auto_restart, args):
    """
    Run phantomjs fetcher if phantomjs is installed.
    """
    args = args or ctx.default_map and ctx.default_map.get('args', [])
    portsarea = ports.split(',')

    ports = []
    for i in range(int(portsarea[0]), int(portsarea[1]) + 1):
        ports.append(i)

    import subprocess
    g = ctx.obj
    _quit = []
    phantomjs_fetcher = os.path.join(os.path.dirname(pyspider.__file__),
                                     'fetcher/phantomjs_fetcher.js')
    list_arges = []
    if len(args) != 0:
        list_arges.append(args.encode('utf-8'))


#    cmd = [phantomjs_path,
#            # this may cause memory leak: https://github.com/ariya/phantomjs/issues/12903
#            #'--load-images=false',
#            '--ssl-protocol=any',
#            '--disk-cache=true'] + list(args or []) + [phantomjs_fetcher, str(port)]
    _phantomjs = {}
    for port in ports:
        cmd = [
            phantomjs_path,
            # this may cause memory leak: https://github.com/ariya/phantomjs/issues/12903
            '--load-images=false',
            '--ssl-protocol=any',
            '--disk-cache=true'
        ] + list_arges + [phantomjs_fetcher, str(port)]
        try:
            _phantomjs[str(port)] = subprocess.Popen(cmd)
        except OSError:
            logging.warning(
                'phantomjs not found, continue running without it. and cmd is '
                + str(cmd))
            return None

    def quit(*args, **kwargs):
        _quit.append(1)
        for key in _phantomjs.keys():
            phantomjs[key].kill()
            phantomjs[key].wait()
        logging.info('phantomjs exited.')

    phantom_proxy = ''
    if not g.get('phantomjs_proxy'):
        for i in range(len(ports)):
            if i == 0:
                phantom_proxy = '' + '127.0.0.1:%s' % ports[i]
            phantom_proxy = phantom_proxy + ',127.0.0.1:%s' % ports[i]
        g['phantomjs_proxy'] = phantom_proxy

    phantomjs = utils.ObjectDict(port=ports, quit=quit)
    g.instances.append(phantomjs)
    if g.get('testing_mode'):
        return phantomjs

    while True:
        for key in _phantomjs.keys():
            _phantomjs[key].wait()
            if _quit or not auto_restart:
                break
            #phantomjs = subprocess.Popen(cmd)
            _phantomjs[key] = subprocess.Popen([
                phantomjs_path,
                # this may cause memory leak: https://github.com/ariya/phantomjs/issues/12903
                #'--load-images=false',
                '--ssl-protocol=any',
                '--disk-cache=true'
            ] + list_arges + [phantomjs_fetcher, key])