Example #1
0
def resultdb_migrating(project, from_connection, to_connection):
    logging.info("resultdb: %s", project)
    f = connect_database(from_connection)
    t = connect_database(to_connection)
    t.drop(project)
    for result in f.select(project):
        t.save(project, result['taskid'], result['url'], result['result'])
Example #2
0
def resultdb_migrating(project, from_connection, to_connection):
    logging.info("resultdb: %s", project)
    f = connect_database(from_connection)
    t = connect_database(to_connection)
    t.drop(project)
    for result in f.select(project):
        t.save(project, result['taskid'], result['url'], result['result'])
Example #3
0
def taskdb_migrating(project, from_connection, to_connection):
    logging.info("taskdb: %s", project)
    f = connect_database(from_connection)
    t = connect_database(to_connection)
    t.drop(project)
    for status in range(1, 5):
        for task in f.load_tasks(status, project=project):
            t.insert(project, task['taskid'], task)
Example #4
0
def taskdb_migrating(project, from_connection, to_connection):
    logging.info("taskdb: %s", project)
    f = connect_database(from_connection)
    t = connect_database(to_connection)
    t.drop(project)
    for status in range(1, 5):
        for task in f.load_tasks(status, project=project):
            t.insert(project, task['taskid'], task)
Example #5
0
def cli(ctx, **kwargs):
    """
    A powerful spider system in python.
    """
    logging.config.fileConfig(os.path.join(os.path.dirname(__file__), "logging.conf"))

    # get db from env
    for db in ('taskdb', 'projectdb', 'resultdb'):
        if kwargs[db] is not None:
            continue
        if os.environ.get('MYSQL_NAME'):
            kwargs[db] = Get(lambda db=db: connect_database('mysql+%s://%s:%s/%s' % (
                db, os.environ['MYSQL_PORT_3306_TCP_ADDR'],
                os.environ['MYSQL_PORT_3306_TCP_PORT'], db)))
        elif os.environ.get('MONGODB_NAME'):
            kwargs[db] = Get(lambda db=db: connect_database('mongodb+%s://%s:%s/%s' % (
                db, os.environ['MONGODB_PORT_27017_TCP_ADDR'],
                os.environ['MONGODB_PORT_27017_TCP_PORT'], db)))
        else:
            if not os.path.exists(kwargs['data_path']):
                os.mkdir(kwargs['data_path'])
            kwargs[db] = Get(lambda db=db: connect_database('sqlite+%s:///%s/%s.db' % (
                db, kwargs['data_path'], db[:-2])))

    # queue
    if kwargs.get('amqp_url'):
        from pyspider.libs.rabbitmq import Queue
        for name in ('newtask_queue', 'status_queue', 'scheduler2fetcher',
                     'fetcher2processor', 'processor2result'):
            kwargs[name] = Get(lambda name=name: Queue(name, amqp_url=kwargs['amqp_url'],
                                                       maxsize=kwargs['queue_maxsize']))
    elif os.environ.get('RABBITMQ_NAME'):
        from pyspider.libs.rabbitmq import Queue
        amqp_url = ("amqp://*****:*****@%(RABBITMQ_PORT_5672_TCP_ADDR)s"
                    ":%(RABBITMQ_PORT_5672_TCP_PORT)s/%%2F" % os.environ)
        for name in ('newtask_queue', 'status_queue', 'scheduler2fetcher',
                     'fetcher2processor', 'processor2result'):
            kwargs[name] = Get(lambda name=name: Queue(name, amqp_url=amqp_url,
                                                       maxsize=kwargs['queue_maxsize']))
    else:
        from multiprocessing import Queue
        for name in ('newtask_queue', 'status_queue', 'scheduler2fetcher',
                     'fetcher2processor', 'processor2result'):
            kwargs[name] = Queue(kwargs['queue_maxsize'])

    # phantomjs-proxy
    if kwargs.get('phantomjs_proxy'):
        pass
    elif os.environ.get('PHANTOMJS_NAME'):
        kwargs['phantomjs_proxy'] = os.environ['PHANTOMJS_PORT'][len('tcp://'):]

    ctx.obj['instances'] = []
    ctx.obj.update(kwargs)

    if ctx.invoked_subcommand is None and not ctx.obj.get('testing_mode'):
        ctx.invoke(all)
    return ctx
Example #6
0
class g(object):
    taskdb = Get(
        lambda: connect_database('sqlite+taskdb:///data/tests/task.db'))
    projectdb = Get(
        lambda: connect_database('sqlite+projectdb:///data/tests/project.db'))
    resultdb = Get(
        lambda: connect_database('sqlite+resultdb:///data/tests/result.db'))

    newtask_queue = Queue(100)
    status_queue = Queue(100)
    scheduler2fetcher = Queue(100)
    fetcher2processor = Queue(100)
    processor2result = Queue(100)
Example #7
0
 def setUpClass(self):
     # create a test admin user
     import requests
     requests.put('http://localhost:5984/_node/_local/_config/admins/test',
                  data='"password"')
     os.environ["COUCHDB_USER"] = "******"
     os.environ["COUCHDB_PASSWORD"] = "******"
     self.taskdb = database.connect_database(
         'couchdb+taskdb://localhost:5984/'
     )
     self.assertIsNotNone(self, self.taskdb)
Example #8
0
def migrate(pool, from_connection, to_connection):
    """
    Migrate tool for pyspider
    """
    f = connect_database(from_connection)
    t = connect_database(to_connection)

    if isinstance(f, ProjectDB):
        for each in f.get_all():
            each = unicode_obj(each)
            logging.info("projectdb: %s", each['name'])
            t.drop(each['name'])
            t.insert(each['name'], each)
    elif isinstance(f, TaskDB):
        pool = Pool(pool)
        pool.map(lambda x, f=from_connection, t=to_connection:
                 taskdb_migrating(x, f, t),
                 f.projects)
    elif isinstance(f, ResultDB):
        pool = Pool(pool)
        pool.map(lambda x, f=from_connection, t=to_connection:
                 resultdb_migrating(x, f, t),
                 f.projects)
Example #9
0
def migrate(pool, from_connection, to_connection):
    """
    Migrate tool for pyspider
    """
    f = connect_database(from_connection)
    t = connect_database(to_connection)

    if isinstance(f, ProjectDB):
        for each in f.get_all():
            each = unicode_obj(each)
            logging.info("projectdb: %s", each['name'])
            t.drop(each['name'])
            t.insert(each['name'], each)
    elif isinstance(f, TaskDB):
        pool = Pool(pool)
        pool.map(
            lambda x, f=from_connection, t=to_connection: taskdb_migrating(x, f, t),
            f.projects)
    elif isinstance(f, ResultDB):
        pool = Pool(pool)
        pool.map(
            lambda x, f=from_connection, t=to_connection: resultdb_migrating(x, f, t),
            f.projects)
Example #10
0
def connect_db(ctx, param, value):
    if value is None:
        return
    return Get(lambda: connect_database(value))
Example #11
0
 def setUpClass(self):
     self.resultdb = database.connect_database(
         'sqlalchemy+postgresql+resultdb://[email protected]/pyspider_test_resultdb'
     )
Example #12
0
 def setUpClass(self):
     self.resultdb = database.connect_database(
         'sqlalchemy+sqlite+resultdb://'
     )
Example #13
0
 def setUpClass(self):
     self.taskdb = database.connect_database(
         'sqlalchemy+sqlite+taskdb://'
     )
Example #14
0
 def setUpClass(self):
     self.resultdb = database.connect_database(
         'mongodb+resultdb://localhost/pyspider_test_resultdb'
     )
Example #15
0
 def setUpClass(self):
     self.taskdb = database.connect_database(
         'mongodb+taskdb://localhost:27017/pyspider_test_taskdb'
     )
Example #16
0
 def setUpClass(self):
     self.resultdb = database.connect_database(
         'mysql+resultdb://localhost/pyspider_test_resultdb'
     )
     self.assertIsNotNone(self, self.resultdb)
Example #17
0
 def setUpClass(self):
     self.projectdb = database.connect_database('sqlite+projectdb://')
     self.assertIsNotNone(self, self.projectdb)
Example #18
0
def one(ctx, interactive, enable_phantomjs, scripts):
    """
    One mode not only means all-in-one, it runs every thing in one process over
    tornado.ioloop, for debug purpose
    """

    ctx.obj["debug"] = False
    g = ctx.obj
    g["testing_mode"] = True

    if scripts:
        from pyspider.database.local.projectdb import ProjectDB

        g["projectdb"] = ProjectDB(scripts)
        if g.get("is_taskdb_default"):
            g["taskdb"] = connect_database("sqlite+taskdb://")
        if g.get("is_resultdb_default"):
            g["resultdb"] = None

    if enable_phantomjs:
        phantomjs_config = g.config.get("phantomjs", {})
        phantomjs_obj = ctx.invoke(phantomjs, **phantomjs_config)
        if phantomjs_obj:
            g.setdefault("phantomjs_proxy", "127.0.0.1:%s" % phantomjs_obj.port)
    else:
        phantomjs_obj = None

    result_worker_config = g.config.get("result_worker", {})
    if g.resultdb is None:
        result_worker_config.setdefault("result_cls", "pyspider.result.OneResultWorker")
    result_worker_obj = ctx.invoke(result_worker, **result_worker_config)

    processor_config = g.config.get("processor", {})
    processor_config.setdefault("enable_stdout_capture", False)
    processor_obj = ctx.invoke(processor, **processor_config)

    fetcher_config = g.config.get("fetcher", {})
    fetcher_config.setdefault("xmlrpc", False)
    fetcher_obj = ctx.invoke(fetcher, **fetcher_config)

    scheduler_config = g.config.get("scheduler", {})
    scheduler_config.setdefault("xmlrpc", False)
    scheduler_config.setdefault("scheduler_cls", "pyspider.scheduler.OneScheduler")
    scheduler_obj = ctx.invoke(scheduler, **scheduler_config)

    scheduler_obj.init_one(
        ioloop=fetcher_obj.ioloop,
        fetcher=fetcher_obj,
        processor=processor_obj,
        result_worker=result_worker_obj,
        interactive=interactive,
    )
    if scripts:
        for project in g.projectdb.projects:
            scheduler_obj.trigger_on_start(project)

    try:
        scheduler_obj.run()
    finally:
        scheduler_obj.quit()
        if phantomjs_obj:
            phantomjs_obj.quit()
Example #19
0
def cli(ctx, **kwargs):
    """
    A powerful spider system in python.
    """
    if kwargs["add_sys_path"]:
        sys.path.append(os.getcwd())

    logging.config.fileConfig(kwargs["logging_config"])

    # get db from env
    for db in ("taskdb", "projectdb", "resultdb"):
        if kwargs[db] is not None:
            continue
        if os.environ.get("MYSQL_NAME"):
            kwargs[db] = utils.Get(
                lambda db=db: connect_database(
                    "sqlalchemy+mysql+%s://%s:%s/%s"
                    % (db, os.environ["MYSQL_PORT_3306_TCP_ADDR"], os.environ["MYSQL_PORT_3306_TCP_PORT"], db)
                )
            )
        elif os.environ.get("MONGODB_NAME"):
            kwargs[db] = utils.Get(
                lambda db=db: connect_database(
                    "mongodb+%s://%s:%s/%s"
                    % (db, os.environ["MONGODB_PORT_27017_TCP_ADDR"], os.environ["MONGODB_PORT_27017_TCP_PORT"], db)
                )
            )
        elif ctx.invoked_subcommand == "bench":
            if kwargs["data_path"] == "./data":
                kwargs["data_path"] += "/bench"
                shutil.rmtree(kwargs["data_path"], ignore_errors=True)
                os.mkdir(kwargs["data_path"])
            if db in ("taskdb", "resultdb"):
                kwargs[db] = utils.Get(lambda db=db: connect_database("sqlite+%s://" % (db)))
            else:
                kwargs[db] = utils.Get(
                    lambda db=db: connect_database("sqlite+%s:///%s/%s.db" % (db, kwargs["data_path"], db[:-2]))
                )
        else:
            if not os.path.exists(kwargs["data_path"]):
                os.mkdir(kwargs["data_path"])
            kwargs[db] = utils.Get(
                lambda db=db: connect_database("sqlite+%s:///%s/%s.db" % (db, kwargs["data_path"], db[:-2]))
            )
            kwargs["is_%s_default" % db] = True

    # create folder for counter.dump
    if not os.path.exists(kwargs["data_path"]):
        os.mkdir(kwargs["data_path"])

    # message queue, compatible with old version
    if kwargs.get("message_queue"):
        pass
    elif kwargs.get("amqp_url"):
        kwargs["message_queue"] = kwargs["amqp_url"]
    elif os.environ.get("RABBITMQ_NAME"):
        kwargs["message_queue"] = (
            "amqp://*****:*****@%(RABBITMQ_PORT_5672_TCP_ADDR)s" ":%(RABBITMQ_PORT_5672_TCP_PORT)s/%%2F" % os.environ
        )
    elif kwargs.get("beanstalk"):
        kwargs["message_queue"] = "beanstalk://%s/" % kwargs["beanstalk"]

    for name in ("newtask_queue", "status_queue", "scheduler2fetcher", "fetcher2processor", "processor2result"):
        if kwargs.get("message_queue"):
            kwargs[name] = utils.Get(
                lambda name=name: connect_message_queue(name, kwargs.get("message_queue"), kwargs["queue_maxsize"])
            )
        else:
            kwargs[name] = connect_message_queue(name, kwargs.get("message_queue"), kwargs["queue_maxsize"])

    # phantomjs-proxy
    if kwargs.get("phantomjs_proxy"):
        pass
    elif os.environ.get("PHANTOMJS_NAME"):
        kwargs["phantomjs_proxy"] = os.environ["PHANTOMJS_PORT_25555_TCP"][len("tcp://") :]

    ctx.obj = utils.ObjectDict(ctx.obj or {})
    ctx.obj["instances"] = []
    ctx.obj.update(kwargs)

    if ctx.invoked_subcommand is None and not ctx.obj.get("testing_mode"):
        ctx.invoke(all)
    return ctx
Example #20
0
conn=psycopg2.connect(database="resultdb", user="******",password="", host="", port="")
cur = conn.cursor()
from pyspider.database import connect_database
resultdb = connect_database("sqlalchemy+postgresql+resultdb://postgres:@10.1.36.183:5432/resultdb")
#result=resultdb.select('test6').next()
#row_result = result['result']
#url=row_result['wages_and_employment_content']
#print type(url),url

##抓数据


##抓取相关的数据 并存到数据库中  

##获取进一步的链接 并返回列表 列表是一个字典 (带相关内容的)
def get_more_touchs(list_content,types):
    for each in list_content:
        url=each[0]
        try:
            r=requests.get(url,headers=header)
            soup=BeautifulSoup(r.text)
            websites={
            "detailed_work_activities" :"/search/dwa/compare/.*?g=Continue",
               "work_context":"^/find/descriptor/result/.*?",
               "work_values_content":"^/explore/workvalues/.*?",
               "work_styles_content":"^/find/descriptor/result/.*?",
               "work_activities":"^/find/descriptor/result/.*?",
               "skills_content":"^/find/descriptor/result/.*?",
                "knowledge_content":"^/find/descriptor/result/.*?",
                "interests":"^/explore/interests/.*?",
                  "abilities":"^/explore/interests/.*?"
Example #21
0
def one(ctx, interactive, enable_phantomjs, scripts):
    """
    One mode not only means all-in-one, it runs every thing in one process over
    tornado.ioloop, for debug purpose

    * webui is not running in one mode.
    * SCRIPTS is the script file path of project
        - when set, taskdb and resultdb will use a in-memery sqlite db by default
        - when set, on_start callback will be triggered on start
    * the status of project is always RUNNING.
    * rate and burst can be set in script with comments like:
        # rate: 1.0
        # burst: 3
    """

    ctx.obj['debug'] = False
    g = ctx.obj
    g['testing_mode'] = True

    if scripts:
        from pyspider.database.local.projectdb import ProjectDB
        g['projectdb'] = ProjectDB(scripts)
        if g.get('is_taskdb_default'):
            g['taskdb'] = connect_database('sqlite+taskdb://')
        if g.get('is_resultdb_default'):
            g['resultdb'] = connect_database('sqlite+resultdb://')

    if enable_phantomjs:
        phantomjs_config = g.config.get('phantomjs', {})
        phantomjs_obj = ctx.invoke(phantomjs, **phantomjs_config)
        if phantomjs_obj:
            g.setdefault('phantomjs_proxy', 'localhost:%s' % phantomjs_obj.port)
    else:
        phantomjs_obj = None

    result_worker_config = g.config.get('result_worker', {})
    result_worker_obj = ctx.invoke(result_worker, **result_worker_config)

    processor_config = g.config.get('processor', {})
    processor_obj = ctx.invoke(processor, **processor_config)

    fetcher_config = g.config.get('fetcher', {})
    fetcher_config.setdefault('xmlrpc', False)
    fetcher_obj = ctx.invoke(fetcher, **fetcher_config)

    scheduler_config = g.config.get('scheduler', {})
    scheduler_config.setdefault('xmlrpc', False)
    scheduler_config.setdefault('scheduler_cls',
                                'pyspider.scheduler.scheduler.OneScheduler')
    scheduler_obj = ctx.invoke(scheduler, **scheduler_config)

    scheduler_obj.init_one(ioloop=fetcher_obj.ioloop,
                           fetcher=fetcher_obj,
                           processor=processor_obj,
                           result_worker=result_worker_obj,
                           interactive=interactive)
    if scripts:
        for project in g.projectdb.projects:
            scheduler_obj.trigger_on_start(project)
    try:
        scheduler_obj.run()
    except KeyboardInterrupt:
        scheduler_obj.quit()
        if phantomjs_obj:
            phantomjs_obj.quit()
        raise
Example #22
0
def cli(ctx, **kwargs):
    """
    A powerful spider system in python.
    """
    logging.config.fileConfig(os.path.join(os.path.dirname(__file__), "logging.conf"))

    # get db from env
    for db in ("taskdb", "projectdb", "resultdb"):
        if kwargs[db] is not None:
            continue
        if os.environ.get("MYSQL_NAME"):
            kwargs[db] = utils.Get(
                lambda db=db: connect_database(
                    "mysql+%s://%s:%s/%s"
                    % (db, os.environ["MYSQL_PORT_3306_TCP_ADDR"], os.environ["MYSQL_PORT_3306_TCP_PORT"], db)
                )
            )
        elif os.environ.get("MONGODB_NAME"):
            kwargs[db] = utils.Get(
                lambda db=db: connect_database(
                    "mongodb+%s://%s:%s/%s"
                    % (db, os.environ["MONGODB_PORT_27017_TCP_ADDR"], os.environ["MONGODB_PORT_27017_TCP_PORT"], db)
                )
            )
        elif ctx.invoked_subcommand == "bench":
            if kwargs["data_path"] == "./data":
                kwargs["data_path"] += "/bench"
                shutil.rmtree(kwargs["data_path"], ignore_errors=True)
                os.mkdir(kwargs["data_path"])
            if db in ("taskdb", "resultdb"):
                kwargs[db] = utils.Get(lambda db=db: connect_database("sqlite+%s://" % (db)))
            else:
                kwargs[db] = utils.Get(
                    lambda db=db: connect_database("sqlite+%s:///%s/%s.db" % (db, kwargs["data_path"], db[:-2]))
                )
        else:
            if not os.path.exists(kwargs["data_path"]):
                os.mkdir(kwargs["data_path"])
            kwargs[db] = utils.Get(
                lambda db=db: connect_database("sqlite+%s:///%s/%s.db" % (db, kwargs["data_path"], db[:-2]))
            )

    # queue
    if kwargs.get("amqp_url"):
        from pyspider.libs.rabbitmq import Queue

        for name in ("newtask_queue", "status_queue", "scheduler2fetcher", "fetcher2processor", "processor2result"):
            kwargs[name] = utils.Get(
                lambda name=name: Queue(name, amqp_url=kwargs["amqp_url"], maxsize=kwargs["queue_maxsize"])
            )
    elif os.environ.get("RABBITMQ_NAME"):
        from pyspider.libs.rabbitmq import Queue

        amqp_url = (
            "amqp://*****:*****@%(RABBITMQ_PORT_5672_TCP_ADDR)s" ":%(RABBITMQ_PORT_5672_TCP_PORT)s/%%2F" % os.environ
        )
        for name in ("newtask_queue", "status_queue", "scheduler2fetcher", "fetcher2processor", "processor2result"):
            kwargs[name] = utils.Get(lambda name=name: Queue(name, amqp_url=amqp_url, maxsize=kwargs["queue_maxsize"]))
    else:
        from multiprocessing import Queue

        for name in ("newtask_queue", "status_queue", "scheduler2fetcher", "fetcher2processor", "processor2result"):
            kwargs[name] = Queue(kwargs["queue_maxsize"])

    # phantomjs-proxy
    if kwargs.get("phantomjs_proxy"):
        pass
    elif os.environ.get("PHANTOMJS_NAME"):
        kwargs["phantomjs_proxy"] = os.environ["PHANTOMJS_PORT"][len("tcp://") :]

    ctx.obj = utils.ObjectDict(ctx.obj or {})
    ctx.obj["instances"] = []
    ctx.obj.update(kwargs)

    if ctx.invoked_subcommand is None and not ctx.obj.get("testing_mode"):
        ctx.invoke(all)
    return ctx
Example #23
0
 def setUpClass(self):
     self.projectdb = database.connect_database(
         'sqlalchemy+mysql+mysqlconnector+projectdb://root@localhost/pyspider_test_projectdb'
     )
     self.assertIsNotNone(self, self.projectdb)
Example #24
0
 def setUpClass(self):
     self.taskdb = database.connect_database('mysql+taskdb://localhost/pyspider_test_taskdb')
Example #25
0
 def setUpClass(self):
     self.resultdb = database.connect_database(
         'sqlalchemy+sqlite+resultdb://'
     )
     self.assertIsNotNone(self, self.resultdb)
Example #26
0
 def setUpClass(self):
     self.projectdb = database.connect_database(
         'mongodb+projectdb://localhost/pyspider_test_projectdb'
     )
Example #27
0
class Handler(BaseHandler):
    crawl_config = {}

    author_projectdb = database.connect_database(
        'elasticsearch+projectdb://127.0.0.1:9200/?index=author')
    note_projectdb = database.connect_database(
        'elasticsearch+projectdb://127.0.0.1:9200/?index=note')
    flower_projectdb = database.connect_database(
        'elasticsearch+projectdb://127.0.0.1:9200/?index=flower')

    # 处理帖子内容的方法,获取 回帖target_id, 帖子内容,发帖客户端
    def handle_content(self, content_area, building_id):
        hl_md5 = hashlib.md5()
        # 原生帖子内容, eg: 引用回帖:4楼:Originallypostedby含笑木香at2018-04-0321:08:49比如我,最喜欢暴风雨的时候睡懒觉,
        # 我也很喜欢啊,别人怕暴风雨,我是遇到暴风雨就兴奋发自小木虫IOS客户端
        raw_content = content_area("div[class='t_fsz']").find(
            "td:eq(0)").text().replace("\n", "").replace(" ", "")
        if (raw_content == ""):
            target_id = ""
            content = ""
            device = ""
            return target_id, content, device
        else:
            # 处理有引用回帖的情况
            if raw_content.startswith("引用回帖"):
                # 获取引用回帖的中的日期
                maybe_date = re.search(
                    r"(\d{4}-\d{1,2}-\d{1,2}\d{1,2}:\d{1,2}:\d{1,2})",
                    raw_content)
                if not maybe_date:
                    target_id = ""
                    content = ""
                    device = ""
                    return target_id, content, device
                founded_date = maybe_date.group(0)
                # 对于引用回帖的内容进行截取,key1=引用回帖: key2=at
                reference_str = raw_content[raw_content.find("引用回帖:") +
                                            5:raw_content.find(founded_date) -
                                            2]
                # 获取虫名 以及 楼层,还有上面的日期,拼接raw_id
                raw_id = reference_str[reference_str.find("Originallypostedby"
                                                          ) +
                                       18:] + founded_date + reference_str[:1]
                hl_md5.update(raw_id.replace(" ", "").encode(encoding='utf-8'))
                target_id = hl_md5.hexdigest()
                # 获取帖子内容
                content = raw_content[raw_content.find(founded_date) +
                                      18:raw_content.find("发自小木虫")]
                # 获取客户端
                if raw_content.find("发自小木虫") != -1:
                    device = raw_content[raw_content.find("发自小木虫") +
                                         5:raw_content.find("客户端")]
                else:
                    device = "PC"
                return target_id, content, device
            else:
                target_id = building_id
                content = raw_content[:raw_content.find("发自小木虫")]
                if raw_content.find("发自小木虫") != -1:
                    device = raw_content[raw_content.find("发自小木虫") +
                                         5:raw_content.find("客户端")]
                else:
                    device = "PC"
                return target_id, content, device

    # 入口方法
    @every(minutes=2.5 * 24 * 60)
    def on_start(self):
        self.crawl('http://muchong.com/bbs',
                   callback=self.index_page,
                   cookies={
                       "Hm_lpvt_2207ecfb7b2633a3bc5c4968feb58569":
                       "1522564279",
                       "Hm_lvt_2207ecfb7b2633a3bc5c4968feb58569": "1522564172",
                       "_discuz_pw": "9a1449a8990d49a6",
                       "_discuz_uid": "3302227",
                       "_emuch_index": "1",
                       "_ga": "GA1.2.1902872401.1522564172",
                       "_gat": "1"
                   })

    # 单位是秒
    @config(age=36 * 60 * 60)
    def index_page(self, response):
        context = response.doc
        for each_area in context.find(
                "div.forum_Box.bg_global.xmc_line_lr.xmc_line_bno").items():
            self.handle_first_area(
                each_area("h2 strong").text(), each_area("table"))

    # 处理一级板块,比如 网络生活区等
    def handle_first_area(self, first_area_name, second_area_table):
        second_area = second_area_table.find("td")
        for each_second_area in second_area.items():
            second_area_link = each_second_area.find(
                "div.xmc_fl.xmc_forum_width h4.xmc_blue a")
            second_area_name = second_area_link.text()
            second_area_href = second_area_link.attr("href")
            self.handle_second_area(first_area_name, second_area_name,
                                    second_area_href)

    # 处理二级板块,比如 休闲灌水等,这个时候进入的是分页的首页 url: http://muchong.com/f-6-1
    def handle_second_area(self, first_area_name, second_area_name,
                           second_area_href):
        if first_area_name != "" and second_area_name != "" and second_area_href != "":
            self.crawl(second_area_href,
                       callback=self.second_index_page,
                       cookies={
                           "Hm_lpvt_2207ecfb7b2633a3bc5c4968feb58569":
                           "1522564279",
                           "Hm_lvt_2207ecfb7b2633a3bc5c4968feb58569":
                           "1522564172",
                           "_discuz_pw": "9a1449a8990d49a6",
                           "_discuz_uid": "3302227",
                           "_emuch_index": "1",
                           "_ga": "GA1.2.1902872401.1522564172",
                           "_gat": "1"
                       })

    # 统计二级分类下的全部帖子,分页爬取(首先获取总页数,拼接每一页应该爬取的url)
    def second_index_page(self, response):
        context = response.doc
        total_page = context.find("td.header:eq(1)").text()
        total_page = total_page[total_page.find("/") + 1:]
        basic_url = response.url
        if not total_page == "":
            total_page = int(total_page)
            if total_page > 200:
                total_page = 200
            # 循环遍历每页
            for page in range(total_page):
                each_page_url = basic_url[:basic_url.rfind("-") +
                                          1] + str(page + 1)
                self.crawl(each_page_url,
                           callback=self.handle_each_second_index_page,
                           cookies={
                               "Hm_lpvt_2207ecfb7b2633a3bc5c4968feb58569":
                               "1522564279",
                               "Hm_lvt_2207ecfb7b2633a3bc5c4968feb58569":
                               "1522564172",
                               "_discuz_pw": "9a1449a8990d49a6",
                               "_discuz_uid": "3302227",
                               "_emuch_index": "1",
                               "_ga": "GA1.2.1902872401.1522564172",
                               "_gat": "1"
                           })

    # 爬取二级分类下的每一个帖子
    def handle_each_second_index_page(self, response):
        context = response.doc
        notes_titles = context.find("th.thread-name")
        for each_note in notes_titles.items():
            if each_note is not None:
                self.crawl(each_note("a.a_subject").attr("href"),
                           callback=self.note_index,
                           cookies={
                               "Hm_lpvt_2207ecfb7b2633a3bc5c4968feb58569":
                               "1522564279",
                               "Hm_lvt_2207ecfb7b2633a3bc5c4968feb58569":
                               "1522564172",
                               "_discuz_pw": "9a1449a8990d49a6",
                               "_discuz_uid": "3302227",
                               "_emuch_index": "1",
                               "_ga": "GA1.2.1902872401.1522564172",
                               "_gat": "1"
                           })

    # 统计帖子的总页数,分页爬取
    def note_index(self, response):
        context = response.doc
        total_page = context.find("td.header:eq(1)").text()
        total_page = total_page[total_page.find("/") + 1:]
        basic_url = response.url
        basic_url = basic_url[:basic_url.rfind("-") + 1]
        if not total_page == "":
            total_page = int(total_page)
            if total_page > 200:
                total_page = 200
            # 循环遍历每页
            for page in range(total_page):
                each_page_url = basic_url[:basic_url.rfind("-") +
                                          1] + str(page + 1)
                self.crawl(each_page_url,
                           callback=self.handle_note,
                           cookies={
                               "Hm_lpvt_2207ecfb7b2633a3bc5c4968feb58569":
                               "1522564279",
                               "Hm_lvt_2207ecfb7b2633a3bc5c4968feb58569":
                               "1522564172",
                               "_discuz_pw": "9a1449a8990d49a6",
                               "_discuz_uid": "3302227",
                               "_emuch_index": "1",
                               "_ga": "GA1.2.1902872401.1522564172",
                               "_gat": "1"
                           })

    def handle_note(self, response):
        request_url = response.url
        building_id = request_url[request_url.rfind("/") +
                                  1:request_url.rfind("-") + 1]
        page_sign = request_url[request_url.rfind("-") + 1:]
        # 获取整个doc
        context = response.doc
        for each_note in context("tbody[id^='pid']").items():
            # 这个md5对象,需要每次都新生成,否则生成的md5值会有问题
            hl_md5 = hashlib.md5()
            note = {}
            author = each_note.find("div.pls_user h3 a")
            # 作者链接
            author_link = author.attr("href")
            note["author_id"] = author_link[author_link.find("uid=") + 4:]
            author_actual_link = author_link.replace("muchong.com",
                                                     "muchong.com/bbs")
            # 楼层及创建时间块
            floor_time_area = each_note.find("div[class='pls_info']")
            create_time = floor_time_area("em").text()
            note["create_time"] = create_time
            # 一楼是1楼,二楼是沙发,三楼是板凳,四楼是4楼
            raw_floor = floor_time_area("span a").text()
            floor = "2楼" if raw_floor == "沙发" else (
                "3楼" if raw_floor == "板凳" else raw_floor)
            note["floor"] = floor[:-1]
            raw_id = author.text() + create_time + floor[:-1]
            hl_md5.update(raw_id.replace(" ", "").encode(encoding='utf-8'))
            note["id"] = hl_md5.hexdigest()
            # 帖子内容区域
            content_area = each_note.find(
                "td[class='plc_mind'] div[class='plc_Con']")
            # 赋值全局变量楼主帖子id
            if note["floor"] == "1":
                note["title"] = content_area("h1").text()
                # 如果是帖子的第一页,第一楼为全局id
                if int(page_sign) == 1:
                    note["id"] = building_id
            if content_area:
                target_id, content, device = self.handle_content(
                    content_area, building_id)
                note["target_id"] = target_id
                note["content"] = content
                note["device"] = device
            category_names = context.find("span.breadcrumb")
            # 一级分类名称
            note["first_category_name"] = category_names("a:eq(1)").text()
            # 二级分类名称
            note["second_category_name"] = category_names("a:eq(2)").text()
            # 三级分类名称
            note["third_category_name"] = category_names("a:eq(3)").text()
            note["building_id"] = building_id
            self.note_projectdb.es.index("note", "project", note, note["id"])
            # 爬取作者
            self.crawl(author_actual_link,
                       callback=self.handle_author,
                       cookies={
                           "Hm_lpvt_2207ecfb7b2633a3bc5c4968feb58569":
                           "1522564279",
                           "Hm_lvt_2207ecfb7b2633a3bc5c4968feb58569":
                           "1522564172",
                           "_discuz_pw": "9a1449a8990d49a6",
                           "_discuz_uid": "3302227",
                           "_emuch_index": "1",
                           "_ga": "GA1.2.1902872401.1522564172",
                           "_gat": "1"
                       })

    def handle_author(self, response):
        hl_md5 = hashlib.md5()
        # 作者信息字典
        author = {}
        # 获取整个doc
        context = response.doc
        # 查找基本信息,class 是userinfo base的table
        basic_information = context("table.userinfo.base")
        # 注册时间
        register_time = basic_information("td:eq(0)").text()
        if register_time:
            author["register_time"] = register_time
        # 其他基本信息(有三个class 为userinfo的table, 选取第二个)
        basic_information = context("table.userinfo:eq(1)")
        author["id"] = basic_information("tr:eq(0) td:eq(0)").text()
        author["name"] = context("div.space_index").find("a:eq(0)").text()
        author["sex"] = basic_information("tr:eq(4) td:eq(0)").text()
        birthday_time = basic_information("tr:eq(4) td:eq(2)").text()
        if (not birthday_time == "0000-00-00") and (not birthday_time == ""):
            author["birthday_time"] = birthday_time
        author["coin_num"] = basic_information("tr:eq(1) td:eq(1)").text()
        author["major"] = basic_information("tr:eq(3) td:eq(2)").text()
        author["help_num"] = basic_information("tr:eq(0) td:eq(2)").text()
        author["grant_num"] = basic_information("tr:eq(1) td:eq(2)").text()
        note_num_src = basic_information("tr:eq(2) td:eq(1)").text()
        note_num_desc = basic_information("tr:eq(2) td:eq(1) font").text()
        author["note_num"] = note_num_src.replace(note_num_desc, "").replace(
            "\n", "").replace(" ", "")
        composite_info = context("div.space_index table tr").find(
            "div:last").text()
        # 截取,切片字符串
        composite_info = composite_info[composite_info.find("听众"):].split(
            "\xa0")
        # 分组存储
        # if len(composite_info) > 0 and not composite_info[0] == "":
        #    author["fans_num"] = composite_info[0][composite_info[0].find(":")+1:].replace(" ", "")
        #    print(author["fans_num"])
        # if len(composite_info) > 1 and not composite_info[1] == "":
        #    author["flower_num"] = composite_info[1][composite_info[1].find(":")+1:].replace(" ","")
        #    print(author["flower_num"])
        # 查看获取的红花
        flowers = context("table.userinfo:eq(2)").find("table")("tr td")
        # flower_num = 0
        for flower_row in flowers.items():
            flower = {}
            flower["owner_id"] = author["id"]
            flower["owner_name"] = author["name"]
            flower["sender_name"] = flower_row("a").text()
            flower_num = flower_row("font").text()[1:-1]
            flower["flower_num"] = "1" if flower_num == "" else flower_num
            # flower_num = flower_num+ int(flower["flower_num"])
            self.flower_projectdb.es.index("flower", "project", flower)
        # author["flower_num"] = flower_num
        hl_md5.update(author["id"].encode(encoding='utf-8'))
        self.author_projectdb.es.index("author", "project", author,
                                       hl_md5.hexdigest())
Example #28
0
 def setUpClass(self):
     self.resultdb = database.connect_database(
         'sqlalchemy+mysql+mysqlconnector+resultdb://root@localhost/pyspider_test_resultdb'
     )
Example #29
0
def connect_db(ctx, param, value):
    if not value:
        return
    return utils.Get(lambda: connect_database(value))
Example #30
0
 def setUpClass(self):
     self.projectdb = database.connect_database(
         'sqlalchemy+sqlite+projectdb://'
     )
Example #31
0
def cli(ctx, **kwargs):
    """
    A powerful spider system in python.
    """
    logging.config.fileConfig(kwargs['logging_config'])

    # get db from env
    for db in ('taskdb', 'projectdb', 'resultdb'):
        if kwargs[db] is not None:
            continue
        if os.environ.get('MYSQL_NAME'):
            kwargs[db] = utils.Get(lambda db=db: connect_database('mysql+%s://%s:%s/%s' % (
                db, os.environ['MYSQL_PORT_3306_TCP_ADDR'],
                os.environ['MYSQL_PORT_3306_TCP_PORT'], db)))
        elif os.environ.get('MONGODB_NAME'):
            kwargs[db] = utils.Get(lambda db=db: connect_database('mongodb+%s://%s:%s/%s' % (
                db, os.environ['MONGODB_PORT_27017_TCP_ADDR'],
                os.environ['MONGODB_PORT_27017_TCP_PORT'], db)))
        elif ctx.invoked_subcommand == 'bench':
            if kwargs['data_path'] == './data':
                kwargs['data_path'] += '/bench'
                shutil.rmtree(kwargs['data_path'], ignore_errors=True)
                os.mkdir(kwargs['data_path'])
            if db in ('taskdb', 'resultdb'):
                kwargs[db] = utils.Get(lambda db=db: connect_database('sqlite+%s://' % (db)))
            else:
                kwargs[db] = utils.Get(lambda db=db: connect_database('sqlite+%s:///%s/%s.db' % (
                    db, kwargs['data_path'], db[:-2])))
        else:
            if not os.path.exists(kwargs['data_path']):
                os.mkdir(kwargs['data_path'])
            kwargs[db] = utils.Get(lambda db=db: connect_database('sqlite+%s:///%s/%s.db' % (
                db, kwargs['data_path'], db[:-2])))
            kwargs['is_%s_default' % db] = True

    # create folder for counter.dump
    if not os.path.exists(kwargs['data_path']):
        os.mkdir(kwargs['data_path'])

    # queue
    if kwargs.get('amqp_url'):
        from pyspider.libs.rabbitmq import Queue
        for name in ('newtask_queue', 'status_queue', 'scheduler2fetcher',
                     'fetcher2processor', 'processor2result'):
            kwargs[name] = utils.Get(lambda name=name: Queue(name, amqp_url=kwargs['amqp_url'],
                                                             maxsize=kwargs['queue_maxsize']))
    elif os.environ.get('RABBITMQ_NAME'):
        from pyspider.libs.rabbitmq import Queue
        amqp_url = ("amqp://*****:*****@%(RABBITMQ_PORT_5672_TCP_ADDR)s"
                    ":%(RABBITMQ_PORT_5672_TCP_PORT)s/%%2F" % os.environ)
        for name in ('newtask_queue', 'status_queue', 'scheduler2fetcher',
                     'fetcher2processor', 'processor2result'):
            kwargs[name] = utils.Get(lambda name=name: Queue(name, amqp_url=amqp_url,
                                                             maxsize=kwargs['queue_maxsize']))
    elif kwargs.get('beanstalk'):
        from pyspider.libs.beanstalk import Queue
        for name in ('newtask_queue', 'status_queue', 'scheduler2fetcher',
                     'fetcher2processor', 'processor2result'):
            kwargs[name] = utils.Get(lambda name=name: Queue(name, host=kwargs.get('beanstalk'),
                                                             maxsize=kwargs['queue_maxsize']))
    else:
        from multiprocessing import Queue
        for name in ('newtask_queue', 'status_queue', 'scheduler2fetcher',
                     'fetcher2processor', 'processor2result'):
            kwargs[name] = Queue(kwargs['queue_maxsize'])

    # phantomjs-proxy
    if kwargs.get('phantomjs_proxy'):
        pass
    elif os.environ.get('PHANTOMJS_NAME'):
        kwargs['phantomjs_proxy'] = os.environ['PHANTOMJS_PORT_25555_TCP'][len('tcp://'):]

    ctx.obj = utils.ObjectDict(ctx.obj or {})
    ctx.obj['instances'] = []
    ctx.obj.update(kwargs)

    if ctx.invoked_subcommand is None and not ctx.obj.get('testing_mode'):
        ctx.invoke(all)
    return ctx
Example #32
0
 def setUpClass(self):
     self.projectdb = database.connect_database(
         'sqlalchemy+postgresql+projectdb://[email protected]:5432/pyspider_test_projectdb'
     )
Example #33
0
def one(ctx, interactive, enable_phantomjs, enable_puppeteer, scripts):
    """
    One mode not only means all-in-one, it runs every thing in one process over
    tornado.ioloop, for debug purpose
    """

    ctx.obj['debug'] = False
    g = ctx.obj
    g['testing_mode'] = True

    if scripts:
        from pyspider.database.local.projectdb import ProjectDB
        g['projectdb'] = ProjectDB(scripts)
        if g.get('is_taskdb_default'):
            g['taskdb'] = connect_database('sqlite+taskdb://')
        if g.get('is_resultdb_default'):
            g['resultdb'] = None

    if enable_phantomjs:
        phantomjs_config = g.config.get('phantomjs', {})
        phantomjs_obj = ctx.invoke(phantomjs, **phantomjs_config)
        if phantomjs_obj:
            g.setdefault('phantomjs_proxy', '127.0.0.1:%s' % phantomjs_obj.port)
    else:
        phantomjs_obj = None

    if enable_puppeteer:
        puppeteer_config = g.config.get('puppeteer', {})
        puppeteer_obj = ctx.invoke(puppeteer, **puppeteer_config)
        if puppeteer_obj:
            g.setdefault('puppeteer_proxy', '127.0.0.1:%s' % puppeteer.port)
    else:
        puppeteer_obj = None

    result_worker_config = g.config.get('result_worker', {})
    if g.resultdb is None:
        result_worker_config.setdefault('result_cls',
                                        'pyspider.result.OneResultWorker')
    result_worker_obj = ctx.invoke(result_worker, **result_worker_config)

    processor_config = g.config.get('processor', {})
    processor_config.setdefault('enable_stdout_capture', False)
    processor_obj = ctx.invoke(processor, **processor_config)

    fetcher_config = g.config.get('fetcher', {})
    fetcher_config.setdefault('xmlrpc', False)
    fetcher_obj = ctx.invoke(fetcher, **fetcher_config)

    scheduler_config = g.config.get('scheduler', {})
    scheduler_config.setdefault('xmlrpc', False)
    scheduler_config.setdefault('scheduler_cls',
                                'pyspider.scheduler.OneScheduler')
    scheduler_obj = ctx.invoke(scheduler, **scheduler_config)

    scheduler_obj.init_one(ioloop=fetcher_obj.ioloop,
                           fetcher=fetcher_obj,
                           processor=processor_obj,
                           result_worker=result_worker_obj,
                           interactive=interactive)
    if scripts:
        for project in g.projectdb.projects:
            scheduler_obj.trigger_on_start(project)

    try:
        scheduler_obj.run()
    finally:
        scheduler_obj.quit()
        if phantomjs_obj:
            phantomjs_obj.quit()
        if puppeteer_obj:
            puppeteer_obj.quit()
Example #34
0
 def setUpClass(self):
     self.taskdb = database.connect_database('redis+taskdb://localhost:6379/15')
     self.taskdb.__prefix__ = 'testtaskdb_'
Example #35
0
 def setUpClass(self):
     self.resultdb = database.connect_database(
         'elasticsearch+resultdb://127.0.0.1:9200/?index=test_pyspider_resultdb'
     )
     assert self.resultdb.index == 'test_pyspider_resultdb'
Example #36
0
 def setUpClass(self):
     self.resultdb = database.connect_database(
             'sqlalchemy+postgresql+resultdb://[email protected]/pyspider_test_resultdb'
     )
     self.assertIsNotNone(self, self.resultdb)
     self.tearDownClass()
Example #37
0
class Handler(BaseHandler):
    crawl_config = {}

    author_projectdb = database.connect_database(
        'elasticsearch+projectdb://127.0.0.1:9200/?index=author')
    flower_projectdb = database.connect_database(
        'elasticsearch+projectdb://127.0.0.1:9200/?index=flower')

    @every(minutes=1)
    def on_start(self):
        self.crawl('http://muchong.com/bbs/space.php?uid=3583297',
                   callback=self.handle_author,
                   cookies={
                       "Hm_lpvt_2207ecfb7b2633a3bc5c4968feb58569":
                       "1522564279",
                       "Hm_lvt_2207ecfb7b2633a3bc5c4968feb58569": "1522564172",
                       "_discuz_pw": "9a1449a8990d49a6",
                       "_discuz_uid": "3302227",
                       "_emuch_index": "1",
                       "_ga": "GA1.2.1902872401.1522564172",
                       "_gat": "1"
                   })

    @config(age=1)
    def handle_author(self, response):
        hl_md5 = hashlib.md5()
        # 作者信息字典
        author = {}
        # 获取整个doc
        context = response.doc
        # 查找基本信息,class 是userinfo base的table
        basic_information = context("table.userinfo.base")
        # 注册时间
        register_time = basic_information("td:eq(0)").text()
        if register_time:
            author["register_time"] = register_time
        # 其他基本信息(有三个class 为userinfo的table, 选取第二个)
        basic_information = context("table.userinfo:eq(1)")
        author["id"] = basic_information("tr:eq(0) td:eq(0)").text()
        author["name"] = context("div.space_index").find("a:eq(0)").text()
        author["sex"] = basic_information("tr:eq(4) td:eq(0)").text()
        birthday_time = basic_information("tr:eq(4) td:eq(2)").text()
        if (not birthday_time == "0000-00-00") and (not birthday_time == ""):
            author["birthday_time"] = birthday_time
        author["coin_num"] = basic_information("tr:eq(1) td:eq(1)").text()
        author["major"] = basic_information("tr:eq(3) td:eq(2)").text()
        author["help_num"] = basic_information("tr:eq(0) td:eq(2)").text()
        author["grant_num"] = basic_information("tr:eq(1) td:eq(2)").text()
        note_num_src = basic_information("tr:eq(2) td:eq(1)").text()
        note_num_desc = basic_information("tr:eq(2) td:eq(1) font").text()
        author["note_num"] = note_num_src.replace(note_num_desc, "").replace(
            "\n", "").replace(" ", "")
        hl_md5.update(author["id"].encode(encoding='utf-8'))
        es_author_id = hl_md5.hexdigest()
        composite_info = context("div.space_index table tr").find(
            "div:last").text()
        # 截取,切片字符串
        composite_info = composite_info[composite_info.find("听众"):].split(
            "\xa0")
        # 分组存储
        #if len(composite_info) > 0 and not composite_info[0] == "":
        #    author["fans_num"] = composite_info[0][composite_info[0].find(":")+1:].replace(" ", "")
        #    print(author["fans_num"])
        #if len(composite_info) > 1 and not composite_info[1] == "":
        #    author["flower_num"] = composite_info[1][composite_info[1].find(":")+1:].replace(" ","")
        #    print(author["flower_num"])
        # 查看获取的红花
        flowers = context("table.userinfo:eq(2)").find("table")("tr td")
        # flower_num = 0
        for flower_row in flowers.items():
            hl_md5_flower = hashlib.md5()
            flower = {}
            flower["owner_id"] = author["id"]
            flower["owner_name"] = author["name"]
            flower["sender_name"] = flower_row("a").text()
            flower_num = flower_row("font").text()[1:-1]
            flower["flower_num"] = "1" if flower_num == "" else flower_num
            raw_index_id = flower["owner_id"] + flower["sender_name"]
            hl_md5_flower.update(raw_index_id.encode(encoding='utf-8'))
            flower_es_id = hl_md5_flower.hexdigest()
            #flower_num = flower_num+ int(flower["flower_num"])
            print(flower_es_id)
            print(flower)
            self.flower_projectdb.es.index("flower", "project", flower,
                                           flower_es_id)
        #author["flower_num"] = flower_num
        self.author_projectdb.es.index("author", "project", author,
                                       es_author_id)
Example #38
0
def cli(ctx, **kwargs):
    """
    A powerful spider system in python.
    """
    if kwargs['add_sys_path']:
        sys.path.append(os.getcwd())

    logging.config.fileConfig(kwargs['logging_config'])

    # get db from env
    for db in ('taskdb', 'projectdb', 'resultdb'):
        if kwargs[db] is not None:
            continue
        if os.environ.get('MYSQL_NAME'):
            kwargs[db] = utils.Get(lambda db=db: connect_database(
                'sqlalchemy+mysql+{0!s}://{1!s}:{2!s}/{3!s}'.format(
                    db, os.environ['MYSQL_PORT_3306_TCP_ADDR'],
                    os.environ['MYSQL_PORT_3306_TCP_PORT'], db)))
        elif os.environ.get('MONGODB_NAME'):
            kwargs[db] = utils.Get(lambda db=db: connect_database(
                'mongodb+{0!s}://{1!s}:{2!s}/{3!s}'.format(
                    db, os.environ['MONGODB_PORT_27017_TCP_ADDR'],
                    os.environ['MONGODB_PORT_27017_TCP_PORT'], db)))
        elif ctx.invoked_subcommand == 'bench':
            if kwargs['data_path'] == './data':
                kwargs['data_path'] += '/bench'
                shutil.rmtree(kwargs['data_path'], ignore_errors=True)
                os.mkdir(kwargs['data_path'])
            if db in ('taskdb', 'resultdb'):
                kwargs[db] = utils.Get(lambda db=db: connect_database('sqlite+{0!s}://'.format((db))))
            else:
                kwargs[db] = utils.Get(lambda db=db: connect_database('sqlite+{0!s}:///{1!s}/{2!s}.db'.format(
                    db, kwargs['data_path'], db[:-2])))
        else:
            if not os.path.exists(kwargs['data_path']):
                os.mkdir(kwargs['data_path'])
            kwargs[db] = utils.Get(lambda db=db: connect_database('sqlite+{0!s}:///{1!s}/{2!s}.db'.format(
                db, kwargs['data_path'], db[:-2])))
            kwargs['is_{0!s}_default'.format(db)] = True

    # create folder for counter.dump
    if not os.path.exists(kwargs['data_path']):
        os.mkdir(kwargs['data_path'])

    # message queue, compatible with old version
    if kwargs.get('message_queue'):
        pass
    elif kwargs.get('amqp_url'):
        kwargs['message_queue'] = kwargs['amqp_url']
    elif os.environ.get('RABBITMQ_NAME'):
        kwargs['message_queue'] = ("amqp://*****:*****@%(RABBITMQ_PORT_5672_TCP_ADDR)s"
                                   ":%(RABBITMQ_PORT_5672_TCP_PORT)s/%%2F" % os.environ)
    elif kwargs.get('beanstalk'):
        kwargs['message_queue'] = "beanstalk://{0!s}/".format(kwargs['beanstalk'])

    for name in ('newtask_queue', 'status_queue', 'scheduler2fetcher',
                 'fetcher2processor', 'processor2result'):
        if kwargs.get('message_queue'):
            kwargs[name] = utils.Get(lambda name=name: connect_message_queue(
                name, kwargs.get('message_queue'), kwargs['queue_maxsize']))
        else:
            kwargs[name] = connect_message_queue(name, kwargs.get('message_queue'),
                                                 kwargs['queue_maxsize'])

    # phantomjs-proxy
    if kwargs.get('phantomjs_proxy'):
        pass
    elif os.environ.get('PHANTOMJS_NAME'):
        kwargs['phantomjs_proxy'] = os.environ['PHANTOMJS_PORT_25555_TCP'][len('tcp://'):]

    ctx.obj = utils.ObjectDict(ctx.obj or {})
    ctx.obj['instances'] = []
    ctx.obj.update(kwargs)

    if ctx.invoked_subcommand is None and not ctx.obj.get('testing_mode'):
        ctx.invoke(all)
    return ctx
Example #39
0
 def setUpClass(self):
     self.taskdb = database.connect_database(
         'elasticsearch+taskdb://127.0.0.1:9200/?index=test_pyspider_taskdb'
     )
     self.assertIsNotNone(self, self.taskdb)
     assert self.taskdb.index == 'test_pyspider_taskdb'
Example #40
0
def one(ctx, interactive, enable_phantomjs, scripts):
    """
    One mode not only means all-in-one, it runs every thing in one process over
    tornado.ioloop, for debug purpose
    """

    ctx.obj['debug'] = False
    g = ctx.obj
    g['testing_mode'] = True

    if scripts:
        from pyspider.database.local.projectdb import ProjectDB
        g['projectdb'] = ProjectDB(scripts)
        if g.get('is_taskdb_default'):
            g['taskdb'] = connect_database('sqlite+taskdb://')
        if g.get('is_resultdb_default'):
            g['resultdb'] = None

    if enable_phantomjs:
        phantomjs_config = g.config.get('phantomjs', {})
        phantomjs_obj = ctx.invoke(phantomjs, **phantomjs_config)
        if phantomjs_obj:
            g.setdefault('phantomjs_proxy', 'localhost:%s' % phantomjs_obj.port)
    else:
        phantomjs_obj = None

    result_worker_config = g.config.get('result_worker', {})
    if g.resultdb is None:
        result_worker_config.setdefault('result_cls',
                                        'pyspider.result.OneResultWorker')
    result_worker_obj = ctx.invoke(result_worker, **result_worker_config)

    processor_config = g.config.get('processor', {})
    processor_config.setdefault('enable_stdout_capture', False)
    processor_obj = ctx.invoke(processor, **processor_config)

    fetcher_config = g.config.get('fetcher', {})
    fetcher_config.setdefault('xmlrpc', False)
    fetcher_obj = ctx.invoke(fetcher, **fetcher_config)

    scheduler_config = g.config.get('scheduler', {})
    scheduler_config.setdefault('xmlrpc', False)
    scheduler_config.setdefault('scheduler_cls',
                                'pyspider.scheduler.OneScheduler')
    scheduler_obj = ctx.invoke(scheduler, **scheduler_config)

    scheduler_obj.init_one(ioloop=fetcher_obj.ioloop,
                           fetcher=fetcher_obj,
                           processor=processor_obj,
                           result_worker=result_worker_obj,
                           interactive=interactive)
    if scripts:
        for project in g.projectdb.projects:
            scheduler_obj.trigger_on_start(project)

    try:
        scheduler_obj.run()
    finally:
        scheduler_obj.quit()
        if phantomjs_obj:
            phantomjs_obj.quit()
Example #41
0
 def setUpClass(self):
     self.taskdb = database.connect_database(
         'sqlalchemy+postgresql+taskdb://[email protected]:5432/pyspider_test_taskdb'
     )
     self.tearDownClass()
Example #42
0
def connect_db(ctx, param, value):
    if not value:
        return
    return utils.Get(lambda: connect_database(value))
Example #43
0
 def setUpClass(self):
     self.taskdb = database.connect_database(
         'elasticsearch+taskdb://127.0.0.1:9200/?index=test_pyspider'
     )
Example #44
0
def cli(ctx, **kwargs):
    """
    A powerful spider system in python.
    """
    if kwargs['add_sys_path']:
        sys.path.append(os.getcwd())

    logging.config.fileConfig(kwargs['logging_config'])

    # get db from env
    for db in ('taskdb', 'projectdb', 'resultdb'):
        if kwargs[db] is not None:
            continue
        if os.environ.get('MYSQL_NAME'):
            kwargs[db] = utils.Get(lambda db=db: connect_database(
                'sqlalchemy+mysql+%s://%s:%s/%s' % (
                    db, os.environ['MYSQL_PORT_3306_TCP_ADDR'],
                    os.environ['MYSQL_PORT_3306_TCP_PORT'], db)))
        elif os.environ.get('MONGODB_NAME'):
            kwargs[db] = utils.Get(lambda db=db: connect_database(
                'mongodb+%s://%s:%s/%s' % (
                    db, os.environ['MONGODB_PORT_27017_TCP_ADDR'],
                    os.environ['MONGODB_PORT_27017_TCP_PORT'], db)))
        elif ctx.invoked_subcommand == 'bench':
            if kwargs['data_path'] == './data':
                kwargs['data_path'] += '/bench'
                shutil.rmtree(kwargs['data_path'], ignore_errors=True)
                os.mkdir(kwargs['data_path'])
            if db in ('taskdb', 'resultdb'):
                kwargs[db] = utils.Get(lambda db=db: connect_database('sqlite+%s://' % (db)))
            elif db in ('projectdb', ):
                kwargs[db] = utils.Get(lambda db=db: connect_database('local+%s://%s' % (
                    db, os.path.join(os.path.dirname(__file__), 'libs/bench.py'))))
        else:
            if not os.path.exists(kwargs['data_path']):
                os.mkdir(kwargs['data_path'])
            kwargs[db] = utils.Get(lambda db=db: connect_database('sqlite+%s:///%s/%s.db' % (
                db, kwargs['data_path'], db[:-2])))
            kwargs['is_%s_default' % db] = True

    # create folder for counter.dump
    if not os.path.exists(kwargs['data_path']):
        os.mkdir(kwargs['data_path'])

    # message queue, compatible with old version
    if kwargs.get('message_queue'):
        pass
    elif kwargs.get('amqp_url'):
        kwargs['message_queue'] = kwargs['amqp_url']
    elif os.environ.get('RABBITMQ_NAME'):
        kwargs['message_queue'] = ("amqp://*****:*****@%(RABBITMQ_PORT_5672_TCP_ADDR)s"
                                   ":%(RABBITMQ_PORT_5672_TCP_PORT)s/%%2F" % os.environ)
    elif kwargs.get('beanstalk'):
        kwargs['message_queue'] = "beanstalk://%s/" % kwargs['beanstalk']

    for name in ('newtask_queue', 'status_queue', 'scheduler2fetcher',
                 'fetcher2processor', 'processor2result'):
        if kwargs.get('message_queue'):
            kwargs[name] = utils.Get(lambda name=name: connect_message_queue(
                name, kwargs.get('message_queue'), kwargs['queue_maxsize']))
        else:
            kwargs[name] = connect_message_queue(name, kwargs.get('message_queue'),
                                                 kwargs['queue_maxsize'])

    # phantomjs-proxy
    if kwargs.get('phantomjs_proxy'):
        pass
    elif os.environ.get('PHANTOMJS_NAME'):
        kwargs['phantomjs_proxy'] = os.environ['PHANTOMJS_PORT_25555_TCP'][len('tcp://'):]

    # puppeteer-proxy
    if kwargs.get('puppeteer_proxy'):
        pass
    elif os.environ.get('PUPPETEER_NAME'):
        kwargs['puppeteer_proxy'] = os.environ['PUPPETEER_PORT_22222_TCP'][len('tcp://'):]

    ctx.obj = utils.ObjectDict(ctx.obj or {})
    ctx.obj['instances'] = []
    ctx.obj.update(kwargs)

    if ctx.invoked_subcommand is None and not ctx.obj.get('testing_mode'):
        ctx.invoke(all)
    return ctx
Example #45
0
def cli(ctx, **kwargs):
    """
    A powerful spider system in python.
    """
    logging.config.fileConfig(kwargs['logging_config'])

    # get db from env
    for db in ('taskdb', 'projectdb', 'resultdb'):
        if kwargs[db] is not None:
            continue
        if os.environ.get('MYSQL_NAME'):
            kwargs[db] = utils.Get(
                lambda db=db: connect_database('mysql+%s://%s:%s/%s' % (
                    db, os.environ['MYSQL_PORT_3306_TCP_ADDR'], os.environ[
                        'MYSQL_PORT_3306_TCP_PORT'], db)))
        elif os.environ.get('MONGODB_NAME'):
            kwargs[db] = utils.Get(
                lambda db=db: connect_database('mongodb+%s://%s:%s/%s' % (
                    db, os.environ['MONGODB_PORT_27017_TCP_ADDR'], os.environ[
                        'MONGODB_PORT_27017_TCP_PORT'], db)))
        elif ctx.invoked_subcommand == 'bench':
            if kwargs['data_path'] == './data':
                kwargs['data_path'] += '/bench'
                shutil.rmtree(kwargs['data_path'], ignore_errors=True)
                os.mkdir(kwargs['data_path'])
            if db in ('taskdb', 'resultdb'):
                kwargs[db] = utils.Get(
                    lambda db=db: connect_database('sqlite+%s://' % (db)))
            else:
                kwargs[db] = utils.Get(
                    lambda db=db: connect_database('sqlite+%s:///%s/%s.db' % (
                        db, kwargs['data_path'], db[:-2])))
        else:
            if not os.path.exists(kwargs['data_path']):
                os.mkdir(kwargs['data_path'])
            kwargs[db] = utils.Get(lambda db=db: connect_database(
                'sqlite+%s:///%s/%s.db' % (db, kwargs['data_path'], db[:-2])))
            kwargs['is_%s_default' % db] = True

    # create folder for counter.dump
    if not os.path.exists(kwargs['data_path']):
        os.mkdir(kwargs['data_path'])

    # queue
    if kwargs.get('amqp_url'):
        from pyspider.libs.rabbitmq import Queue
        for name in ('newtask_queue', 'status_queue', 'scheduler2fetcher',
                     'fetcher2processor', 'processor2result'):
            kwargs[name] = utils.Get(
                lambda name=name: Queue(name,
                                        amqp_url=kwargs['amqp_url'],
                                        maxsize=kwargs['queue_maxsize']))
    elif os.environ.get('RABBITMQ_NAME'):
        from pyspider.libs.rabbitmq import Queue
        amqp_url = ("amqp://*****:*****@%(RABBITMQ_PORT_5672_TCP_ADDR)s"
                    ":%(RABBITMQ_PORT_5672_TCP_PORT)s/%%2F" % os.environ)
        for name in ('newtask_queue', 'status_queue', 'scheduler2fetcher',
                     'fetcher2processor', 'processor2result'):
            kwargs[name] = utils.Get(lambda name=name: Queue(
                name, amqp_url=amqp_url, maxsize=kwargs['queue_maxsize']))
    elif kwargs.get('beanstalk'):
        from pyspider.libs.beanstalk import Queue
        for name in ('newtask_queue', 'status_queue', 'scheduler2fetcher',
                     'fetcher2processor', 'processor2result'):
            kwargs[name] = utils.Get(
                lambda name=name: Queue(name,
                                        host=kwargs.get('beanstalk'),
                                        maxsize=kwargs['queue_maxsize']))
    else:
        from multiprocessing import Queue
        for name in ('newtask_queue', 'status_queue', 'scheduler2fetcher',
                     'fetcher2processor', 'processor2result'):
            kwargs[name] = Queue(kwargs['queue_maxsize'])

    # phantomjs-proxy
    if kwargs.get('phantomjs_proxy'):
        pass
    elif os.environ.get('PHANTOMJS_NAME'):
        kwargs['phantomjs_proxy'] = os.environ['PHANTOMJS_PORT_25555_TCP'][
            len('tcp://'):]

    ctx.obj = utils.ObjectDict(ctx.obj or {})
    ctx.obj['instances'] = []
    ctx.obj.update(kwargs)

    if ctx.invoked_subcommand is None and not ctx.obj.get('testing_mode'):
        ctx.invoke(all)
    return ctx