def resultdb_migrating(project, from_connection, to_connection): logging.info("resultdb: %s", project) f = connect_database(from_connection) t = connect_database(to_connection) t.drop(project) for result in f.select(project): t.save(project, result['taskid'], result['url'], result['result'])
def taskdb_migrating(project, from_connection, to_connection): logging.info("taskdb: %s", project) f = connect_database(from_connection) t = connect_database(to_connection) t.drop(project) for status in range(1, 5): for task in f.load_tasks(status, project=project): t.insert(project, task['taskid'], task)
def cli(ctx, **kwargs): """ A powerful spider system in python. """ logging.config.fileConfig(os.path.join(os.path.dirname(__file__), "logging.conf")) # get db from env for db in ('taskdb', 'projectdb', 'resultdb'): if kwargs[db] is not None: continue if os.environ.get('MYSQL_NAME'): kwargs[db] = Get(lambda db=db: connect_database('mysql+%s://%s:%s/%s' % ( db, os.environ['MYSQL_PORT_3306_TCP_ADDR'], os.environ['MYSQL_PORT_3306_TCP_PORT'], db))) elif os.environ.get('MONGODB_NAME'): kwargs[db] = Get(lambda db=db: connect_database('mongodb+%s://%s:%s/%s' % ( db, os.environ['MONGODB_PORT_27017_TCP_ADDR'], os.environ['MONGODB_PORT_27017_TCP_PORT'], db))) else: if not os.path.exists(kwargs['data_path']): os.mkdir(kwargs['data_path']) kwargs[db] = Get(lambda db=db: connect_database('sqlite+%s:///%s/%s.db' % ( db, kwargs['data_path'], db[:-2]))) # queue if kwargs.get('amqp_url'): from pyspider.libs.rabbitmq import Queue for name in ('newtask_queue', 'status_queue', 'scheduler2fetcher', 'fetcher2processor', 'processor2result'): kwargs[name] = Get(lambda name=name: Queue(name, amqp_url=kwargs['amqp_url'], maxsize=kwargs['queue_maxsize'])) elif os.environ.get('RABBITMQ_NAME'): from pyspider.libs.rabbitmq import Queue amqp_url = ("amqp://*****:*****@%(RABBITMQ_PORT_5672_TCP_ADDR)s" ":%(RABBITMQ_PORT_5672_TCP_PORT)s/%%2F" % os.environ) for name in ('newtask_queue', 'status_queue', 'scheduler2fetcher', 'fetcher2processor', 'processor2result'): kwargs[name] = Get(lambda name=name: Queue(name, amqp_url=amqp_url, maxsize=kwargs['queue_maxsize'])) else: from multiprocessing import Queue for name in ('newtask_queue', 'status_queue', 'scheduler2fetcher', 'fetcher2processor', 'processor2result'): kwargs[name] = Queue(kwargs['queue_maxsize']) # phantomjs-proxy if kwargs.get('phantomjs_proxy'): pass elif os.environ.get('PHANTOMJS_NAME'): kwargs['phantomjs_proxy'] = os.environ['PHANTOMJS_PORT'][len('tcp://'):] ctx.obj['instances'] = [] ctx.obj.update(kwargs) if ctx.invoked_subcommand is None and not ctx.obj.get('testing_mode'): ctx.invoke(all) return ctx
class g(object): taskdb = Get( lambda: connect_database('sqlite+taskdb:///data/tests/task.db')) projectdb = Get( lambda: connect_database('sqlite+projectdb:///data/tests/project.db')) resultdb = Get( lambda: connect_database('sqlite+resultdb:///data/tests/result.db')) newtask_queue = Queue(100) status_queue = Queue(100) scheduler2fetcher = Queue(100) fetcher2processor = Queue(100) processor2result = Queue(100)
def setUpClass(self): # create a test admin user import requests requests.put('http://localhost:5984/_node/_local/_config/admins/test', data='"password"') os.environ["COUCHDB_USER"] = "******" os.environ["COUCHDB_PASSWORD"] = "******" self.taskdb = database.connect_database( 'couchdb+taskdb://localhost:5984/' ) self.assertIsNotNone(self, self.taskdb)
def migrate(pool, from_connection, to_connection): """ Migrate tool for pyspider """ f = connect_database(from_connection) t = connect_database(to_connection) if isinstance(f, ProjectDB): for each in f.get_all(): each = unicode_obj(each) logging.info("projectdb: %s", each['name']) t.drop(each['name']) t.insert(each['name'], each) elif isinstance(f, TaskDB): pool = Pool(pool) pool.map(lambda x, f=from_connection, t=to_connection: taskdb_migrating(x, f, t), f.projects) elif isinstance(f, ResultDB): pool = Pool(pool) pool.map(lambda x, f=from_connection, t=to_connection: resultdb_migrating(x, f, t), f.projects)
def migrate(pool, from_connection, to_connection): """ Migrate tool for pyspider """ f = connect_database(from_connection) t = connect_database(to_connection) if isinstance(f, ProjectDB): for each in f.get_all(): each = unicode_obj(each) logging.info("projectdb: %s", each['name']) t.drop(each['name']) t.insert(each['name'], each) elif isinstance(f, TaskDB): pool = Pool(pool) pool.map( lambda x, f=from_connection, t=to_connection: taskdb_migrating(x, f, t), f.projects) elif isinstance(f, ResultDB): pool = Pool(pool) pool.map( lambda x, f=from_connection, t=to_connection: resultdb_migrating(x, f, t), f.projects)
def connect_db(ctx, param, value): if value is None: return return Get(lambda: connect_database(value))
def setUpClass(self): self.resultdb = database.connect_database( 'sqlalchemy+postgresql+resultdb://[email protected]/pyspider_test_resultdb' )
def setUpClass(self): self.resultdb = database.connect_database( 'sqlalchemy+sqlite+resultdb://' )
def setUpClass(self): self.taskdb = database.connect_database( 'sqlalchemy+sqlite+taskdb://' )
def setUpClass(self): self.resultdb = database.connect_database( 'mongodb+resultdb://localhost/pyspider_test_resultdb' )
def setUpClass(self): self.taskdb = database.connect_database( 'mongodb+taskdb://localhost:27017/pyspider_test_taskdb' )
def setUpClass(self): self.resultdb = database.connect_database( 'mysql+resultdb://localhost/pyspider_test_resultdb' ) self.assertIsNotNone(self, self.resultdb)
def setUpClass(self): self.projectdb = database.connect_database('sqlite+projectdb://') self.assertIsNotNone(self, self.projectdb)
def one(ctx, interactive, enable_phantomjs, scripts): """ One mode not only means all-in-one, it runs every thing in one process over tornado.ioloop, for debug purpose """ ctx.obj["debug"] = False g = ctx.obj g["testing_mode"] = True if scripts: from pyspider.database.local.projectdb import ProjectDB g["projectdb"] = ProjectDB(scripts) if g.get("is_taskdb_default"): g["taskdb"] = connect_database("sqlite+taskdb://") if g.get("is_resultdb_default"): g["resultdb"] = None if enable_phantomjs: phantomjs_config = g.config.get("phantomjs", {}) phantomjs_obj = ctx.invoke(phantomjs, **phantomjs_config) if phantomjs_obj: g.setdefault("phantomjs_proxy", "127.0.0.1:%s" % phantomjs_obj.port) else: phantomjs_obj = None result_worker_config = g.config.get("result_worker", {}) if g.resultdb is None: result_worker_config.setdefault("result_cls", "pyspider.result.OneResultWorker") result_worker_obj = ctx.invoke(result_worker, **result_worker_config) processor_config = g.config.get("processor", {}) processor_config.setdefault("enable_stdout_capture", False) processor_obj = ctx.invoke(processor, **processor_config) fetcher_config = g.config.get("fetcher", {}) fetcher_config.setdefault("xmlrpc", False) fetcher_obj = ctx.invoke(fetcher, **fetcher_config) scheduler_config = g.config.get("scheduler", {}) scheduler_config.setdefault("xmlrpc", False) scheduler_config.setdefault("scheduler_cls", "pyspider.scheduler.OneScheduler") scheduler_obj = ctx.invoke(scheduler, **scheduler_config) scheduler_obj.init_one( ioloop=fetcher_obj.ioloop, fetcher=fetcher_obj, processor=processor_obj, result_worker=result_worker_obj, interactive=interactive, ) if scripts: for project in g.projectdb.projects: scheduler_obj.trigger_on_start(project) try: scheduler_obj.run() finally: scheduler_obj.quit() if phantomjs_obj: phantomjs_obj.quit()
def cli(ctx, **kwargs): """ A powerful spider system in python. """ if kwargs["add_sys_path"]: sys.path.append(os.getcwd()) logging.config.fileConfig(kwargs["logging_config"]) # get db from env for db in ("taskdb", "projectdb", "resultdb"): if kwargs[db] is not None: continue if os.environ.get("MYSQL_NAME"): kwargs[db] = utils.Get( lambda db=db: connect_database( "sqlalchemy+mysql+%s://%s:%s/%s" % (db, os.environ["MYSQL_PORT_3306_TCP_ADDR"], os.environ["MYSQL_PORT_3306_TCP_PORT"], db) ) ) elif os.environ.get("MONGODB_NAME"): kwargs[db] = utils.Get( lambda db=db: connect_database( "mongodb+%s://%s:%s/%s" % (db, os.environ["MONGODB_PORT_27017_TCP_ADDR"], os.environ["MONGODB_PORT_27017_TCP_PORT"], db) ) ) elif ctx.invoked_subcommand == "bench": if kwargs["data_path"] == "./data": kwargs["data_path"] += "/bench" shutil.rmtree(kwargs["data_path"], ignore_errors=True) os.mkdir(kwargs["data_path"]) if db in ("taskdb", "resultdb"): kwargs[db] = utils.Get(lambda db=db: connect_database("sqlite+%s://" % (db))) else: kwargs[db] = utils.Get( lambda db=db: connect_database("sqlite+%s:///%s/%s.db" % (db, kwargs["data_path"], db[:-2])) ) else: if not os.path.exists(kwargs["data_path"]): os.mkdir(kwargs["data_path"]) kwargs[db] = utils.Get( lambda db=db: connect_database("sqlite+%s:///%s/%s.db" % (db, kwargs["data_path"], db[:-2])) ) kwargs["is_%s_default" % db] = True # create folder for counter.dump if not os.path.exists(kwargs["data_path"]): os.mkdir(kwargs["data_path"]) # message queue, compatible with old version if kwargs.get("message_queue"): pass elif kwargs.get("amqp_url"): kwargs["message_queue"] = kwargs["amqp_url"] elif os.environ.get("RABBITMQ_NAME"): kwargs["message_queue"] = ( "amqp://*****:*****@%(RABBITMQ_PORT_5672_TCP_ADDR)s" ":%(RABBITMQ_PORT_5672_TCP_PORT)s/%%2F" % os.environ ) elif kwargs.get("beanstalk"): kwargs["message_queue"] = "beanstalk://%s/" % kwargs["beanstalk"] for name in ("newtask_queue", "status_queue", "scheduler2fetcher", "fetcher2processor", "processor2result"): if kwargs.get("message_queue"): kwargs[name] = utils.Get( lambda name=name: connect_message_queue(name, kwargs.get("message_queue"), kwargs["queue_maxsize"]) ) else: kwargs[name] = connect_message_queue(name, kwargs.get("message_queue"), kwargs["queue_maxsize"]) # phantomjs-proxy if kwargs.get("phantomjs_proxy"): pass elif os.environ.get("PHANTOMJS_NAME"): kwargs["phantomjs_proxy"] = os.environ["PHANTOMJS_PORT_25555_TCP"][len("tcp://") :] ctx.obj = utils.ObjectDict(ctx.obj or {}) ctx.obj["instances"] = [] ctx.obj.update(kwargs) if ctx.invoked_subcommand is None and not ctx.obj.get("testing_mode"): ctx.invoke(all) return ctx
conn=psycopg2.connect(database="resultdb", user="******",password="", host="", port="") cur = conn.cursor() from pyspider.database import connect_database resultdb = connect_database("sqlalchemy+postgresql+resultdb://postgres:@10.1.36.183:5432/resultdb") #result=resultdb.select('test6').next() #row_result = result['result'] #url=row_result['wages_and_employment_content'] #print type(url),url ##抓数据 ##抓取相关的数据 并存到数据库中 ##获取进一步的链接 并返回列表 列表是一个字典 (带相关内容的) def get_more_touchs(list_content,types): for each in list_content: url=each[0] try: r=requests.get(url,headers=header) soup=BeautifulSoup(r.text) websites={ "detailed_work_activities" :"/search/dwa/compare/.*?g=Continue", "work_context":"^/find/descriptor/result/.*?", "work_values_content":"^/explore/workvalues/.*?", "work_styles_content":"^/find/descriptor/result/.*?", "work_activities":"^/find/descriptor/result/.*?", "skills_content":"^/find/descriptor/result/.*?", "knowledge_content":"^/find/descriptor/result/.*?", "interests":"^/explore/interests/.*?", "abilities":"^/explore/interests/.*?"
def one(ctx, interactive, enable_phantomjs, scripts): """ One mode not only means all-in-one, it runs every thing in one process over tornado.ioloop, for debug purpose * webui is not running in one mode. * SCRIPTS is the script file path of project - when set, taskdb and resultdb will use a in-memery sqlite db by default - when set, on_start callback will be triggered on start * the status of project is always RUNNING. * rate and burst can be set in script with comments like: # rate: 1.0 # burst: 3 """ ctx.obj['debug'] = False g = ctx.obj g['testing_mode'] = True if scripts: from pyspider.database.local.projectdb import ProjectDB g['projectdb'] = ProjectDB(scripts) if g.get('is_taskdb_default'): g['taskdb'] = connect_database('sqlite+taskdb://') if g.get('is_resultdb_default'): g['resultdb'] = connect_database('sqlite+resultdb://') if enable_phantomjs: phantomjs_config = g.config.get('phantomjs', {}) phantomjs_obj = ctx.invoke(phantomjs, **phantomjs_config) if phantomjs_obj: g.setdefault('phantomjs_proxy', 'localhost:%s' % phantomjs_obj.port) else: phantomjs_obj = None result_worker_config = g.config.get('result_worker', {}) result_worker_obj = ctx.invoke(result_worker, **result_worker_config) processor_config = g.config.get('processor', {}) processor_obj = ctx.invoke(processor, **processor_config) fetcher_config = g.config.get('fetcher', {}) fetcher_config.setdefault('xmlrpc', False) fetcher_obj = ctx.invoke(fetcher, **fetcher_config) scheduler_config = g.config.get('scheduler', {}) scheduler_config.setdefault('xmlrpc', False) scheduler_config.setdefault('scheduler_cls', 'pyspider.scheduler.scheduler.OneScheduler') scheduler_obj = ctx.invoke(scheduler, **scheduler_config) scheduler_obj.init_one(ioloop=fetcher_obj.ioloop, fetcher=fetcher_obj, processor=processor_obj, result_worker=result_worker_obj, interactive=interactive) if scripts: for project in g.projectdb.projects: scheduler_obj.trigger_on_start(project) try: scheduler_obj.run() except KeyboardInterrupt: scheduler_obj.quit() if phantomjs_obj: phantomjs_obj.quit() raise
def cli(ctx, **kwargs): """ A powerful spider system in python. """ logging.config.fileConfig(os.path.join(os.path.dirname(__file__), "logging.conf")) # get db from env for db in ("taskdb", "projectdb", "resultdb"): if kwargs[db] is not None: continue if os.environ.get("MYSQL_NAME"): kwargs[db] = utils.Get( lambda db=db: connect_database( "mysql+%s://%s:%s/%s" % (db, os.environ["MYSQL_PORT_3306_TCP_ADDR"], os.environ["MYSQL_PORT_3306_TCP_PORT"], db) ) ) elif os.environ.get("MONGODB_NAME"): kwargs[db] = utils.Get( lambda db=db: connect_database( "mongodb+%s://%s:%s/%s" % (db, os.environ["MONGODB_PORT_27017_TCP_ADDR"], os.environ["MONGODB_PORT_27017_TCP_PORT"], db) ) ) elif ctx.invoked_subcommand == "bench": if kwargs["data_path"] == "./data": kwargs["data_path"] += "/bench" shutil.rmtree(kwargs["data_path"], ignore_errors=True) os.mkdir(kwargs["data_path"]) if db in ("taskdb", "resultdb"): kwargs[db] = utils.Get(lambda db=db: connect_database("sqlite+%s://" % (db))) else: kwargs[db] = utils.Get( lambda db=db: connect_database("sqlite+%s:///%s/%s.db" % (db, kwargs["data_path"], db[:-2])) ) else: if not os.path.exists(kwargs["data_path"]): os.mkdir(kwargs["data_path"]) kwargs[db] = utils.Get( lambda db=db: connect_database("sqlite+%s:///%s/%s.db" % (db, kwargs["data_path"], db[:-2])) ) # queue if kwargs.get("amqp_url"): from pyspider.libs.rabbitmq import Queue for name in ("newtask_queue", "status_queue", "scheduler2fetcher", "fetcher2processor", "processor2result"): kwargs[name] = utils.Get( lambda name=name: Queue(name, amqp_url=kwargs["amqp_url"], maxsize=kwargs["queue_maxsize"]) ) elif os.environ.get("RABBITMQ_NAME"): from pyspider.libs.rabbitmq import Queue amqp_url = ( "amqp://*****:*****@%(RABBITMQ_PORT_5672_TCP_ADDR)s" ":%(RABBITMQ_PORT_5672_TCP_PORT)s/%%2F" % os.environ ) for name in ("newtask_queue", "status_queue", "scheduler2fetcher", "fetcher2processor", "processor2result"): kwargs[name] = utils.Get(lambda name=name: Queue(name, amqp_url=amqp_url, maxsize=kwargs["queue_maxsize"])) else: from multiprocessing import Queue for name in ("newtask_queue", "status_queue", "scheduler2fetcher", "fetcher2processor", "processor2result"): kwargs[name] = Queue(kwargs["queue_maxsize"]) # phantomjs-proxy if kwargs.get("phantomjs_proxy"): pass elif os.environ.get("PHANTOMJS_NAME"): kwargs["phantomjs_proxy"] = os.environ["PHANTOMJS_PORT"][len("tcp://") :] ctx.obj = utils.ObjectDict(ctx.obj or {}) ctx.obj["instances"] = [] ctx.obj.update(kwargs) if ctx.invoked_subcommand is None and not ctx.obj.get("testing_mode"): ctx.invoke(all) return ctx
def setUpClass(self): self.projectdb = database.connect_database( 'sqlalchemy+mysql+mysqlconnector+projectdb://root@localhost/pyspider_test_projectdb' ) self.assertIsNotNone(self, self.projectdb)
def setUpClass(self): self.taskdb = database.connect_database('mysql+taskdb://localhost/pyspider_test_taskdb')
def setUpClass(self): self.resultdb = database.connect_database( 'sqlalchemy+sqlite+resultdb://' ) self.assertIsNotNone(self, self.resultdb)
def setUpClass(self): self.projectdb = database.connect_database( 'mongodb+projectdb://localhost/pyspider_test_projectdb' )
class Handler(BaseHandler): crawl_config = {} author_projectdb = database.connect_database( 'elasticsearch+projectdb://127.0.0.1:9200/?index=author') note_projectdb = database.connect_database( 'elasticsearch+projectdb://127.0.0.1:9200/?index=note') flower_projectdb = database.connect_database( 'elasticsearch+projectdb://127.0.0.1:9200/?index=flower') # 处理帖子内容的方法,获取 回帖target_id, 帖子内容,发帖客户端 def handle_content(self, content_area, building_id): hl_md5 = hashlib.md5() # 原生帖子内容, eg: 引用回帖:4楼:Originallypostedby含笑木香at2018-04-0321:08:49比如我,最喜欢暴风雨的时候睡懒觉, # 我也很喜欢啊,别人怕暴风雨,我是遇到暴风雨就兴奋发自小木虫IOS客户端 raw_content = content_area("div[class='t_fsz']").find( "td:eq(0)").text().replace("\n", "").replace(" ", "") if (raw_content == ""): target_id = "" content = "" device = "" return target_id, content, device else: # 处理有引用回帖的情况 if raw_content.startswith("引用回帖"): # 获取引用回帖的中的日期 maybe_date = re.search( r"(\d{4}-\d{1,2}-\d{1,2}\d{1,2}:\d{1,2}:\d{1,2})", raw_content) if not maybe_date: target_id = "" content = "" device = "" return target_id, content, device founded_date = maybe_date.group(0) # 对于引用回帖的内容进行截取,key1=引用回帖: key2=at reference_str = raw_content[raw_content.find("引用回帖:") + 5:raw_content.find(founded_date) - 2] # 获取虫名 以及 楼层,还有上面的日期,拼接raw_id raw_id = reference_str[reference_str.find("Originallypostedby" ) + 18:] + founded_date + reference_str[:1] hl_md5.update(raw_id.replace(" ", "").encode(encoding='utf-8')) target_id = hl_md5.hexdigest() # 获取帖子内容 content = raw_content[raw_content.find(founded_date) + 18:raw_content.find("发自小木虫")] # 获取客户端 if raw_content.find("发自小木虫") != -1: device = raw_content[raw_content.find("发自小木虫") + 5:raw_content.find("客户端")] else: device = "PC" return target_id, content, device else: target_id = building_id content = raw_content[:raw_content.find("发自小木虫")] if raw_content.find("发自小木虫") != -1: device = raw_content[raw_content.find("发自小木虫") + 5:raw_content.find("客户端")] else: device = "PC" return target_id, content, device # 入口方法 @every(minutes=2.5 * 24 * 60) def on_start(self): self.crawl('http://muchong.com/bbs', callback=self.index_page, cookies={ "Hm_lpvt_2207ecfb7b2633a3bc5c4968feb58569": "1522564279", "Hm_lvt_2207ecfb7b2633a3bc5c4968feb58569": "1522564172", "_discuz_pw": "9a1449a8990d49a6", "_discuz_uid": "3302227", "_emuch_index": "1", "_ga": "GA1.2.1902872401.1522564172", "_gat": "1" }) # 单位是秒 @config(age=36 * 60 * 60) def index_page(self, response): context = response.doc for each_area in context.find( "div.forum_Box.bg_global.xmc_line_lr.xmc_line_bno").items(): self.handle_first_area( each_area("h2 strong").text(), each_area("table")) # 处理一级板块,比如 网络生活区等 def handle_first_area(self, first_area_name, second_area_table): second_area = second_area_table.find("td") for each_second_area in second_area.items(): second_area_link = each_second_area.find( "div.xmc_fl.xmc_forum_width h4.xmc_blue a") second_area_name = second_area_link.text() second_area_href = second_area_link.attr("href") self.handle_second_area(first_area_name, second_area_name, second_area_href) # 处理二级板块,比如 休闲灌水等,这个时候进入的是分页的首页 url: http://muchong.com/f-6-1 def handle_second_area(self, first_area_name, second_area_name, second_area_href): if first_area_name != "" and second_area_name != "" and second_area_href != "": self.crawl(second_area_href, callback=self.second_index_page, cookies={ "Hm_lpvt_2207ecfb7b2633a3bc5c4968feb58569": "1522564279", "Hm_lvt_2207ecfb7b2633a3bc5c4968feb58569": "1522564172", "_discuz_pw": "9a1449a8990d49a6", "_discuz_uid": "3302227", "_emuch_index": "1", "_ga": "GA1.2.1902872401.1522564172", "_gat": "1" }) # 统计二级分类下的全部帖子,分页爬取(首先获取总页数,拼接每一页应该爬取的url) def second_index_page(self, response): context = response.doc total_page = context.find("td.header:eq(1)").text() total_page = total_page[total_page.find("/") + 1:] basic_url = response.url if not total_page == "": total_page = int(total_page) if total_page > 200: total_page = 200 # 循环遍历每页 for page in range(total_page): each_page_url = basic_url[:basic_url.rfind("-") + 1] + str(page + 1) self.crawl(each_page_url, callback=self.handle_each_second_index_page, cookies={ "Hm_lpvt_2207ecfb7b2633a3bc5c4968feb58569": "1522564279", "Hm_lvt_2207ecfb7b2633a3bc5c4968feb58569": "1522564172", "_discuz_pw": "9a1449a8990d49a6", "_discuz_uid": "3302227", "_emuch_index": "1", "_ga": "GA1.2.1902872401.1522564172", "_gat": "1" }) # 爬取二级分类下的每一个帖子 def handle_each_second_index_page(self, response): context = response.doc notes_titles = context.find("th.thread-name") for each_note in notes_titles.items(): if each_note is not None: self.crawl(each_note("a.a_subject").attr("href"), callback=self.note_index, cookies={ "Hm_lpvt_2207ecfb7b2633a3bc5c4968feb58569": "1522564279", "Hm_lvt_2207ecfb7b2633a3bc5c4968feb58569": "1522564172", "_discuz_pw": "9a1449a8990d49a6", "_discuz_uid": "3302227", "_emuch_index": "1", "_ga": "GA1.2.1902872401.1522564172", "_gat": "1" }) # 统计帖子的总页数,分页爬取 def note_index(self, response): context = response.doc total_page = context.find("td.header:eq(1)").text() total_page = total_page[total_page.find("/") + 1:] basic_url = response.url basic_url = basic_url[:basic_url.rfind("-") + 1] if not total_page == "": total_page = int(total_page) if total_page > 200: total_page = 200 # 循环遍历每页 for page in range(total_page): each_page_url = basic_url[:basic_url.rfind("-") + 1] + str(page + 1) self.crawl(each_page_url, callback=self.handle_note, cookies={ "Hm_lpvt_2207ecfb7b2633a3bc5c4968feb58569": "1522564279", "Hm_lvt_2207ecfb7b2633a3bc5c4968feb58569": "1522564172", "_discuz_pw": "9a1449a8990d49a6", "_discuz_uid": "3302227", "_emuch_index": "1", "_ga": "GA1.2.1902872401.1522564172", "_gat": "1" }) def handle_note(self, response): request_url = response.url building_id = request_url[request_url.rfind("/") + 1:request_url.rfind("-") + 1] page_sign = request_url[request_url.rfind("-") + 1:] # 获取整个doc context = response.doc for each_note in context("tbody[id^='pid']").items(): # 这个md5对象,需要每次都新生成,否则生成的md5值会有问题 hl_md5 = hashlib.md5() note = {} author = each_note.find("div.pls_user h3 a") # 作者链接 author_link = author.attr("href") note["author_id"] = author_link[author_link.find("uid=") + 4:] author_actual_link = author_link.replace("muchong.com", "muchong.com/bbs") # 楼层及创建时间块 floor_time_area = each_note.find("div[class='pls_info']") create_time = floor_time_area("em").text() note["create_time"] = create_time # 一楼是1楼,二楼是沙发,三楼是板凳,四楼是4楼 raw_floor = floor_time_area("span a").text() floor = "2楼" if raw_floor == "沙发" else ( "3楼" if raw_floor == "板凳" else raw_floor) note["floor"] = floor[:-1] raw_id = author.text() + create_time + floor[:-1] hl_md5.update(raw_id.replace(" ", "").encode(encoding='utf-8')) note["id"] = hl_md5.hexdigest() # 帖子内容区域 content_area = each_note.find( "td[class='plc_mind'] div[class='plc_Con']") # 赋值全局变量楼主帖子id if note["floor"] == "1": note["title"] = content_area("h1").text() # 如果是帖子的第一页,第一楼为全局id if int(page_sign) == 1: note["id"] = building_id if content_area: target_id, content, device = self.handle_content( content_area, building_id) note["target_id"] = target_id note["content"] = content note["device"] = device category_names = context.find("span.breadcrumb") # 一级分类名称 note["first_category_name"] = category_names("a:eq(1)").text() # 二级分类名称 note["second_category_name"] = category_names("a:eq(2)").text() # 三级分类名称 note["third_category_name"] = category_names("a:eq(3)").text() note["building_id"] = building_id self.note_projectdb.es.index("note", "project", note, note["id"]) # 爬取作者 self.crawl(author_actual_link, callback=self.handle_author, cookies={ "Hm_lpvt_2207ecfb7b2633a3bc5c4968feb58569": "1522564279", "Hm_lvt_2207ecfb7b2633a3bc5c4968feb58569": "1522564172", "_discuz_pw": "9a1449a8990d49a6", "_discuz_uid": "3302227", "_emuch_index": "1", "_ga": "GA1.2.1902872401.1522564172", "_gat": "1" }) def handle_author(self, response): hl_md5 = hashlib.md5() # 作者信息字典 author = {} # 获取整个doc context = response.doc # 查找基本信息,class 是userinfo base的table basic_information = context("table.userinfo.base") # 注册时间 register_time = basic_information("td:eq(0)").text() if register_time: author["register_time"] = register_time # 其他基本信息(有三个class 为userinfo的table, 选取第二个) basic_information = context("table.userinfo:eq(1)") author["id"] = basic_information("tr:eq(0) td:eq(0)").text() author["name"] = context("div.space_index").find("a:eq(0)").text() author["sex"] = basic_information("tr:eq(4) td:eq(0)").text() birthday_time = basic_information("tr:eq(4) td:eq(2)").text() if (not birthday_time == "0000-00-00") and (not birthday_time == ""): author["birthday_time"] = birthday_time author["coin_num"] = basic_information("tr:eq(1) td:eq(1)").text() author["major"] = basic_information("tr:eq(3) td:eq(2)").text() author["help_num"] = basic_information("tr:eq(0) td:eq(2)").text() author["grant_num"] = basic_information("tr:eq(1) td:eq(2)").text() note_num_src = basic_information("tr:eq(2) td:eq(1)").text() note_num_desc = basic_information("tr:eq(2) td:eq(1) font").text() author["note_num"] = note_num_src.replace(note_num_desc, "").replace( "\n", "").replace(" ", "") composite_info = context("div.space_index table tr").find( "div:last").text() # 截取,切片字符串 composite_info = composite_info[composite_info.find("听众"):].split( "\xa0") # 分组存储 # if len(composite_info) > 0 and not composite_info[0] == "": # author["fans_num"] = composite_info[0][composite_info[0].find(":")+1:].replace(" ", "") # print(author["fans_num"]) # if len(composite_info) > 1 and not composite_info[1] == "": # author["flower_num"] = composite_info[1][composite_info[1].find(":")+1:].replace(" ","") # print(author["flower_num"]) # 查看获取的红花 flowers = context("table.userinfo:eq(2)").find("table")("tr td") # flower_num = 0 for flower_row in flowers.items(): flower = {} flower["owner_id"] = author["id"] flower["owner_name"] = author["name"] flower["sender_name"] = flower_row("a").text() flower_num = flower_row("font").text()[1:-1] flower["flower_num"] = "1" if flower_num == "" else flower_num # flower_num = flower_num+ int(flower["flower_num"]) self.flower_projectdb.es.index("flower", "project", flower) # author["flower_num"] = flower_num hl_md5.update(author["id"].encode(encoding='utf-8')) self.author_projectdb.es.index("author", "project", author, hl_md5.hexdigest())
def setUpClass(self): self.resultdb = database.connect_database( 'sqlalchemy+mysql+mysqlconnector+resultdb://root@localhost/pyspider_test_resultdb' )
def connect_db(ctx, param, value): if not value: return return utils.Get(lambda: connect_database(value))
def setUpClass(self): self.projectdb = database.connect_database( 'sqlalchemy+sqlite+projectdb://' )
def cli(ctx, **kwargs): """ A powerful spider system in python. """ logging.config.fileConfig(kwargs['logging_config']) # get db from env for db in ('taskdb', 'projectdb', 'resultdb'): if kwargs[db] is not None: continue if os.environ.get('MYSQL_NAME'): kwargs[db] = utils.Get(lambda db=db: connect_database('mysql+%s://%s:%s/%s' % ( db, os.environ['MYSQL_PORT_3306_TCP_ADDR'], os.environ['MYSQL_PORT_3306_TCP_PORT'], db))) elif os.environ.get('MONGODB_NAME'): kwargs[db] = utils.Get(lambda db=db: connect_database('mongodb+%s://%s:%s/%s' % ( db, os.environ['MONGODB_PORT_27017_TCP_ADDR'], os.environ['MONGODB_PORT_27017_TCP_PORT'], db))) elif ctx.invoked_subcommand == 'bench': if kwargs['data_path'] == './data': kwargs['data_path'] += '/bench' shutil.rmtree(kwargs['data_path'], ignore_errors=True) os.mkdir(kwargs['data_path']) if db in ('taskdb', 'resultdb'): kwargs[db] = utils.Get(lambda db=db: connect_database('sqlite+%s://' % (db))) else: kwargs[db] = utils.Get(lambda db=db: connect_database('sqlite+%s:///%s/%s.db' % ( db, kwargs['data_path'], db[:-2]))) else: if not os.path.exists(kwargs['data_path']): os.mkdir(kwargs['data_path']) kwargs[db] = utils.Get(lambda db=db: connect_database('sqlite+%s:///%s/%s.db' % ( db, kwargs['data_path'], db[:-2]))) kwargs['is_%s_default' % db] = True # create folder for counter.dump if not os.path.exists(kwargs['data_path']): os.mkdir(kwargs['data_path']) # queue if kwargs.get('amqp_url'): from pyspider.libs.rabbitmq import Queue for name in ('newtask_queue', 'status_queue', 'scheduler2fetcher', 'fetcher2processor', 'processor2result'): kwargs[name] = utils.Get(lambda name=name: Queue(name, amqp_url=kwargs['amqp_url'], maxsize=kwargs['queue_maxsize'])) elif os.environ.get('RABBITMQ_NAME'): from pyspider.libs.rabbitmq import Queue amqp_url = ("amqp://*****:*****@%(RABBITMQ_PORT_5672_TCP_ADDR)s" ":%(RABBITMQ_PORT_5672_TCP_PORT)s/%%2F" % os.environ) for name in ('newtask_queue', 'status_queue', 'scheduler2fetcher', 'fetcher2processor', 'processor2result'): kwargs[name] = utils.Get(lambda name=name: Queue(name, amqp_url=amqp_url, maxsize=kwargs['queue_maxsize'])) elif kwargs.get('beanstalk'): from pyspider.libs.beanstalk import Queue for name in ('newtask_queue', 'status_queue', 'scheduler2fetcher', 'fetcher2processor', 'processor2result'): kwargs[name] = utils.Get(lambda name=name: Queue(name, host=kwargs.get('beanstalk'), maxsize=kwargs['queue_maxsize'])) else: from multiprocessing import Queue for name in ('newtask_queue', 'status_queue', 'scheduler2fetcher', 'fetcher2processor', 'processor2result'): kwargs[name] = Queue(kwargs['queue_maxsize']) # phantomjs-proxy if kwargs.get('phantomjs_proxy'): pass elif os.environ.get('PHANTOMJS_NAME'): kwargs['phantomjs_proxy'] = os.environ['PHANTOMJS_PORT_25555_TCP'][len('tcp://'):] ctx.obj = utils.ObjectDict(ctx.obj or {}) ctx.obj['instances'] = [] ctx.obj.update(kwargs) if ctx.invoked_subcommand is None and not ctx.obj.get('testing_mode'): ctx.invoke(all) return ctx
def setUpClass(self): self.projectdb = database.connect_database( 'sqlalchemy+postgresql+projectdb://[email protected]:5432/pyspider_test_projectdb' )
def one(ctx, interactive, enable_phantomjs, enable_puppeteer, scripts): """ One mode not only means all-in-one, it runs every thing in one process over tornado.ioloop, for debug purpose """ ctx.obj['debug'] = False g = ctx.obj g['testing_mode'] = True if scripts: from pyspider.database.local.projectdb import ProjectDB g['projectdb'] = ProjectDB(scripts) if g.get('is_taskdb_default'): g['taskdb'] = connect_database('sqlite+taskdb://') if g.get('is_resultdb_default'): g['resultdb'] = None if enable_phantomjs: phantomjs_config = g.config.get('phantomjs', {}) phantomjs_obj = ctx.invoke(phantomjs, **phantomjs_config) if phantomjs_obj: g.setdefault('phantomjs_proxy', '127.0.0.1:%s' % phantomjs_obj.port) else: phantomjs_obj = None if enable_puppeteer: puppeteer_config = g.config.get('puppeteer', {}) puppeteer_obj = ctx.invoke(puppeteer, **puppeteer_config) if puppeteer_obj: g.setdefault('puppeteer_proxy', '127.0.0.1:%s' % puppeteer.port) else: puppeteer_obj = None result_worker_config = g.config.get('result_worker', {}) if g.resultdb is None: result_worker_config.setdefault('result_cls', 'pyspider.result.OneResultWorker') result_worker_obj = ctx.invoke(result_worker, **result_worker_config) processor_config = g.config.get('processor', {}) processor_config.setdefault('enable_stdout_capture', False) processor_obj = ctx.invoke(processor, **processor_config) fetcher_config = g.config.get('fetcher', {}) fetcher_config.setdefault('xmlrpc', False) fetcher_obj = ctx.invoke(fetcher, **fetcher_config) scheduler_config = g.config.get('scheduler', {}) scheduler_config.setdefault('xmlrpc', False) scheduler_config.setdefault('scheduler_cls', 'pyspider.scheduler.OneScheduler') scheduler_obj = ctx.invoke(scheduler, **scheduler_config) scheduler_obj.init_one(ioloop=fetcher_obj.ioloop, fetcher=fetcher_obj, processor=processor_obj, result_worker=result_worker_obj, interactive=interactive) if scripts: for project in g.projectdb.projects: scheduler_obj.trigger_on_start(project) try: scheduler_obj.run() finally: scheduler_obj.quit() if phantomjs_obj: phantomjs_obj.quit() if puppeteer_obj: puppeteer_obj.quit()
def setUpClass(self): self.taskdb = database.connect_database('redis+taskdb://localhost:6379/15') self.taskdb.__prefix__ = 'testtaskdb_'
def setUpClass(self): self.resultdb = database.connect_database( 'elasticsearch+resultdb://127.0.0.1:9200/?index=test_pyspider_resultdb' ) assert self.resultdb.index == 'test_pyspider_resultdb'
def setUpClass(self): self.resultdb = database.connect_database( 'sqlalchemy+postgresql+resultdb://[email protected]/pyspider_test_resultdb' ) self.assertIsNotNone(self, self.resultdb) self.tearDownClass()
class Handler(BaseHandler): crawl_config = {} author_projectdb = database.connect_database( 'elasticsearch+projectdb://127.0.0.1:9200/?index=author') flower_projectdb = database.connect_database( 'elasticsearch+projectdb://127.0.0.1:9200/?index=flower') @every(minutes=1) def on_start(self): self.crawl('http://muchong.com/bbs/space.php?uid=3583297', callback=self.handle_author, cookies={ "Hm_lpvt_2207ecfb7b2633a3bc5c4968feb58569": "1522564279", "Hm_lvt_2207ecfb7b2633a3bc5c4968feb58569": "1522564172", "_discuz_pw": "9a1449a8990d49a6", "_discuz_uid": "3302227", "_emuch_index": "1", "_ga": "GA1.2.1902872401.1522564172", "_gat": "1" }) @config(age=1) def handle_author(self, response): hl_md5 = hashlib.md5() # 作者信息字典 author = {} # 获取整个doc context = response.doc # 查找基本信息,class 是userinfo base的table basic_information = context("table.userinfo.base") # 注册时间 register_time = basic_information("td:eq(0)").text() if register_time: author["register_time"] = register_time # 其他基本信息(有三个class 为userinfo的table, 选取第二个) basic_information = context("table.userinfo:eq(1)") author["id"] = basic_information("tr:eq(0) td:eq(0)").text() author["name"] = context("div.space_index").find("a:eq(0)").text() author["sex"] = basic_information("tr:eq(4) td:eq(0)").text() birthday_time = basic_information("tr:eq(4) td:eq(2)").text() if (not birthday_time == "0000-00-00") and (not birthday_time == ""): author["birthday_time"] = birthday_time author["coin_num"] = basic_information("tr:eq(1) td:eq(1)").text() author["major"] = basic_information("tr:eq(3) td:eq(2)").text() author["help_num"] = basic_information("tr:eq(0) td:eq(2)").text() author["grant_num"] = basic_information("tr:eq(1) td:eq(2)").text() note_num_src = basic_information("tr:eq(2) td:eq(1)").text() note_num_desc = basic_information("tr:eq(2) td:eq(1) font").text() author["note_num"] = note_num_src.replace(note_num_desc, "").replace( "\n", "").replace(" ", "") hl_md5.update(author["id"].encode(encoding='utf-8')) es_author_id = hl_md5.hexdigest() composite_info = context("div.space_index table tr").find( "div:last").text() # 截取,切片字符串 composite_info = composite_info[composite_info.find("听众"):].split( "\xa0") # 分组存储 #if len(composite_info) > 0 and not composite_info[0] == "": # author["fans_num"] = composite_info[0][composite_info[0].find(":")+1:].replace(" ", "") # print(author["fans_num"]) #if len(composite_info) > 1 and not composite_info[1] == "": # author["flower_num"] = composite_info[1][composite_info[1].find(":")+1:].replace(" ","") # print(author["flower_num"]) # 查看获取的红花 flowers = context("table.userinfo:eq(2)").find("table")("tr td") # flower_num = 0 for flower_row in flowers.items(): hl_md5_flower = hashlib.md5() flower = {} flower["owner_id"] = author["id"] flower["owner_name"] = author["name"] flower["sender_name"] = flower_row("a").text() flower_num = flower_row("font").text()[1:-1] flower["flower_num"] = "1" if flower_num == "" else flower_num raw_index_id = flower["owner_id"] + flower["sender_name"] hl_md5_flower.update(raw_index_id.encode(encoding='utf-8')) flower_es_id = hl_md5_flower.hexdigest() #flower_num = flower_num+ int(flower["flower_num"]) print(flower_es_id) print(flower) self.flower_projectdb.es.index("flower", "project", flower, flower_es_id) #author["flower_num"] = flower_num self.author_projectdb.es.index("author", "project", author, es_author_id)
def cli(ctx, **kwargs): """ A powerful spider system in python. """ if kwargs['add_sys_path']: sys.path.append(os.getcwd()) logging.config.fileConfig(kwargs['logging_config']) # get db from env for db in ('taskdb', 'projectdb', 'resultdb'): if kwargs[db] is not None: continue if os.environ.get('MYSQL_NAME'): kwargs[db] = utils.Get(lambda db=db: connect_database( 'sqlalchemy+mysql+{0!s}://{1!s}:{2!s}/{3!s}'.format( db, os.environ['MYSQL_PORT_3306_TCP_ADDR'], os.environ['MYSQL_PORT_3306_TCP_PORT'], db))) elif os.environ.get('MONGODB_NAME'): kwargs[db] = utils.Get(lambda db=db: connect_database( 'mongodb+{0!s}://{1!s}:{2!s}/{3!s}'.format( db, os.environ['MONGODB_PORT_27017_TCP_ADDR'], os.environ['MONGODB_PORT_27017_TCP_PORT'], db))) elif ctx.invoked_subcommand == 'bench': if kwargs['data_path'] == './data': kwargs['data_path'] += '/bench' shutil.rmtree(kwargs['data_path'], ignore_errors=True) os.mkdir(kwargs['data_path']) if db in ('taskdb', 'resultdb'): kwargs[db] = utils.Get(lambda db=db: connect_database('sqlite+{0!s}://'.format((db)))) else: kwargs[db] = utils.Get(lambda db=db: connect_database('sqlite+{0!s}:///{1!s}/{2!s}.db'.format( db, kwargs['data_path'], db[:-2]))) else: if not os.path.exists(kwargs['data_path']): os.mkdir(kwargs['data_path']) kwargs[db] = utils.Get(lambda db=db: connect_database('sqlite+{0!s}:///{1!s}/{2!s}.db'.format( db, kwargs['data_path'], db[:-2]))) kwargs['is_{0!s}_default'.format(db)] = True # create folder for counter.dump if not os.path.exists(kwargs['data_path']): os.mkdir(kwargs['data_path']) # message queue, compatible with old version if kwargs.get('message_queue'): pass elif kwargs.get('amqp_url'): kwargs['message_queue'] = kwargs['amqp_url'] elif os.environ.get('RABBITMQ_NAME'): kwargs['message_queue'] = ("amqp://*****:*****@%(RABBITMQ_PORT_5672_TCP_ADDR)s" ":%(RABBITMQ_PORT_5672_TCP_PORT)s/%%2F" % os.environ) elif kwargs.get('beanstalk'): kwargs['message_queue'] = "beanstalk://{0!s}/".format(kwargs['beanstalk']) for name in ('newtask_queue', 'status_queue', 'scheduler2fetcher', 'fetcher2processor', 'processor2result'): if kwargs.get('message_queue'): kwargs[name] = utils.Get(lambda name=name: connect_message_queue( name, kwargs.get('message_queue'), kwargs['queue_maxsize'])) else: kwargs[name] = connect_message_queue(name, kwargs.get('message_queue'), kwargs['queue_maxsize']) # phantomjs-proxy if kwargs.get('phantomjs_proxy'): pass elif os.environ.get('PHANTOMJS_NAME'): kwargs['phantomjs_proxy'] = os.environ['PHANTOMJS_PORT_25555_TCP'][len('tcp://'):] ctx.obj = utils.ObjectDict(ctx.obj or {}) ctx.obj['instances'] = [] ctx.obj.update(kwargs) if ctx.invoked_subcommand is None and not ctx.obj.get('testing_mode'): ctx.invoke(all) return ctx
def setUpClass(self): self.taskdb = database.connect_database( 'elasticsearch+taskdb://127.0.0.1:9200/?index=test_pyspider_taskdb' ) self.assertIsNotNone(self, self.taskdb) assert self.taskdb.index == 'test_pyspider_taskdb'
def one(ctx, interactive, enable_phantomjs, scripts): """ One mode not only means all-in-one, it runs every thing in one process over tornado.ioloop, for debug purpose """ ctx.obj['debug'] = False g = ctx.obj g['testing_mode'] = True if scripts: from pyspider.database.local.projectdb import ProjectDB g['projectdb'] = ProjectDB(scripts) if g.get('is_taskdb_default'): g['taskdb'] = connect_database('sqlite+taskdb://') if g.get('is_resultdb_default'): g['resultdb'] = None if enable_phantomjs: phantomjs_config = g.config.get('phantomjs', {}) phantomjs_obj = ctx.invoke(phantomjs, **phantomjs_config) if phantomjs_obj: g.setdefault('phantomjs_proxy', 'localhost:%s' % phantomjs_obj.port) else: phantomjs_obj = None result_worker_config = g.config.get('result_worker', {}) if g.resultdb is None: result_worker_config.setdefault('result_cls', 'pyspider.result.OneResultWorker') result_worker_obj = ctx.invoke(result_worker, **result_worker_config) processor_config = g.config.get('processor', {}) processor_config.setdefault('enable_stdout_capture', False) processor_obj = ctx.invoke(processor, **processor_config) fetcher_config = g.config.get('fetcher', {}) fetcher_config.setdefault('xmlrpc', False) fetcher_obj = ctx.invoke(fetcher, **fetcher_config) scheduler_config = g.config.get('scheduler', {}) scheduler_config.setdefault('xmlrpc', False) scheduler_config.setdefault('scheduler_cls', 'pyspider.scheduler.OneScheduler') scheduler_obj = ctx.invoke(scheduler, **scheduler_config) scheduler_obj.init_one(ioloop=fetcher_obj.ioloop, fetcher=fetcher_obj, processor=processor_obj, result_worker=result_worker_obj, interactive=interactive) if scripts: for project in g.projectdb.projects: scheduler_obj.trigger_on_start(project) try: scheduler_obj.run() finally: scheduler_obj.quit() if phantomjs_obj: phantomjs_obj.quit()
def setUpClass(self): self.taskdb = database.connect_database( 'sqlalchemy+postgresql+taskdb://[email protected]:5432/pyspider_test_taskdb' ) self.tearDownClass()
def setUpClass(self): self.taskdb = database.connect_database( 'elasticsearch+taskdb://127.0.0.1:9200/?index=test_pyspider' )
def cli(ctx, **kwargs): """ A powerful spider system in python. """ if kwargs['add_sys_path']: sys.path.append(os.getcwd()) logging.config.fileConfig(kwargs['logging_config']) # get db from env for db in ('taskdb', 'projectdb', 'resultdb'): if kwargs[db] is not None: continue if os.environ.get('MYSQL_NAME'): kwargs[db] = utils.Get(lambda db=db: connect_database( 'sqlalchemy+mysql+%s://%s:%s/%s' % ( db, os.environ['MYSQL_PORT_3306_TCP_ADDR'], os.environ['MYSQL_PORT_3306_TCP_PORT'], db))) elif os.environ.get('MONGODB_NAME'): kwargs[db] = utils.Get(lambda db=db: connect_database( 'mongodb+%s://%s:%s/%s' % ( db, os.environ['MONGODB_PORT_27017_TCP_ADDR'], os.environ['MONGODB_PORT_27017_TCP_PORT'], db))) elif ctx.invoked_subcommand == 'bench': if kwargs['data_path'] == './data': kwargs['data_path'] += '/bench' shutil.rmtree(kwargs['data_path'], ignore_errors=True) os.mkdir(kwargs['data_path']) if db in ('taskdb', 'resultdb'): kwargs[db] = utils.Get(lambda db=db: connect_database('sqlite+%s://' % (db))) elif db in ('projectdb', ): kwargs[db] = utils.Get(lambda db=db: connect_database('local+%s://%s' % ( db, os.path.join(os.path.dirname(__file__), 'libs/bench.py')))) else: if not os.path.exists(kwargs['data_path']): os.mkdir(kwargs['data_path']) kwargs[db] = utils.Get(lambda db=db: connect_database('sqlite+%s:///%s/%s.db' % ( db, kwargs['data_path'], db[:-2]))) kwargs['is_%s_default' % db] = True # create folder for counter.dump if not os.path.exists(kwargs['data_path']): os.mkdir(kwargs['data_path']) # message queue, compatible with old version if kwargs.get('message_queue'): pass elif kwargs.get('amqp_url'): kwargs['message_queue'] = kwargs['amqp_url'] elif os.environ.get('RABBITMQ_NAME'): kwargs['message_queue'] = ("amqp://*****:*****@%(RABBITMQ_PORT_5672_TCP_ADDR)s" ":%(RABBITMQ_PORT_5672_TCP_PORT)s/%%2F" % os.environ) elif kwargs.get('beanstalk'): kwargs['message_queue'] = "beanstalk://%s/" % kwargs['beanstalk'] for name in ('newtask_queue', 'status_queue', 'scheduler2fetcher', 'fetcher2processor', 'processor2result'): if kwargs.get('message_queue'): kwargs[name] = utils.Get(lambda name=name: connect_message_queue( name, kwargs.get('message_queue'), kwargs['queue_maxsize'])) else: kwargs[name] = connect_message_queue(name, kwargs.get('message_queue'), kwargs['queue_maxsize']) # phantomjs-proxy if kwargs.get('phantomjs_proxy'): pass elif os.environ.get('PHANTOMJS_NAME'): kwargs['phantomjs_proxy'] = os.environ['PHANTOMJS_PORT_25555_TCP'][len('tcp://'):] # puppeteer-proxy if kwargs.get('puppeteer_proxy'): pass elif os.environ.get('PUPPETEER_NAME'): kwargs['puppeteer_proxy'] = os.environ['PUPPETEER_PORT_22222_TCP'][len('tcp://'):] ctx.obj = utils.ObjectDict(ctx.obj or {}) ctx.obj['instances'] = [] ctx.obj.update(kwargs) if ctx.invoked_subcommand is None and not ctx.obj.get('testing_mode'): ctx.invoke(all) return ctx
def cli(ctx, **kwargs): """ A powerful spider system in python. """ logging.config.fileConfig(kwargs['logging_config']) # get db from env for db in ('taskdb', 'projectdb', 'resultdb'): if kwargs[db] is not None: continue if os.environ.get('MYSQL_NAME'): kwargs[db] = utils.Get( lambda db=db: connect_database('mysql+%s://%s:%s/%s' % ( db, os.environ['MYSQL_PORT_3306_TCP_ADDR'], os.environ[ 'MYSQL_PORT_3306_TCP_PORT'], db))) elif os.environ.get('MONGODB_NAME'): kwargs[db] = utils.Get( lambda db=db: connect_database('mongodb+%s://%s:%s/%s' % ( db, os.environ['MONGODB_PORT_27017_TCP_ADDR'], os.environ[ 'MONGODB_PORT_27017_TCP_PORT'], db))) elif ctx.invoked_subcommand == 'bench': if kwargs['data_path'] == './data': kwargs['data_path'] += '/bench' shutil.rmtree(kwargs['data_path'], ignore_errors=True) os.mkdir(kwargs['data_path']) if db in ('taskdb', 'resultdb'): kwargs[db] = utils.Get( lambda db=db: connect_database('sqlite+%s://' % (db))) else: kwargs[db] = utils.Get( lambda db=db: connect_database('sqlite+%s:///%s/%s.db' % ( db, kwargs['data_path'], db[:-2]))) else: if not os.path.exists(kwargs['data_path']): os.mkdir(kwargs['data_path']) kwargs[db] = utils.Get(lambda db=db: connect_database( 'sqlite+%s:///%s/%s.db' % (db, kwargs['data_path'], db[:-2]))) kwargs['is_%s_default' % db] = True # create folder for counter.dump if not os.path.exists(kwargs['data_path']): os.mkdir(kwargs['data_path']) # queue if kwargs.get('amqp_url'): from pyspider.libs.rabbitmq import Queue for name in ('newtask_queue', 'status_queue', 'scheduler2fetcher', 'fetcher2processor', 'processor2result'): kwargs[name] = utils.Get( lambda name=name: Queue(name, amqp_url=kwargs['amqp_url'], maxsize=kwargs['queue_maxsize'])) elif os.environ.get('RABBITMQ_NAME'): from pyspider.libs.rabbitmq import Queue amqp_url = ("amqp://*****:*****@%(RABBITMQ_PORT_5672_TCP_ADDR)s" ":%(RABBITMQ_PORT_5672_TCP_PORT)s/%%2F" % os.environ) for name in ('newtask_queue', 'status_queue', 'scheduler2fetcher', 'fetcher2processor', 'processor2result'): kwargs[name] = utils.Get(lambda name=name: Queue( name, amqp_url=amqp_url, maxsize=kwargs['queue_maxsize'])) elif kwargs.get('beanstalk'): from pyspider.libs.beanstalk import Queue for name in ('newtask_queue', 'status_queue', 'scheduler2fetcher', 'fetcher2processor', 'processor2result'): kwargs[name] = utils.Get( lambda name=name: Queue(name, host=kwargs.get('beanstalk'), maxsize=kwargs['queue_maxsize'])) else: from multiprocessing import Queue for name in ('newtask_queue', 'status_queue', 'scheduler2fetcher', 'fetcher2processor', 'processor2result'): kwargs[name] = Queue(kwargs['queue_maxsize']) # phantomjs-proxy if kwargs.get('phantomjs_proxy'): pass elif os.environ.get('PHANTOMJS_NAME'): kwargs['phantomjs_proxy'] = os.environ['PHANTOMJS_PORT_25555_TCP'][ len('tcp://'):] ctx.obj = utils.ObjectDict(ctx.obj or {}) ctx.obj['instances'] = [] ctx.obj.update(kwargs) if ctx.invoked_subcommand is None and not ctx.obj.get('testing_mode'): ctx.invoke(all) return ctx