def _load_starturl_from_schedule(self): try: start_urls = self._task_schedule.get(block=False) name = start_urls[0] start_url = start_urls[1] # logger.debug("当前爬取的网页是:%s"%start_urls) logger.info(*self.lfm.crawled("CrawlerRunner", self.name, '当前爬取的网页', start_url)) crawler = Crawler(self.spidercls, self.settings, self.lfm, self, self.middlewares) crawler.create_spider_from_task(name, start_url) return crawler except Empty: logger.debug( *self.lfm.crawled("CrawlerRunner", self.name, '队列中的task分配完毕')) if not self._push_task_finish: self._create_task() else: self._pull_task_finish = True except Exception as e: logger.error(*self.lfm.error( "CrawlerRunner", self.name, "", '出现错误:', ), extra={ 'exception': e, }, exc_info=True) return None
def test_crawler(self): from core.crawler import Crawler, tbhtmlOnParser from DBManagement.models import Shop crawler = Crawler() crawler._parser.register(tbhtmlOnParser) # entryHttpUrl = "http://s.taobao.com/search?" entryHttpUrl = "http://s.taobao.com/search?initiative_id=staobaoz_20120515&q=手表&suggest=history_1&_input_charset=utf-8&source=suggest&tab=all&bcoffset=1&s=" list = [] tag = '手表' for page in range(0, 100): # modify data template values['s'] = str(page) tbData = crawler.craw( Shop() , entryHttpUrl + str(page * 44), tag, **values) # identify the page number tbData['page'] = page list.append( tbData ) print(tbData['data']) return list
def _crawlTarget(self): print "\n[+] Crawling links..." # Build a queue and start crawlers queue = self._getTargetsQueue() crawlers = [] for i in range(min(self.getOption('threads'), len(self.targets))): c = Crawler(self, queue, crawl_links=True) c.setDaemon(True) crawlers.append(c) c.start() # Little hack to kill threads on SIGINT while True: try: if queue.empty() is True: break #x sys.stdout.write("\r Remaining targets: %s" % queue.qsize()) #sys.stdout.flush() except KeyboardInterrupt: print"\n |- " + colored.yellow("INTERRUPT!") + " Killing threads..." queue = Queue.Queue() break queue.join() # Harvest results results = [] errors = {} for c in crawlers: # results for r in c.results: results.append(r) # errors for ek, ev in c.errors.iteritems(): if errors.has_key(ek): errors[ek] += ev else: errors[ek] = ev results = set(results) if errors: print " |--[+] " + colored.red("CRAWL ERRORS!") for ek, ev in errors.iteritems(): print " | |- %sx: %s" % (len(ev), ek) if len(results) > 0: print " |- " + colored.green("SUCCESS: ") + "Found %s unique targets." % len(results) else: print " |- " + colored.yellow("WARNING: ") + "No new targets found." # Add targets for t in results: self.targets.append(t)
def start_crawler(self): self.print_info("[✔] Crawler started!") self.crawler = Crawler(self.url, self.login_form) self.crawler.signals.result_list.connect(self.print_result) self.crawler.signals.finish_control.connect(self.finish_control) self.crawler.signals.info_box.connect(self.print_info) self.threadpool.start(self.crawler)
def scan(self): level = int(self.args.get('level', 1)) #post 扫描 if not self.target.startswith(('http', 'HTTP')): self.target = 'http://' + self.target if not self.target.endswith('/'): self.target += '/' ''' for target in gethosts(self.target): self.portscan(target) pass ''' headers = json.loads(self.args.get('headers', "{}")) self.crawle = Crawler(self.target, headers=headers) self.crawle.settings.update(level=level) #self.crawle.settings.update(proxy={'http':'http://127.0.0.1:1111','https':'http://127.0.0.1:1111'}) self.crawle.settings.update(self.args) th = [] th.append(threading.Thread(target=self.crawle.run1)) th.append(threading.Thread(target=self.webscan)) th.append(threading.Thread(target=self.httpscan)) for t in th: #t.daemon = True t.start() for t in th: t.join() #扫描完成写入httpret结果 self.writewebsite(self.crawle.website)
class TestCrawler(unittest.TestCase): def setUp(self): self.crawler = Crawler() def test_xici(self): res = self.crawler.xici() print(res) self.assertTrue(len(res) != 0) def test_kuaidaili(self): res = self.crawler.kuaidaili() print(res) self.assertTrue(len(res) != 0) def test_ip3366(self): res = self.crawler.ip3366() print(res) self.assertTrue(len(res) != 0) def test_jiangxianli(self): res = self.crawler.jiangxianli() print(res) self.assertTrue(len(res) != 0)
def _crawlForms(self): print "\n[+] Crawling for forms..." queue = self._getTargetsQueue() crawlers = [] for i in range(min(self.getOption('threads'), len(self.targets))): c = Crawler(self, queue, crawl_forms=True) c.setDaemon(True) crawlers.append(c) c.start() # Little hack to kill threads on SIGINT while True: try: if queue.empty() is True: break sys.stderr.write("\r |- Remaining targets: %s " % queue.qsize()) sys.stderr.flush() except KeyboardInterrupt: print "\n |- " + colored.yellow( "INTERRUPT!") + " Killing threads..." queue = Queue.Queue() break queue.join() # Harvest results results = [] errors = {} for c in crawlers: # results for r in c.results: results.append(r) # errors for ek, ev in c.errors.iteritems(): if errors.has_key(ek): errors[ek] += ev else: errors[ek] = ev results = set(results) if errors: print " |--[+] " + colored.red("CRAWL ERRORS!") for ek, ev in errors.iteritems(): print " | |- %sx: %s" % (len(ev), ek) if len(results) > 0: print " |- " + colored.green( "SUCCESS: ") + "Found %s unique forms." % len(results) else: print " |- " + colored.yellow("WARNING: ") + "No forms found." # Add targets for t in results: self.targets.append(t)
class Adder: def __init__(self): self._crawler = Crawler() def _crawl_fns(self): return [ self._crawler.__getattribute__(fn_name) for fn_name in self._crawler.fn_names ] def add(self): raw_ips = [] with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: futures = [executor.submit(fn) for fn in self._crawl_fns()] for future in concurrent.futures.as_completed(futures): try: data = future.result() raw_ips.extend(data) except Exception as exc: print('generated an exception:{}'.format(exc)) else: print('crawl succcess') return [raw_ips[i:i + 100] for i in range(0, len(raw_ips), 100)]
def run(self, start_urls, scopes=None): start_url = start_urls[0] self.start() start_time = time.time() scope = Scope(start_url, options=self.options.scope_options) if scopes: scope.scopes = [x.strip() for x in scopes.split(',')] self.db.start(start_url, scope.host) c = None s = None loader = None self.logger.debug("Parsing scan options") login = LoginAction(logger=self.logger.getEffectiveLevel()) pre_run = login.pre_parse(self.options) if pre_run: self.scan_cookies = dict(login.session_obj.cookies) scanoptions = [] if self.options.custom_options: scan_vars = self.options.custom_options.split(',') for v in scan_vars: opt = v.strip() scanoptions.append(opt) self.logger.debug("Enabled option %s" % opt) if self.options.scanner or self.options.allin: s = ScriptEngine(options=scanoptions, logger=self.logger.getEffectiveLevel(), database=self.db) if self.options.use_adv_scripts or self.options.allin: loader = modules.CustomModuleLoader( options=scanoptions, logger=self.logger.getEffectiveLevel(), database=self.db, scope=scope) loader.sslverify = self.options.sslverify loader.headers = login.headers loader.cookies = self.scan_cookies todo = [] c = Crawler(base_url=start_url, logger=self.logger.getEffectiveLevel()) for login_header in login.headers: c.headers[login_header] = login.headers[login_header] if self.options.use_crawler or self.options.allin: if pre_run: c.login = True # set cookies from Login module cookies = dict(login.session_obj.cookies) if cookies and len(cookies): self.logger.debug( "Setting crawler cookies from login module: %s" % str(cookies)) c.cookie.append(cookies) c.thread_count = self.thread_count c.max_urls = int(self.options.maxurls) c.scope = scope if self.options.user_agent: c.headers = {'User-Agent': self.options.user_agent} if len(start_urls) != 1: for extra_url in start_urls[1:]: c.parse_url(extra_url, extra_url) # discovery scripts, pre-run scripts and advanced modules if self.options.scanner or self.options.allin: self.logger.info("Starting filesystem discovery (pre-crawler)") new_links = s.run_fs(start_url) for newlink in new_links: c.parse_url(newlink[0], newlink[0]) if self.options.use_adv_scripts or self.options.allin: self.logger.info("Running custom scripts (pre-crawler)") links = loader.base_crawler(start_url) for link in links: self.logger.debug("Adding link %s from post scripts" % link) c.parse_url(link, link) if self.options.wl_file: wf = WebFinder(url=start_url, logger=self.logger.getEffectiveLevel(), word_list=self.options.wl_file, append=self.options.wl_ext, ok_status_codes=self.options.wl_codes, invalid_text=self.options.wl_404, threads=self.thread_count) for wf_result in wf.output: c.parse_url(wf_result, start_url) self.logger.info("Starting Crawler") c.run_scraper() self.logger.debug("Cookies set during scan: %s" % (str(c.cookie.cookies))) self.scan_cookies = c.cookie.cookies self.logger.info("Creating unique link/post data list") todo = uniquinize(c.scraped_pages) else: todo = [[start_url, None]] if self.options.driver: self.logger.info("Running GhostDriver") m = Mefjus(logger=self.logger.getEffectiveLevel(), driver_path=self.options.driver_path, use_proxy=self.options.proxy, proxy_port=self.options.proxy_port, use_https=scope.is_https, show_driver=self.options.show_driver or self.options.interactive) results = m.run(todo, interactive=self.options.interactive) for res in results: if not scope.in_scope(res[0]): self.logger.debug("IGNORE %s.. out-of-scope" % res) continue if c.get_filetype(res[0]) in c.blocked_filetypes: self.logger.debug("IGNORE %s.. bad file-type" % res) continue if res in c.scraped_pages: self.logger.debug("IGNORE %s.. exists" % res) continue else: todo.append(res) self.logger.debug("QUEUE %s" % res) self.logger.info("Creating unique link/post data list") old_num = len(todo) todo = uniquinize(todo) self.logger.debug( "WebDriver discovered %d more url/post data pairs" % (len(todo) - old_num)) scanner_obj = None if self.options.scanner or self.options.allin: self.logger.info("Starting scan sequence") if len(todo) < self.thread_count: # for performance sake self.thread_count = len(todo) scanner_obj = scanner.Scanner( logger=self.logger.getEffectiveLevel(), script_engine=s, thread_count=self.thread_count) scanner_obj.copy_engine = self.options.optimize for page in todo: url, data = page req = Request(url, data=data, agent=self.options.user_agent, headers=login.headers, cookies=self.scan_cookies) req.run() scanner_obj.queue.put(req) scanner_obj.logger.debug("Queued %s %s" % (url, data)) scanner_obj.run() post_results = [] if self.options.use_adv_scripts or self.options.allin: self.logger.info("Running post scripts") post_results = loader.run_post(todo, cookies=self.scan_cookies) cms_results = None if self.options.cms_enabled or self.options.allin: cms_loader = ext.libcms.cms_scanner_core.CustomModuleLoader( log_level=self.logger.getEffectiveLevel()) cms_results = cms_loader.run_scripts(start_url) if cms_results: for cms in cms_results: for cms_result in cms_results[cms]: self.db.put(result_type="CMS Script", script=cms, severity=0, text=cms_result) webapp_results = None if self.options.webapp_enabled or self.options.allin: webapp_loader = WebAppModuleLoader( log_level=self.logger.getEffectiveLevel()) webapp_loader.load_modules() webapp_results = webapp_loader.run_scripts( start_url, scope=scope, cookies=self.scan_cookies, headers=login.headers) if webapp_results: for webapp in webapp_results: for webapp_result in webapp_results[webapp]: self.db.put(result_type="WebApp Script", script=webapp, severity=0, text=json.dumps(webapp_result)) meta = {} if self.options.msf: monster = metamonster.MetaMonster( log_level=self.logger.getEffectiveLevel()) creds = self.options.msf_creds.split(':') monster.username = creds[0] monster.password = creds[1] monster.host = self.options.msf_host monster.port = self.options.msf_port monster.ssl = self.options.msf_ssl monster.endpoint = self.options.msf_uri monster.should_start = self.options.msf_autostart monster.connect(start_url) if monster.client and monster.client.is_working: monster.get_exploits() monster.detect() queries = monster.create_queries() monster.run_queries(queries) meta = monster.results for working in meta['working']: msf_module, msf_output = working self.db.put(result_type="Metasploit", script=msf_module, severity=3, text=json.dumps(msf_output)) scan_tree = { 'start': start_time, 'end': time.time(), 'scope': scope.host, 'starturl': start_url, 'crawled': len(c.scraped_pages) if c else 0, 'scanned': len(todo) if self.options.scanner else 0, 'results': scanner_obj.script_engine.results if scanner_obj else [], 'metasploit': meta, 'cms': cms_results, 'webapps': webapp_results, 'post': post_results if self.options.use_adv_scripts else [] } self.db.end() if self.options.outfile: with open(self.options.outfile, 'w') as f: f.write(json.dumps(scan_tree)) self.logger.info("Wrote results to %s" % self.options.outfile)
def setUp(self): self.crawler = Crawler()
def run_crawler(): # areas = [ # ["Africa", "Africa", "Nigeria", "160", "nigeria"], # ["Africa", "Africa", "South Africa 南非", "112", "south-africa"], # ["Africa", "Africa", "Morocco", "159", "morocco"], # ["Africa", "Africa", "Kenya", "247", "kenya"], # ["Asia", "East Asia", "China 中國", "117", "china"], # ["Asia", "East Asia", "Japan 日本", "121", "japan"], # ["Asia", "East Asia", "South Korea", "125", "south-korea"], # ["Asia", "East Asia", "Hong Kong 香港", "118", "hong-kong"], # ["Asia", "SEA", "Indonesia 印尼", "120", "indonesia"], # ["Asia", "SEA", "Thailand 泰國", "126", "thailand"], # ["Asia", "SEA", "Vietnam 越南", "127", "vietnam"], # ["Asia", "South Asia", "India 印度", "119", "india"], # ["Asia", "South Asia", "Pakistan 巴基斯坦", "294", "pakistan"], # ["Asia", "West Asia", "Saudi Arabia 沙烏地阿拉伯", "110", "saudi-arabia"], # ["Australia & Oceania", "Australia & Oceania", "Australia 澳洲", "107", "australia"], # ["Australia & Oceania", "Australia & Oceania", "New Zealand", "161", "new-zealand"], # ["Europe", "Central & West Europe", "Germany 德國", "137", "germany"], # ["Europe", "Central & West Europe", "UK 英國", "156", "united-kingdom"], # ["Europe", "Central & West Europe", "France 法國", "136", "france"], # ["Europe", "Central & West Europe", "Poland 波蘭", "146", "poland"], # ["Europe", "Central & West Europe", "Netherlands 荷蘭", "144", "netherlands"], # ["Europe", "Central & West Europe", "Switzerland 瑞士", "155", "switzerland"], # ["Europe", "Central & West Europe", "Belgium 比利時", "129", "belgium"], # ["Europe", "Central & West Europe", "Austria 奧地利", "128", "austria"], # ["Europe", "Central & West Europe", "Czechia", "132", "czechia"], # ["Europe", "Central & West Europe", "Ireland", "140", "ireland"], # ["Europe", "Eastern Europe", "Russia 俄羅斯", "149", "russia"], # ["Europe", "Northern Europe", "Sweden 瑞典", "154", "sweden"], # ["Europe", "Northern Europe", "Norway", "145", "norway"], # ["Europe", "Northern Europe", "Denmark", "133", "denmark"], # ["Europe", "Northern Europe", "Finland", "135", "finland"], # ["Europe", "Southern Europe", "Italy 義大利", "141", "italy"], # ["Europe", "Southern Europe", "Spain 西班牙", "153", "spain"], # ["Europe", "Southern Europe", "Turkey 土耳其", "113", "turkey"], # ["Europe", "Southern Europe", "Portugal", "147", "portugal"], # ["Europe", "Southern Europe", "Greece 希臘", "138", "greece"], # ["North America", "North America", "USA 美國", "109", "united-states"], # ["North America", "North America", "Mexico 墨西哥", "116", "mexico"], # ["North America", "North America", "Canada 加拿大", "108", "canada"], # ["South America", "South America", "Brazil 巴西", "115", "brazil"], # ["South America", "South America", "Argentina 阿根廷", "114", "argentina"], # ["South America", "South America", "Colombia 哥倫比亞", "158", "colombia"], # ["South America", "South America", "Chile", "157", "chile"], # ["Worldwide", "Worldwide", "", "100", "worldwide"] # ] prefix_label = ["國家", ""] areas = [["Worldwide", "全球", "100", "worldwide"], ["HongKong", "香港", "118", "hong-kong"], ["China", "中國", "117", "china"], ["USA", "美國", "109", "united-states"], ["India", "印度", "119", "india"], ["Japan", "日本", "121", "japan"], ["Brazil", "巴西", "115", "brazil"], ["Germany", "德國", "137", "germany"], ["UK", "英國", "156", "united-kingdom"], ["France", "法國", "136", "france"], ["Russia", "俄羅斯", "149", "russia"], ["Mexico", "墨西哥", "116", "mexico"], ["Indonesia", "印尼", "120", "indonesiaa"], ["South Korea", "南韓", "125", "south-korea"], ["Italy", "義大利", "141", "italy"], ["Canada", "加拿大", "108", "canada"], ["Spain", "西班牙", "153", "spain"], ["Australia", "澳洲", "107", "australia"], ["Turkey", "土耳其", "113", "turkey"], ["Argentina", "阿根廷", "114", "argentina"], ["Saudi Arabia", "沙烏地阿拉伯", "110", "saudi-arabia"], ["Poland", "波蘭", "146", "poland"], ["Netherlands", "荷蘭", "144", "netherlands"], ["Thailand", "泰國", "126", "thailand"], ["Vietnam", "越南", "127", "vietnam"], ["Colombia", "哥倫比亞", "158", "colombia"], ["Switzerland", "瑞士", "155", "switzerland"], ["Sweden", "瑞典", "154", "sweden"], ["Belgium", "比利時", "129", "belgium"]] tab = { "食品類": [{ "title": "Consumer Market(Non-Carbonated Soft Drinks)", "r_type": "20020200", "i_type": "non-carbonated-soft-drinks", "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"], "fields": ["revenue", "revenue_yoy", "cagr", "arpc"] }, { "title": "Consumer Market(Roast Coffee)", "r_type": "30010100", "i_type": "roast-coffee", "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"], "fields": ["revenue", "revenue_yoy", "cagr", "arpc"] }, { "title": "Consumer Market(Instant Coffee)", "r_type": "30010200", "i_type": "instant-coffee", "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"], "fields": ["revenue", "revenue_yoy", "cagr", "arpc"] }, { "title": "Consumer Market(Tea)", "r_type": "30020000", "i_type": "tea", "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"], "fields": ["revenue", "revenue_yoy", "cagr", "arpc"] }, { "title": "Consumer Market(Processed & Frozen Fruits)", "r_type": "40040200", "i_type": "processed-frozen-fruits", "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"], "fields": ["revenue", "revenue_yoy", "cagr", "arpc"] }, { "title": "Consumer Market(Nuts)", "r_type": "40110300", "i_type": "nuts", "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARC"], "fields": ["revenue", "revenue_yoy", "cagr", "arpc"] }, { "title": "Consumer Market(Spices & Culinary Herbs)", "r_type": "40070300", "i_type": "spices-culinary-herbs", "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"], "fields": ["revenue", "revenue_yoy", "cagr", "arpc"] }, { "title": "Consumer Market(Snack Food)", "r_type": "40110000", "i_type": "snack-food", "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"], "fields": ["revenue", "revenue_yoy", "cagr", "arpc"] }, { "title": "Consumer Market(Baby Food)", "r_type": "40120000", "i_type": "baby-food", "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"], "fields": ["revenue", "revenue_yoy", "cagr", "arpc"] }, { "title": "Consumer Market(Pet Food)", "r_type": "40130000", "i_type": "pet-food", "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"], "fields": ["revenue", "revenue_yoy", "cagr", "arpc"] }, { "title": "Digital Market(Food & Beverage) ", "r_type": "253", "i_type": "food-beverages", "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"], "fields": ["revenue", "revenue_yoy", "cagr", "arpc"] }], "服飾精品類": [{ "title": "Digital Market(Fashion)", "r_type": "244", "i_type": "fashion", "labels": ["Revenue", "yoy", "CAGR"], "fields": ["revenue", "revenue_yoy", "cagr"] }, { "title": "Consumer Market(Apparel)", "r_type": "90000000", "i_type": "apparel", "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"], "fields": ["revenue", "revenue_yoy", "cagr", "arpc"] }, { "title": "Consumer Market(Footwear)", "r_type": "11000000", "i_type": "footwear", "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"], "fields": ["revenue", "revenue_yoy", "cagr", "arpc"] }, { "title": "Consumer Market(Eyewear)", "r_type": "12020000", "i_type": "sunglasses", "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"], "fields": ["revenue", "revenue_yoy", "cagr", "arpc"] }, { "title": "Consumer Market(Accessories)", "r_type": "13000000", "i_type": "accessories", "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"], "fields": ["revenue", "revenue_yoy", "cagr", "arpc"] }, { "title": "Consumer Market(Luxury Goods)", "r_type": "21000000", "i_type": "luxury-goods", "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"], "fields": ["revenue", "revenue_yoy", "cagr", "arpc"] }], "3C類": [{ "title": "Consumer Market", "r_type": "15000000", "i_type": "consumer-electronics", "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"], "fields": ["revenue", "revenue_yoy", "cagr", "arpc"] }, { "title": "Digital Market", "r_type": "251", "i_type": "consumer-electronics", "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"], "fields": ["revenue", "revenue_yoy", "cagr", "arpc"] }, { "title": "Consumer Market(Digital Camera)", "r_type": "15010400", "i_type": "digital-cameras", "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"], "fields": ["revenue", "revenue_yoy", "cagr", "arpc"] }, { "title": "Consumer Market(mobile phones)", "r_type": "15020100", "i_type": "mobile-phones", "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"], "fields": ["revenue", "revenue_yoy", "cagr", "arpc"] }, { "title": "(Laptops & Tablets)", "r_type": "251", "i_type": "consumer-electronics", "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"], "fields": ["revenue", "revenue_yoy", "cagr", "arpc"] }], "家電類": [{ "title": "Consumer Market", "r_type": "16000000", "i_type": "household-appliances", "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"], "fields": ["revenue", "revenue_yoy", "cagr", "arpc"] }, { "title": "Digital Market", "r_type": "256", "i_type": "household-appliances", "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"], "fields": ["revenue", "revenue_yoy", "cagr", "arpc"] }, { "title": "Consumer Market(Vacuum Cleaners)", "r_type": "16020100", "i_type": "vacuum%25C2%25A0cleaners", "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"], "fields": ["revenue", "revenue_yoy", "cagr", "arpc"] }, { "title": "Consumer Market(Small Kittchen appliance)", "r_type": "16020200", "i_type": "small-kitchen-appliances", "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"], "fields": ["revenue", "revenue_yoy", "cagr", "arpc"] }, { "title": "Consumer Market(Coffee Machine)", "r_type": "16021000", "i_type": "coffee-machines", "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"], "fields": ["revenue", "revenue_yoy", "cagr", "arpc"] }], "美妝類": [{ "title": "Consumer Market(Beauty & Personal Care)", "r_type": "70000000", "i_type": "beauty-personal-care", "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"], "fields": ["revenue", "revenue_yoy", "cagr", "arpc"] }, { "title": "Digital Market", "r_type": "254", "i_type": "personal-care", "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"], "fields": ["revenue", "revenue_yoy", "cagr", "arpc"] }], "美妝次分類": [{ "title": "Consumer Market(cosmetics)", "r_type": "70010000", "i_type": "cosmetics", "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"], "fields": ["revenue", "revenue_yoy", "cagr", "arpc"] }, { "title": "Consumer Market(Face Skincare)", "r_type": "70020100", "i_type": "face", "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"], "fields": ["revenue", "revenue_yoy", "cagr", "arpc"] }, { "title": "Consumer Market(Body Skincare)", "r_type": "70020200", "i_type": "body", "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"], "fields": ["revenue", "revenue_yoy", "cagr", "arpc"] }, { "title": "Consumer Market(personal - hair care)", "r_type": "70040000", "i_type": "hair-care", "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"], "fields": ["revenue", "revenue_yoy", "cagr", "arpc"] }, { "title": "Consumer Market(personal - oral care)", "r_type": "70060000", "i_type": "oral-care", "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"], "fields": ["revenue", "revenue_yoy", "cagr", "arpc"] }, { "title": "Consumer Market(personal - fragrances)", "r_type": "70050000", "i_type": "fragrances", "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"], "fields": ["revenue", "revenue_yoy", "cagr", "arpc"] }], "其他": [{ "title": "Digital Market - Toys, Hobby & DIY", "r_type": "248", "i_type": "toys-hobby-diy", "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"], "fields": ["revenue", "revenue_yoy", "cagr", "arpc"] }, { "title": "Digital Market - Sports & Outdoor", "r_type": "259", "i_type": "sports-outdoor", "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"], "fields": ["revenue", "revenue_yoy", "cagr", "arpc"] }, { "title": "Digital Market - Hobby & Stationary", "r_type": "260", "i_type": "hobby-stationery", "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"], "fields": ["revenue", "revenue_yoy", "cagr", "arpc"] }, { "title": "Consumer Market - Home & Laundry Care", "r_type": "60000000", "i_type": "home-laundry-care", "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"], "fields": ["revenue", "revenue_yoy", "cagr", "arpc"] }, { "title": "Tissue & Hygiene Paper", "r_type": "80000000", "i_type": "tissue-hygiene-paper", "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"], "fields": ["revenue", "revenue_yoy", "cagr", "arpc"] }, { "title": "Toys & Hobby", "r_type": "19000000", "i_type": "toys-hobby", "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"], "fields": ["revenue", "revenue_yoy", "cagr", "arpc"] }], "五金類": [{ "title": "Consumer Market(Lamps & Lighting)", "r_type": "17060000", "i_type": "lamps-lighting", "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"], "fields": ["revenue", "revenue_yoy", "cagr", "arpc"] }, { "title": "Consumer Market(Floor Covering)", "r_type": "17060000", "i_type": "lamps-lighting", "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"], "fields": ["revenue", "revenue_yoy", "cagr", "arpc"] }, { "title": "Digital Market(DIY, Garden & Pets)", "r_type": "357", "i_type": "diy-garden-pets", "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"], "fields": ["revenue", "revenue_yoy", "cagr", "arpc"] }] } sheet_index = -1 wb = Workbook() for tab_title in tab.keys(): print(tab_title) # create sheet sheet_index += 1 ws = wb.create_sheet(tab_title, sheet_index) # ws.cell(row=4, column=2, value=10) # ws.cell(column=col, row=row, value="{0}".format(get_column_letter(col))) cols_index = 1 for data_group_id in range(len(tab[tab_title])): data_group = tab[tab_title][data_group_id] row_id = 1 # print('\t{}\t{}\t{}\t{}'.format(data_group['title'], data_group['r_type'], data_group['i_type'], # data_group['fields'])) if data_group_id == 0: cols = prefix_label + [data_group['title'] ] + data_group['labels'] col_start = 1 else: cols = [data_group['title']] + data_group['labels'] for c in range(len(cols)): _ = ws.cell(row=row_id, column=col_start + c, value="{}".format(cols[c])) for area in areas: row_id += 1 url = "https://www.statista.com/outlook/{}/{}/{}/{}".format( data_group['r_type'], area[-2], data_group['i_type'], area[-1]) # _ = ws.cell(row=row_id, column=cols_count, value="{}".format(cols[c])) # print("\t\t{}".format(url)) crawler = Crawler(url) d = crawler.data() if data_group_id == 0: row_data = [area[0], area[1], url] else: row_data = [url] for f in data_group['fields']: row_data.append(d[f]) # c = "{}\t{}-{}".format(c, f, d[f]) for c in range(len(row_data)): _ = ws.cell(row=row_id, column=col_start + c, value="{}".format(row_data[c])) print(row_data) sl = random.choice(SLEEP_SECOND) print("*************** sleep {} second ***************".format( sl)) # if row_id >= 5: # break time.sleep(sl) col_start += len(cols) wb.save(filename="/tmp/statista.xlsx")
def __init__(self): self._crawler = Crawler()
def run_spider(cfg, s): crawl = Crawler(s, cfg['crawler']) crawl.start()