Esempio n. 1
0
 def _load_starturl_from_schedule(self):
     try:
         start_urls = self._task_schedule.get(block=False)
         name = start_urls[0]
         start_url = start_urls[1]
         # logger.debug("当前爬取的网页是:%s"%start_urls)
         logger.info(*self.lfm.crawled("CrawlerRunner", self.name,
                                       '当前爬取的网页', start_url))
         crawler = Crawler(self.spidercls, self.settings, self.lfm, self,
                           self.middlewares)
         crawler.create_spider_from_task(name, start_url)
         return crawler
     except Empty:
         logger.debug(
             *self.lfm.crawled("CrawlerRunner", self.name, '队列中的task分配完毕'))
         if not self._push_task_finish:
             self._create_task()
         else:
             self._pull_task_finish = True
     except Exception as e:
         logger.error(*self.lfm.error(
             "CrawlerRunner",
             self.name,
             "",
             '出现错误:',
         ),
                      extra={
                          'exception': e,
                      },
                      exc_info=True)
     return None
Esempio n. 2
0
    def test_crawler(self):
        
        from core.crawler import Crawler, tbhtmlOnParser
        from DBManagement.models import Shop
        
        crawler = Crawler()
        
        crawler._parser.register(tbhtmlOnParser)
        
#       entryHttpUrl = "http://s.taobao.com/search?"
        entryHttpUrl = "http://s.taobao.com/search?initiative_id=staobaoz_20120515&q=手表&suggest=history_1&_input_charset=utf-8&source=suggest&tab=all&bcoffset=1&s="
        
        list = []
        
        tag = '手表'
        
        for page in range(0, 100):
            # modify data template
            values['s'] = str(page)
        
            tbData  = crawler.craw( Shop() , entryHttpUrl + str(page * 44), tag, **values) 
            
            # identify the page number
            tbData['page'] = page
            
            list.append( tbData )
            
            print(tbData['data'])
            
        return list
Esempio n. 3
0
    def _crawlTarget(self):
        print "\n[+] Crawling links..."

        # Build a queue and start crawlers 
        queue = self._getTargetsQueue()
        crawlers = []
        for i in range(min(self.getOption('threads'), len(self.targets))):
            c = Crawler(self, queue, crawl_links=True)
            c.setDaemon(True)
            crawlers.append(c)
            c.start()
      
        # Little hack to kill threads on SIGINT
        while True:
            try:
                if queue.empty() is True:
                    break
                #x sys.stdout.write("\r    Remaining targets: %s" % queue.qsize())
                #sys.stdout.flush()
            except KeyboardInterrupt:
                print"\n |- " + colored.yellow("INTERRUPT!") + " Killing threads..."
                queue = Queue.Queue()
                break
        
        queue.join()

        # Harvest results
        results = []
        errors = {}
        for c in crawlers:
            # results
            for r in c.results:
                results.append(r)
            # errors
            for ek, ev in c.errors.iteritems():
                if errors.has_key(ek):
                    errors[ek] += ev
                else:
                    errors[ek] = ev

        results = set(results)
        
        if errors:
            print " |--[+] " + colored.red("CRAWL ERRORS!")
            for ek, ev in errors.iteritems():
                print " |   |- %sx: %s" % (len(ev), ek)
        if len(results) > 0:
            print " |- " + colored.green("SUCCESS: ") +  "Found %s unique targets." % len(results)
        else:
            print " |- " + colored.yellow("WARNING: ") + "No new targets found."

        # Add targets
        for t in results:
            self.targets.append(t)
Esempio n. 4
0
 def start_crawler(self):
     self.print_info("[✔] Crawler started!")
     self.crawler = Crawler(self.url, self.login_form)
     self.crawler.signals.result_list.connect(self.print_result)
     self.crawler.signals.finish_control.connect(self.finish_control)
     self.crawler.signals.info_box.connect(self.print_info)
     self.threadpool.start(self.crawler)
Esempio n. 5
0
    def scan(self):
        level = int(self.args.get('level', 1))  #post 扫描

        if not self.target.startswith(('http', 'HTTP')):
            self.target = 'http://' + self.target
        if not self.target.endswith('/'):
            self.target += '/'
        '''
        for target in gethosts(self.target):
            self.portscan(target)
            pass
        '''
        headers = json.loads(self.args.get('headers', "{}"))
        self.crawle = Crawler(self.target, headers=headers)
        self.crawle.settings.update(level=level)
        #self.crawle.settings.update(proxy={'http':'http://127.0.0.1:1111','https':'http://127.0.0.1:1111'})
        self.crawle.settings.update(self.args)

        th = []
        th.append(threading.Thread(target=self.crawle.run1))
        th.append(threading.Thread(target=self.webscan))
        th.append(threading.Thread(target=self.httpscan))
        for t in th:
            #t.daemon = True
            t.start()
        for t in th:
            t.join()

        #扫描完成写入httpret结果
        self.writewebsite(self.crawle.website)
Esempio n. 6
0
class TestCrawler(unittest.TestCase):
    def setUp(self):
        self.crawler = Crawler()

    def test_xici(self):
        res = self.crawler.xici()
        print(res)
        self.assertTrue(len(res) != 0)

    def test_kuaidaili(self):
        res = self.crawler.kuaidaili()
        print(res)
        self.assertTrue(len(res) != 0)

    def test_ip3366(self):
        res = self.crawler.ip3366()
        print(res)
        self.assertTrue(len(res) != 0)

    def test_jiangxianli(self):
        res = self.crawler.jiangxianli()
        print(res)
        self.assertTrue(len(res) != 0)
Esempio n. 7
0
    def _crawlForms(self):
        print "\n[+] Crawling for forms..."

        queue = self._getTargetsQueue()
        crawlers = []
        for i in range(min(self.getOption('threads'), len(self.targets))):
            c = Crawler(self, queue, crawl_forms=True)
            c.setDaemon(True)
            crawlers.append(c)
            c.start()

        # Little hack to kill threads on SIGINT
        while True:
            try:
                if queue.empty() is True:
                    break
                sys.stderr.write("\r |- Remaining targets: %s " %
                                 queue.qsize())
                sys.stderr.flush()
            except KeyboardInterrupt:
                print "\n |- " + colored.yellow(
                    "INTERRUPT!") + " Killing threads..."
                queue = Queue.Queue()
                break

        queue.join()

        # Harvest results
        results = []
        errors = {}
        for c in crawlers:
            # results
            for r in c.results:
                results.append(r)
            # errors
            for ek, ev in c.errors.iteritems():
                if errors.has_key(ek):
                    errors[ek] += ev
                else:
                    errors[ek] = ev

        results = set(results)

        if errors:
            print " |--[+] " + colored.red("CRAWL ERRORS!")
            for ek, ev in errors.iteritems():
                print " |   |- %sx: %s" % (len(ev), ek)

        if len(results) > 0:
            print " |- " + colored.green(
                "SUCCESS: ") + "Found %s unique forms." % len(results)
        else:
            print " |- " + colored.yellow("WARNING: ") + "No forms found."

        # Add targets
        for t in results:
            self.targets.append(t)
Esempio n. 8
0
class Adder:
    def __init__(self):
        self._crawler = Crawler()

    def _crawl_fns(self):
        return [
            self._crawler.__getattribute__(fn_name)
            for fn_name in self._crawler.fn_names
        ]

    def add(self):
        raw_ips = []
        with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
            futures = [executor.submit(fn) for fn in self._crawl_fns()]
            for future in concurrent.futures.as_completed(futures):
                try:
                    data = future.result()
                    raw_ips.extend(data)
                except Exception as exc:
                    print('generated an exception:{}'.format(exc))
                else:
                    print('crawl succcess')
        return [raw_ips[i:i + 100] for i in range(0, len(raw_ips), 100)]
Esempio n. 9
0
    def run(self, start_urls, scopes=None):
        start_url = start_urls[0]
        self.start()
        start_time = time.time()
        scope = Scope(start_url, options=self.options.scope_options)
        if scopes:
            scope.scopes = [x.strip() for x in scopes.split(',')]
        self.db.start(start_url, scope.host)
        c = None
        s = None
        loader = None

        self.logger.debug("Parsing scan options")
        login = LoginAction(logger=self.logger.getEffectiveLevel())
        pre_run = login.pre_parse(self.options)
        if pre_run:
            self.scan_cookies = dict(login.session_obj.cookies)
        scanoptions = []
        if self.options.custom_options:
            scan_vars = self.options.custom_options.split(',')
            for v in scan_vars:
                opt = v.strip()
                scanoptions.append(opt)
                self.logger.debug("Enabled option %s" % opt)
        if self.options.scanner or self.options.allin:
            s = ScriptEngine(options=scanoptions,
                             logger=self.logger.getEffectiveLevel(),
                             database=self.db)

        if self.options.use_adv_scripts or self.options.allin:
            loader = modules.CustomModuleLoader(
                options=scanoptions,
                logger=self.logger.getEffectiveLevel(),
                database=self.db,
                scope=scope)

            loader.sslverify = self.options.sslverify
            loader.headers = login.headers
            loader.cookies = self.scan_cookies

        todo = []

        c = Crawler(base_url=start_url, logger=self.logger.getEffectiveLevel())
        for login_header in login.headers:
            c.headers[login_header] = login.headers[login_header]
        if self.options.use_crawler or self.options.allin:
            if pre_run:
                c.login = True
                # set cookies from Login module
                cookies = dict(login.session_obj.cookies)
                if cookies and len(cookies):
                    self.logger.debug(
                        "Setting crawler cookies from login module: %s" %
                        str(cookies))
                    c.cookie.append(cookies)
            c.thread_count = self.thread_count
            c.max_urls = int(self.options.maxurls)
            c.scope = scope
            if self.options.user_agent:
                c.headers = {'User-Agent': self.options.user_agent}
            if len(start_urls) != 1:
                for extra_url in start_urls[1:]:
                    c.parse_url(extra_url, extra_url)
            # discovery scripts, pre-run scripts and advanced modules
            if self.options.scanner or self.options.allin:
                self.logger.info("Starting filesystem discovery (pre-crawler)")
                new_links = s.run_fs(start_url)

                for newlink in new_links:
                    c.parse_url(newlink[0], newlink[0])
                if self.options.use_adv_scripts or self.options.allin:
                    self.logger.info("Running custom scripts (pre-crawler)")
                    links = loader.base_crawler(start_url)
                    for link in links:
                        self.logger.debug("Adding link %s from post scripts" %
                                          link)
                        c.parse_url(link, link)

            if self.options.wl_file:
                wf = WebFinder(url=start_url,
                               logger=self.logger.getEffectiveLevel(),
                               word_list=self.options.wl_file,
                               append=self.options.wl_ext,
                               ok_status_codes=self.options.wl_codes,
                               invalid_text=self.options.wl_404,
                               threads=self.thread_count)
                for wf_result in wf.output:
                    c.parse_url(wf_result, start_url)

            self.logger.info("Starting Crawler")
            c.run_scraper()
            self.logger.debug("Cookies set during scan: %s" %
                              (str(c.cookie.cookies)))
            self.scan_cookies = c.cookie.cookies

            self.logger.info("Creating unique link/post data list")
            todo = uniquinize(c.scraped_pages)
        else:
            todo = [[start_url, None]]

        if self.options.driver:
            self.logger.info("Running GhostDriver")

            m = Mefjus(logger=self.logger.getEffectiveLevel(),
                       driver_path=self.options.driver_path,
                       use_proxy=self.options.proxy,
                       proxy_port=self.options.proxy_port,
                       use_https=scope.is_https,
                       show_driver=self.options.show_driver
                       or self.options.interactive)
            results = m.run(todo, interactive=self.options.interactive)
            for res in results:
                if not scope.in_scope(res[0]):
                    self.logger.debug("IGNORE %s.. out-of-scope" % res)
                    continue
                if c.get_filetype(res[0]) in c.blocked_filetypes:
                    self.logger.debug("IGNORE %s.. bad file-type" % res)
                    continue
                if res in c.scraped_pages:
                    self.logger.debug("IGNORE %s.. exists" % res)
                    continue
                else:
                    todo.append(res)
                    self.logger.debug("QUEUE %s" % res)
            self.logger.info("Creating unique link/post data list")
            old_num = len(todo)
            todo = uniquinize(todo)
            self.logger.debug(
                "WebDriver discovered %d more url/post data pairs" %
                (len(todo) - old_num))

        scanner_obj = None
        if self.options.scanner or self.options.allin:
            self.logger.info("Starting scan sequence")
            if len(todo) < self.thread_count:
                # for performance sake
                self.thread_count = len(todo)
            scanner_obj = scanner.Scanner(
                logger=self.logger.getEffectiveLevel(),
                script_engine=s,
                thread_count=self.thread_count)
            scanner_obj.copy_engine = self.options.optimize
            for page in todo:
                url, data = page
                req = Request(url,
                              data=data,
                              agent=self.options.user_agent,
                              headers=login.headers,
                              cookies=self.scan_cookies)
                req.run()
                scanner_obj.queue.put(req)
                scanner_obj.logger.debug("Queued %s %s" % (url, data))
            scanner_obj.run()

        post_results = []
        if self.options.use_adv_scripts or self.options.allin:
            self.logger.info("Running post scripts")
            post_results = loader.run_post(todo, cookies=self.scan_cookies)
        cms_results = None
        if self.options.cms_enabled or self.options.allin:
            cms_loader = ext.libcms.cms_scanner_core.CustomModuleLoader(
                log_level=self.logger.getEffectiveLevel())
            cms_results = cms_loader.run_scripts(start_url)
            if cms_results:
                for cms in cms_results:
                    for cms_result in cms_results[cms]:
                        self.db.put(result_type="CMS Script",
                                    script=cms,
                                    severity=0,
                                    text=cms_result)

        webapp_results = None
        if self.options.webapp_enabled or self.options.allin:
            webapp_loader = WebAppModuleLoader(
                log_level=self.logger.getEffectiveLevel())
            webapp_loader.load_modules()
            webapp_results = webapp_loader.run_scripts(
                start_url,
                scope=scope,
                cookies=self.scan_cookies,
                headers=login.headers)
            if webapp_results:
                for webapp in webapp_results:
                    for webapp_result in webapp_results[webapp]:
                        self.db.put(result_type="WebApp Script",
                                    script=webapp,
                                    severity=0,
                                    text=json.dumps(webapp_result))
        meta = {}
        if self.options.msf:
            monster = metamonster.MetaMonster(
                log_level=self.logger.getEffectiveLevel())
            creds = self.options.msf_creds.split(':')
            monster.username = creds[0]
            monster.password = creds[1]
            monster.host = self.options.msf_host
            monster.port = self.options.msf_port
            monster.ssl = self.options.msf_ssl
            monster.endpoint = self.options.msf_uri
            monster.should_start = self.options.msf_autostart

            monster.connect(start_url)
            if monster.client and monster.client.is_working:
                monster.get_exploits()
                monster.detect()
                queries = monster.create_queries()
                monster.run_queries(queries)
                meta = monster.results
                for working in meta['working']:
                    msf_module, msf_output = working
                    self.db.put(result_type="Metasploit",
                                script=msf_module,
                                severity=3,
                                text=json.dumps(msf_output))

        scan_tree = {
            'start': start_time,
            'end': time.time(),
            'scope': scope.host,
            'starturl': start_url,
            'crawled': len(c.scraped_pages) if c else 0,
            'scanned': len(todo) if self.options.scanner else 0,
            'results':
            scanner_obj.script_engine.results if scanner_obj else [],
            'metasploit': meta,
            'cms': cms_results,
            'webapps': webapp_results,
            'post': post_results if self.options.use_adv_scripts else []
        }

        self.db.end()

        if self.options.outfile:
            with open(self.options.outfile, 'w') as f:
                f.write(json.dumps(scan_tree))
                self.logger.info("Wrote results to %s" % self.options.outfile)
Esempio n. 10
0
 def setUp(self):
     self.crawler = Crawler()
Esempio n. 11
0
def run_crawler():
    # areas = [
    #     ["Africa", "Africa", "Nigeria", "160", "nigeria"],
    #     ["Africa", "Africa", "South Africa 南非", "112", "south-africa"],
    #     ["Africa", "Africa", "Morocco", "159", "morocco"],
    #     ["Africa", "Africa", "Kenya", "247", "kenya"],
    #     ["Asia", "East Asia", "China 中國", "117", "china"],
    #     ["Asia", "East Asia", "Japan 日本", "121", "japan"],
    #     ["Asia", "East Asia", "South Korea", "125", "south-korea"],
    #     ["Asia", "East Asia", "Hong Kong 香港", "118", "hong-kong"],
    #     ["Asia", "SEA", "Indonesia 印尼", "120", "indonesia"],
    #     ["Asia", "SEA", "Thailand 泰國", "126", "thailand"],
    #     ["Asia", "SEA", "Vietnam 越南", "127", "vietnam"],
    #     ["Asia", "South Asia", "India 印度", "119", "india"],
    #     ["Asia", "South Asia", "Pakistan 巴基斯坦", "294", "pakistan"],
    #     ["Asia", "West Asia", "Saudi Arabia 沙烏地阿拉伯", "110", "saudi-arabia"],
    #     ["Australia & Oceania", "Australia & Oceania", "Australia 澳洲", "107", "australia"],
    #     ["Australia & Oceania", "Australia & Oceania", "New Zealand", "161", "new-zealand"],
    #     ["Europe", "Central & West Europe", "Germany 德國", "137", "germany"],
    #     ["Europe", "Central & West Europe", "UK 英國", "156", "united-kingdom"],
    #     ["Europe", "Central & West Europe", "France 法國", "136", "france"],
    #     ["Europe", "Central & West Europe", "Poland 波蘭", "146", "poland"],
    #     ["Europe", "Central & West Europe", "Netherlands 荷蘭", "144", "netherlands"],
    #     ["Europe", "Central & West Europe", "Switzerland 瑞士", "155", "switzerland"],
    #     ["Europe", "Central & West Europe", "Belgium 比利時", "129", "belgium"],
    #     ["Europe", "Central & West Europe", "Austria 奧地利", "128", "austria"],
    #     ["Europe", "Central & West Europe", "Czechia", "132", "czechia"],
    #     ["Europe", "Central & West Europe", "Ireland", "140", "ireland"],
    #     ["Europe", "Eastern Europe", "Russia 俄羅斯", "149", "russia"],
    #     ["Europe", "Northern Europe", "Sweden 瑞典", "154", "sweden"],
    #     ["Europe", "Northern Europe", "Norway", "145", "norway"],
    #     ["Europe", "Northern Europe", "Denmark", "133", "denmark"],
    #     ["Europe", "Northern Europe", "Finland", "135", "finland"],
    #     ["Europe", "Southern Europe", "Italy 義大利", "141", "italy"],
    #     ["Europe", "Southern Europe", "Spain 西班牙", "153", "spain"],
    #     ["Europe", "Southern Europe", "Turkey 土耳其", "113", "turkey"],
    #     ["Europe", "Southern Europe", "Portugal", "147", "portugal"],
    #     ["Europe", "Southern Europe", "Greece 希臘", "138", "greece"],
    #     ["North America", "North America", "USA 美國", "109", "united-states"],
    #     ["North America", "North America", "Mexico 墨西哥", "116", "mexico"],
    #     ["North America", "North America", "Canada 加拿大", "108", "canada"],
    #     ["South America", "South America", "Brazil 巴西", "115", "brazil"],
    #     ["South America", "South America", "Argentina 阿根廷", "114", "argentina"],
    #     ["South America", "South America", "Colombia 哥倫比亞", "158", "colombia"],
    #     ["South America", "South America", "Chile", "157", "chile"],
    #     ["Worldwide", "Worldwide", "", "100", "worldwide"]
    # ]

    prefix_label = ["國家", ""]
    areas = [["Worldwide", "全球", "100", "worldwide"],
             ["HongKong", "香港", "118", "hong-kong"],
             ["China", "中國", "117", "china"],
             ["USA", "美國", "109", "united-states"],
             ["India", "印度", "119", "india"], ["Japan", "日本", "121", "japan"],
             ["Brazil", "巴西", "115", "brazil"],
             ["Germany", "德國", "137", "germany"],
             ["UK", "英國", "156", "united-kingdom"],
             ["France", "法國", "136", "france"],
             ["Russia", "俄羅斯", "149", "russia"],
             ["Mexico", "墨西哥", "116", "mexico"],
             ["Indonesia", "印尼", "120", "indonesiaa"],
             ["South Korea", "南韓", "125", "south-korea"],
             ["Italy", "義大利", "141", "italy"],
             ["Canada", "加拿大", "108", "canada"],
             ["Spain", "西班牙", "153", "spain"],
             ["Australia", "澳洲", "107", "australia"],
             ["Turkey", "土耳其", "113", "turkey"],
             ["Argentina", "阿根廷", "114", "argentina"],
             ["Saudi Arabia", "沙烏地阿拉伯", "110", "saudi-arabia"],
             ["Poland", "波蘭", "146", "poland"],
             ["Netherlands", "荷蘭", "144", "netherlands"],
             ["Thailand", "泰國", "126", "thailand"],
             ["Vietnam", "越南", "127", "vietnam"],
             ["Colombia", "哥倫比亞", "158", "colombia"],
             ["Switzerland", "瑞士", "155", "switzerland"],
             ["Sweden", "瑞典", "154", "sweden"],
             ["Belgium", "比利時", "129", "belgium"]]
    tab = {
        "食品類": [{
            "title": "Consumer Market(Non-Carbonated Soft Drinks)",
            "r_type": "20020200",
            "i_type": "non-carbonated-soft-drinks",
            "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"],
            "fields": ["revenue", "revenue_yoy", "cagr", "arpc"]
        }, {
            "title": "Consumer Market(Roast Coffee)",
            "r_type": "30010100",
            "i_type": "roast-coffee",
            "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"],
            "fields": ["revenue", "revenue_yoy", "cagr", "arpc"]
        }, {
            "title": "Consumer Market(Instant Coffee)",
            "r_type": "30010200",
            "i_type": "instant-coffee",
            "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"],
            "fields": ["revenue", "revenue_yoy", "cagr", "arpc"]
        }, {
            "title": "Consumer Market(Tea)",
            "r_type": "30020000",
            "i_type": "tea",
            "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"],
            "fields": ["revenue", "revenue_yoy", "cagr", "arpc"]
        }, {
            "title": "Consumer Market(Processed & Frozen Fruits)",
            "r_type": "40040200",
            "i_type": "processed-frozen-fruits",
            "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"],
            "fields": ["revenue", "revenue_yoy", "cagr", "arpc"]
        }, {
            "title": "Consumer Market(Nuts)",
            "r_type": "40110300",
            "i_type": "nuts",
            "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARC"],
            "fields": ["revenue", "revenue_yoy", "cagr", "arpc"]
        }, {
            "title": "Consumer Market(Spices & Culinary Herbs)",
            "r_type": "40070300",
            "i_type": "spices-culinary-herbs",
            "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"],
            "fields": ["revenue", "revenue_yoy", "cagr", "arpc"]
        }, {
            "title": "Consumer Market(Snack Food)",
            "r_type": "40110000",
            "i_type": "snack-food",
            "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"],
            "fields": ["revenue", "revenue_yoy", "cagr", "arpc"]
        }, {
            "title": "Consumer Market(Baby Food)",
            "r_type": "40120000",
            "i_type": "baby-food",
            "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"],
            "fields": ["revenue", "revenue_yoy", "cagr", "arpc"]
        }, {
            "title": "Consumer Market(Pet Food)",
            "r_type": "40130000",
            "i_type": "pet-food",
            "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"],
            "fields": ["revenue", "revenue_yoy", "cagr", "arpc"]
        }, {
            "title": "Digital Market(Food & Beverage) ",
            "r_type": "253",
            "i_type": "food-beverages",
            "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"],
            "fields": ["revenue", "revenue_yoy", "cagr", "arpc"]
        }],
        "服飾精品類": [{
            "title": "Digital Market(Fashion)",
            "r_type": "244",
            "i_type": "fashion",
            "labels": ["Revenue", "yoy", "CAGR"],
            "fields": ["revenue", "revenue_yoy", "cagr"]
        }, {
            "title": "Consumer Market(Apparel)",
            "r_type": "90000000",
            "i_type": "apparel",
            "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"],
            "fields": ["revenue", "revenue_yoy", "cagr", "arpc"]
        }, {
            "title": "Consumer Market(Footwear)",
            "r_type": "11000000",
            "i_type": "footwear",
            "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"],
            "fields": ["revenue", "revenue_yoy", "cagr", "arpc"]
        }, {
            "title": "Consumer Market(Eyewear)",
            "r_type": "12020000",
            "i_type": "sunglasses",
            "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"],
            "fields": ["revenue", "revenue_yoy", "cagr", "arpc"]
        }, {
            "title": "Consumer Market(Accessories)",
            "r_type": "13000000",
            "i_type": "accessories",
            "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"],
            "fields": ["revenue", "revenue_yoy", "cagr", "arpc"]
        }, {
            "title": "Consumer Market(Luxury Goods)",
            "r_type": "21000000",
            "i_type": "luxury-goods",
            "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"],
            "fields": ["revenue", "revenue_yoy", "cagr", "arpc"]
        }],
        "3C類": [{
            "title": "Consumer Market",
            "r_type": "15000000",
            "i_type": "consumer-electronics",
            "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"],
            "fields": ["revenue", "revenue_yoy", "cagr", "arpc"]
        }, {
            "title": "Digital Market",
            "r_type": "251",
            "i_type": "consumer-electronics",
            "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"],
            "fields": ["revenue", "revenue_yoy", "cagr", "arpc"]
        }, {
            "title": "Consumer Market(Digital Camera)",
            "r_type": "15010400",
            "i_type": "digital-cameras",
            "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"],
            "fields": ["revenue", "revenue_yoy", "cagr", "arpc"]
        }, {
            "title": "Consumer Market(mobile phones)",
            "r_type": "15020100",
            "i_type": "mobile-phones",
            "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"],
            "fields": ["revenue", "revenue_yoy", "cagr", "arpc"]
        }, {
            "title": "(Laptops & Tablets)",
            "r_type": "251",
            "i_type": "consumer-electronics",
            "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"],
            "fields": ["revenue", "revenue_yoy", "cagr", "arpc"]
        }],
        "家電類": [{
            "title": "Consumer Market",
            "r_type": "16000000",
            "i_type": "household-appliances",
            "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"],
            "fields": ["revenue", "revenue_yoy", "cagr", "arpc"]
        }, {
            "title": "Digital Market",
            "r_type": "256",
            "i_type": "household-appliances",
            "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"],
            "fields": ["revenue", "revenue_yoy", "cagr", "arpc"]
        }, {
            "title": "Consumer Market(Vacuum Cleaners)",
            "r_type": "16020100",
            "i_type": "vacuum%25C2%25A0cleaners",
            "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"],
            "fields": ["revenue", "revenue_yoy", "cagr", "arpc"]
        }, {
            "title": "Consumer Market(Small Kittchen appliance)",
            "r_type": "16020200",
            "i_type": "small-kitchen-appliances",
            "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"],
            "fields": ["revenue", "revenue_yoy", "cagr", "arpc"]
        }, {
            "title": "Consumer Market(Coffee Machine)",
            "r_type": "16021000",
            "i_type": "coffee-machines",
            "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"],
            "fields": ["revenue", "revenue_yoy", "cagr", "arpc"]
        }],
        "美妝類": [{
            "title": "Consumer Market(Beauty & Personal Care)",
            "r_type": "70000000",
            "i_type": "beauty-personal-care",
            "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"],
            "fields": ["revenue", "revenue_yoy", "cagr", "arpc"]
        }, {
            "title": "Digital Market",
            "r_type": "254",
            "i_type": "personal-care",
            "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"],
            "fields": ["revenue", "revenue_yoy", "cagr", "arpc"]
        }],
        "美妝次分類": [{
            "title": "Consumer Market(cosmetics)",
            "r_type": "70010000",
            "i_type": "cosmetics",
            "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"],
            "fields": ["revenue", "revenue_yoy", "cagr", "arpc"]
        }, {
            "title": "Consumer Market(Face Skincare)",
            "r_type": "70020100",
            "i_type": "face",
            "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"],
            "fields": ["revenue", "revenue_yoy", "cagr", "arpc"]
        }, {
            "title": "Consumer Market(Body Skincare)",
            "r_type": "70020200",
            "i_type": "body",
            "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"],
            "fields": ["revenue", "revenue_yoy", "cagr", "arpc"]
        }, {
            "title": "Consumer Market(personal - hair care)",
            "r_type": "70040000",
            "i_type": "hair-care",
            "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"],
            "fields": ["revenue", "revenue_yoy", "cagr", "arpc"]
        }, {
            "title": "Consumer Market(personal - oral care)",
            "r_type": "70060000",
            "i_type": "oral-care",
            "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"],
            "fields": ["revenue", "revenue_yoy", "cagr", "arpc"]
        }, {
            "title": "Consumer Market(personal - fragrances)",
            "r_type": "70050000",
            "i_type": "fragrances",
            "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"],
            "fields": ["revenue", "revenue_yoy", "cagr", "arpc"]
        }],
        "其他": [{
            "title": "Digital Market - Toys, Hobby & DIY",
            "r_type": "248",
            "i_type": "toys-hobby-diy",
            "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"],
            "fields": ["revenue", "revenue_yoy", "cagr", "arpc"]
        }, {
            "title": "Digital Market - Sports & Outdoor",
            "r_type": "259",
            "i_type": "sports-outdoor",
            "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"],
            "fields": ["revenue", "revenue_yoy", "cagr", "arpc"]
        }, {
            "title": "Digital Market - Hobby & Stationary",
            "r_type": "260",
            "i_type": "hobby-stationery",
            "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"],
            "fields": ["revenue", "revenue_yoy", "cagr", "arpc"]
        }, {
            "title": "Consumer Market - Home & Laundry Care",
            "r_type": "60000000",
            "i_type": "home-laundry-care",
            "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"],
            "fields": ["revenue", "revenue_yoy", "cagr", "arpc"]
        }, {
            "title": "Tissue & Hygiene Paper",
            "r_type": "80000000",
            "i_type": "tissue-hygiene-paper",
            "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"],
            "fields": ["revenue", "revenue_yoy", "cagr", "arpc"]
        }, {
            "title": "Toys & Hobby",
            "r_type": "19000000",
            "i_type": "toys-hobby",
            "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"],
            "fields": ["revenue", "revenue_yoy", "cagr", "arpc"]
        }],
        "五金類": [{
            "title": "Consumer Market(Lamps & Lighting)",
            "r_type": "17060000",
            "i_type": "lamps-lighting",
            "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"],
            "fields": ["revenue", "revenue_yoy", "cagr", "arpc"]
        }, {
            "title": "Consumer Market(Floor Covering)",
            "r_type": "17060000",
            "i_type": "lamps-lighting",
            "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"],
            "fields": ["revenue", "revenue_yoy", "cagr", "arpc"]
        }, {
            "title": "Digital Market(DIY, Garden & Pets)",
            "r_type": "357",
            "i_type": "diy-garden-pets",
            "labels": ["Revenue, mln USD", "yoy", "CAGR", "ARPC"],
            "fields": ["revenue", "revenue_yoy", "cagr", "arpc"]
        }]
    }

    sheet_index = -1
    wb = Workbook()
    for tab_title in tab.keys():
        print(tab_title)
        # create sheet
        sheet_index += 1
        ws = wb.create_sheet(tab_title, sheet_index)
        # ws.cell(row=4, column=2, value=10)
        # ws.cell(column=col, row=row, value="{0}".format(get_column_letter(col)))
        cols_index = 1
        for data_group_id in range(len(tab[tab_title])):
            data_group = tab[tab_title][data_group_id]
            row_id = 1

            # print('\t{}\t{}\t{}\t{}'.format(data_group['title'], data_group['r_type'], data_group['i_type'],
            #                                 data_group['fields']))

            if data_group_id == 0:
                cols = prefix_label + [data_group['title']
                                       ] + data_group['labels']
                col_start = 1
            else:
                cols = [data_group['title']] + data_group['labels']

            for c in range(len(cols)):
                _ = ws.cell(row=row_id,
                            column=col_start + c,
                            value="{}".format(cols[c]))

            for area in areas:
                row_id += 1
                url = "https://www.statista.com/outlook/{}/{}/{}/{}".format(
                    data_group['r_type'], area[-2], data_group['i_type'],
                    area[-1])

                # _ = ws.cell(row=row_id, column=cols_count, value="{}".format(cols[c]))
                # print("\t\t{}".format(url))
                crawler = Crawler(url)
                d = crawler.data()
                if data_group_id == 0:
                    row_data = [area[0], area[1], url]
                else:
                    row_data = [url]

                for f in data_group['fields']:
                    row_data.append(d[f])
                    # c = "{}\t{}-{}".format(c, f, d[f])

                for c in range(len(row_data)):
                    _ = ws.cell(row=row_id,
                                column=col_start + c,
                                value="{}".format(row_data[c]))

                print(row_data)

                sl = random.choice(SLEEP_SECOND)
                print("*************** sleep {} second ***************".format(
                    sl))
                # if row_id >= 5:
                #     break

                time.sleep(sl)
            col_start += len(cols)

    wb.save(filename="/tmp/statista.xlsx")
Esempio n. 12
0
 def __init__(self):
     self._crawler = Crawler()
Esempio n. 13
0
def run_spider(cfg, s):
    crawl = Crawler(s, cfg['crawler'])
    crawl.start()