예제 #1
0
파일: domain.py 프로젝트: zx273983653/ARL
    def site_spider(self):
        entry_urls_list = []
        for site in self.site_list:
            entry_urls = [site]
            entry_urls.extend(self.search_engines_result.get(site, []))
            entry_urls_list.append(entry_urls)

        site_spider_result = services.site_spider_thread(entry_urls_list)
        for site in site_spider_result:
            target_urls = site_spider_result[site]
            new_target_urls = []
            for url in target_urls:
                if url in self.page_url_list:
                    continue
                new_target_urls.append(url)

                self.page_url_list.append(url)

            page_map = services.page_fetch(new_target_urls)
            for url in page_map:
                item = {
                    "site": site,
                    "task_id": self.task_id,
                    "source": CollectSource.SITESPIDER
                }
                item.update(page_map[url])

                domain_parsed = utils.domain_parsed(site)

                if domain_parsed:
                    item["fld"] = domain_parsed["fld"]

                utils.conn_db('url').insert_one(item)
예제 #2
0
파일: domain.py 프로젝트: zx273983653/ARL
    def search_engines(self):
        self.search_engines_result = search_engines(self.site_list)
        for site in self.search_engines_result:
            target_urls = self.search_engines_result[site]
            page_map = services.page_fetch(target_urls)

            for url in page_map:
                self.page_url_list.append(url)
                item = {
                    "site": site,
                    "task_id": self.task_id,
                    "source": CollectSource.SEARCHENGINE
                }

                item.update(page_map[url])

                domain_parsed = utils.domain_parsed(site)

                if domain_parsed:
                    item["fld"] = domain_parsed["fld"]

                utils.conn_db('url').insert_one(item)
예제 #3
0
    def site_spider(self):
        entry_urls_list = []
        for site in self.site_list:
            entry_urls_list.append([site])

        site_spider_result = services.site_spider_thread(entry_urls_list)
        for site in site_spider_result:
            target_urls = []
            target_urls.extend(site_spider_result[site])

            if not target_urls:
                continue

            page_map = services.page_fetch(target_urls)
            for url in page_map:
                item = {
                    "site": site,
                    "task_id": self.task_id,
                    "source": CollectSource.SITESPIDER
                }
                item.update(page_map[url])

                utils.conn_db('url').insert_one(item)