def site_spider(self): entry_urls_list = [] for site in self.site_list: entry_urls = [site] entry_urls.extend(self.search_engines_result.get(site, [])) entry_urls_list.append(entry_urls) site_spider_result = services.site_spider_thread(entry_urls_list) for site in site_spider_result: target_urls = site_spider_result[site] new_target_urls = [] for url in target_urls: if url in self.page_url_list: continue new_target_urls.append(url) self.page_url_list.append(url) page_map = services.page_fetch(new_target_urls) for url in page_map: item = { "site": site, "task_id": self.task_id, "source": CollectSource.SITESPIDER } item.update(page_map[url]) domain_parsed = utils.domain_parsed(site) if domain_parsed: item["fld"] = domain_parsed["fld"] utils.conn_db('url').insert_one(item)
def search_engines(self): self.search_engines_result = search_engines(self.site_list) for site in self.search_engines_result: target_urls = self.search_engines_result[site] page_map = services.page_fetch(target_urls) for url in page_map: self.page_url_list.append(url) item = { "site": site, "task_id": self.task_id, "source": CollectSource.SEARCHENGINE } item.update(page_map[url]) domain_parsed = utils.domain_parsed(site) if domain_parsed: item["fld"] = domain_parsed["fld"] utils.conn_db('url').insert_one(item)
def site_spider(self): entry_urls_list = [] for site in self.site_list: entry_urls_list.append([site]) site_spider_result = services.site_spider_thread(entry_urls_list) for site in site_spider_result: target_urls = [] target_urls.extend(site_spider_result[site]) if not target_urls: continue page_map = services.page_fetch(target_urls) for url in page_map: item = { "site": site, "task_id": self.task_id, "source": CollectSource.SITESPIDER } item.update(page_map[url]) utils.conn_db('url').insert_one(item)