def main(): log.info('程序启动') try: threading.Thread(target=checkIpMain).start() threading.Thread(target=updata).start() except: main()
def insertOne(ip, fun, db, sign): try: if fun(ip): db.insert_mongo(ip) log.info('入库{}ip:{}'.format(sign, ip)) except: pass
def updata(): log.info('更新线程启动!!!') while (True): try: acquire(1) time.sleep(6) except: traceback.print_exc() log.error("更新时有异常。。。。") time.sleep(2)
def acquireIp(): aUrl = getUrl() log.info('获取ip地址:{}'.format(aUrl)) try: reponse = requests.get(aUrl, headers=header, timeout=5) if reponse.status_code == 200: parseHtml(reponse.text) except: # traceback.print_exc() log.error('请求ip异常:{}'.format(aUrl))
def checkIpMain(): while True: try: log.info('测试线程执行!!!') testIp() deleteIp() time.sleep(6) except: traceback.print_exc() log.error("测试时有异常。。。。") time.sleep(2)
def write (self, data): if self.channelsClosed: log.warn("Chan closed for %r" % data, self.defname) return log.info(data, self.defname) self.inChannel.write(data) if data.endswith("\n"): try: self.inChannel.flush() except gobject.GError, e: log.error(str(e)+". Last line wasn't sent.\n", self.defname)
def save(self, url: str, title: str, keywords): # skip already indexed urls if self.url_already_indexed(url): log.info(f"URL {url} already indexed, skipping") return doc = { 'url_hash': self.to_md5(url), 'url': url, 'title': title, 'keywords': keywords, 'timestamp': datetime.now(), } res = self._es.index(index=self._index_name, body=doc) if res['result'] != "created": log.warning(f"Could not index {url}, result was: {res['result']}") else: log.info(f"Indexed {url}: {title}")
def __init__(self, hostname: str, port: str, user: str = None, password: str = None, index_name="crawler_main"): self._index_name = index_name if not user or not password: url = f"http://{hostname}:{port}" elif user and password: url = f"http://{user}:{password}@{hostname}:{port}" else: raise RuntimeException( "Please specify user and password for elasticsearch connection" ) self._es = Elasticsearch([url]) if not self._es.indices.exists(index=self._index_name): mapping = { "mappings": { "properties": { "url_hash": { "type": "keyword", "index": "true" } } } } self._es.indices.create(index=self._index_name, body=mapping) log.info(f"Index {self._index_name} created") else: res = self._es.cat.count(self._index_name, params={"format": "json"}) count = res[0]['count'] log.info( f"Index {self._index_name} already contains {count} entries")
def crawl(self, root_url: str, depth=0): self._url_list.append(root_url) current_idx = 0 while current_idx < len(self._url_list): url = self._url_list[current_idx] print(80*"-") log.info(f"Processing {url}") current_idx += 1 if is_url_filtered(url): log.info("URL is filtered, skipping") continue if len(url) >= self._max_url_length: log.info(f"URL is too long (max_length={self._max_url_length}), skipping") continue try: if not self.is_html(url): log.info("URL is not HTML, skipping") continue req = Request(url, headers=self._header) response = urlopen(req, timeout=3) content = response.read().decode(errors='ignore') except Exception as e: log.error(e) log.info("An error occurred while opening URL, skipping") continue # detect if url is an entirely new domain and reset counter if self.get_domain(url) != self._previous_domain: self._current_on_site = 0 else: self._current_on_site += 1 self._previous_domain = self.get_domain(url) # get title and check whether it's latin title = self.get_title(content) if title: title = title.strip() if self.is_latin(title): keywords = get_keywords(content) self._indexer.save(url, title, keywords) else: log.info(f"Skipping because: Title not latin ('{title}')") continue # extract links from html soup = BeautifulSoup(content, features="lxml") to_crawl = [] cnt = 0 for link in soup.findAll('a'): l = link.get('href') if l and (l.startswith("http") or l[0] == '/'): if l[0] == '/': l = urljoin(url, l) # discard too many links on same domain to prevent cycles if self._current_on_site <= self._max_stay_on_site: if not self._indexer.url_already_indexed(l) and l not in self._url_list: self._url_list.append(l) cnt += 1 # but make sure to append 'foreign' URLs in every case if self.get_domain(url) != self.get_domain(l): self._url_list.append(l) cnt += 1 if cnt >= self._max_new_urls_per_page: break log.info(f"URLs found: {len(self._url_list)} ({cnt} new)") # check whether to clean URL list so it doesn't get too big if len(self._url_list) >= self._max_urls_in_list: len_before = len(self._url_list) self.purge_url_list(self.get_domain(url)) len_after = len(self._url_list) log.info(f"Purged URL list (removed {len_before - len_after} entries)") current_idx = 0
def insert(ip): log.info("获取新的ip:{}".format(str(ip))) for db in pools: threading.Thread(target=insertOne, args=(ip, db.getInputMethod(), db, db.getDesc())).start()
def delect(self): log.info('删除无用ip:{}-->{}'.format(self.getDesc(), self.instance.delect()))
def delete_index(self): self._es.indices.delete(index=self._index_name) log.info(f"Deleted index {self._index_name}")