コード例 #1
0
def main():
    log.info('程序启动')
    try:
        threading.Thread(target=checkIpMain).start()
        threading.Thread(target=updata).start()
    except:
        main()
コード例 #2
0
ファイル: Ip2Db.py プロジェクト: zhouboboya/spider
def insertOne(ip, fun, db, sign):
    try:
        if fun(ip):
            db.insert_mongo(ip)
            log.info('入库{}ip:{}'.format(sign, ip))
    except:
        pass
コード例 #3
0
def updata():
    log.info('更新线程启动!!!')
    while (True):
        try:
            acquire(1)
            time.sleep(6)
        except:
            traceback.print_exc()
            log.error("更新时有异常。。。。")
            time.sleep(2)
コード例 #4
0
def acquireIp():
    aUrl = getUrl()
    log.info('获取ip地址:{}'.format(aUrl))
    try:
        reponse = requests.get(aUrl, headers=header, timeout=5)
        if reponse.status_code == 200:
            parseHtml(reponse.text)
    except:
        # traceback.print_exc()
        log.error('请求ip异常:{}'.format(aUrl))
コード例 #5
0
def checkIpMain():
    while True:
        try:
            log.info('测试线程执行!!!')
            testIp()
            deleteIp()
            time.sleep(6)
        except:
            traceback.print_exc()
            log.error("测试时有异常。。。。")
            time.sleep(2)
コード例 #6
0
ファイル: SubProcess.py プロジェクト: btrent/knave
 def write (self, data):
     if self.channelsClosed:
         log.warn("Chan closed for %r" % data, self.defname)
         return
     log.info(data, self.defname)
     self.inChannel.write(data)
     if data.endswith("\n"):
         try:
             self.inChannel.flush()
         except gobject.GError, e:
             log.error(str(e)+". Last line wasn't sent.\n", self.defname)
コード例 #7
0
ファイル: Indexer.py プロジェクト: snakearrow/crawler
    def save(self, url: str, title: str, keywords):
        # skip already indexed urls
        if self.url_already_indexed(url):
            log.info(f"URL {url} already indexed, skipping")
            return

        doc = {
            'url_hash': self.to_md5(url),
            'url': url,
            'title': title,
            'keywords': keywords,
            'timestamp': datetime.now(),
        }
        res = self._es.index(index=self._index_name, body=doc)
        if res['result'] != "created":
            log.warning(f"Could not index {url}, result was: {res['result']}")
        else:
            log.info(f"Indexed {url}: {title}")
コード例 #8
0
ファイル: Indexer.py プロジェクト: snakearrow/crawler
    def __init__(self,
                 hostname: str,
                 port: str,
                 user: str = None,
                 password: str = None,
                 index_name="crawler_main"):
        self._index_name = index_name
        if not user or not password:
            url = f"http://{hostname}:{port}"
        elif user and password:
            url = f"http://{user}:{password}@{hostname}:{port}"
        else:
            raise RuntimeException(
                "Please specify user and password for elasticsearch connection"
            )

        self._es = Elasticsearch([url])

        if not self._es.indices.exists(index=self._index_name):
            mapping = {
                "mappings": {
                    "properties": {
                        "url_hash": {
                            "type": "keyword",
                            "index": "true"
                        }
                    }
                }
            }
            self._es.indices.create(index=self._index_name, body=mapping)
            log.info(f"Index {self._index_name} created")
        else:
            res = self._es.cat.count(self._index_name,
                                     params={"format": "json"})
            count = res[0]['count']
            log.info(
                f"Index {self._index_name} already contains {count} entries")
コード例 #9
0
 def crawl(self, root_url: str, depth=0):
     self._url_list.append(root_url)
     current_idx = 0
     
     while current_idx < len(self._url_list):
         url = self._url_list[current_idx]
         print(80*"-")
         log.info(f"Processing {url}")
         current_idx += 1
         
         if is_url_filtered(url):
             log.info("URL is filtered, skipping")
             continue
             
         if len(url) >= self._max_url_length:
             log.info(f"URL is too long (max_length={self._max_url_length}), skipping")
             continue
     
         try:
             if not self.is_html(url):
                 log.info("URL is not HTML, skipping")
                 continue
             
             req = Request(url, headers=self._header)
             response = urlopen(req, timeout=3)
             content = response.read().decode(errors='ignore')
         except Exception as e:
             log.error(e)
             log.info("An error occurred while opening URL, skipping")
             continue
     
         # detect if url is an entirely new domain and reset counter
         if self.get_domain(url) != self._previous_domain:
             self._current_on_site = 0
         else:
             self._current_on_site += 1
         
         self._previous_domain = self.get_domain(url)
     
         # get title and check whether it's latin
         title = self.get_title(content)
         if title:
             title = title.strip()
             if self.is_latin(title):
                 keywords = get_keywords(content)
                 self._indexer.save(url, title, keywords)
             else:
                 log.info(f"Skipping because: Title not latin ('{title}')")
                 continue
 
         # extract links from html
         soup = BeautifulSoup(content, features="lxml")
         to_crawl = []
         cnt = 0
         for link in soup.findAll('a'):
             l = link.get('href')
             if l and (l.startswith("http") or l[0] == '/'):
                 if l[0] == '/':
                     l = urljoin(url, l)
                     
                 # discard too many links on same domain to prevent cycles
                 if self._current_on_site <= self._max_stay_on_site:
                     if not self._indexer.url_already_indexed(l) and l not in self._url_list:
                         self._url_list.append(l)
                         cnt += 1
                 # but make sure to append 'foreign' URLs in every case
                 if self.get_domain(url) != self.get_domain(l):
                     self._url_list.append(l)
                     cnt += 1
             if cnt >= self._max_new_urls_per_page:
                 break
             
         log.info(f"URLs found: {len(self._url_list)} ({cnt} new)")
     
         # check whether to clean URL list so it doesn't get too big
         if len(self._url_list) >= self._max_urls_in_list:
             len_before = len(self._url_list)
             self.purge_url_list(self.get_domain(url))
             len_after = len(self._url_list)
             log.info(f"Purged URL list (removed {len_before - len_after} entries)")
             current_idx = 0
コード例 #10
0
ファイル: Ip2Db.py プロジェクト: zhouboboya/spider
def insert(ip):
    log.info("获取新的ip:{}".format(str(ip)))
    for db in pools:
        threading.Thread(target=insertOne,
                         args=(ip, db.getInputMethod(), db,
                               db.getDesc())).start()
コード例 #11
0
 def delect(self):
     log.info('删除无用ip:{}-->{}'.format(self.getDesc(), self.instance.delect()))
コード例 #12
0
ファイル: Indexer.py プロジェクト: snakearrow/crawler
 def delete_index(self):
     self._es.indices.delete(index=self._index_name)
     log.info(f"Deleted index {self._index_name}")