def parseAndSave(content,currentWebsite): # save_html_content(currentWebsite.id, websiteContents) soup = BeautifulSoup(content, 'lxml') items = soup.find_all('a') print("A Items", len(items)) COUNT = 0 if items: for a in items: if a.string: url, text = a.get('href'), a.string.encode('utf-8').strip() check_pass = check_content(url, text) if check_pass: url = complement_url(url, currentWebsite.url) if url: result = save_info_feed(url, text, currentWebsite.id, currentWebsite.company.id) if result: COUNT += 1 if COUNT == 0: log(NOTICE, "#{id} {name} {site} 没抓到更新 {count} 条".format(id=currentWebsite.company.id, name=currentWebsite.company.name_cn.encode( 'utf-8').strip(), site=currentWebsite.url, count=COUNT)) else: log(RECORD, "#{id} {name} {site} 抓到更新 {count} 条".format(id=currentWebsite.company.id, name=currentWebsite.company.name_cn.encode( 'utf-8').strip(), site=currentWebsite.url, count=COUNT))
def extract(w_id): try: w = get_website(w_id) # log(NOTICE, "开始 #{id} {name} {site} ".format(id=w.id, name=w.company.name_cn, site=w.url)) new_html_content = crawl(w.url) if not new_html_content: log(NOTICE, "#{id} {name} {site} 抓到更新 0 条".format(id=w.company.id, name=w.company.name_cn, site=w.url)) return if w.html_content: old_html_content = w.html_content.content else: save_html_content(w.id, new_html_content) log(NOTICE, "#{id} {name} {site} 抓到更新 0 条".format(id=w.company.id, name=w.company.name_cn, site=w.url)) return diff_text = diff_file(old_html_content, new_html_content) if not diff_text: log(NOTICE, "#{id} {name} {site} 抓到更新 0 条".format(id=w.company.id, name=w.company.name_cn, site=w.url)) return save_html_content(w.id, new_html_content) soup = BeautifulSoup(diff_text, 'lxml') items = soup.find_all('a') COUNT = 0 if items: for a in items: if a.string: url, text = a.get('href'), a.string check_pass = check_content(url, text) if check_pass: url = complement_url(url, w.url) if url: result = save_info_feed(url, text, w.id, w.company.id) if result: COUNT += 1 # log(RECORD, "[name] [+] [{url} {text}]".format(name=w.company.name_cn, url=url, text=text.strip())) if COUNT == 0: log(NOTICE, "#{id} {name} {site} 抓到更新 {count} 条".format(id=w.company.id, name=w.company.name_cn, site=w.url, count=COUNT)) else: log(RECORD, "#{id} {name} {site} 抓到更新 {count} 条".format(id=w.company.id, name=w.company.name_cn, site=w.url, count=COUNT)) except Exception as e: try: w = get_website(w_id) log(ERROR, "#{id} {name} {site} {err}".format(id=w.id, name=w.company.name_cn, site=w.url, err=str(e))) except Exception as e: log(ERROR, str(e))
def extract(w_id): """ :param w_id: :return: """ try: # 列举出所有没能成功抓取更新的情况,并在log中记录。 w = get_website(w_id) # log(NOTICE, "开始 #{id} {name} {site} ".format(id=w.id, name=w.company.name_cn, site=w.url)) # Todo 此处尝试调用Scrapy new_html_content = crawl(w.url) if not new_html_content: log( NOTICE, "#{id} {name} {site} 抓到更新 0 条".format(id=w.company.id, name=w.company.name_cn, site=w.url)) return # if current website 'w' already have html_content. compare it with 'new_content' and save those when 'diff' exist. if w.html_content: old_html_content = w.html_content.content else: save_html_content(w.id, new_html_content) log( NOTICE, "#{id} {name} {site} 抓到更新 0 条".format(id=w.company.id, name=w.company.name_cn, site=w.url)) return diff_text = diff_file(old_html_content, new_html_content) if not diff_text: log( NOTICE, "#{id} {name} {site} 抓到更新 0 条".format(id=w.company.id, name=w.company.name_cn, site=w.url)) return save_html_content(w.id, new_html_content) # lxml是一个html解析器,与它类似的还有html5lib等。 soup = BeautifulSoup(diff_text, 'lxml') items = soup.find_all('a') COUNT = 0 # 基本逻辑:抓取所有<a href>标签,check内容是否合规,是则该标签的url补全,存入info_feed表中。 if items: for a in items: if a.string: url, text = a.get('href'), a.string check_pass = check_content(url, text) if check_pass: url = complement_url(url, w.url) if url: result = save_info_feed(url, text, w.id, w.company.id) if result: COUNT += 1 # log(RECORD, "[name] [+] [{url} {text}]".format(name=w.company.name_cn, url=url, text=text.strip())) if COUNT == 0: log( NOTICE, "#{id} {name} {site} 抓到更新 {count} 条".format( id=w.company.id, name=w.company.name_cn, site=w.url, count=COUNT)) else: log( RECORD, "#{id} {name} {site} 抓到更新 {count} 条".format( id=w.company.id, name=w.company.name_cn, site=w.url, count=COUNT)) except Exception as e: try: w = get_website(w_id) log( ERROR, "#{id} {name} {site} {err}".format(id=w.id, name=w.company.name_cn, site=w.url, err=str(e))) except Exception as e: log(ERROR, str(e))