def getResponse(host, movie_url): global number _proxy = getProxy() response = HttpRequest.httpRequest(host, url=movie_url, proxy=_proxy) number += 1 if not response._request: getResponse(host, movie_url) else: return response
import os, sys from util import HttpRequest from util import Configs from util import MysqlClient from bs4 import BeautifulSoup reload(sys) sys.setdefaultencoding('utf8') config = Configs.Configs() db = MysqlClient.MysqlDB() request = HttpRequest.httpRequest("cn-proxy.com", "http://cn-proxy.com") html_code = BeautifulSoup(request.response().text) def saveDb(sqls): return db.execute(sqls) def queryDb(sqls): return db.queryFirst(sqls) for tables in html_code.find_all('table', class_="sortable"): for trs in tables.find_all('tr'): tds = [] n = 0 for td in trs.select("td"): try:
<p class="tt cl"> <span>2015/07/31</span> <a href="/subject/26978.html" target="_blank"> <b> <font color="#FF6600">小时代4:灵魂尽头迅雷下载<i>/小时代4:灵魂尽头</i>.2015</font> </b> </a> </p> <p>又名:<a href="/subject/26978.html" target="_blank">小时代4/Tiny Time 4.0</a></p> <p class="des">2015(中国大陆)<em>/</em>杨幂<em>/</em>郭采洁<em>/</em>陈学冬<em>/</em>郭碧婷<em>/</em>谢依霖<em>/</em>李贤宰<em>/</em>锦荣<em>/</em>任言恺<em>/</em>姜潮<em>/</em>王琳<em>/</em>商侃<em>/</em>郭敬明</p> <p class="rt">豆瓣评分:<strong>4</strong><em class="dian">.</em><em class="fm">7</em> <a href="/jumpto.php?aid=26978" rel="nofollow" target="_blank" title="去豆瓣查看影片介绍"><em class="e_db"></em></a></p> ''' reload(sys) sys.setdefaultencoding('utf8') host = "www.bttiantang.com" for p in range(1, 687): response = HttpRequest.httpRequest(host, url="http://"+host+"/?PageNo="+str(p)) html_code = BeautifulSoup(response.response().text) for i in html_code.find_all('div', class_="item"): for j in i.select(".title > .tt"): try: http_path = "http://"+host+j.a['href'] spider_movie.SpiderMovies.delay(host, str(http_path)) except Exception, e: print e,j
Created on PyCharm @author: Edison @date: 15/8/6 下午4:31 @summary: @note: @version: ''' __author__ = 'Edison' import os,sys import spider_movie from util import HttpRequest from bs4 import BeautifulSoup reload(sys) sys.setdefaultencoding('utf8') host = "rrmj.tv" response = HttpRequest.httpRequest(host) html_code = BeautifulSoup(response.response()) for i in html_code.select("#menu3"): print i.href