def download_page(self, urlmd5, url, proxy): downloader = PageDownload(proxy=proxy, timeout=5) page = downloader.simple_download(url=url) if page is not None: company_name = self.extract_field_from_page( page=page, reg=r"""<meta name="keywords" content="([^=]+)"/>""", index=-1) company_net = self.extract_field_from_page( page=page, reg= r'"/companyurlimg.php\?url=([^<]+)" alt="myImage" style="border:none' ) address = self.extract_field_from_page(page=page.decode("utf-8"), reg="<p>([^=]+)</p>", index=-1) # if file_name is None: # file_name = self.extract_field_from_page(page=page, reg=r'<h1>([^<]+)下载?</h1>') sql = "insert into " + self.page_table + " values (%s,%s,%s,%s,%s,now())" db = MysqlHandle() db.insert(sql=sql, value_list=[(urlmd5, url, company_name, company_net, address)]) db.close() update_sql = "update " + self.url_table + " set status=200 where urlmd5='%s'" % ( urlmd5) db = MysqlHandle() db.update(sql=update_sql) db.close() return True else: return False
def generate_samples(): base_url = "http://api.map.baidu.com/geoconv/v1/?coords=%s,%s&from=5&to=6&ak=XwpZGfXMn45W9Czd1UwmC6RwMMULD1Ue" work_book = xlwt.Workbook() sheet = work_book.add_sheet("sheet") sql = "select x , y from bdmap_api_school_218_page_table LIMIT 1000" db = MysqlHandle() query_res = db.query(sql) i = 0 for (x, y) in query_res: url = base_url % (str(x), str(y)) downloader = PageDownload() page = downloader.simple_download(url) if is_json(page): json_page = json.loads(page) status = json_page["status"] if status == 0: new_x = json_page["result"][0]["x"] new_y = json_page["result"][0]["y"] print (x, y, new_x, new_y) sheet.write(i, 0, x) sheet.write(i, 1, y) sheet.write(i, 2, new_x) sheet.write(i, 3, new_y) i = i+1 work_book.save("sample.xls")
def download_page(self, urlmd5, url, proxy): downloader = PageDownload(proxy=proxy) page = downloader.simple_download(url=url) if page is not None: #print page.decode("utf-8") file_name = self.extract_field_from_page(page=page, reg=r'专辑名称:([^<]+)') if file_name is None: file_name = self.extract_field_from_page(page=page, reg=r'<h1 class="title">([^<]+)</h1>') music_type = self.extract_field_from_page(page=page, reg=r' <a href="/\w+/">([^<]+)</a>') # if file_name is None: # file_name = self.extract_field_from_page(page=page, reg=r'<h1>([^<]+)下载?</h1>') singer_name = self.extract_field_from_page(page=page, reg=r'专辑艺人:([^<]+)') baiduyun_url = self.extract_field_from_page(page=page, reg=r"""<a href="#ecms" onclick="window.open\('([^<]+)','','width=300,height=300,resizable=yes'\)""") print baiduyun_url if baiduyun_url is None: return False if baiduyun_url is not None: baiduyun_url = self.domain+baiduyun_url baiduyun_password = self.extract_field_from_page(page=page, reg=r'密码: (\w+)') sql = "insert into " + self.page_table + " values (%s,%s,%s,%s,%s,%s,%s,now())" db = MysqlHandle() db.insert(sql=sql, value_list=[(urlmd5, url, file_name,music_type, singer_name, baiduyun_url, baiduyun_password)]) db.close() update_sql = "update " + self.url_table + " set status=200 where urlmd5='%s'" % (urlmd5) db = MysqlHandle() db.update(sql=update_sql) db.close() return True else: return False
def download_page(self, urlmd5, url, proxy, url_table, page_table): downloader = PageDownload(proxy=proxy) page = downloader.simple_download(url=url) if page is not None: file_name = self.extract_field_from_page( page=page, reg=r'<h1 class="yh mt_1 f_32">([^<]+\.[a-z]+)</h1>') file_size = self.extract_field_from_page( page=page, reg=r'<h3 class="c999 fl mt_05 f_12 n yh">' r'<em class="n ml_1 mr_1">·</em>(\d+\.?\d+M)</h3>') baiduyun_url = self.extract_field_from_page( page=page, reg=r'href="(https?://pan.baidu.com/[^\s]+)"') baiduyun_password = self.extract_field_from_page( page=page, reg=r'<em class="dn"></em>密码:(\w+)</b>') sql = "insert into " + page_table + " values (%s,%s,%s,%s,%s,%s,now())" db = MysqlHandle() db.insert(sql=sql, value_list=[(urlmd5, url, file_name, file_size, baiduyun_url, baiduyun_password)]) db.close() update_sql = "update " + url_table + " set status=200 where urlmd5='%s'" % ( urlmd5) db = MysqlHandle() db.update(sql=update_sql) db.close() return True else: return False
def download_page(self, urlmd5, url, proxy): downloader = PageDownload(proxy=proxy) page = downloader.simple_download(url=url) if page is not None: file_name = self.extract_field_from_page( page=page, reg=r'<li class="fl ml_1 mt_08 c999">([^=]+)</li>') # if file_name is None: # file_name = self.extract_field_from_page(page=page, reg=r'<h1>([^<]+)下载?</h1>') singer_name = self.extract_field_from_page( page=page, reg=r'<li><a class="fl c3b ml_1 mt_08" href="http:' r'//www.51ape.com/[^=]+/" title="[^=]+">([^=]+)' r'</a></li>') baiduyun_url = self.extract_field_from_page( page=page, reg=r'href="(https?://pan.baidu.com/s/[^=]+)"') baiduyun_password = self.extract_field_from_page( page=page, reg=r'提取<em class="dn"></em>密码:(\w+)</b>') sql = "insert into " + self.page_table + " values (%s,%s,%s,%s,%s,%s,now())" db = MysqlHandle() db.insert(sql=sql, value_list=[(urlmd5, url, file_name, singer_name, baiduyun_url, baiduyun_password)]) db.close() update_sql = "update " + self.url_table + " set status=200 where urlmd5='%s'" % ( urlmd5) db = MysqlHandle() db.update(sql=update_sql) db.close() return True else: return False
def download_page(self, urlmd5, url, proxy): downloader = PageDownload(proxy=proxy) page = downloader.simple_download(url=url) if page is not None: file_name = self.extract_field_from_page( page=page, reg=r'<h1>([^<]+)\[FLAC格式\]下载</h1>') if file_name is None: file_name = self.extract_field_from_page( page=page, reg=r'<h1>([^<]+)下载?</h1>') baiduyun_url = self.extract_field_from_page( page=page, reg=r'<meta name="description" content="' r'[^<]*(https?://pan.baidu.com/[^\s]+) [^<]+"/>') baiduyun_password = self.extract_field_from_page( page=page, reg=r'<meta name="description" content' r'="[^<]+密码[\W]+(\w+)..."/>') sql = "insert into " + self.page_table + " values (%s,%s,%s,%s,%s,now())" db = MysqlHandle() db.insert(sql=sql, value_list=[(urlmd5, url, file_name, baiduyun_url, baiduyun_password)]) db.close() update_sql = "update " + self.url_table + " set status=200 where urlmd5='%s'" % ( urlmd5) db = MysqlHandle() db.update(sql=update_sql) db.close() return True else: return False
def filter_avaliable_ips(test_url=None): if test_url is None: return db = MysqlManage.MysqlHandle() is_success = db.delete('DELETE FROM TEMP_IPS_MANAGE') if not is_success: db.close() return db.close() sql = 'SELECT PROXY FROM AI_PROXY_IPS' db = MysqlManage.MysqlHandle() # 查询出所有代理ip IP_LIST = db.query(sql) db.close() for ip in IP_LIST: PROXY = {'http': 'http://'+ip[0]} # 代理 print 'filtering ip:'+ip[0] downloader = PageDownload(hd=mobike_headers,proxy=PROXY, timeout=3) try: # post_data = { # 'longitude': '121.1883', # 'latitude': '31.05147', # 'citycode': '021', # 'errMsg': 'getMapCenterLocation:ok' # } page = downloader.simple_download(url=test_url) if page is not None: AVALIABLE_IPS.append(ip) print ip[0]+" is ok!" else: pass except Exception, e: print str(e) pass
def download_page(self, base_url, geo, proxy, city_code, sec, page_table): downloader = PageDownload(proxy=proxy, hd=mobike_headers) post_dic = { 'longitude': str(geo[0]), 'latitude': str(geo[1]), 'citycode': str(city_code), 'errMsg': 'getMapCenterLocation:ok' } page = downloader.download_with_post(url=base_url, post_data=post_dic) if is_json(page): json_page = json.loads(page) if json_page.has_key("object"): mobike_object = json_page["object"] items = [] for mobike in mobike_object: bike_id = mobike["distId"] bike_type = mobike["biketype"] b_type = mobike["type"] lng = mobike["distX"] lat = mobike["distY"] dis_source = str(geo[0]) + "," + str(geo[1]) item = (bike_id, bike_type, b_type, lat, lng, dis_source, sec) items.append(item) db = MysqlHandle() sql = "insert into " + page_table + " values(%s,%s,%s,%s,%s,%s,%s,now())" db.insert(sql=sql, value_list=items) return True else: return False
def get_page_url(line_name, line_type, city_code, coords): base_url = 'http://map.baidu.com/?newmap=1&reqflag=pcmap&biz=1&from=webmap&da_par=direct&pcevaname=pc4.1&qt=bl&da_src=searchBox.button&wd=%s&c=%d&l=13&b=(%s)&from=webmap&sug_forward=&tn=B_NORMAL_MAP&nn=0' downloader = PageDownload(timeout=5) page = downloader.simple_download(base_url % (line_name, city_code, coords)) if is_json(page): json_data = json.loads(page) if not json_data.has_key('content'): print base_url return contents = json_data['content'] line_list = [] for item in contents: name = item['name'] if not item.has_key("uid"): print name, base_url continue uid = item['uid'] page_url = 'http://map.baidu.com/?qt=bsl&tps=&newmap=1&uid=' + uid + '&c=%d' % ( city_code) line_list.append((name, uid, page_url, line_type)) db = MysqlHandle() insert_sql = "insert into baidu_busline_url_analyse values(%s,%s,%s,%s,0)" db.insert(insert_sql, line_list) db.close()
def download_page(): db = MysqlHandle() query_sql = "select uid,min(name),min(line_type), min(page_url) from baidu_busline_url_analyse where status=0 group by uid " page_infs = db.query(query_sql) db.close() downloader = PageDownload() for item in page_infs: print(item[0]) page = downloader.simple_download(item[3]) # if is_json(page): # json_page = json.loads(page) # if json_page.has_key("content"): # main_info = json_page["content"][0] # name = main_info["name"] # timeable = main_info["timeable"] db = MysqlHandle() is_success = False if page is not None: insert_sql = "insert into baidu_busline_page values(%s,%s,%s,%s,NULL )" is_success = db.insert(insert_sql, [(item[0], item[1], item[2], page)]) if is_success and page is not None: update_sql = "update baidu_busline_url_analyse set status=200 where uid='%s'" % ( item[0]) db.update(update_sql) db.close()
def download_info_page(self, e_id): page_url = self.page_base_url % (e_id) print page_url downloader = PageDownload() page = downloader.simple_download(page_url) if page: site = re.findall(r"<br />网址:([^<]+)<br />",page) if site: site = site[0] else: site = None company_name = re.findall(r"<strong>([^<]+)</strong>",page) if company_name: company_name = company_name[0] else: company_name = None zw_num = re.findall(r'<span class="glyphicon glyphicon-envelope"></span> 展位号: (\w+)',page) if zw_num: zw_num = zw_num[0] else: zw_num = None mail = re.findall(r'</span> 邮箱:<a href="mailto:([^<]+@[^<]+)">[^<]+@[^<]+</a>',page) if mail: mail = mail[0] else: mail = None db = MysqlHandle() sql = "insert into diecast VALUES (%s,%s,%s,%s)" db.insert(sql,[(site,company_name,zw_num,mail)]) db.close()
def download_list_page(self, urlmd5, url, proxy, boundary, domain=None): downloader = PageDownload(proxy=proxy) page = downloader.simple_download(url=url) if is_json(page): json_page = json.loads(page) result = json_page["result"] total_count = result["total"] print ("total:"+str(total_count)) if int(total_count) <= 10 and int(total_count)>0: content = json_page["content"] for item in content: uid = item["uid"] primary_uid = item["primary_uid"] new_url = self.page_url % (uid, primary_uid) new_urlmd5 = to_md5(in_str=new_url) url_type = 0 boundary = None status = 0 sql = "select * from " + self.url_table + " where urlmd5='%s'" % (new_urlmd5) db = MysqlHandle() results = db.query(sql=sql) db.close() if not results: db = MysqlHandle() insert_sql = "insert into " + self.url_table + " values (%s,%s,%s,%s,%s,now())" db.insert(sql=insert_sql, value_list=[(new_urlmd5, new_url, url_type, boundary,status)]) db.close() else: print "This url is already in the database!!" elif int(total_count) <= 0: pass else: min_interval = boundary.split(";")[0] max_interval = boundary.split(";")[1] lat_min = min_interval.split(",")[1] lat_max = max_interval.split(",")[1] lng_min = min_interval.split(",")[0] lng_max = max_interval.split(",")[0] boundarys = split_boundary(int(float(lat_max)), int(float(lat_min)), int(float(lng_max)), int(float(lng_min)), 4, 0.2) for _boundary in boundarys: _boundary_st = str(_boundary[1][0])+","+str(_boundary[0][0])+";"+str(_boundary[1][1])+","+str(_boundary[0][1]) new_url = self.list_url % (self.city_code,self.keyword, _boundary_st) new_urlmd5 = to_md5(in_str=new_url) url_type = 1 boundary = _boundary_st status = 0 db = MysqlHandle() insert_sql = "insert into " + self.url_table + " values (%s,%s,%s,%s,%s,now())" db.insert(sql=insert_sql, value_list=[(new_urlmd5, new_url, url_type, boundary, status)]) db.close() update_sql = "update "+self.url_table+" set status=200 where urlmd5='%s'" % (urlmd5) db = MysqlHandle() db.update(sql=update_sql) db.close() return True else: return False
def get_lines(url, type): downloader = PageDownload() page = downloader.simple_download(url) reg = r'<a href="/x_[^<]+" >([^<]+)</a>' res = re.findall(reg, page) lines = [] for item in res: lines.append((item, type)) return lines
def download_list_page(self): for x in range(self.page_num): list_url = self.list_base_url % (x+1) downloader = PageDownload() page = downloader.simple_download(list_url) if page: ids = re.findall(r"onclick='javascript:Eshow\((\d+)\)'", page) for e_id in ids: self.download_info_page(e_id)
def download_page(self, urlmd5, url, proxy): downloader = PageDownload(proxy=proxy) page = downloader.simple_download(url=url) if is_json(page): page_json = json.loads(page) content = page_json["content"] uid = content["uid"] name = content["name"] address = content["addr"] if content.has_key("phone"): phone = content["phone"] else: phone = None x = content["navi_x"] y = content["navi_y"] # geo = content["geo"] ext = content["ext"] if type(ext) == type({"ee":""}): detail_info = ext["detail_info"] else: detail_info = {"info":""} if detail_info.has_key("tag"): tag = detail_info["tag"] else: tag = None if detail_info.has_key("image"): image = detail_info["image"] else: image = None if detail_info.has_key("display_info_redu"): dispaly_redu = detail_info["display_info_redu"] else: dispaly_redu = None if detail_info.has_key("price"): price = detail_info["price"] else: price = None sql = "insert into " + self.page_table + " values (%s,%s,%s,%s,%s,%s,%s,%s,null,null,null,%s,%s,%s,null,%s,now())" db = MysqlHandle() db.insert(sql=sql, value_list=[(urlmd5, url, uid, name, address, phone, x, y, tag, image, price, dispaly_redu)]) db.close() update_sql = "update " + self.url_table + " set status=200 where urlmd5='%s'" % (urlmd5) db = MysqlHandle() db.update(sql=update_sql) db.close() return True else: return False
def download_list_page(self, urlmd5, url, proxy, url_table, filter_table, domain=None): downloader = PageDownload(proxy=proxy) page = downloader.simple_download(url=url) if page is not None: new_urls = re.findall(self.reg, page) for _url in new_urls: if domain is not None: if _url.startswith("/"): new_url = domain + _url else: new_url = _url else: new_url = _url url_type = self.filter_url(url=new_url, filter_table=filter_table) if url_type is not None: new_urlmd5 = to_md5(in_str=new_url) sql = "select * from " + url_table + " where urlmd5='%s'" % ( new_urlmd5) db = MysqlHandle() results = db.query(sql=sql) db.close() if not results: db = MysqlHandle() insert_sql = "insert into " + url_table + " values (%s,%s,%s,%s,now())" db.insert(sql=insert_sql, value_list=[(new_urlmd5, new_url, url_type, 0)]) db.close() else: print "This url is already in the database!!" else: pass update_sql = "update " + url_table + " set status=200 where urlmd5='%s'" % ( urlmd5) db = MysqlHandle() db.update(sql=update_sql) db.close() return True else: return False
def transform(self): urls = [] while not self.queue.empty(): (urlmd5,baiduyun_url) = self.queue.get_nowait() downloader = PageDownload(timeout=8) page = downloader.simple_download(baiduyun_url) if page is not None: _url = re.findall(r'<a href="..(/doaction.php\?enews=DownSoft&classid=\d+&id=\d+&pathid=\d+&' r'pass=\w+&p=:::)"',page) if _url is not []: _url = "http://www.carflac.com/e/DownSys"+_url[0] respond = requests.get(_url) url = respond.url print url urls.append([urlmd5,baiduyun_url,url]) print "url queue is empty,we will quit" return urls
def update_geo_data(uid, l_type, city_code, name): page_table = "bdmap_api_" + name + "_" + str(city_code) + "_page_table" url = "http://map.baidu.com/?qt=ext&newmap=1&uid=%s&c=%d&nn=0&l=%d&ext_ver=new" % ( uid, city_code, l_type) downloader = PageDownload() page = downloader.simple_download(url) if is_json(page): json_data = json.loads(page) if json_data.has_key("content"): content = json_data["content"] if content.has_key("geo"): geo = content["geo"] print(uid) md5 = to_md5(uid) sql = "update " + page_table + ' set geo="' + geo + '" where md5="' + md5 + '"' db = MysqlHandle() db.update(sql) db.close() time.sleep(random.uniform(0.5, 1.0))
def download_list_page(self, urlmd5, url, proxy, domain=None): downloader = PageDownload(proxy=proxy,timeout=10) page = downloader.simple_download(url=url) if page is not None: new_urls = re.findall(self.reg, page) # singer_names = re.findall(self.js0_reg, page) # for singer_name in singer_names: # merge_url = "http://www.51ape.com/skin/ape/php/qx_2.php?qx=" + singer_name # new_urls.append(merge_url) for _url in new_urls: if domain is not None: if _url.startswith("/"): new_url = domain + _url else: new_url = _url else: new_url = _url url_type = self.filter_url(url=new_url) if url_type is not None: new_urlmd5 = to_md5(in_str=new_url) sql = "select * from "+self.url_table+" where urlmd5='%s'" % (new_urlmd5) db = MysqlHandle() results = db.query(sql=sql) db.close() if not results: db = MysqlHandle() insert_sql = "insert into "+self.url_table+" values (%s,%s,%s,%s,now())" db.insert(sql=insert_sql, value_list=[(new_urlmd5, new_url, url_type, 0)]) db.close() else: print "This url is already in the database!!" else: pass update_sql = "update "+self.url_table+" set status=200 where urlmd5='%s'" % (urlmd5) db = MysqlHandle() db.update(sql=update_sql) db.close() return True else: return False
def get_busline(origin, destination, city_code, key): url = base_api_url % (origin, destination, city_code, key) downloader = PageDownload(timeout=10) page = downloader.simple_download(url) if page is not None: if is_json(page): json_data = json.loads(page) if json_data["status"] == "1": print "successful!!" return page else: if json_data["info"] == "DAILY_QUERY_OVER_LIMIT": return False else: return None else: print "return not json!!" return None else: print "request error!!" return None
def download_page(self, urlmd5, url, proxy): downloader = PageDownload(proxy=proxy) page = downloader.simple_download(url=url) if page is not None: file_name = self.extract_field_from_page( page=page, reg=r'<h2>([^\]]+) <span class="c?b?">' r'WAV</span>') # if file_name is None: # file_name = self.extract_field_from_page(page=page, reg=r'<h1>([^<]+)下载?</h1>') singer_name = self.extract_field_from_page( page=page, reg=r'<p><a href="/detail/\d+.html" style="color:' r'#217fbc">([^=]+)</a></p>') result = self.extract_field_from_page( page=page, reg= r'<p class="downurl">链接: (https?://pan.baidu.com/s/[^=]+) 密码: (\w+)</p>' ) if result is not None: [baiduyun_url, baiduyun_password] = result else: baiduyun_url = None baiduyun_password = None sql = "insert into " + self.page_table + " values (%s,%s,%s,%s,%s,%s,now())" db = MysqlHandle() db.insert(sql=sql, value_list=[(urlmd5, url, file_name, singer_name, baiduyun_url, baiduyun_password)]) db.close() update_sql = "update " + self.url_table + " set status=200 where urlmd5='%s'" % ( urlmd5) db = MysqlHandle() db.update(sql=update_sql) db.close() return True else: return False
def filter_avaliable_ips(): db = MysqlHandle() is_success = db.delete('DELETE FROM TEMP_IPS_MANAGE') if not is_success: db.close() return db.close() sql = 'SELECT PROXY FROM AI_PROXY_IPS' db = MysqlHandle() # 查询出所有代理ip IP_LIST = db.query(sql) db.close() for ip in IP_LIST: PROXY = {'http': 'http://'+ip[0]} # 代理 print 'filtering ip:'+ip[0] downloader = PageDownload(hd=ofo_headers,proxy=PROXY) try: post_data = MultipartEncoder( { "lat": "30.515133", "lng": "114.346161", "token": "7eb9b200-3d7f-11e8-b714-e9d19c19f7b0", "source": "-5", "source-version": "10005" }, boundary='----ofo-boundary-MC40MjcxMzUw' ) page = downloader.download_with_post(url=TEST_URL,post_data=post_data) if page is not None: AVALIABLE_IPS.append(ip) print ip[0]+" is ok!" else: pass except Exception, e: print str(e) pass
def bdll_2_bdmc(points): base_url = "http://api.map.baidu.com/geoconv/v1/?coords=%s&from=5&to=6" \ "&ak=BF0Y5lHmGGMReuSFBBldFOyWjEuRgdpO" coords = "" for point in points: coords = coords+str(point[0])+","+str(point[1])+";" coords = coords.strip(";") url = base_url % (coords) downloader = PageDownload() page = downloader.simple_download(url) if page is not None: json_page = json.loads(page) status = json_page["status"] if status == 0: xy_points = json_page["result"] res_points = [] for xy_point in xy_points: res_points.append([xy_point["x"], xy_point["y"]]) return res_points else: return None else: return None
def spider(city_code, name, keyword, key_token): boundary_table = "bdmap_api_" + name + "_" + str( city_code) + "_boundary_table" page_table = "bdmap_api_" + name + "_" + str(city_code) + "_page_table" base_url = "http://api.map.baidu.com/place/v2/search?query=%s&scope=2&bounds=%s&output=json&ak=%s&page_num=%d" sql = "select md5, boundary from " + boundary_table + " where status=0" db = MysqlHandle() res_data = db.query(sql) for (md5, boundary) in res_data: url = base_url % (keyword, boundary, key_token, 0) downloader = PageDownload() page = downloader.simple_download(url) if is_json(page): json_data = json.loads(page) status = json_data["status"] total = json_data["total"] print(boundary, url, total) if status == 0 and int(total) > 0: page_count = int(total) / 10 for x in range(0, page_count + 2): _url = base_url % (keyword, boundary, key_token, x) downloader = PageDownload() _page = downloader.simple_download(_url) if is_json(_page): _json_data = json.loads(_page) results = _json_data["results"] for item in results: name = item["name"] address = item["address"] province = item["province"] city = item["city"] area = item["area"] uid = item["uid"] _md5 = to_md5(uid) lat = item["location"]["lat"] lng = item["location"]["lng"] try: tag = item["detail_info"]["tag"] except Exception, e: tag = None print(e.message) sql = "insert into " + page_table + " values(%s,%s,%s,%s,null,%s,%s,%s,%s,%s,null,null,%s,null,now(),null)" db = MysqlHandle() db.insert(sql, [[ _md5, uid, name, address, province, city, area, lng, lat, tag ]]) db.close() sql = 'update ' + boundary_table + ' set status=200,total_count=' + str( total) + ' where md5="' + md5 + '"' db = MysqlHandle() db.update(sql) db.close()
def run(self): proxy = self.proxy_mgr.get_proxy()[1] time.sleep(2) s_time = time.time() # start time sl_time = time.time() # start time of a proxy while 1: tuple_from_queue = self.queue.get() if tuple_from_queue is not None: try: self.lock.acquire() (num, origin, destination) = tuple_from_queue url = "http://restapi.amap.com/v3/direction/transit/integrated?origin=%s&" \ "destination=%s&city=%d&output=json&key=%s" % (origin, destination, 131, self.key) # print url downloader = PageDownload() page = downloader.simple_download(url) print num, origin, destination self.count += 1 if page is not None: if is_json(page): json_data = json.loads(page) if json_data["status"] == "1": route = json_data['route'] transits = route["transits"] distance = None duration = None cost = None for transit in transits: distance = float(transit["distance"]) distance = round(distance / 1000, 2) duration = float(transit['duration']) / 3600 duration = round(duration, 2) cost = transit['cost'] if type(cost) is not list: cost = float(transit['cost']) cost = round(cost, 2) break else: continue db = MysqlHandle() # print "insert into table " sql = "insert into amap_busline_route VALUES (%s,%s,%s,%s,%s,%s)" db.insert(sql=sql, value_list=[(num, origin, destination, distance, duration, cost)]) db.close() else: if json_data["info"] == "DAILY_QUERY_OVER_LIMIT": # the limit use of a day print "key: " + self.key + " use out" break else: print json_data["info"] else: print "result is not json format" self.queue.put_nowait((num, origin, destination)) else: self.queue.put_nowait((num, origin, destination)) print "the page is None" time.sleep(2) self.lock.release() e_time = time.time() if self.count == 50: # make sure that we will not get move than 50 result in a minutes if e_time - s_time < 60: time.sleep(60 - e_time + s_time) s_time = time.time() if e_time - sl_time > 300: # if a proxy ip is used for more than 6 minutes,we will change it! proxy = self.proxy_mgr.change_proxy(proxy)[1] print "proxy has changed to: " + proxy sl_time = time.time() except Exception: with open("error.txt", "a") as f: f.write(str(tuple_from_queue[0]) + "\n") self.queue.put_nowait(tuple_from_queue) else: print 'queue is empty,please wait' time.sleep(10)
class Spider(object): def __init__(self, url): self.begin_url = url self.downloader = PageDownload() def get_type_url_list(self): page = self.downloader.simple_download(self.begin_url) _types = re.findall(r'<a href="top/(\w+).html"', page) _labels = re.findall( r' <td style="height: 22px" align="center"><a href="top/\w+.html" title="[^<]+历史天气查询">([^<]+)</a></td>', page) type_url_list = [ "http://lishi.tianqi.com/" + _type + "/index.html" for _type in _types ] results = {} for _label, url in zip(_labels, type_url_list): results[_label] = url return results def get_month_url(self, url): page = self.downloader.simple_download(url) mon_urls = re.findall( r'<li><a href="(http://lishi.tianqi.com/\w+/\d+.html)">', page) return mon_urls def get_day_info(self, url): page = self.downloader.simple_download(url) days = re.findall(r'(\d+\-\d+\-\d+)', page) tems = re.findall(r'<li>(\-?\d+)</li>', page) result = {} print len(days), len(tems) for i, day in enumerate(days): result[day] = {} result[day]["max"] = tems[2 * i] result[day]["min"] = tems[2 * i + 1] return result def main(self): url_res = self.get_type_url_list() index = 0 for k, v in url_res.items(): if index < 65: index = index + 1 continue work_book = xlwt.Workbook() sheet = work_book.add_sheet("sheet1") print v month_urls = self.get_month_url(v) i = 0 for month_url in month_urls: print k, month_url _res = self.get_day_info(month_url) res = sorted(_res) sorted_dict = map(lambda x: {x: _res[x]}, res) for item in sorted_dict: day = item.keys()[0] info = item.values()[0] max_t = info["max"] min_t = info["min"] sheet.write(i, 0, k) sheet.write(i, 1, day) sheet.write(i, 2, max_t) sheet.write(i, 3, min_t) i = i + 1 index = index + 1 work_book.save(k + str(index) + ".xls")
def __init__(self, url): self.begin_url = url self.downloader = PageDownload()