def generate_samples(): base_url = "http://api.map.baidu.com/geoconv/v1/?coords=%s,%s&from=5&to=6&ak=XwpZGfXMn45W9Czd1UwmC6RwMMULD1Ue" work_book = xlwt.Workbook() sheet = work_book.add_sheet("sheet") sql = "select x , y from bdmap_api_school_218_page_table LIMIT 1000" db = MysqlHandle() query_res = db.query(sql) i = 0 for (x, y) in query_res: url = base_url % (str(x), str(y)) downloader = PageDownload() page = downloader.simple_download(url) if is_json(page): json_page = json.loads(page) status = json_page["status"] if status == 0: new_x = json_page["result"][0]["x"] new_y = json_page["result"][0]["y"] print (x, y, new_x, new_y) sheet.write(i, 0, x) sheet.write(i, 1, y) sheet.write(i, 2, new_x) sheet.write(i, 3, new_y) i = i+1 work_book.save("sample.xls")
def download(city_code, l_type, name, tag, city): page_table = "bdmap_api_" + name + "_" + str(city_code) + "_page_table" sql = 'select uid from ' + page_table + ' where city="' + city + '" and geo is null' db = MysqlHandle() query_data = db.query(sql) for (uid, ) in query_data: update_geo_data(uid, l_type, city_code, name)
def download(): db = MysqlHandle() sql = "select s.division, s.parent from divisions t,divisions s where t.division=s.parent and t.parent='湖北省' " query_res = db.query(sql) webSelenium = WebSelenium() for (division, parent) in query_res: division = division.replace("+", "") division = division.replace("☆", "") search_word = parent + division if arcpy.Exists(r'G:\xin.data\spiders_data\hbs' + "\\" + search_word + ".shp"): continue print(division) try: webdriver = webSelenium.simple_download( "http://127.0.0.1/common/get_bmap_boundary?city=" + division, "chrome") # webdriver = webSelenium.login_with_cookies(login_url="http://pan.baidu.com/s/1c03zJGW", cookies_data=cookies, domain="pan.baidu.com") button_path = webdriver.find_elements_by_xpath( "/html/body/input[2]")[0] button_path.click() time.sleep(5) button_path.click() button_download = webdriver.find_element_by_xpath( "/html/body/input[3]") time.sleep(5) button_download.click() time.sleep(3) webdriver.close() except Exception, e: print(e.message)
def download_page(self, base_url, geo, proxy, city_code, sec, page_table): downloader = PageDownload(proxy=proxy, hd=mobike_headers) post_dic = { 'longitude': str(geo[0]), 'latitude': str(geo[1]), 'citycode': str(city_code), 'errMsg': 'getMapCenterLocation:ok' } page = downloader.download_with_post(url=base_url, post_data=post_dic) if is_json(page): json_page = json.loads(page) if json_page.has_key("object"): mobike_object = json_page["object"] items = [] for mobike in mobike_object: bike_id = mobike["distId"] bike_type = mobike["biketype"] b_type = mobike["type"] lng = mobike["distX"] lat = mobike["distY"] dis_source = str(geo[0]) + "," + str(geo[1]) item = (bike_id, bike_type, b_type, lat, lng, dis_source, sec) items.append(item) db = MysqlHandle() sql = "insert into " + page_table + " values(%s,%s,%s,%s,%s,%s,%s,now())" db.insert(sql=sql, value_list=items) return True else: return False
def GET(self): inputs = web.input() if inputs.has_key("token"): token = inputs["token"] if token == "whanys": if inputs.has_key("username") and inputs.has_key("password"): username = inputs["username"] password = inputs["password"] sql = "select * from user_table_yixin where username='******' and password='******' and status=0" db = MysqlHandle() res = db.query(sql) if res: result = {"status": "0", "msg": "success"} else: result = {"status": "1", "msg": "failed!"} else: result = { "status": "1", "msg": "failed,parameters not enough!" } else: result = { "status": "1", "msg": "failed,your token is not true!" } else: result = {"status": "1", "msg": "failed,you need a token!"} return result
def filter_url(self, url): sql = "select type, filter from "+self.filter_table db = MysqlHandle() filter_data = db.query(sql=sql) for _filter in filter_data: _type = _filter[0] _url = _filter[1] if re.match(_url, url) is not None: return _type return None
def load_urls(): db = MysqlHandle() sql = "select urlmd5,url from cphi_page_table where urlmd5 not in (select urlmd5 FROM cphi_info_table)" results = db.query(sql) queue = Queue() for item in results: _url = item[1].replace("/introduce/", "/") #print _url queue.put_nowait((item[0], _url + "contactinfo/")) return queue
def split_boundary_outline(b_type, city_code, name): boundary_table = "bdmap_api_" + name + "_" + str( city_code) + "_boundary_table" sql = 'select lng_min,lat_min,lng_max,lat_max from ' + boundary_table + " where type=" + str( b_type) + " and total_count=400" db = MysqlHandle() query_res = db.query(sql) for (lng_min, lat_min, lng_max, lat_max) in query_res: boundarys = split_boundary(float(lat_max), float(lat_min), float(lng_max), float(lng_min), 10, 0.1) for _boundary in boundarys: _lng_min = _boundary[1][0] _lat_min = _boundary[0][0] _lng_max = _boundary[1][1] _lat_max = _boundary[0][1] _boundary_st = str(_boundary[0][0]) + "," + str( _boundary[1][0]) + "," + str(_boundary[0][1]) + "," + str( _boundary[1][1]) md5 = to_md5(_boundary_st) db = MysqlHandle() sql = "insert into " + boundary_table + " values(%s,%s,2,%s,%s,%s,%s,0,0,now())" db.insert( sql, [[md5, _boundary_st, _lng_min, _lat_min, _lng_max, _lat_max]]) db.close()
def spider(d_type): db = MysqlHandle() sql = "select division, abb_name from divisions where status=0 and type =" + str( d_type) query_res = db.query(sql) for (division, abb_name) in query_res: print division try: download(division, d_type, abb_name) except Exception, e: print(e.message)
def insert_into_shp(shp, workspace, query_item): uid = query_item[0] name = query_item[1] geo_type = query_item[2] page = query_item[3] json_page = json.loads(page) if not json_page.has_key("content"): return content = json_page["content"] item_info = content[0] geo = item_info["geo"] _geo = geo.split("|")[2].strip(";") real_geo = "MULTILINESTRING(" for segement in _geo.split(";"): real_geo = real_geo + "(" los = segement.split(",") for i in range(0, len(los), 2): # if i>2 : # if los[i]==los[i - 2] and los[i + 1]==los[i - 1]: # continue real_geo = real_geo + los[i] + " " + los[i + 1] + "," real_geo = real_geo.strip(",") + ")," real_geo = real_geo.strip(",") + ")" timetable = item_info["timetable"] if timetable is None: timetable = "" price = int(item_info["ticketPrice"]) / 100.0 current_city = json_page["current_city"] city = current_city["name"] if city is None: city = "" province = current_city["up_province_name"] if province is None: province = "" arcpy.env.workspace = workspace polyline = arcpy.FromWKT(real_geo) fields = [ "UID", "NAME", "PROVINCE", "CITY", "GEO_TYPE", "TIMETABLE", "PRICE" ] fields.append("SHAPE@") values = [uid, name, province, city, geo_type, timetable, price, polyline] cursor = arcpy.da.InsertCursor(shp, fields) cursor.insertRow(values) del cursor db = MysqlHandle() sql = 'update baidu_busline_page set status=200 where uid="' + item[0] + '"' db.update(sql)
def run(self): download_count = 0 total_count = 0 fail_count_20 = 0 start_time = time.time() while 1: try: self.tuple_from_queue = self.queue.get() if self.tuple_from_queue is None: print u'urls pool is empty,please wait for 90s' time.sleep(90) break else: print self.tuple_from_queue start_time_this_round = time.time() download_count += 1 self.lock.acquire() if not self.need_proxy: self.proxy = None download_result = self.spider.process(tuple_from_queue=self.tuple_from_queue, proxy=self.proxy) self.lock.release() total_count += download_result['total'] if download_result['total'] != download_result['success']: # 如果爬取url没有全部成功,失败的次数相应增加 fail_count_20 += download_result['total'] - download_result['success'] for failed_data in download_result['failed_list']: # 将失败的url记录再加入持久队列 self.queue.put(failed_data) print failed_data batch = download_count % self.CHKTHD if batch == 0: self.activity = 1 - float(fail_count_20) / float(total_count) print '[%s]COUNT: %d, FAIL-IN-this%d:%d , avail:%f:' % ( self.proxy_ip, self.CHKTHD, total_count, fail_count_20, self.activity) fail_count_20 = 0 total_count = 0 if self.activity < 0.3: print '[%s]rate of download is %f,too low' % (self.proxy_ip, self.activity) db = MysqlHandle() sql = "update temp_ips_manage set availabity=%s where proxy='%s'" % ( self.activity, self.proxy_ip) db.update(sql=sql) db.close() # self.change_proxy() self.proxy = {'http': 'http://' + self.proxy_manger.change_proxy(self.proxy_ip)[1]} spider_time = time.time() - start_time if spider_time > 600: print '[%s]timeout,we will quit' % (self.proxy_ip) self.proxy = {'http': 'http://' + self.proxy_manger.change_proxy(self.proxy_ip)[1]} start_time = time.time() elaspsed = time.time() - start_time_this_round interval = random.randint(self.interval_down, self.interval_upp) if elaspsed < interval: time.sleep(interval - elaspsed) except Exception,e: print e.message print self.tuple_from_queue if type(self.tuple_from_queue)==list and len(self.tuple_from_queue)==3: self.queue.put(self.tuple_from_queue)
def get_info(urlmd5, url, driver): driver.get(url) page = driver.page_source.encode("utf-8") # print page contacts = re.findall(r"<strong>联系人</strong>:([^=]+)<a", page) if contacts: contacts = contacts[0] else: contacts = None e_mail = re.findall(r'<a href="mailto:([^"]+@[^"]+)"', page) if e_mail: e_mail = e_mail[0] else: e_mail = None phone_num = re.findall(r"<p><strong>电话</strong>: ([^/]+)</p>", page) if phone_num: phone_num = phone_num[0] else: phone_num = None #print phone_num db = MysqlHandle() sql = "insert into cphi_info_table values(%s,%s,%s,%s,%s)" print e_mail db.insert(sql, [(urlmd5, url, contacts, e_mail, phone_num)]) db.close()
def download_info_page(self, e_id): page_url = self.page_base_url % (e_id) print page_url downloader = PageDownload() page = downloader.simple_download(page_url) if page: site = re.findall(r"<br />网址:([^<]+)<br />",page) if site: site = site[0] else: site = None company_name = re.findall(r"<strong>([^<]+)</strong>",page) if company_name: company_name = company_name[0] else: company_name = None zw_num = re.findall(r'<span class="glyphicon glyphicon-envelope"></span> 展位号: (\w+)',page) if zw_num: zw_num = zw_num[0] else: zw_num = None mail = re.findall(r'</span> 邮箱:<a href="mailto:([^<]+@[^<]+)">[^<]+@[^<]+</a>',page) if mail: mail = mail[0] else: mail = None db = MysqlHandle() sql = "insert into diecast VALUES (%s,%s,%s,%s)" db.insert(sql,[(site,company_name,zw_num,mail)]) db.close()
def get_page_url(line_name, line_type, city_code, coords): base_url = 'http://map.baidu.com/?newmap=1&reqflag=pcmap&biz=1&from=webmap&da_par=direct&pcevaname=pc4.1&qt=bl&da_src=searchBox.button&wd=%s&c=%d&l=13&b=(%s)&from=webmap&sug_forward=&tn=B_NORMAL_MAP&nn=0' downloader = PageDownload(timeout=5) page = downloader.simple_download(base_url % (line_name, city_code, coords)) if is_json(page): json_data = json.loads(page) if not json_data.has_key('content'): print base_url return contents = json_data['content'] line_list = [] for item in contents: name = item['name'] if not item.has_key("uid"): print name, base_url continue uid = item['uid'] page_url = 'http://map.baidu.com/?qt=bsl&tps=&newmap=1&uid=' + uid + '&c=%d' % ( city_code) line_list.append((name, uid, page_url, line_type)) db = MysqlHandle() insert_sql = "insert into baidu_busline_url_analyse values(%s,%s,%s,%s,0)" db.insert(insert_sql, line_list) db.close()
def add_init_url(self, url_table_name, filter_config, city_code, keyword): list_url = 'https://map.baidu.com/?newmap=1&reqflag=pcmap&biz=1&from=webmap&da_par=direct&pcevaname=pc4.1&qt=spot&from=webmap&c=%d&wd=%s&wd2=&pn=0&nn=0&db=0&sug=0&addr=0&pl_data_type=life&pl_sort_type=data_type&pl_sort_rule=0&pl_business_type=cinema&pl_business_id=&da_src=pcmappg.poi.page&on_gel=1&src=7&gr=3&l=12&rn=10&tn=B_NORMAL_MAP&ie=utf-8&b=(%s)' url = filter_config["url"] # db = MysqlHandle() # insert_sql = "insert into " + url_table_name + " values(%s,%s,%s,%s,%s,now())" # db.insert(sql=insert_sql, value_list=[(url["urlmd5"], url["url"], url["type"], url["boundary"], url["status"])]) # db.close() boundary = url["boundary"] min_interval = boundary.split(";")[0] max_interval = boundary.split(";")[1] lat_min = min_interval.split(",")[1] lat_max = max_interval.split(",")[1] lng_min = min_interval.split(",")[0] lng_max = max_interval.split(",")[0] boundarys = split_boundary(int(float(lat_max)), int(float(lat_min)), int(float(lng_max)), int(float(lng_min)), 20, 0.2) for _boundary in boundarys: _boundary_st = str(_boundary[1][0]) + "," + str( _boundary[0][0]) + ";" + str(_boundary[1][1]) + "," + str( _boundary[0][1]) new_url = list_url % (city_code, keyword, _boundary_st) new_urlmd5 = to_md5(in_str=new_url) url_type = 2 boundary = _boundary_st status = 0 db = MysqlHandle() insert_sql = "insert into " + self.url_table + " values (%s,%s,%s,%s,%s,now())" db.insert(sql=insert_sql, value_list=[(new_urlmd5, new_url, url_type, boundary, status)]) db.close()
def insert_into_stations(shp, workspace, query_item): l_uid = query_item[0] name = query_item[1] page = query_item[3] json_page = json.loads(page) if not json_page.has_key("content"): return content = json_page["content"] item_info = content[0] stations = item_info["stations"] current_city = json_page["current_city"] city = current_city["name"] if city is None: city = "" province = current_city["up_province_name"] if province is None: province = "" for station in stations: station_name = station["name"] station_geo = station["geo"].strip(";").split("|")[-1].replace( ",", " ") geo_str = "POINT(%s)" % (station_geo) station_uid = station["uid"] arcpy.env.workspace = workspace point = arcpy.FromWKT(geo_str) fields = ["UID", "NAME", "PROVINCE", "CITY", "L_UID", "L_NAME"] fields.append("SHAPE@") values = [ station_uid, station_name, province, city, l_uid, name, point ] cursor = arcpy.da.InsertCursor(shp, fields) cursor.insertRow(values) del cursor db = MysqlHandle() sql = 'update baidu_busline_page set status=200 where uid="' + item[0] + '"' db.update(sql)
def add_filter_urls(self, filter_table_name, filter_config): filters = filter_config["filters"] value_list = [] for filter in filters: value_list.append((filter["type"], filter["filter"])) db = MysqlHandle() insert_sql = "insert into "+filter_table_name+" values(%s,%s)" db.insert(sql=insert_sql, value_list=value_list) db.close()
def update_geo_data(uid, l_type, city_code, name): page_table = "bdmap_api_" + name + "_" + str(city_code) + "_page_table" url = "http://map.baidu.com/?qt=ext&newmap=1&uid=%s&c=%d&nn=0&l=%d&ext_ver=new" % ( uid, city_code, l_type) downloader = PageDownload() page = downloader.simple_download(url) if is_json(page): json_data = json.loads(page) if json_data.has_key("content"): content = json_data["content"] if content.has_key("geo"): geo = content["geo"] print(uid) md5 = to_md5(uid) sql = "update " + page_table + ' set geo="' + geo + '" where md5="' + md5 + '"' db = MysqlHandle() db.update(sql) db.close() time.sleep(random.uniform(0.5, 1.0))
def init_spider(city_code, name, boundary): #Initializer(source="bdmap_api_"+name+"_"+str(city_code), table_config="table_config.json", filter_config=None, need_proxy=False) boundary_table = "bdmap_api_" + name + "_" + str( city_code) + "_boundary_table" lng_min, lat_min, lng_max, lat_max = boundary boundarys = split_boundary(float(lat_max), float(lat_min), float(lng_max), float(lng_min), 10, 0.1) for _boundary in boundarys: _lng_min = _boundary[1][0] _lat_min = _boundary[0][0] _lng_max = _boundary[1][1] _lat_max = _boundary[0][1] _boundary_st = str(_boundary[0][0]) + "," + str( _boundary[1][0]) + "," + str(_boundary[0][1]) + "," + str( _boundary[1][1]) md5 = to_md5(_boundary_st) db = MysqlHandle() sql = "insert into " + boundary_table + " values(%s,%s,1,%s,%s,%s,%s,0,0,now())" db.insert( sql, [[md5, _boundary_st, _lng_min, _lat_min, _lng_max, _lat_max]]) db.close()
def GET(self): inputs = web.input() if inputs.has_key("token"): token = inputs["token"] if token == "whanys": if inputs.has_key("username") and inputs.has_key("password"): username = inputs["username"] password = inputs["password"] sql = "select * from user_table_yixin where username='******'" db = MysqlHandle() res = db.query(sql) if res: result = { "status": "1", "msg": "failed,the username is already exist!" } else: db = MysqlHandle() sql = "insert into user_table_yixin values(%s,%s,now(),%s)" res = db.insert(sql, [(username, password, 0)]) if res: result = {"status": "0", "msg": "success"} else: result = { "status": "1", "msg": "failed,parameters not enough!" } else: result = { "status": "1", "msg": "failed,your token is not true!" } else: result = {"status": "1", "msg": "failed,you need a token!"} return result
def load_geo_page(table): db = MysqlHandle() sql = "select uid,name,type, page from " + table + " where status is null limit 40000 " res_query = db.query(sql) return res_query
def download_page(self, urlmd5, url, proxy): downloader = PageDownload(proxy=proxy) page = downloader.simple_download(url=url) if page is not None: #print page.decode("utf-8") file_name = self.extract_field_from_page(page=page, reg=r'专辑名称:([^<]+)') if file_name is None: file_name = self.extract_field_from_page(page=page, reg=r'<h1 class="title">([^<]+)</h1>') music_type = self.extract_field_from_page(page=page, reg=r' <a href="/\w+/">([^<]+)</a>') # if file_name is None: # file_name = self.extract_field_from_page(page=page, reg=r'<h1>([^<]+)下载?</h1>') singer_name = self.extract_field_from_page(page=page, reg=r'专辑艺人:([^<]+)') baiduyun_url = self.extract_field_from_page(page=page, reg=r"""<a href="#ecms" onclick="window.open\('([^<]+)','','width=300,height=300,resizable=yes'\)""") print baiduyun_url if baiduyun_url is None: return False if baiduyun_url is not None: baiduyun_url = self.domain+baiduyun_url baiduyun_password = self.extract_field_from_page(page=page, reg=r'密码: (\w+)') sql = "insert into " + self.page_table + " values (%s,%s,%s,%s,%s,%s,%s,now())" db = MysqlHandle() db.insert(sql=sql, value_list=[(urlmd5, url, file_name,music_type, singer_name, baiduyun_url, baiduyun_password)]) db.close() update_sql = "update " + self.url_table + " set status=200 where urlmd5='%s'" % (urlmd5) db = MysqlHandle() db.update(sql=update_sql) db.close() return True else: return False
def spider(city_code, name, keyword, key_token): boundary_table = "bdmap_api_" + name + "_" + str( city_code) + "_boundary_table" page_table = "bdmap_api_" + name + "_" + str(city_code) + "_page_table" base_url = "http://api.map.baidu.com/place/v2/search?query=%s&scope=2&bounds=%s&output=json&ak=%s&page_num=%d" sql = "select md5, boundary from " + boundary_table + " where status=0" db = MysqlHandle() res_data = db.query(sql) for (md5, boundary) in res_data: url = base_url % (keyword, boundary, key_token, 0) downloader = PageDownload() page = downloader.simple_download(url) if is_json(page): json_data = json.loads(page) status = json_data["status"] total = json_data["total"] print(boundary, url, total) if status == 0 and int(total) > 0: page_count = int(total) / 10 for x in range(0, page_count + 2): _url = base_url % (keyword, boundary, key_token, x) downloader = PageDownload() _page = downloader.simple_download(_url) if is_json(_page): _json_data = json.loads(_page) results = _json_data["results"] for item in results: name = item["name"] address = item["address"] province = item["province"] city = item["city"] area = item["area"] uid = item["uid"] _md5 = to_md5(uid) lat = item["location"]["lat"] lng = item["location"]["lng"] try: tag = item["detail_info"]["tag"] except Exception, e: tag = None print(e.message) sql = "insert into " + page_table + " values(%s,%s,%s,%s,null,%s,%s,%s,%s,%s,null,null,%s,null,now(),null)" db = MysqlHandle() db.insert(sql, [[ _md5, uid, name, address, province, city, area, lng, lat, tag ]]) db.close() sql = 'update ' + boundary_table + ' set status=200,total_count=' + str( total) + ' where md5="' + md5 + '"' db = MysqlHandle() db.update(sql) db.close()
def filter_avaliable_ips(): db = MysqlHandle() is_success = db.delete('DELETE FROM TEMP_IPS_MANAGE') if not is_success: db.close() return db.close() sql = 'SELECT PROXY FROM AI_PROXY_IPS' db = MysqlHandle() # 查询出所有代理ip IP_LIST = db.query(sql) db.close() for ip in IP_LIST: PROXY = {'http': 'http://' + ip[0]} # 代理 print 'filtering ip:' + ip[0] downloader = PageDownload(hd=mobike_headers, proxy=PROXY, timeout=3) try: post_data = { 'longitude': '121.1883', 'latitude': '31.05147', 'citycode': '021', 'errMsg': 'getMapCenterLocation:ok' } page = downloader.download_with_post(url=TEST_URL, post_data=post_data) if page is not None: AVALIABLE_IPS.append(ip) print ip[0] + " is ok!" else: pass except Exception, e: print str(e) pass
def download_page(): db = MysqlHandle() query_sql = "select uid,min(name),min(line_type), min(page_url) from baidu_busline_url_analyse where status=0 group by uid " page_infs = db.query(query_sql) db.close() downloader = PageDownload() for item in page_infs: print(item[0]) page = downloader.simple_download(item[3]) # if is_json(page): # json_page = json.loads(page) # if json_page.has_key("content"): # main_info = json_page["content"][0] # name = main_info["name"] # timeable = main_info["timeable"] db = MysqlHandle() is_success = False if page is not None: insert_sql = "insert into baidu_busline_page values(%s,%s,%s,%s,NULL )" is_success = db.insert(insert_sql, [(item[0], item[1], item[2], page)]) if is_success and page is not None: update_sql = "update baidu_busline_url_analyse set status=200 where uid='%s'" % ( item[0]) db.update(update_sql) db.close()
def district_table(table_name): query_sql = 'select distinct proxy from '+table_name db = MysqlHandle() proxys = db.query(query_sql) db.close() delete_sql = 'delete from '+table_name db = MysqlHandle() db.delete(delete_sql) db.close() db = MysqlHandle() insert_sql = 'insert into '+table_name+' values (%s,now(),0,100)' is_success = db.insert(insert_sql, proxys) if is_success: print u'The filtering has finished!' db.close()
def filter_avaliable_ips(): db = MysqlHandle() is_success = db.delete('DELETE FROM TEMP_IPS_MANAGE') if not is_success: db.close() return db.close() sql = 'SELECT PROXY FROM AI_PROXY_IPS' db = MysqlHandle() # 查询出所有代理ip IP_LIST = db.query(sql) db.close() for ip in IP_LIST: PROXY = {'http': 'http://'+ip[0]} # 代理 print 'filtering ip:'+ip[0] downloader = PageDownload(hd=ofo_headers,proxy=PROXY) try: post_data = MultipartEncoder( { "lat": "30.515133", "lng": "114.346161", "token": "7eb9b200-3d7f-11e8-b714-e9d19c19f7b0", "source": "-5", "source-version": "10005" }, boundary='----ofo-boundary-MC40MjcxMzUw' ) page = downloader.download_with_post(url=TEST_URL,post_data=post_data) if page is not None: AVALIABLE_IPS.append(ip) print ip[0]+" is ok!" else: pass except Exception, e: print str(e) pass
"token": "7eb9b200-3d7f-11e8-b714-e9d19c19f7b0", "source": "-5", "source-version": "10005" }, boundary='----ofo-boundary-MC40MjcxMzUw' ) page = downloader.download_with_post(url=TEST_URL,post_data=post_data) if page is not None: AVALIABLE_IPS.append(ip) print ip[0]+" is ok!" else: pass except Exception, e: print str(e) pass db = MysqlHandle() db.insert('INSERT INTO TEMP_IPS_MANAGE VALUES (%s,now(),0,100)', AVALIABLE_IPS) db.close() district_table('TEMP_IPS_MANAGE') # 去除重复代理ip def district_table(table_name): query_sql = 'select distinct proxy from '+table_name db = MysqlHandle() proxys = db.query(query_sql) db.close() delete_sql = 'delete from '+table_name db = MysqlHandle() db.delete(delete_sql) db.close()
def add_init_url(self, url_table_name, filter_config, city_code, keyword): url = filter_config["url"] db = MysqlHandle() insert_sql = "insert into " + url_table_name + " values(%s,%s,%s,%s,now())" db.insert(sql=insert_sql, value_list=[(url["urlmd5"], url["url"], url["type"], url["status"])]) db.close()
def download_list_page(self, urlmd5, url, proxy, domain=None): downloader = PageDownload(proxy=proxy,timeout=10) page = downloader.simple_download(url=url) if page is not None: new_urls = re.findall(self.reg, page) # singer_names = re.findall(self.js0_reg, page) # for singer_name in singer_names: # merge_url = "http://www.51ape.com/skin/ape/php/qx_2.php?qx=" + singer_name # new_urls.append(merge_url) for _url in new_urls: if domain is not None: if _url.startswith("/"): new_url = domain + _url else: new_url = _url else: new_url = _url url_type = self.filter_url(url=new_url) if url_type is not None: new_urlmd5 = to_md5(in_str=new_url) sql = "select * from "+self.url_table+" where urlmd5='%s'" % (new_urlmd5) db = MysqlHandle() results = db.query(sql=sql) db.close() if not results: db = MysqlHandle() insert_sql = "insert into "+self.url_table+" values (%s,%s,%s,%s,now())" db.insert(sql=insert_sql, value_list=[(new_urlmd5, new_url, url_type, 0)]) db.close() else: print "This url is already in the database!!" else: pass update_sql = "update "+self.url_table+" set status=200 where urlmd5='%s'" % (urlmd5) db = MysqlHandle() db.update(sql=update_sql) db.close() return True else: return False