def filter_avaliable_ips(): db = MysqlHandle() is_success = db.delete('DELETE FROM TEMP_IPS_MANAGE') if not is_success: db.close() return db.close() sql = 'SELECT PROXY FROM AI_PROXY_IPS' db = MysqlHandle() # 查询出所有代理ip IP_LIST = db.query(sql) db.close() for ip in IP_LIST: PROXY = {'http': 'http://' + ip[0]} # 代理 print 'filtering ip:' + ip[0] downloader = PageDownload(hd=mobike_headers, proxy=PROXY, timeout=3) try: post_data = { 'longitude': '121.1883', 'latitude': '31.05147', 'citycode': '021', 'errMsg': 'getMapCenterLocation:ok' } page = downloader.download_with_post(url=TEST_URL, post_data=post_data) if page is not None: AVALIABLE_IPS.append(ip) print ip[0] + " is ok!" else: pass except Exception, e: print str(e) pass
def GET(self): inputs = web.input() if inputs.has_key("token"): token = inputs["token"] if token == "whanys": if inputs.has_key("username") and inputs.has_key("password"): username = inputs["username"] password = inputs["password"] sql = "select * from user_table_yixin where username='******' and password='******' and status=0" db = MysqlHandle() res = db.query(sql) if res: result = {"status": "0", "msg": "success"} else: result = {"status": "1", "msg": "failed!"} else: result = { "status": "1", "msg": "failed,parameters not enough!" } else: result = { "status": "1", "msg": "failed,your token is not true!" } else: result = {"status": "1", "msg": "failed,you need a token!"} return result
def download(): db = MysqlHandle() sql = "select s.division, s.parent from divisions t,divisions s where t.division=s.parent and t.parent='湖北省' " query_res = db.query(sql) webSelenium = WebSelenium() for (division, parent) in query_res: division = division.replace("+", "") division = division.replace("☆", "") search_word = parent + division if arcpy.Exists(r'G:\xin.data\spiders_data\hbs' + "\\" + search_word + ".shp"): continue print(division) try: webdriver = webSelenium.simple_download( "http://127.0.0.1/common/get_bmap_boundary?city=" + division, "chrome") # webdriver = webSelenium.login_with_cookies(login_url="http://pan.baidu.com/s/1c03zJGW", cookies_data=cookies, domain="pan.baidu.com") button_path = webdriver.find_elements_by_xpath( "/html/body/input[2]")[0] button_path.click() time.sleep(5) button_path.click() button_download = webdriver.find_element_by_xpath( "/html/body/input[3]") time.sleep(5) button_download.click() time.sleep(3) webdriver.close() except Exception, e: print(e.message)
def download(city_code, l_type, name, tag, city): page_table = "bdmap_api_" + name + "_" + str(city_code) + "_page_table" sql = 'select uid from ' + page_table + ' where city="' + city + '" and geo is null' db = MysqlHandle() query_data = db.query(sql) for (uid, ) in query_data: update_geo_data(uid, l_type, city_code, name)
def download_page(): db = MysqlHandle() query_sql = "select uid,min(name),min(line_type), min(page_url) from baidu_busline_url_analyse where status=0 group by uid " page_infs = db.query(query_sql) db.close() downloader = PageDownload() for item in page_infs: print(item[0]) page = downloader.simple_download(item[3]) # if is_json(page): # json_page = json.loads(page) # if json_page.has_key("content"): # main_info = json_page["content"][0] # name = main_info["name"] # timeable = main_info["timeable"] db = MysqlHandle() is_success = False if page is not None: insert_sql = "insert into baidu_busline_page values(%s,%s,%s,%s,NULL )" is_success = db.insert(insert_sql, [(item[0], item[1], item[2], page)]) if is_success and page is not None: update_sql = "update baidu_busline_url_analyse set status=200 where uid='%s'" % ( item[0]) db.update(update_sql) db.close()
def generate_samples(): base_url = "http://api.map.baidu.com/geoconv/v1/?coords=%s,%s&from=5&to=6&ak=XwpZGfXMn45W9Czd1UwmC6RwMMULD1Ue" work_book = xlwt.Workbook() sheet = work_book.add_sheet("sheet") sql = "select x , y from bdmap_api_school_218_page_table LIMIT 1000" db = MysqlHandle() query_res = db.query(sql) i = 0 for (x, y) in query_res: url = base_url % (str(x), str(y)) downloader = PageDownload() page = downloader.simple_download(url) if is_json(page): json_page = json.loads(page) status = json_page["status"] if status == 0: new_x = json_page["result"][0]["x"] new_y = json_page["result"][0]["y"] print (x, y, new_x, new_y) sheet.write(i, 0, x) sheet.write(i, 1, y) sheet.write(i, 2, new_x) sheet.write(i, 3, new_y) i = i+1 work_book.save("sample.xls")
def split_boundary_outline(b_type, city_code, name): boundary_table = "bdmap_api_" + name + "_" + str( city_code) + "_boundary_table" sql = 'select lng_min,lat_min,lng_max,lat_max from ' + boundary_table + " where type=" + str( b_type) + " and total_count=400" db = MysqlHandle() query_res = db.query(sql) for (lng_min, lat_min, lng_max, lat_max) in query_res: boundarys = split_boundary(float(lat_max), float(lat_min), float(lng_max), float(lng_min), 10, 0.1) for _boundary in boundarys: _lng_min = _boundary[1][0] _lat_min = _boundary[0][0] _lng_max = _boundary[1][1] _lat_max = _boundary[0][1] _boundary_st = str(_boundary[0][0]) + "," + str( _boundary[1][0]) + "," + str(_boundary[0][1]) + "," + str( _boundary[1][1]) md5 = to_md5(_boundary_st) db = MysqlHandle() sql = "insert into " + boundary_table + " values(%s,%s,2,%s,%s,%s,%s,0,0,now())" db.insert( sql, [[md5, _boundary_st, _lng_min, _lat_min, _lng_max, _lat_max]]) db.close()
def download_list_page(self, urlmd5, url, proxy, boundary, domain=None): downloader = PageDownload(proxy=proxy) page = downloader.simple_download(url=url) if is_json(page): json_page = json.loads(page) result = json_page["result"] total_count = result["total"] print ("total:"+str(total_count)) if int(total_count) <= 10 and int(total_count)>0: content = json_page["content"] for item in content: uid = item["uid"] primary_uid = item["primary_uid"] new_url = self.page_url % (uid, primary_uid) new_urlmd5 = to_md5(in_str=new_url) url_type = 0 boundary = None status = 0 sql = "select * from " + self.url_table + " where urlmd5='%s'" % (new_urlmd5) db = MysqlHandle() results = db.query(sql=sql) db.close() if not results: db = MysqlHandle() insert_sql = "insert into " + self.url_table + " values (%s,%s,%s,%s,%s,now())" db.insert(sql=insert_sql, value_list=[(new_urlmd5, new_url, url_type, boundary,status)]) db.close() else: print "This url is already in the database!!" elif int(total_count) <= 0: pass else: min_interval = boundary.split(";")[0] max_interval = boundary.split(";")[1] lat_min = min_interval.split(",")[1] lat_max = max_interval.split(",")[1] lng_min = min_interval.split(",")[0] lng_max = max_interval.split(",")[0] boundarys = split_boundary(int(float(lat_max)), int(float(lat_min)), int(float(lng_max)), int(float(lng_min)), 4, 0.2) for _boundary in boundarys: _boundary_st = str(_boundary[1][0])+","+str(_boundary[0][0])+";"+str(_boundary[1][1])+","+str(_boundary[0][1]) new_url = self.list_url % (self.city_code,self.keyword, _boundary_st) new_urlmd5 = to_md5(in_str=new_url) url_type = 1 boundary = _boundary_st status = 0 db = MysqlHandle() insert_sql = "insert into " + self.url_table + " values (%s,%s,%s,%s,%s,now())" db.insert(sql=insert_sql, value_list=[(new_urlmd5, new_url, url_type, boundary, status)]) db.close() update_sql = "update "+self.url_table+" set status=200 where urlmd5='%s'" % (urlmd5) db = MysqlHandle() db.update(sql=update_sql) db.close() return True else: return False
def load_urls(): db = MysqlHandle() sql = "select urlmd5,url from cphi_page_table where urlmd5 not in (select urlmd5 FROM cphi_info_table)" results = db.query(sql) queue = Queue() for item in results: _url = item[1].replace("/introduce/", "/") #print _url queue.put_nowait((item[0], _url + "contactinfo/")) return queue
def filter_url(self, url): sql = "select type, filter from "+self.filter_table db = MysqlHandle() filter_data = db.query(sql=sql) for _filter in filter_data: _type = _filter[0] _url = _filter[1] if re.match(_url, url) is not None: return _type return None
def spider(city_code, name, keyword, key_token): boundary_table = "bdmap_api_" + name + "_" + str( city_code) + "_boundary_table" page_table = "bdmap_api_" + name + "_" + str(city_code) + "_page_table" base_url = "http://api.map.baidu.com/place/v2/search?query=%s&scope=2&bounds=%s&output=json&ak=%s&page_num=%d" sql = "select md5, boundary from " + boundary_table + " where status=0" db = MysqlHandle() res_data = db.query(sql) for (md5, boundary) in res_data: url = base_url % (keyword, boundary, key_token, 0) downloader = PageDownload() page = downloader.simple_download(url) if is_json(page): json_data = json.loads(page) status = json_data["status"] total = json_data["total"] print(boundary, url, total) if status == 0 and int(total) > 0: page_count = int(total) / 10 for x in range(0, page_count + 2): _url = base_url % (keyword, boundary, key_token, x) downloader = PageDownload() _page = downloader.simple_download(_url) if is_json(_page): _json_data = json.loads(_page) results = _json_data["results"] for item in results: name = item["name"] address = item["address"] province = item["province"] city = item["city"] area = item["area"] uid = item["uid"] _md5 = to_md5(uid) lat = item["location"]["lat"] lng = item["location"]["lng"] try: tag = item["detail_info"]["tag"] except Exception, e: tag = None print(e.message) sql = "insert into " + page_table + " values(%s,%s,%s,%s,null,%s,%s,%s,%s,%s,null,null,%s,null,now(),null)" db = MysqlHandle() db.insert(sql, [[ _md5, uid, name, address, province, city, area, lng, lat, tag ]]) db.close() sql = 'update ' + boundary_table + ' set status=200,total_count=' + str( total) + ' where md5="' + md5 + '"' db = MysqlHandle() db.update(sql) db.close()
def spider(d_type): db = MysqlHandle() sql = "select division, abb_name from divisions where status=0 and type =" + str( d_type) query_res = db.query(sql) for (division, abb_name) in query_res: print division try: download(division, d_type, abb_name) except Exception, e: print(e.message)
def district_table(table_name): query_sql = 'select distinct proxy from '+table_name db = MysqlHandle() proxys = db.query(query_sql) db.close() delete_sql = 'delete from '+table_name db = MysqlHandle() db.delete(delete_sql) db.close() db = MysqlHandle() insert_sql = 'insert into '+table_name+' values (%s,now(),0,100)' is_success = db.insert(insert_sql, proxys) if is_success: print u'The filtering has finished!' db.close()
def download_list_page(self, urlmd5, url, proxy, url_table, filter_table, domain=None): downloader = PageDownload(proxy=proxy) page = downloader.simple_download(url=url) if page is not None: new_urls = re.findall(self.reg, page) for _url in new_urls: if domain is not None: if _url.startswith("/"): new_url = domain + _url else: new_url = _url else: new_url = _url url_type = self.filter_url(url=new_url, filter_table=filter_table) if url_type is not None: new_urlmd5 = to_md5(in_str=new_url) sql = "select * from " + url_table + " where urlmd5='%s'" % ( new_urlmd5) db = MysqlHandle() results = db.query(sql=sql) db.close() if not results: db = MysqlHandle() insert_sql = "insert into " + url_table + " values (%s,%s,%s,%s,now())" db.insert(sql=insert_sql, value_list=[(new_urlmd5, new_url, url_type, 0)]) db.close() else: print "This url is already in the database!!" else: pass update_sql = "update " + url_table + " set status=200 where urlmd5='%s'" % ( urlmd5) db = MysqlHandle() db.update(sql=update_sql) db.close() return True else: return False
def start_feed(self): round_num = 0 last_url_count = 0 this_url_count = 0 same_count = 0 while self.run_sign: if len(self.queue) < 2500: start_time = time.time() round_index = round_num % 10 db = MysqlHandle() url_list = db.query(self.sql) db.close() count = 0 for url_data in url_list: self.queue.put( (url_data[0], url_data[1], url_data[2], url_data[3])) update_sql = self.update_sql_base % url_data[0] db = MysqlHandle() db.update(update_sql) db.close() count += 1 print 'FinishedQueue-' + self.url_table + ': %d TIME ELASPSED: %f ' % ( count, time.time() - start_time) if round_index == 0: if this_url_count == last_url_count: same_count += 1 else: last_url_count = this_url_count same_count = 0 this_url_count = 0 else: this_url_count += 1 round_num += 1 else: print 'The Queue is full!' if same_count == 100: print 'THE SAME NUM %d appeared for 10 rounds, feeding frequency turn down.' time.sleep(360) self.set_stop() else: time.sleep(5) print 'Existed Successfully!!'
def download_list_page(self, urlmd5, url, proxy, domain=None): downloader = PageDownload(proxy=proxy,timeout=10) page = downloader.simple_download(url=url) if page is not None: new_urls = re.findall(self.reg, page) # singer_names = re.findall(self.js0_reg, page) # for singer_name in singer_names: # merge_url = "http://www.51ape.com/skin/ape/php/qx_2.php?qx=" + singer_name # new_urls.append(merge_url) for _url in new_urls: if domain is not None: if _url.startswith("/"): new_url = domain + _url else: new_url = _url else: new_url = _url url_type = self.filter_url(url=new_url) if url_type is not None: new_urlmd5 = to_md5(in_str=new_url) sql = "select * from "+self.url_table+" where urlmd5='%s'" % (new_urlmd5) db = MysqlHandle() results = db.query(sql=sql) db.close() if not results: db = MysqlHandle() insert_sql = "insert into "+self.url_table+" values (%s,%s,%s,%s,now())" db.insert(sql=insert_sql, value_list=[(new_urlmd5, new_url, url_type, 0)]) db.close() else: print "This url is already in the database!!" else: pass update_sql = "update "+self.url_table+" set status=200 where urlmd5='%s'" % (urlmd5) db = MysqlHandle() db.update(sql=update_sql) db.close() return True else: return False
def GET(self): inputs = web.input() if inputs.has_key("token"): token = inputs["token"] if token == "whanys": if inputs.has_key("username") and inputs.has_key("password"): username = inputs["username"] password = inputs["password"] sql = "select * from user_table_yixin where username='******'" db = MysqlHandle() res = db.query(sql) if res: result = { "status": "1", "msg": "failed,the username is already exist!" } else: db = MysqlHandle() sql = "insert into user_table_yixin values(%s,%s,now(),%s)" res = db.insert(sql, [(username, password, 0)]) if res: result = {"status": "0", "msg": "success"} else: result = { "status": "1", "msg": "failed,parameters not enough!" } else: result = { "status": "1", "msg": "failed,your token is not true!" } else: result = {"status": "1", "msg": "failed,you need a token!"} return result
def filter_avaliable_ips(): db = MysqlHandle() is_success = db.delete('DELETE FROM TEMP_IPS_MANAGE') if not is_success: db.close() return db.close() sql = 'SELECT PROXY FROM AI_PROXY_IPS' db = MysqlHandle() # 查询出所有代理ip IP_LIST = db.query(sql) db.close() for ip in IP_LIST: PROXY = {'http': 'http://'+ip[0]} # 代理 print 'filtering ip:'+ip[0] downloader = PageDownload(hd=ofo_headers,proxy=PROXY) try: post_data = MultipartEncoder( { "lat": "30.515133", "lng": "114.346161", "token": "7eb9b200-3d7f-11e8-b714-e9d19c19f7b0", "source": "-5", "source-version": "10005" }, boundary='----ofo-boundary-MC40MjcxMzUw' ) page = downloader.download_with_post(url=TEST_URL,post_data=post_data) if page is not None: AVALIABLE_IPS.append(ip) print ip[0]+" is ok!" else: pass except Exception, e: print str(e) pass
def GET(self): inputs = web.input() if inputs.has_key("token"): token = inputs.get("token") sql = "select * from token_table where token='" + token + "'" db = MysqlHandle() res = db.query(sql=sql) if not res: result = { "status": "1", "msg": "failed,your token is false!", } else: if inputs.has_key("name"): music_name = inputs["name"] db = MysqlHandle() sql = "select file_name,singer,baiduyun_url,baiduyun_password from sq688_page_table where file_name " \ "like '%" + "%s" % (music_name) + "%'" print sql res = db.query(sql=sql) db.close() if not res: sql = "select file_name,singer,baiduyun_url,baiduyun_password from 51ape_page_table where file_name " \ "like '%" + "%s" % (music_name) + "%'" db = MysqlHandle() res = db.query(sql=sql) db.close() items = [] for item in res: file_name = item[0] singer = item[1] baiduyun_url = item[2] baiduyun_password = item[3] item_dic = { "music": file_name, "singer": singer, "url": baiduyun_url, "password": baiduyun_password } items.append(item_dic) result = { "status": "1", "msg": "success", "results": items } elif inputs.has_key("singer"): singer = inputs["singer"] db = MysqlHandle() sql = "select file_name,singer,baiduyun_url,baiduyun_password from sq688_page_table where singer " \ "like '%s'" % (singer) res = db.query(sql=sql) db.close() if not res: sql = "select file_name,singer,baiduyun_url,baiduyun_password from 51ape_page_table where singer" \ "like '%s'" % (singer) db = MysqlHandle() res = db.query(sql=sql) db.close() items = [] for item in res: file_name = item[0] singer = item[1] baiduyun_url = item[2] baiduyun_password = item[3] item_dic = { "music": file_name, "singer": singer, "url": baiduyun_url, "password": baiduyun_password } items.append(item_dic) result = { "status": "0", "msg": "success", "results": items } else: result = { "status": "1", "msg": "failed,params is not enough!" } else: result = {"status": "1", "msg": "failed,you need a token!"} result = json.dumps(result) return result
def load_urls(self): sql = "select urlmd5,baiduyun_url from carflac_page_table" db = MysqlHandle() res = db.query(sql) for item in res: self.queue.put_nowait(item)
def load_geo_page(table): db = MysqlHandle() sql = "select uid,name,type, page from " + table + " where status is null limit 40000 " res_query = db.query(sql) return res_query
def download(division, d_type, abb_name=None): base_url = "http://xzqh.mca.gov.cn/defaultQuery?shengji=%s&diji=%s&xianji=%s" webSelenium = WebSelenium() if d_type == 1: url = base_url % (urllib.quote( (division + "(" + abb_name + ")").encode("gb2312")), "-1", "-1") driver = webSelenium.simple_download(url, "chrome") print url rows = driver.find_elements_by_xpath( "/html/body/div[@id='center']/div[@class='mid_con_qt']/table[@class='info_table']/tbody/tr[@class='shi_nub']" ) for row in rows: c_division = row.find_element_by_xpath( "td[@class='name_left']/a[@class='name_text']").text population = row.find_element_by_xpath("td[3]").text area = row.find_element_by_xpath("td[4]").text code = row.find_element_by_xpath("td[5]").text zone = row.find_element_by_xpath("td[6]").text zip_code = row.find_element_by_xpath("td[7]").text if population == u'': population = None if area == u'': area = None if code == u'': code = None if zone == u'': zone = None if zip_code == u'': zip_code = None print(c_division, population, area, code, zone, zip_code) db = MysqlHandle() sql = "insert into divisions values(%s,%s,NULL ,%s,%s,%s,%s,%s,%s,0)" is_ok = db.insert(sql, [(c_division, code, 2, population, area, zone, zip_code, division)]) db.close() if is_ok: db = MysqlHandle() sql = 'update divisions set status=200 where division="' + division + '" and type=' + str( d_type) db.update(sql) elif d_type == 2: db = MysqlHandle() #sql = 'select parent from divisions where division="'+division+'"' sql = 'SELECT division, abb_name FROM divisions where division in (select parent from divisions where division="' + division + '")' res = db.query(sql) parent_division = res[0][0] + "(" + res[0][1] + ")" url = base_url % (urllib.quote(parent_division.encode("gb2312")), urllib.quote(division.encode("gb2312")), "-1") driver = webSelenium.simple_download(url, "chrome") print url rows = driver.find_elements_by_xpath( "/html/body/div[@id='center']/div[@class='mid_con_qt']/table[@class='info_table']/tbody/tr" ) for row in rows[2:]: c_division = row.find_element_by_xpath( "td[@class='name_left']").text population = row.find_element_by_xpath("td[3]").text if population == u'': population = None area = row.find_element_by_xpath("td[4]").text if area == u'': area = None code = row.find_element_by_xpath("td[5]").text if code == u'': code = None zone = row.find_element_by_xpath("td[6]").text if zone == u'': zone = None zip_code = row.find_element_by_xpath("td[7]").text if zip_code == u'': zip_code = None print(c_division, population, area, code, zone, zip_code) db = MysqlHandle() sql = "insert into divisions values(%s,%s,NULL ,%s,%s,%s,%s,%s,%s,0)" is_ok = db.insert(sql, [(c_division, code, 3, population, area, zone, zip_code, division)]) if is_ok: db = MysqlHandle() sql = 'update divisions set status=200 where division="' + division + '" and type=' + str( d_type) db.update(sql) else: pass