def download_page(self, urlmd5, url, proxy): downloader = PageDownload(proxy=proxy) page = downloader.simple_download(url=url) if page is not None: file_name = self.extract_field_from_page( page=page, reg=r'<li class="fl ml_1 mt_08 c999">([^=]+)</li>') # if file_name is None: # file_name = self.extract_field_from_page(page=page, reg=r'<h1>([^<]+)下载?</h1>') singer_name = self.extract_field_from_page( page=page, reg=r'<li><a class="fl c3b ml_1 mt_08" href="http:' r'//www.51ape.com/[^=]+/" title="[^=]+">([^=]+)' r'</a></li>') baiduyun_url = self.extract_field_from_page( page=page, reg=r'href="(https?://pan.baidu.com/s/[^=]+)"') baiduyun_password = self.extract_field_from_page( page=page, reg=r'提取<em class="dn"></em>密码:(\w+)</b>') sql = "insert into " + self.page_table + " values (%s,%s,%s,%s,%s,%s,now())" db = MysqlHandle() db.insert(sql=sql, value_list=[(urlmd5, url, file_name, singer_name, baiduyun_url, baiduyun_password)]) db.close() update_sql = "update " + self.url_table + " set status=200 where urlmd5='%s'" % ( urlmd5) db = MysqlHandle() db.update(sql=update_sql) db.close() return True else: return False
def download_page(self, urlmd5, url, proxy): downloader = PageDownload(proxy=proxy) page = downloader.simple_download(url=url) if page is not None: #print page.decode("utf-8") file_name = self.extract_field_from_page(page=page, reg=r'专辑名称:([^<]+)') if file_name is None: file_name = self.extract_field_from_page(page=page, reg=r'<h1 class="title">([^<]+)</h1>') music_type = self.extract_field_from_page(page=page, reg=r' <a href="/\w+/">([^<]+)</a>') # if file_name is None: # file_name = self.extract_field_from_page(page=page, reg=r'<h1>([^<]+)下载?</h1>') singer_name = self.extract_field_from_page(page=page, reg=r'专辑艺人:([^<]+)') baiduyun_url = self.extract_field_from_page(page=page, reg=r"""<a href="#ecms" onclick="window.open\('([^<]+)','','width=300,height=300,resizable=yes'\)""") print baiduyun_url if baiduyun_url is None: return False if baiduyun_url is not None: baiduyun_url = self.domain+baiduyun_url baiduyun_password = self.extract_field_from_page(page=page, reg=r'密码: (\w+)') sql = "insert into " + self.page_table + " values (%s,%s,%s,%s,%s,%s,%s,now())" db = MysqlHandle() db.insert(sql=sql, value_list=[(urlmd5, url, file_name,music_type, singer_name, baiduyun_url, baiduyun_password)]) db.close() update_sql = "update " + self.url_table + " set status=200 where urlmd5='%s'" % (urlmd5) db = MysqlHandle() db.update(sql=update_sql) db.close() return True else: return False
def run(self): download_count = 0 total_count = 0 fail_count_20 = 0 start_time = time.time() while 1: try: self.tuple_from_queue = self.queue.get() if self.tuple_from_queue is None: print u'urls pool is empty,please wait for 90s' time.sleep(90) break else: print self.tuple_from_queue start_time_this_round = time.time() download_count += 1 self.lock.acquire() if not self.need_proxy: self.proxy = None download_result = self.spider.process(tuple_from_queue=self.tuple_from_queue, proxy=self.proxy) self.lock.release() total_count += download_result['total'] if download_result['total'] != download_result['success']: # 如果爬取url没有全部成功,失败的次数相应增加 fail_count_20 += download_result['total'] - download_result['success'] for failed_data in download_result['failed_list']: # 将失败的url记录再加入持久队列 self.queue.put(failed_data) print failed_data batch = download_count % self.CHKTHD if batch == 0: self.activity = 1 - float(fail_count_20) / float(total_count) print '[%s]COUNT: %d, FAIL-IN-this%d:%d , avail:%f:' % ( self.proxy_ip, self.CHKTHD, total_count, fail_count_20, self.activity) fail_count_20 = 0 total_count = 0 if self.activity < 0.3: print '[%s]rate of download is %f,too low' % (self.proxy_ip, self.activity) db = MysqlHandle() sql = "update temp_ips_manage set availabity=%s where proxy='%s'" % ( self.activity, self.proxy_ip) db.update(sql=sql) db.close() # self.change_proxy() self.proxy = {'http': 'http://' + self.proxy_manger.change_proxy(self.proxy_ip)[1]} spider_time = time.time() - start_time if spider_time > 600: print '[%s]timeout,we will quit' % (self.proxy_ip) self.proxy = {'http': 'http://' + self.proxy_manger.change_proxy(self.proxy_ip)[1]} start_time = time.time() elaspsed = time.time() - start_time_this_round interval = random.randint(self.interval_down, self.interval_upp) if elaspsed < interval: time.sleep(interval - elaspsed) except Exception,e: print e.message print self.tuple_from_queue if type(self.tuple_from_queue)==list and len(self.tuple_from_queue)==3: self.queue.put(self.tuple_from_queue)
def download_page(self, urlmd5, url, proxy): downloader = PageDownload(proxy=proxy, timeout=5) page = downloader.simple_download(url=url) if page is not None: company_name = self.extract_field_from_page( page=page, reg=r"""<meta name="keywords" content="([^=]+)"/>""", index=-1) company_net = self.extract_field_from_page( page=page, reg= r'"/companyurlimg.php\?url=([^<]+)" alt="myImage" style="border:none' ) address = self.extract_field_from_page(page=page.decode("utf-8"), reg="<p>([^=]+)</p>", index=-1) # if file_name is None: # file_name = self.extract_field_from_page(page=page, reg=r'<h1>([^<]+)下载?</h1>') sql = "insert into " + self.page_table + " values (%s,%s,%s,%s,%s,now())" db = MysqlHandle() db.insert(sql=sql, value_list=[(urlmd5, url, company_name, company_net, address)]) db.close() update_sql = "update " + self.url_table + " set status=200 where urlmd5='%s'" % ( urlmd5) db = MysqlHandle() db.update(sql=update_sql) db.close() return True else: return False
def download_page(): db = MysqlHandle() query_sql = "select uid,min(name),min(line_type), min(page_url) from baidu_busline_url_analyse where status=0 group by uid " page_infs = db.query(query_sql) db.close() downloader = PageDownload() for item in page_infs: print(item[0]) page = downloader.simple_download(item[3]) # if is_json(page): # json_page = json.loads(page) # if json_page.has_key("content"): # main_info = json_page["content"][0] # name = main_info["name"] # timeable = main_info["timeable"] db = MysqlHandle() is_success = False if page is not None: insert_sql = "insert into baidu_busline_page values(%s,%s,%s,%s,NULL )" is_success = db.insert(insert_sql, [(item[0], item[1], item[2], page)]) if is_success and page is not None: update_sql = "update baidu_busline_url_analyse set status=200 where uid='%s'" % ( item[0]) db.update(update_sql) db.close()
def download_page(self, urlmd5, url, proxy): downloader = PageDownload(proxy=proxy) page = downloader.simple_download(url=url) if page is not None: file_name = self.extract_field_from_page( page=page, reg=r'<h1>([^<]+)\[FLAC格式\]下载</h1>') if file_name is None: file_name = self.extract_field_from_page( page=page, reg=r'<h1>([^<]+)下载?</h1>') baiduyun_url = self.extract_field_from_page( page=page, reg=r'<meta name="description" content="' r'[^<]*(https?://pan.baidu.com/[^\s]+) [^<]+"/>') baiduyun_password = self.extract_field_from_page( page=page, reg=r'<meta name="description" content' r'="[^<]+密码[\W]+(\w+)..."/>') sql = "insert into " + self.page_table + " values (%s,%s,%s,%s,%s,now())" db = MysqlHandle() db.insert(sql=sql, value_list=[(urlmd5, url, file_name, baiduyun_url, baiduyun_password)]) db.close() update_sql = "update " + self.url_table + " set status=200 where urlmd5='%s'" % ( urlmd5) db = MysqlHandle() db.update(sql=update_sql) db.close() return True else: return False
def download_page(self, urlmd5, url, proxy, url_table, page_table): downloader = PageDownload(proxy=proxy) page = downloader.simple_download(url=url) if page is not None: file_name = self.extract_field_from_page( page=page, reg=r'<h1 class="yh mt_1 f_32">([^<]+\.[a-z]+)</h1>') file_size = self.extract_field_from_page( page=page, reg=r'<h3 class="c999 fl mt_05 f_12 n yh">' r'<em class="n ml_1 mr_1">·</em>(\d+\.?\d+M)</h3>') baiduyun_url = self.extract_field_from_page( page=page, reg=r'href="(https?://pan.baidu.com/[^\s]+)"') baiduyun_password = self.extract_field_from_page( page=page, reg=r'<em class="dn"></em>密码:(\w+)</b>') sql = "insert into " + page_table + " values (%s,%s,%s,%s,%s,%s,now())" db = MysqlHandle() db.insert(sql=sql, value_list=[(urlmd5, url, file_name, file_size, baiduyun_url, baiduyun_password)]) db.close() update_sql = "update " + url_table + " set status=200 where urlmd5='%s'" % ( urlmd5) db = MysqlHandle() db.update(sql=update_sql) db.close() return True else: return False
def download_list_page(self, urlmd5, url, proxy, boundary, domain=None): downloader = PageDownload(proxy=proxy) page = downloader.simple_download(url=url) if is_json(page): json_page = json.loads(page) result = json_page["result"] total_count = result["total"] print ("total:"+str(total_count)) if int(total_count) <= 10 and int(total_count)>0: content = json_page["content"] for item in content: uid = item["uid"] primary_uid = item["primary_uid"] new_url = self.page_url % (uid, primary_uid) new_urlmd5 = to_md5(in_str=new_url) url_type = 0 boundary = None status = 0 sql = "select * from " + self.url_table + " where urlmd5='%s'" % (new_urlmd5) db = MysqlHandle() results = db.query(sql=sql) db.close() if not results: db = MysqlHandle() insert_sql = "insert into " + self.url_table + " values (%s,%s,%s,%s,%s,now())" db.insert(sql=insert_sql, value_list=[(new_urlmd5, new_url, url_type, boundary,status)]) db.close() else: print "This url is already in the database!!" elif int(total_count) <= 0: pass else: min_interval = boundary.split(";")[0] max_interval = boundary.split(";")[1] lat_min = min_interval.split(",")[1] lat_max = max_interval.split(",")[1] lng_min = min_interval.split(",")[0] lng_max = max_interval.split(",")[0] boundarys = split_boundary(int(float(lat_max)), int(float(lat_min)), int(float(lng_max)), int(float(lng_min)), 4, 0.2) for _boundary in boundarys: _boundary_st = str(_boundary[1][0])+","+str(_boundary[0][0])+";"+str(_boundary[1][1])+","+str(_boundary[0][1]) new_url = self.list_url % (self.city_code,self.keyword, _boundary_st) new_urlmd5 = to_md5(in_str=new_url) url_type = 1 boundary = _boundary_st status = 0 db = MysqlHandle() insert_sql = "insert into " + self.url_table + " values (%s,%s,%s,%s,%s,now())" db.insert(sql=insert_sql, value_list=[(new_urlmd5, new_url, url_type, boundary, status)]) db.close() update_sql = "update "+self.url_table+" set status=200 where urlmd5='%s'" % (urlmd5) db = MysqlHandle() db.update(sql=update_sql) db.close() return True else: return False
def spider(city_code, name, keyword, key_token): boundary_table = "bdmap_api_" + name + "_" + str( city_code) + "_boundary_table" page_table = "bdmap_api_" + name + "_" + str(city_code) + "_page_table" base_url = "http://api.map.baidu.com/place/v2/search?query=%s&scope=2&bounds=%s&output=json&ak=%s&page_num=%d" sql = "select md5, boundary from " + boundary_table + " where status=0" db = MysqlHandle() res_data = db.query(sql) for (md5, boundary) in res_data: url = base_url % (keyword, boundary, key_token, 0) downloader = PageDownload() page = downloader.simple_download(url) if is_json(page): json_data = json.loads(page) status = json_data["status"] total = json_data["total"] print(boundary, url, total) if status == 0 and int(total) > 0: page_count = int(total) / 10 for x in range(0, page_count + 2): _url = base_url % (keyword, boundary, key_token, x) downloader = PageDownload() _page = downloader.simple_download(_url) if is_json(_page): _json_data = json.loads(_page) results = _json_data["results"] for item in results: name = item["name"] address = item["address"] province = item["province"] city = item["city"] area = item["area"] uid = item["uid"] _md5 = to_md5(uid) lat = item["location"]["lat"] lng = item["location"]["lng"] try: tag = item["detail_info"]["tag"] except Exception, e: tag = None print(e.message) sql = "insert into " + page_table + " values(%s,%s,%s,%s,null,%s,%s,%s,%s,%s,null,null,%s,null,now(),null)" db = MysqlHandle() db.insert(sql, [[ _md5, uid, name, address, province, city, area, lng, lat, tag ]]) db.close() sql = 'update ' + boundary_table + ' set status=200,total_count=' + str( total) + ' where md5="' + md5 + '"' db = MysqlHandle() db.update(sql) db.close()
def insert_into_shp(shp, workspace, query_item): uid = query_item[0] name = query_item[1] geo_type = query_item[2] page = query_item[3] json_page = json.loads(page) if not json_page.has_key("content"): return content = json_page["content"] item_info = content[0] geo = item_info["geo"] _geo = geo.split("|")[2].strip(";") real_geo = "MULTILINESTRING(" for segement in _geo.split(";"): real_geo = real_geo + "(" los = segement.split(",") for i in range(0, len(los), 2): # if i>2 : # if los[i]==los[i - 2] and los[i + 1]==los[i - 1]: # continue real_geo = real_geo + los[i] + " " + los[i + 1] + "," real_geo = real_geo.strip(",") + ")," real_geo = real_geo.strip(",") + ")" timetable = item_info["timetable"] if timetable is None: timetable = "" price = int(item_info["ticketPrice"]) / 100.0 current_city = json_page["current_city"] city = current_city["name"] if city is None: city = "" province = current_city["up_province_name"] if province is None: province = "" arcpy.env.workspace = workspace polyline = arcpy.FromWKT(real_geo) fields = [ "UID", "NAME", "PROVINCE", "CITY", "GEO_TYPE", "TIMETABLE", "PRICE" ] fields.append("SHAPE@") values = [uid, name, province, city, geo_type, timetable, price, polyline] cursor = arcpy.da.InsertCursor(shp, fields) cursor.insertRow(values) del cursor db = MysqlHandle() sql = 'update baidu_busline_page set status=200 where uid="' + item[0] + '"' db.update(sql)
def download_page(self, urlmd5, url, proxy): downloader = PageDownload(proxy=proxy) page = downloader.simple_download(url=url) if is_json(page): page_json = json.loads(page) content = page_json["content"] uid = content["uid"] name = content["name"] address = content["addr"] if content.has_key("phone"): phone = content["phone"] else: phone = None x = content["navi_x"] y = content["navi_y"] # geo = content["geo"] ext = content["ext"] if type(ext) == type({"ee":""}): detail_info = ext["detail_info"] else: detail_info = {"info":""} if detail_info.has_key("tag"): tag = detail_info["tag"] else: tag = None if detail_info.has_key("image"): image = detail_info["image"] else: image = None if detail_info.has_key("display_info_redu"): dispaly_redu = detail_info["display_info_redu"] else: dispaly_redu = None if detail_info.has_key("price"): price = detail_info["price"] else: price = None sql = "insert into " + self.page_table + " values (%s,%s,%s,%s,%s,%s,%s,%s,null,null,null,%s,%s,%s,null,%s,now())" db = MysqlHandle() db.insert(sql=sql, value_list=[(urlmd5, url, uid, name, address, phone, x, y, tag, image, price, dispaly_redu)]) db.close() update_sql = "update " + self.url_table + " set status=200 where urlmd5='%s'" % (urlmd5) db = MysqlHandle() db.update(sql=update_sql) db.close() return True else: return False
def download_list_page(self, urlmd5, url, proxy, url_table, filter_table, domain=None): downloader = PageDownload(proxy=proxy) page = downloader.simple_download(url=url) if page is not None: new_urls = re.findall(self.reg, page) for _url in new_urls: if domain is not None: if _url.startswith("/"): new_url = domain + _url else: new_url = _url else: new_url = _url url_type = self.filter_url(url=new_url, filter_table=filter_table) if url_type is not None: new_urlmd5 = to_md5(in_str=new_url) sql = "select * from " + url_table + " where urlmd5='%s'" % ( new_urlmd5) db = MysqlHandle() results = db.query(sql=sql) db.close() if not results: db = MysqlHandle() insert_sql = "insert into " + url_table + " values (%s,%s,%s,%s,now())" db.insert(sql=insert_sql, value_list=[(new_urlmd5, new_url, url_type, 0)]) db.close() else: print "This url is already in the database!!" else: pass update_sql = "update " + url_table + " set status=200 where urlmd5='%s'" % ( urlmd5) db = MysqlHandle() db.update(sql=update_sql) db.close() return True else: return False
def start_feed(self): round_num = 0 last_url_count = 0 this_url_count = 0 same_count = 0 while self.run_sign: if len(self.queue) < 2500: start_time = time.time() round_index = round_num % 10 db = MysqlHandle() url_list = db.query(self.sql) db.close() count = 0 for url_data in url_list: self.queue.put( (url_data[0], url_data[1], url_data[2], url_data[3])) update_sql = self.update_sql_base % url_data[0] db = MysqlHandle() db.update(update_sql) db.close() count += 1 print 'FinishedQueue-' + self.url_table + ': %d TIME ELASPSED: %f ' % ( count, time.time() - start_time) if round_index == 0: if this_url_count == last_url_count: same_count += 1 else: last_url_count = this_url_count same_count = 0 this_url_count = 0 else: this_url_count += 1 round_num += 1 else: print 'The Queue is full!' if same_count == 100: print 'THE SAME NUM %d appeared for 10 rounds, feeding frequency turn down.' time.sleep(360) self.set_stop() else: time.sleep(5) print 'Existed Successfully!!'
def update_geo_data(uid, l_type, city_code, name): page_table = "bdmap_api_" + name + "_" + str(city_code) + "_page_table" url = "http://map.baidu.com/?qt=ext&newmap=1&uid=%s&c=%d&nn=0&l=%d&ext_ver=new" % ( uid, city_code, l_type) downloader = PageDownload() page = downloader.simple_download(url) if is_json(page): json_data = json.loads(page) if json_data.has_key("content"): content = json_data["content"] if content.has_key("geo"): geo = content["geo"] print(uid) md5 = to_md5(uid) sql = "update " + page_table + ' set geo="' + geo + '" where md5="' + md5 + '"' db = MysqlHandle() db.update(sql) db.close() time.sleep(random.uniform(0.5, 1.0))
def download_list_page(self, urlmd5, url, proxy, domain=None): downloader = PageDownload(proxy=proxy,timeout=10) page = downloader.simple_download(url=url) if page is not None: new_urls = re.findall(self.reg, page) # singer_names = re.findall(self.js0_reg, page) # for singer_name in singer_names: # merge_url = "http://www.51ape.com/skin/ape/php/qx_2.php?qx=" + singer_name # new_urls.append(merge_url) for _url in new_urls: if domain is not None: if _url.startswith("/"): new_url = domain + _url else: new_url = _url else: new_url = _url url_type = self.filter_url(url=new_url) if url_type is not None: new_urlmd5 = to_md5(in_str=new_url) sql = "select * from "+self.url_table+" where urlmd5='%s'" % (new_urlmd5) db = MysqlHandle() results = db.query(sql=sql) db.close() if not results: db = MysqlHandle() insert_sql = "insert into "+self.url_table+" values (%s,%s,%s,%s,now())" db.insert(sql=insert_sql, value_list=[(new_urlmd5, new_url, url_type, 0)]) db.close() else: print "This url is already in the database!!" else: pass update_sql = "update "+self.url_table+" set status=200 where urlmd5='%s'" % (urlmd5) db = MysqlHandle() db.update(sql=update_sql) db.close() return True else: return False
def download_page(self, urlmd5, url, proxy): downloader = PageDownload(proxy=proxy) page = downloader.simple_download(url=url) if page is not None: file_name = self.extract_field_from_page( page=page, reg=r'<h2>([^\]]+) <span class="c?b?">' r'WAV</span>') # if file_name is None: # file_name = self.extract_field_from_page(page=page, reg=r'<h1>([^<]+)下载?</h1>') singer_name = self.extract_field_from_page( page=page, reg=r'<p><a href="/detail/\d+.html" style="color:' r'#217fbc">([^=]+)</a></p>') result = self.extract_field_from_page( page=page, reg= r'<p class="downurl">链接: (https?://pan.baidu.com/s/[^=]+) 密码: (\w+)</p>' ) if result is not None: [baiduyun_url, baiduyun_password] = result else: baiduyun_url = None baiduyun_password = None sql = "insert into " + self.page_table + " values (%s,%s,%s,%s,%s,%s,now())" db = MysqlHandle() db.insert(sql=sql, value_list=[(urlmd5, url, file_name, singer_name, baiduyun_url, baiduyun_password)]) db.close() update_sql = "update " + self.url_table + " set status=200 where urlmd5='%s'" % ( urlmd5) db = MysqlHandle() db.update(sql=update_sql) db.close() return True else: return False
def insert_into_stations(shp, workspace, query_item): l_uid = query_item[0] name = query_item[1] page = query_item[3] json_page = json.loads(page) if not json_page.has_key("content"): return content = json_page["content"] item_info = content[0] stations = item_info["stations"] current_city = json_page["current_city"] city = current_city["name"] if city is None: city = "" province = current_city["up_province_name"] if province is None: province = "" for station in stations: station_name = station["name"] station_geo = station["geo"].strip(";").split("|")[-1].replace( ",", " ") geo_str = "POINT(%s)" % (station_geo) station_uid = station["uid"] arcpy.env.workspace = workspace point = arcpy.FromWKT(geo_str) fields = ["UID", "NAME", "PROVINCE", "CITY", "L_UID", "L_NAME"] fields.append("SHAPE@") values = [ station_uid, station_name, province, city, l_uid, name, point ] cursor = arcpy.da.InsertCursor(shp, fields) cursor.insertRow(values) del cursor db = MysqlHandle() sql = 'update baidu_busline_page set status=200 where uid="' + item[0] + '"' db.update(sql)
def download(division, d_type, abb_name=None): base_url = "http://xzqh.mca.gov.cn/defaultQuery?shengji=%s&diji=%s&xianji=%s" webSelenium = WebSelenium() if d_type == 1: url = base_url % (urllib.quote( (division + "(" + abb_name + ")").encode("gb2312")), "-1", "-1") driver = webSelenium.simple_download(url, "chrome") print url rows = driver.find_elements_by_xpath( "/html/body/div[@id='center']/div[@class='mid_con_qt']/table[@class='info_table']/tbody/tr[@class='shi_nub']" ) for row in rows: c_division = row.find_element_by_xpath( "td[@class='name_left']/a[@class='name_text']").text population = row.find_element_by_xpath("td[3]").text area = row.find_element_by_xpath("td[4]").text code = row.find_element_by_xpath("td[5]").text zone = row.find_element_by_xpath("td[6]").text zip_code = row.find_element_by_xpath("td[7]").text if population == u'': population = None if area == u'': area = None if code == u'': code = None if zone == u'': zone = None if zip_code == u'': zip_code = None print(c_division, population, area, code, zone, zip_code) db = MysqlHandle() sql = "insert into divisions values(%s,%s,NULL ,%s,%s,%s,%s,%s,%s,0)" is_ok = db.insert(sql, [(c_division, code, 2, population, area, zone, zip_code, division)]) db.close() if is_ok: db = MysqlHandle() sql = 'update divisions set status=200 where division="' + division + '" and type=' + str( d_type) db.update(sql) elif d_type == 2: db = MysqlHandle() #sql = 'select parent from divisions where division="'+division+'"' sql = 'SELECT division, abb_name FROM divisions where division in (select parent from divisions where division="' + division + '")' res = db.query(sql) parent_division = res[0][0] + "(" + res[0][1] + ")" url = base_url % (urllib.quote(parent_division.encode("gb2312")), urllib.quote(division.encode("gb2312")), "-1") driver = webSelenium.simple_download(url, "chrome") print url rows = driver.find_elements_by_xpath( "/html/body/div[@id='center']/div[@class='mid_con_qt']/table[@class='info_table']/tbody/tr" ) for row in rows[2:]: c_division = row.find_element_by_xpath( "td[@class='name_left']").text population = row.find_element_by_xpath("td[3]").text if population == u'': population = None area = row.find_element_by_xpath("td[4]").text if area == u'': area = None code = row.find_element_by_xpath("td[5]").text if code == u'': code = None zone = row.find_element_by_xpath("td[6]").text if zone == u'': zone = None zip_code = row.find_element_by_xpath("td[7]").text if zip_code == u'': zip_code = None print(c_division, population, area, code, zone, zip_code) db = MysqlHandle() sql = "insert into divisions values(%s,%s,NULL ,%s,%s,%s,%s,%s,%s,0)" is_ok = db.insert(sql, [(c_division, code, 3, population, area, zone, zip_code, division)]) if is_ok: db = MysqlHandle() sql = 'update divisions set status=200 where division="' + division + '" and type=' + str( d_type) db.update(sql) else: pass