Ejemplo n.º 1
0
def generate_samples():
    base_url = "http://api.map.baidu.com/geoconv/v1/?coords=%s,%s&from=5&to=6&ak=XwpZGfXMn45W9Czd1UwmC6RwMMULD1Ue"
    work_book = xlwt.Workbook()
    sheet = work_book.add_sheet("sheet")
    sql = "select x , y from bdmap_api_school_218_page_table LIMIT 1000"
    db = MysqlHandle()
    query_res = db.query(sql)
    i = 0
    for (x, y) in query_res:
        url = base_url % (str(x), str(y))
        downloader = PageDownload()
        page = downloader.simple_download(url)
        if is_json(page):

            json_page = json.loads(page)
            status = json_page["status"]
            if status == 0:
                new_x = json_page["result"][0]["x"]
                new_y = json_page["result"][0]["y"]
                print (x, y, new_x, new_y)
                sheet.write(i, 0, x)
                sheet.write(i, 1, y)
                sheet.write(i, 2, new_x)
                sheet.write(i, 3, new_y)
                i = i+1

    work_book.save("sample.xls")
Ejemplo n.º 2
0
def download(city_code, l_type, name, tag, city):
    page_table = "bdmap_api_" + name + "_" + str(city_code) + "_page_table"
    sql = 'select uid from ' + page_table + ' where  city="' + city + '" and geo is null'
    db = MysqlHandle()
    query_data = db.query(sql)
    for (uid, ) in query_data:
        update_geo_data(uid, l_type, city_code, name)
def download():
    db = MysqlHandle()
    sql = "select s.division, s.parent from divisions t,divisions s where t.division=s.parent and t.parent='湖北省' "
    query_res = db.query(sql)
    webSelenium = WebSelenium()
    for (division, parent) in query_res:
        division = division.replace("+", "")
        division = division.replace("☆", "")
        search_word = parent + division
        if arcpy.Exists(r'G:\xin.data\spiders_data\hbs' + "\\" + search_word +
                        ".shp"):
            continue
        print(division)
        try:
            webdriver = webSelenium.simple_download(
                "http://127.0.0.1/common/get_bmap_boundary?city=" + division,
                "chrome")

            # webdriver = webSelenium.login_with_cookies(login_url="http://pan.baidu.com/s/1c03zJGW", cookies_data=cookies, domain="pan.baidu.com")
            button_path = webdriver.find_elements_by_xpath(
                "/html/body/input[2]")[0]

            button_path.click()

            time.sleep(5)
            button_path.click()
            button_download = webdriver.find_element_by_xpath(
                "/html/body/input[3]")
            time.sleep(5)

            button_download.click()
            time.sleep(3)
            webdriver.close()
        except Exception, e:
            print(e.message)
Ejemplo n.º 4
0
 def download_page(self, base_url, geo, proxy, city_code, sec, page_table):
     downloader = PageDownload(proxy=proxy, hd=mobike_headers)
     post_dic = {
         'longitude': str(geo[0]),
         'latitude': str(geo[1]),
         'citycode': str(city_code),
         'errMsg': 'getMapCenterLocation:ok'
     }
     page = downloader.download_with_post(url=base_url, post_data=post_dic)
     if is_json(page):
         json_page = json.loads(page)
         if json_page.has_key("object"):
             mobike_object = json_page["object"]
             items = []
             for mobike in mobike_object:
                 bike_id = mobike["distId"]
                 bike_type = mobike["biketype"]
                 b_type = mobike["type"]
                 lng = mobike["distX"]
                 lat = mobike["distY"]
                 dis_source = str(geo[0]) + "," + str(geo[1])
                 item = (bike_id, bike_type, b_type, lat, lng, dis_source,
                         sec)
                 items.append(item)
             db = MysqlHandle()
             sql = "insert into " + page_table + " values(%s,%s,%s,%s,%s,%s,%s,now())"
             db.insert(sql=sql, value_list=items)
             return True
     else:
         return False
Ejemplo n.º 5
0
 def GET(self):
     inputs = web.input()
     if inputs.has_key("token"):
         token = inputs["token"]
         if token == "whanys":
             if inputs.has_key("username") and inputs.has_key("password"):
                 username = inputs["username"]
                 password = inputs["password"]
                 sql = "select * from user_table_yixin where username='******' and password='******' and status=0"
                 db = MysqlHandle()
                 res = db.query(sql)
                 if res:
                     result = {"status": "0", "msg": "success"}
                 else:
                     result = {"status": "1", "msg": "failed!"}
             else:
                 result = {
                     "status": "1",
                     "msg": "failed,parameters not enough!"
                 }
         else:
             result = {
                 "status": "1",
                 "msg": "failed,your token is not true!"
             }
     else:
         result = {"status": "1", "msg": "failed,you need a token!"}
     return result
Ejemplo n.º 6
0
 def filter_url(self, url):
     sql = "select type, filter from "+self.filter_table
     db = MysqlHandle()
     filter_data = db.query(sql=sql)
     for _filter in filter_data:
         _type = _filter[0]
         _url = _filter[1]
         if re.match(_url, url) is not None:
             return _type
     return None
Ejemplo n.º 7
0
def load_urls():
    db = MysqlHandle()
    sql = "select urlmd5,url from cphi_page_table where urlmd5 not in (select urlmd5 FROM cphi_info_table)"
    results = db.query(sql)
    queue = Queue()
    for item in results:
        _url = item[1].replace("/introduce/", "/")
        #print _url
        queue.put_nowait((item[0], _url + "contactinfo/"))
    return queue
Ejemplo n.º 8
0
def split_boundary_outline(b_type, city_code, name):
    boundary_table = "bdmap_api_" + name + "_" + str(
        city_code) + "_boundary_table"
    sql = 'select lng_min,lat_min,lng_max,lat_max from ' + boundary_table + " where type=" + str(
        b_type) + " and total_count=400"
    db = MysqlHandle()
    query_res = db.query(sql)
    for (lng_min, lat_min, lng_max, lat_max) in query_res:
        boundarys = split_boundary(float(lat_max), float(lat_min),
                                   float(lng_max), float(lng_min), 10, 0.1)
        for _boundary in boundarys:
            _lng_min = _boundary[1][0]
            _lat_min = _boundary[0][0]
            _lng_max = _boundary[1][1]
            _lat_max = _boundary[0][1]
            _boundary_st = str(_boundary[0][0]) + "," + str(
                _boundary[1][0]) + "," + str(_boundary[0][1]) + "," + str(
                    _boundary[1][1])
            md5 = to_md5(_boundary_st)
            db = MysqlHandle()
            sql = "insert into " + boundary_table + " values(%s,%s,2,%s,%s,%s,%s,0,0,now())"
            db.insert(
                sql,
                [[md5, _boundary_st, _lng_min, _lat_min, _lng_max, _lat_max]])
            db.close()
Ejemplo n.º 9
0
def spider(d_type):
    db = MysqlHandle()
    sql = "select division, abb_name from divisions where status=0 and type =" + str(
        d_type)
    query_res = db.query(sql)
    for (division, abb_name) in query_res:
        print division
        try:
            download(division, d_type, abb_name)
        except Exception, e:
            print(e.message)
Ejemplo n.º 10
0
def insert_into_shp(shp, workspace, query_item):
    uid = query_item[0]
    name = query_item[1]
    geo_type = query_item[2]
    page = query_item[3]
    json_page = json.loads(page)
    if not json_page.has_key("content"):
        return
    content = json_page["content"]
    item_info = content[0]
    geo = item_info["geo"]
    _geo = geo.split("|")[2].strip(";")
    real_geo = "MULTILINESTRING("
    for segement in _geo.split(";"):
        real_geo = real_geo + "("
        los = segement.split(",")
        for i in range(0, len(los), 2):
            # if i>2 :
            #     if los[i]==los[i - 2] and los[i + 1]==los[i - 1]:
            #         continue
            real_geo = real_geo + los[i] + " " + los[i + 1] + ","
        real_geo = real_geo.strip(",") + "),"
    real_geo = real_geo.strip(",") + ")"

    timetable = item_info["timetable"]
    if timetable is None:
        timetable = ""
    price = int(item_info["ticketPrice"]) / 100.0
    current_city = json_page["current_city"]

    city = current_city["name"]
    if city is None:
        city = ""
    province = current_city["up_province_name"]
    if province is None:
        province = ""
    arcpy.env.workspace = workspace
    polyline = arcpy.FromWKT(real_geo)
    fields = [
        "UID", "NAME", "PROVINCE", "CITY", "GEO_TYPE", "TIMETABLE", "PRICE"
    ]
    fields.append("SHAPE@")
    values = [uid, name, province, city, geo_type, timetable, price, polyline]
    cursor = arcpy.da.InsertCursor(shp, fields)
    cursor.insertRow(values)
    del cursor

    db = MysqlHandle()
    sql = 'update baidu_busline_page set status=200 where uid="' + item[0] + '"'
    db.update(sql)
Ejemplo n.º 11
0
    def run(self):
        download_count = 0
        total_count = 0
        fail_count_20 = 0
        start_time = time.time()
        while 1:
            try:
                self.tuple_from_queue = self.queue.get()
                if self.tuple_from_queue is None:
                    print u'urls pool is empty,please wait for 90s'
                    time.sleep(90)
                    break
                else:
                    print self.tuple_from_queue
                    start_time_this_round = time.time()
                    download_count += 1
                    self.lock.acquire()
                    if not self.need_proxy:
                        self.proxy = None
                    download_result = self.spider.process(tuple_from_queue=self.tuple_from_queue, proxy=self.proxy)
                    self.lock.release()

                    total_count += download_result['total']
                    if download_result['total'] != download_result['success']:  # 如果爬取url没有全部成功,失败的次数相应增加
                        fail_count_20 += download_result['total'] - download_result['success']
                        for failed_data in download_result['failed_list']:  # 将失败的url记录再加入持久队列
                            self.queue.put(failed_data)
                            print failed_data
                batch = download_count % self.CHKTHD
                if batch == 0:
                    self.activity = 1 - float(fail_count_20) / float(total_count)
                    print '[%s]COUNT: %d, FAIL-IN-this%d:%d , avail:%f:' % (
                        self.proxy_ip, self.CHKTHD, total_count, fail_count_20,
                        self.activity)
                    fail_count_20 = 0
                    total_count = 0
                    if self.activity < 0.3:
                        print '[%s]rate of download is %f,too low' % (self.proxy_ip, self.activity)
                        db = MysqlHandle()
                        sql = "update temp_ips_manage set availabity=%s where proxy='%s'" % (
                        self.activity, self.proxy_ip)
                        db.update(sql=sql)
                        db.close()
                        # self.change_proxy()
                        self.proxy = {'http': 'http://' + self.proxy_manger.change_proxy(self.proxy_ip)[1]}
                spider_time = time.time() - start_time
                if spider_time > 600:
                    print '[%s]timeout,we will quit' % (self.proxy_ip)
                    self.proxy = {'http': 'http://' + self.proxy_manger.change_proxy(self.proxy_ip)[1]}
                    start_time = time.time()
                elaspsed = time.time() - start_time_this_round
                interval = random.randint(self.interval_down, self.interval_upp)
                if elaspsed < interval:
                    time.sleep(interval - elaspsed)
            except Exception,e:
                print e.message
                print self.tuple_from_queue
                if type(self.tuple_from_queue)==list and len(self.tuple_from_queue)==3:

                    self.queue.put(self.tuple_from_queue)
Ejemplo n.º 12
0
def get_info(urlmd5, url, driver):
    driver.get(url)
    page = driver.page_source.encode("utf-8")
    # print page
    contacts = re.findall(r"<strong>联系人</strong>:([^=]+)<a", page)
    if contacts:
        contacts = contacts[0]
    else:
        contacts = None
    e_mail = re.findall(r'<a href="mailto:([^"]+@[^"]+)"', page)
    if e_mail:
        e_mail = e_mail[0]
    else:
        e_mail = None
    phone_num = re.findall(r"<p><strong>电话</strong>: ([^/]+)</p>", page)
    if phone_num:
        phone_num = phone_num[0]
    else:
        phone_num = None
    #print phone_num
    db = MysqlHandle()
    sql = "insert into cphi_info_table values(%s,%s,%s,%s,%s)"
    print e_mail
    db.insert(sql, [(urlmd5, url, contacts, e_mail, phone_num)])
    db.close()
Ejemplo n.º 13
0
    def download_info_page(self, e_id):
        page_url = self.page_base_url % (e_id)
        print page_url
        downloader = PageDownload()
        page = downloader.simple_download(page_url)
        if page:
            site = re.findall(r"<br />网址:([^<]+)<br />",page)
            if site:
                site = site[0]
            else:
                site = None
            company_name = re.findall(r"<strong>([^<]+)</strong>",page)
            if company_name:
                company_name = company_name[0]
            else:
                company_name = None
            zw_num = re.findall(r'<span class="glyphicon glyphicon-envelope"></span> 展位号: (\w+)',page)
            if zw_num:
                zw_num = zw_num[0]
            else:
                zw_num = None
            mail = re.findall(r'</span> 邮箱:<a href="mailto:([^<]+@[^<]+)">[^<]+@[^<]+</a>',page)
            if mail:
                mail = mail[0]
            else:
                mail = None

            db = MysqlHandle()
            sql = "insert into diecast VALUES (%s,%s,%s,%s)"
            db.insert(sql,[(site,company_name,zw_num,mail)])
            db.close()
Ejemplo n.º 14
0
def get_page_url(line_name, line_type, city_code, coords):
    base_url = 'http://map.baidu.com/?newmap=1&reqflag=pcmap&biz=1&from=webmap&da_par=direct&pcevaname=pc4.1&qt=bl&da_src=searchBox.button&wd=%s&c=%d&l=13&b=(%s)&from=webmap&sug_forward=&tn=B_NORMAL_MAP&nn=0'
    downloader = PageDownload(timeout=5)
    page = downloader.simple_download(base_url %
                                      (line_name, city_code, coords))
    if is_json(page):
        json_data = json.loads(page)
        if not json_data.has_key('content'):
            print base_url
            return
        contents = json_data['content']
        line_list = []
        for item in contents:
            name = item['name']

            if not item.has_key("uid"):
                print name, base_url
                continue
            uid = item['uid']
            page_url = 'http://map.baidu.com/?qt=bsl&tps=&newmap=1&uid=' + uid + '&c=%d' % (
                city_code)
            line_list.append((name, uid, page_url, line_type))
        db = MysqlHandle()
        insert_sql = "insert into baidu_busline_url_analyse values(%s,%s,%s,%s,0)"
        db.insert(insert_sql, line_list)
        db.close()
Ejemplo n.º 15
0
    def add_init_url(self, url_table_name, filter_config, city_code, keyword):
        list_url = 'https://map.baidu.com/?newmap=1&reqflag=pcmap&biz=1&from=webmap&da_par=direct&pcevaname=pc4.1&qt=spot&from=webmap&c=%d&wd=%s&wd2=&pn=0&nn=0&db=0&sug=0&addr=0&pl_data_type=life&pl_sort_type=data_type&pl_sort_rule=0&pl_business_type=cinema&pl_business_id=&da_src=pcmappg.poi.page&on_gel=1&src=7&gr=3&l=12&rn=10&tn=B_NORMAL_MAP&ie=utf-8&b=(%s)'
        url = filter_config["url"]
        # db = MysqlHandle()
        # insert_sql = "insert into " + url_table_name + " values(%s,%s,%s,%s,%s,now())"
        # db.insert(sql=insert_sql, value_list=[(url["urlmd5"], url["url"], url["type"], url["boundary"], url["status"])])
        # db.close()
        boundary = url["boundary"]
        min_interval = boundary.split(";")[0]
        max_interval = boundary.split(";")[1]
        lat_min = min_interval.split(",")[1]
        lat_max = max_interval.split(",")[1]
        lng_min = min_interval.split(",")[0]
        lng_max = max_interval.split(",")[0]

        boundarys = split_boundary(int(float(lat_max)), int(float(lat_min)),
                                   int(float(lng_max)), int(float(lng_min)),
                                   20, 0.2)
        for _boundary in boundarys:
            _boundary_st = str(_boundary[1][0]) + "," + str(
                _boundary[0][0]) + ";" + str(_boundary[1][1]) + "," + str(
                    _boundary[0][1])
            new_url = list_url % (city_code, keyword, _boundary_st)
            new_urlmd5 = to_md5(in_str=new_url)
            url_type = 2
            boundary = _boundary_st
            status = 0
            db = MysqlHandle()
            insert_sql = "insert into " + self.url_table + " values (%s,%s,%s,%s,%s,now())"
            db.insert(sql=insert_sql,
                      value_list=[(new_urlmd5, new_url, url_type, boundary,
                                   status)])
            db.close()
Ejemplo n.º 16
0
def insert_into_stations(shp, workspace, query_item):
    l_uid = query_item[0]
    name = query_item[1]
    page = query_item[3]
    json_page = json.loads(page)
    if not json_page.has_key("content"):
        return
    content = json_page["content"]
    item_info = content[0]
    stations = item_info["stations"]
    current_city = json_page["current_city"]

    city = current_city["name"]
    if city is None:
        city = ""
    province = current_city["up_province_name"]
    if province is None:
        province = ""
    for station in stations:
        station_name = station["name"]
        station_geo = station["geo"].strip(";").split("|")[-1].replace(
            ",", " ")
        geo_str = "POINT(%s)" % (station_geo)
        station_uid = station["uid"]
        arcpy.env.workspace = workspace
        point = arcpy.FromWKT(geo_str)
        fields = ["UID", "NAME", "PROVINCE", "CITY", "L_UID", "L_NAME"]
        fields.append("SHAPE@")
        values = [
            station_uid, station_name, province, city, l_uid, name, point
        ]
        cursor = arcpy.da.InsertCursor(shp, fields)
        cursor.insertRow(values)
        del cursor
    db = MysqlHandle()
    sql = 'update baidu_busline_page set status=200 where uid="' + item[0] + '"'
    db.update(sql)
Ejemplo n.º 17
0
 def add_filter_urls(self, filter_table_name, filter_config):
     filters = filter_config["filters"]
     value_list = []
     for filter in filters:
         value_list.append((filter["type"], filter["filter"]))
     db = MysqlHandle()
     insert_sql = "insert into "+filter_table_name+" values(%s,%s)"
     db.insert(sql=insert_sql, value_list=value_list)
     db.close()
Ejemplo n.º 18
0
def update_geo_data(uid, l_type, city_code, name):
    page_table = "bdmap_api_" + name + "_" + str(city_code) + "_page_table"
    url = "http://map.baidu.com/?qt=ext&newmap=1&uid=%s&c=%d&nn=0&l=%d&ext_ver=new" % (
        uid, city_code, l_type)
    downloader = PageDownload()
    page = downloader.simple_download(url)
    if is_json(page):
        json_data = json.loads(page)
        if json_data.has_key("content"):
            content = json_data["content"]
            if content.has_key("geo"):
                geo = content["geo"]
                print(uid)
                md5 = to_md5(uid)
                sql = "update " + page_table + ' set geo="' + geo + '" where md5="' + md5 + '"'
                db = MysqlHandle()
                db.update(sql)
                db.close()
    time.sleep(random.uniform(0.5, 1.0))
Ejemplo n.º 19
0
def init_spider(city_code, name, boundary):
    #Initializer(source="bdmap_api_"+name+"_"+str(city_code), table_config="table_config.json", filter_config=None, need_proxy=False)
    boundary_table = "bdmap_api_" + name + "_" + str(
        city_code) + "_boundary_table"
    lng_min, lat_min, lng_max, lat_max = boundary
    boundarys = split_boundary(float(lat_max), float(lat_min), float(lng_max),
                               float(lng_min), 10, 0.1)
    for _boundary in boundarys:
        _lng_min = _boundary[1][0]
        _lat_min = _boundary[0][0]
        _lng_max = _boundary[1][1]
        _lat_max = _boundary[0][1]
        _boundary_st = str(_boundary[0][0]) + "," + str(
            _boundary[1][0]) + "," + str(_boundary[0][1]) + "," + str(
                _boundary[1][1])
        md5 = to_md5(_boundary_st)
        db = MysqlHandle()
        sql = "insert into " + boundary_table + " values(%s,%s,1,%s,%s,%s,%s,0,0,now())"
        db.insert(
            sql, [[md5, _boundary_st, _lng_min, _lat_min, _lng_max, _lat_max]])
        db.close()
Ejemplo n.º 20
0
    def GET(self):
        inputs = web.input()
        if inputs.has_key("token"):
            token = inputs["token"]
            if token == "whanys":
                if inputs.has_key("username") and inputs.has_key("password"):
                    username = inputs["username"]
                    password = inputs["password"]
                    sql = "select * from user_table_yixin where username='******'"
                    db = MysqlHandle()
                    res = db.query(sql)
                    if res:
                        result = {
                            "status": "1",
                            "msg": "failed,the username is already exist!"
                        }
                    else:
                        db = MysqlHandle()
                        sql = "insert into user_table_yixin values(%s,%s,now(),%s)"
                        res = db.insert(sql, [(username, password, 0)])
                        if res:
                            result = {"status": "0", "msg": "success"}

                else:
                    result = {
                        "status": "1",
                        "msg": "failed,parameters not enough!"
                    }
            else:
                result = {
                    "status": "1",
                    "msg": "failed,your token is not true!"
                }
        else:
            result = {"status": "1", "msg": "failed,you need a token!"}
        return result
Ejemplo n.º 21
0
def load_geo_page(table):
    db = MysqlHandle()
    sql = "select uid,name,type, page from " + table + " where status is null limit 40000 "
    res_query = db.query(sql)
    return res_query
Ejemplo n.º 22
0
    def download_page(self, urlmd5, url, proxy):
        downloader = PageDownload(proxy=proxy)
        page = downloader.simple_download(url=url)
        if page is not None:
            #print page.decode("utf-8")
            file_name = self.extract_field_from_page(page=page, reg=r'专辑名称:([^<]+)')
            if file_name is None:
                file_name = self.extract_field_from_page(page=page, reg=r'<h1 class="title">([^<]+)</h1>')
            music_type = self.extract_field_from_page(page=page, reg=r'&nbsp;<a href="/\w+/">([^<]+)</a>')

            # if file_name is None:
            #     file_name = self.extract_field_from_page(page=page, reg=r'<h1>([^<]+)下载?</h1>')
            singer_name = self.extract_field_from_page(page=page, reg=r'专辑艺人:([^<]+)')
            baiduyun_url = self.extract_field_from_page(page=page, reg=r"""<a href="#ecms" onclick="window.open\('([^<]+)','','width=300,height=300,resizable=yes'\)""")
            print baiduyun_url
            if baiduyun_url is None:
                return False
            if baiduyun_url is not None:
                baiduyun_url = self.domain+baiduyun_url
            baiduyun_password = self.extract_field_from_page(page=page, reg=r'密码: (\w+)')
            sql = "insert into " + self.page_table + "  values (%s,%s,%s,%s,%s,%s,%s,now())"
            db = MysqlHandle()
            db.insert(sql=sql, value_list=[(urlmd5, url, file_name,music_type, singer_name, baiduyun_url,
                                            baiduyun_password)])
            db.close()
            update_sql = "update " + self.url_table + " set status=200 where urlmd5='%s'" % (urlmd5)
            db = MysqlHandle()
            db.update(sql=update_sql)
            db.close()
            return True
        else:
            return False
Ejemplo n.º 23
0
def spider(city_code, name, keyword, key_token):
    boundary_table = "bdmap_api_" + name + "_" + str(
        city_code) + "_boundary_table"
    page_table = "bdmap_api_" + name + "_" + str(city_code) + "_page_table"
    base_url = "http://api.map.baidu.com/place/v2/search?query=%s&scope=2&bounds=%s&output=json&ak=%s&page_num=%d"
    sql = "select md5, boundary from " + boundary_table + " where status=0"
    db = MysqlHandle()
    res_data = db.query(sql)
    for (md5, boundary) in res_data:
        url = base_url % (keyword, boundary, key_token, 0)
        downloader = PageDownload()
        page = downloader.simple_download(url)
        if is_json(page):
            json_data = json.loads(page)
            status = json_data["status"]
            total = json_data["total"]
            print(boundary, url, total)
            if status == 0 and int(total) > 0:
                page_count = int(total) / 10
                for x in range(0, page_count + 2):
                    _url = base_url % (keyword, boundary, key_token, x)
                    downloader = PageDownload()
                    _page = downloader.simple_download(_url)
                    if is_json(_page):
                        _json_data = json.loads(_page)
                        results = _json_data["results"]
                        for item in results:
                            name = item["name"]
                            address = item["address"]
                            province = item["province"]
                            city = item["city"]
                            area = item["area"]
                            uid = item["uid"]
                            _md5 = to_md5(uid)
                            lat = item["location"]["lat"]
                            lng = item["location"]["lng"]
                            try:
                                tag = item["detail_info"]["tag"]
                            except Exception, e:
                                tag = None
                                print(e.message)
                            sql = "insert into " + page_table + " values(%s,%s,%s,%s,null,%s,%s,%s,%s,%s,null,null,%s,null,now(),null)"
                            db = MysqlHandle()
                            db.insert(sql, [[
                                _md5, uid, name, address, province, city, area,
                                lng, lat, tag
                            ]])
                            db.close()

            sql = 'update ' + boundary_table + ' set status=200,total_count=' + str(
                total) + ' where md5="' + md5 + '"'
            db = MysqlHandle()
            db.update(sql)
            db.close()
Ejemplo n.º 24
0
def filter_avaliable_ips():
    db = MysqlHandle()
    is_success = db.delete('DELETE  FROM TEMP_IPS_MANAGE')
    if not is_success:
        db.close()
        return
    db.close()
    sql = 'SELECT PROXY FROM AI_PROXY_IPS'
    db = MysqlHandle()
    # 查询出所有代理ip
    IP_LIST = db.query(sql)
    db.close()
    for ip in IP_LIST:
        PROXY = {'http': 'http://' + ip[0]}  # 代理
        print 'filtering ip:' + ip[0]
        downloader = PageDownload(hd=mobike_headers, proxy=PROXY, timeout=3)
        try:
            post_data = {
                'longitude': '121.1883',
                'latitude': '31.05147',
                'citycode': '021',
                'errMsg': 'getMapCenterLocation:ok'
            }
            page = downloader.download_with_post(url=TEST_URL,
                                                 post_data=post_data)
            if page is not None:
                AVALIABLE_IPS.append(ip)
                print ip[0] + " is ok!"
            else:
                pass
        except Exception, e:
            print str(e)
            pass
Ejemplo n.º 25
0
def download_page():
    db = MysqlHandle()
    query_sql = "select uid,min(name),min(line_type), min(page_url) from baidu_busline_url_analyse where status=0 group by uid "
    page_infs = db.query(query_sql)
    db.close()
    downloader = PageDownload()

    for item in page_infs:
        print(item[0])
        page = downloader.simple_download(item[3])
        # if is_json(page):
        #     json_page = json.loads(page)
        #     if json_page.has_key("content"):
        #         main_info = json_page["content"][0]
        #         name = main_info["name"]
        #         timeable = main_info["timeable"]
        db = MysqlHandle()
        is_success = False
        if page is not None:
            insert_sql = "insert into baidu_busline_page values(%s,%s,%s,%s,NULL )"
            is_success = db.insert(insert_sql,
                                   [(item[0], item[1], item[2], page)])
        if is_success and page is not None:
            update_sql = "update baidu_busline_url_analyse set status=200 where uid='%s'" % (
                item[0])
            db.update(update_sql)
        db.close()
Ejemplo n.º 26
0
def district_table(table_name):
    query_sql = 'select distinct proxy  from '+table_name
    db = MysqlHandle()
    proxys = db.query(query_sql)
    db.close()
    delete_sql = 'delete from '+table_name
    db = MysqlHandle()
    db.delete(delete_sql)
    db.close()
    db = MysqlHandle()
    insert_sql = 'insert into '+table_name+' values (%s,now(),0,100)'
    is_success = db.insert(insert_sql, proxys)
    if is_success:
        print u'The filtering has finished!'
    db.close()
Ejemplo n.º 27
0
def filter_avaliable_ips():
    db = MysqlHandle()
    is_success = db.delete('DELETE  FROM TEMP_IPS_MANAGE')
    if not is_success:
        db.close()
        return
    db.close()
    sql = 'SELECT PROXY FROM AI_PROXY_IPS'
    db = MysqlHandle()
    # 查询出所有代理ip
    IP_LIST = db.query(sql)
    db.close()
    for ip in IP_LIST:
        PROXY = {'http': 'http://'+ip[0]}  # 代理
        print 'filtering ip:'+ip[0]
        downloader = PageDownload(hd=ofo_headers,proxy=PROXY)
        try:
            post_data = MultipartEncoder(
    {
        "lat": "30.515133",
        "lng": "114.346161",
        "token": "7eb9b200-3d7f-11e8-b714-e9d19c19f7b0",
        "source": "-5",
        "source-version": "10005"
    },
		boundary='----ofo-boundary-MC40MjcxMzUw'
	)
            page = downloader.download_with_post(url=TEST_URL,post_data=post_data)
            if page is not None:
                AVALIABLE_IPS.append(ip)
                print ip[0]+" is ok!"
            else:
                pass
        except Exception, e:
            print str(e)
            pass
Ejemplo n.º 28
0
        "token": "7eb9b200-3d7f-11e8-b714-e9d19c19f7b0",
        "source": "-5",
        "source-version": "10005"
    },
		boundary='----ofo-boundary-MC40MjcxMzUw'
	)
            page = downloader.download_with_post(url=TEST_URL,post_data=post_data)
            if page is not None:
                AVALIABLE_IPS.append(ip)
                print ip[0]+" is ok!"
            else:
                pass
        except Exception, e:
            print str(e)
            pass
    db = MysqlHandle()
    db.insert('INSERT INTO TEMP_IPS_MANAGE VALUES (%s,now(),0,100)', AVALIABLE_IPS)
    db.close()
    district_table('TEMP_IPS_MANAGE')


#  去除重复代理ip
def district_table(table_name):
    query_sql = 'select distinct proxy  from '+table_name
    db = MysqlHandle()
    proxys = db.query(query_sql)
    db.close()
    delete_sql = 'delete from '+table_name
    db = MysqlHandle()
    db.delete(delete_sql)
    db.close()
Ejemplo n.º 29
0
 def add_init_url(self, url_table_name, filter_config, city_code, keyword):
     url = filter_config["url"]
     db = MysqlHandle()
     insert_sql = "insert into " + url_table_name + " values(%s,%s,%s,%s,now())"
     db.insert(sql=insert_sql, value_list=[(url["urlmd5"], url["url"], url["type"], url["status"])])
     db.close()
Ejemplo n.º 30
0
 def download_list_page(self, urlmd5, url, proxy, domain=None):
     downloader = PageDownload(proxy=proxy,timeout=10)
     page = downloader.simple_download(url=url)
     if page is not None:
         new_urls = re.findall(self.reg, page)
         # singer_names = re.findall(self.js0_reg, page)
         # for singer_name in singer_names:
         #     merge_url = "http://www.51ape.com/skin/ape/php/qx_2.php?qx=" + singer_name
         #     new_urls.append(merge_url)
         for _url in new_urls:
             if domain is not None:
                 if _url.startswith("/"):
                     new_url = domain + _url
                 else:
                     new_url = _url
             else:
                 new_url = _url
             url_type = self.filter_url(url=new_url)
             if url_type is not None:
                 new_urlmd5 = to_md5(in_str=new_url)
                 sql = "select * from  "+self.url_table+" where urlmd5='%s'" % (new_urlmd5)
                 db = MysqlHandle()
                 results = db.query(sql=sql)
                 db.close()
                 if not results:
                     db = MysqlHandle()
                     insert_sql = "insert into "+self.url_table+" values (%s,%s,%s,%s,now())"
                     db.insert(sql=insert_sql, value_list=[(new_urlmd5, new_url, url_type, 0)])
                     db.close()
                 else:
                     print "This url is already in the database!!"
             else:
                 pass
         update_sql = "update "+self.url_table+" set status=200 where urlmd5='%s'" % (urlmd5)
         db = MysqlHandle()
         db.update(sql=update_sql)
         db.close()
         return True
     else:
         return False